Skip to content

Commit

Permalink
Add one more example skeleton for file content extraction. (#55)
Browse files Browse the repository at this point in the history
Add one more example skeleton for file content extraction.
  • Loading branch information
badmonster0 authored Mar 7, 2025
1 parent 9216313 commit 46aa004
Show file tree
Hide file tree
Showing 8 changed files with 157 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/manual_extraction/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Postgres database address for cocoindex
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
30 changes: 30 additions & 0 deletions examples/manual_extraction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
Simple example for cocoindex: extract structured information from a Markdown file.

## Prerequisite
Follow [Setup Postgres](../../#setup-postgres) section on the root directory to setup Postgres database.

## Run

Install dependencies:

```bash
pip install -e .
```

Setup:

```bash
python manual_extraction.py cocoindex setup
```

Update index:

```bash
python manual_extraction.py cocoindex update
```

Run:

```bash
python manual_extraction.py
```
119 changes: 119 additions & 0 deletions examples/manual_extraction/manual_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import tempfile
import dataclasses

from dotenv import load_dotenv
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser

import cocoindex

class PdfToMarkdown(cocoindex.op.FunctionSpec):
"""Convert a PDF to markdown."""

@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
class PdfToMarkdownExecutor:
"""Executor for PdfToMarkdown."""

spec: PdfToMarkdown
_converter: PdfConverter

def prepare(self):
config_parser = ConfigParser({})
self._converter = PdfConverter(create_model_dict(), config=config_parser.generate_config_dict())

def __call__(self, content: bytes) -> str:
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
temp_file.write(content)
temp_file.flush()
text, _, _ = text_from_rendered(self._converter(temp_file.name))
return text

@dataclasses.dataclass
class ArgInfo:
name: str
description: str

@dataclasses.dataclass
class MethodInfo:
name: str
args: list[ArgInfo]
description: str

@dataclasses.dataclass
class ClassInfo:
name: str
description: str
methods: list[MethodInfo]

@dataclasses.dataclass
class ManualInfo:
title: str
description: str
classes: list[ClassInfo]
methods: list[MethodInfo]


class ExtractManual(cocoindex.op.FunctionSpec):
"""Extract manual information from a Markdown."""

@cocoindex.op.executor_class()
class ExtractManualExecutor:
"""Executor for ExtractManual."""

spec: ExtractManual

def __call__(self, _markdown: str) -> ManualInfo:
return ManualInfo(
title="title_placeholder",
description="description_placeholder",
classes=[
ClassInfo(
name="class_name_placeholder",
description="class_description_placeholder",
methods=[
MethodInfo(
name="method_name_placeholder",
args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")],
description="method_description_placeholder"
)
]
)
],
methods=[
MethodInfo(
name="method_name_placeholder",
args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")],
description="method_description_placeholder"
)
]
)

@cocoindex.flow_def(name="ManualExtraction")
def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
"""
Define an example flow that extracts manual information from a Markdown.
"""
data_scope["documents"] = flow_builder.add_source(cocoindex.sources.LocalFile(path="pdf_files", binary=True))

manual_infos = data_scope.add_collector()

with data_scope["documents"].row() as doc:
doc["markdown"] = doc["content"].transform(PdfToMarkdown())
doc["manual_info"] = doc["markdown"].transform(ExtractManual())
manual_infos.collect(filename=doc["filename"], manual_info=doc["manual_info"])

manual_infos.export(
"manual_infos",
cocoindex.storages.Postgres(),
primary_key_fields=["filename"],
)

@cocoindex.main_fn()
def _run():
pass

if __name__ == "__main__":
load_dotenv(override=True)
_run()
Binary file added examples/manual_extraction/manuals/array.pdf
Binary file not shown.
Binary file added examples/manual_extraction/manuals/base64.pdf
Binary file not shown.
Binary file added examples/manual_extraction/manuals/copy.pdf
Binary file not shown.
Binary file added examples/manual_extraction/manuals/glob.pdf
Binary file not shown.
6 changes: 6 additions & 0 deletions examples/manual_extraction/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[project]
name = "manual-extraction"
version = "0.1.0"
description = "Simple example for cocoindex: extract manual information from a Markdown."
requires-python = ">=3.10"
dependencies = ["cocoindex>=0.1.4", "python-dotenv>=1.0.1", "marker-pdf>=1.5.2"]

0 comments on commit 46aa004

Please sign in to comment.