Add one more example skeleton for file content extraction. (#55)

Add one more example skeleton for file content extraction.
cocoindex-io · Mar 7, 2025 · 46aa004 · 46aa004
1 parent 9216313
commit 46aa004
Show file tree

Hide file tree

Showing 8 changed files with 157 additions and 0 deletions.
diff --git a/examples/manual_extraction/.env b/examples/manual_extraction/.env
@@ -0,0 +1,2 @@
+# Postgres database address for cocoindex
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
diff --git a/examples/manual_extraction/README.md b/examples/manual_extraction/README.md
@@ -0,0 +1,30 @@
+Simple example for cocoindex: extract structured information from a Markdown file.
+
+## Prerequisite
+Follow [Setup Postgres](../../#setup-postgres) section on the root directory to setup Postgres database.
+
+## Run
+
+Install dependencies:
+
+```bash
+pip install -e .
+```
+
+Setup:
+
+```bash
+python manual_extraction.py cocoindex setup
+```
+
+Update index:
+
+```bash
+python manual_extraction.py cocoindex update
+```
+
+Run:
+
+```bash
+python manual_extraction.py
+```
diff --git a/examples/manual_extraction/manual_extraction.py b/examples/manual_extraction/manual_extraction.py
@@ -0,0 +1,119 @@
+import tempfile
+import dataclasses
+
+from dotenv import load_dotenv
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from marker.config.parser import ConfigParser
+
+import cocoindex
+
+class PdfToMarkdown(cocoindex.op.FunctionSpec):
+    """Convert a PDF to markdown."""
+
+@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
+class PdfToMarkdownExecutor:
+    """Executor for PdfToMarkdown."""
+
+    spec: PdfToMarkdown
+    _converter: PdfConverter
+
+    def prepare(self):
+        config_parser = ConfigParser({})
+        self._converter = PdfConverter(create_model_dict(), config=config_parser.generate_config_dict())
+
+    def __call__(self, content: bytes) -> str:
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
+            temp_file.write(content)
+            temp_file.flush()
+            text, _, _ = text_from_rendered(self._converter(temp_file.name))
+            return text
+
+@dataclasses.dataclass
+class ArgInfo:
+    name: str
+    description: str
+
+@dataclasses.dataclass
+class MethodInfo:
+    name: str
+    args: list[ArgInfo]
+    description: str
+
+@dataclasses.dataclass
+class ClassInfo:
+    name: str
+    description: str
+    methods: list[MethodInfo]
+
+@dataclasses.dataclass
+class ManualInfo:
+    title: str
+    description: str
+    classes: list[ClassInfo]
+    methods: list[MethodInfo]
+
+
+class ExtractManual(cocoindex.op.FunctionSpec):
+    """Extract manual information from a Markdown."""
+
+@cocoindex.op.executor_class()
+class ExtractManualExecutor:
+    """Executor for ExtractManual."""
+
+    spec: ExtractManual
+
+    def __call__(self, _markdown: str) -> ManualInfo:
+        return ManualInfo(
+            title="title_placeholder",
+            description="description_placeholder",
+            classes=[
+                ClassInfo(
+                    name="class_name_placeholder",
+                    description="class_description_placeholder",
+                    methods=[
+                        MethodInfo(
+                            name="method_name_placeholder",
+                            args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")],
+                            description="method_description_placeholder"
+                        )
+                    ]
+                )
+            ],
+            methods=[
+                MethodInfo(
+                    name="method_name_placeholder",
+                    args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")],
+                    description="method_description_placeholder"
+                )
+            ]
+        )
+
+@cocoindex.flow_def(name="ManualExtraction")
+def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    """
+    Define an example flow that extracts manual information from a Markdown.
+    """
+    data_scope["documents"] = flow_builder.add_source(cocoindex.sources.LocalFile(path="pdf_files", binary=True))
+
+    manual_infos = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["markdown"] = doc["content"].transform(PdfToMarkdown())
+        doc["manual_info"] = doc["markdown"].transform(ExtractManual())
+        manual_infos.collect(filename=doc["filename"], manual_info=doc["manual_info"])
+
+    manual_infos.export(
+        "manual_infos",
+        cocoindex.storages.Postgres(),
+        primary_key_fields=["filename"],
+    )
+
+@cocoindex.main_fn()
+def _run():
+    pass
+
+if __name__ == "__main__":
+    load_dotenv(override=True)
+    _run()
diff --git a/examples/manual_extraction/manuals/array.pdf b/examples/manual_extraction/manuals/array.pdf
diff --git a/examples/manual_extraction/manuals/base64.pdf b/examples/manual_extraction/manuals/base64.pdf
diff --git a/examples/manual_extraction/manuals/copy.pdf b/examples/manual_extraction/manuals/copy.pdf
diff --git a/examples/manual_extraction/manuals/glob.pdf b/examples/manual_extraction/manuals/glob.pdf
diff --git a/examples/manual_extraction/pyproject.toml b/examples/manual_extraction/pyproject.toml
@@ -0,0 +1,6 @@
+[project]
+name = "manual-extraction"
+version = "0.1.0"
+description = "Simple example for cocoindex: extract manual information from a Markdown."
+requires-python = ">=3.10"
+dependencies = ["cocoindex>=0.1.4", "python-dotenv>=1.0.1", "marker-pdf>=1.5.2"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Postgres database address for cocoindex
		COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex