cocoindex-io
diff --git a/‎examples/amazon_s3_embedding/main.py
Lines changed: 33 additions & 10 deletions b/‎examples/amazon_s3_embedding/main.py
Lines changed: 33 additions & 10 deletions
diff --git a/‎examples/code_embedding/main.py
Lines changed: 43 additions & 14 deletions b/‎examples/code_embedding/main.py
Lines changed: 43 additions & 14 deletions
diff --git a/‎examples/docs_to_knowledge_graph/main.py
Lines changed: 52 additions & 25 deletions b/‎examples/docs_to_knowledge_graph/main.py
Lines changed: 52 additions & 25 deletions
@@ -3,8 +3,11 @@
 import cocoindex
 import os
 
+
 @cocoindex.flow_def(name="AmazonS3TextEmbedding")
-def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+def amazon_s3_text_embedding_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+):
     """
     Define an example flow that embeds text from Amazon S3 into a vector database.
     """
@@ -18,21 +21,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
             prefix=prefix,
             included_patterns=["*.md", "*.txt", "*.docx"],
             binary=False,
-            sqs_queue_url=sqs_queue_url))
+            sqs_queue_url=sqs_queue_url,
+        )
+    )
 
     doc_embeddings = data_scope.add_collector()
 
     with data_scope["documents"].row() as doc:
         doc["chunks"] = doc["content"].transform(
             cocoindex.functions.SplitRecursively(),
-            language="markdown", chunk_size=2000, chunk_overlap=500)
+            language="markdown",
+            chunk_size=2000,
+            chunk_overlap=500,
+        )
 
         with doc["chunks"].row() as chunk:
             chunk["embedding"] = chunk["text"].transform(
                 cocoindex.functions.SentenceTransformerEmbed(
-                 model="sentence-transformers/all-MiniLM-L6-v2")) 
-            doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
-                                   text=chunk["text"], embedding=chunk["embedding"])
+                    model="sentence-transformers/all-MiniLM-L6-v2"
+                )
+            )
+            doc_embeddings.collect(
+                filename=doc["filename"],
+                location=chunk["location"],
+                text=chunk["text"],
+                embedding=chunk["embedding"],
+            )
 
     doc_embeddings.export(
         "doc_embeddings",
@@ -41,24 +55,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
         vector_indexes=[
             cocoindex.VectorIndexDef(
                 field_name="embedding",
-                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
+                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
+            )
+        ],
+    )
+
 
 query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
     name="SemanticsSearch",
     flow=amazon_s3_text_embedding_flow,
     target_name="doc_embeddings",
     query_transform_flow=lambda text: text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
-            model="sentence-transformers/all-MiniLM-L6-v2")),
-    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    ),
+    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
+)
+
 
 def _main():
     # Use a `FlowLiveUpdater` to keep the flow data updated.
     with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
         # Run queries in a loop to demonstrate the query capabilities.
         while True:
             query = input("Enter search query (or Enter to quit): ")
-            if query == '':
+            if query == "":
                 break
             results, _ = query_handler.search(query, 10)
             print("\nSearch results:")
@@ -68,6 +90,7 @@ def _main():
                 print("---")
             print()
 
+
 if __name__ == "__main__":
     load_dotenv()
     cocoindex.init()
 
@@ -3,40 +3,59 @@
 import cocoindex
 import os
 
+
 @cocoindex.op.function()
 def extract_extension(filename: str) -> str:
     """Extract the extension of a filename."""
     return os.path.splitext(filename)[1]
 
+
 @cocoindex.transform_flow()
-def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
+def code_to_embedding(
+    text: cocoindex.DataSlice[str],
+) -> cocoindex.DataSlice[list[float]]:
     """
     Embed the text using a SentenceTransformer model.
     """
     return text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
-            model="sentence-transformers/all-MiniLM-L6-v2"))
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    )
+
 
 @cocoindex.flow_def(name="CodeEmbedding")
-def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+def code_embedding_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+):
     """
     Define an example flow that embeds files into a vector database.
     """
     data_scope["files"] = flow_builder.add_source(
-        cocoindex.sources.LocalFile(path="../..",
-                                    included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
-                                    excluded_patterns=["**/.*", "target", "**/node_modules"]))
+        cocoindex.sources.LocalFile(
+            path="../..",
+            included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
+            excluded_patterns=["**/.*", "target", "**/node_modules"],
+        )
+    )
     code_embeddings = data_scope.add_collector()
 
     with data_scope["files"].row() as file:
         file["extension"] = file["filename"].transform(extract_extension)
         file["chunks"] = file["content"].transform(
             cocoindex.functions.SplitRecursively(),
-            language=file["extension"], chunk_size=1000, chunk_overlap=300)
+            language=file["extension"],
+            chunk_size=1000,
+            chunk_overlap=300,
+        )
         with file["chunks"].row() as chunk:
             chunk["embedding"] = chunk["text"].call(code_to_embedding)
-            code_embeddings.collect(filename=file["filename"], location=chunk["location"],
-                                    code=chunk["text"], embedding=chunk["embedding"])
+            code_embeddings.collect(
+                filename=file["filename"],
+                location=chunk["location"],
+                code=chunk["text"],
+                embedding=chunk["embedding"],
+            )
 
     code_embeddings.export(
         "code_embeddings",
@@ -45,26 +64,35 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
         vector_indexes=[
             cocoindex.VectorIndexDef(
                 field_name="embedding",
-                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
+                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
+            )
+        ],
+    )
 
 
 def search(pool: ConnectionPool, query: str, top_k: int = 5):
     # Get the table name, for the export target in the code_embedding_flow above.
-    table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings")
+    table_name = cocoindex.utils.get_target_storage_default_name(
+        code_embedding_flow, "code_embeddings"
+    )
     # Evaluate the transform flow defined above with the input query, to get the embedding.
     query_vector = code_to_embedding.eval(query)
     # Run the query and get the results.
     with pool.connection() as conn:
         with conn.cursor() as cur:
-            cur.execute(f"""
+            cur.execute(
+                f"""
                 SELECT filename, code, embedding <=> %s::vector AS distance
                 FROM {table_name} ORDER BY distance LIMIT %s
-            """, (query_vector, top_k))
+            """,
+                (query_vector, top_k),
+            )
             return [
                 {"filename": row[0], "code": row[1], "score": 1.0 - row[2]}
                 for row in cur.fetchall()
             ]
 
+
 def _main():
     # Make sure the flow is built and up-to-date.
     stats = code_embedding_flow.update()
@@ -75,7 +103,7 @@ def _main():
     # Run queries in a loop to demonstrate the query capabilities.
     while True:
         query = input("Enter search query (or Enter to quit): ")
-        if query == '':
+        if query == "":
             break
         # Run the query function with the database connection pool and the query.
         results = search(pool, query)
@@ -86,6 +114,7 @@ def _main():
             print("---")
         print()
 
+
 if __name__ == "__main__":
     load_dotenv()
     cocoindex.init()
 
@@ -1,27 +1,35 @@
 """
 This example shows how to extract relationships from documents and build a knowledge graph.
 """
+
 import dataclasses
 import cocoindex
 
+
 @dataclasses.dataclass
 class DocumentSummary:
     """Describe a summary of a document."""
+
     title: str
     summary: str
 
+
 @dataclasses.dataclass
 class Relationship:
     """
     Describe a relationship between two entities.
     Subject and object should be Core CocoIndex concepts only, should be nouns. For example, `CocoIndex`, `Incremental Processing`, `ETL`,  `Data` etc.
     """
+
     subject: str
     predicate: str
     object: str
 
+
 @cocoindex.flow_def(name="DocsToKG")
-def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+def docs_to_kg_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+):
     """
     Define an example flow that extracts relationship from files and build knowledge graph.
     """
@@ -32,11 +40,14 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
             uri="bolt://localhost:7687",
             user="neo4j",
             password="cocoindex",
-    ))
+        ),
+    )
 
     data_scope["documents"] = flow_builder.add_source(
-        cocoindex.sources.LocalFile(path="../../docs/docs/core",
-                                    included_patterns=["*.md", "*.mdx"]))
+        cocoindex.sources.LocalFile(
+            path="../../docs/docs/core", included_patterns=["*.md", "*.mdx"]
+        )
+    )
 
     document_node = data_scope.add_collector()
     entity_relationship = data_scope.add_collector()
@@ -48,24 +59,34 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
             cocoindex.functions.ExtractByLlm(
                 llm_spec=cocoindex.LlmSpec(
                     # Supported LLM: https://cocoindex.io/docs/ai/llm
-                    api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
+                    api_type=cocoindex.LlmApiType.OPENAI,
+                    model="gpt-4o",
+                ),
                 output_type=DocumentSummary,
-                instruction="Please summarize the content of the document."))
+                instruction="Please summarize the content of the document.",
+            )
+        )
         document_node.collect(
-            filename=doc["filename"], title=doc["summary"]["title"],
-            summary=doc["summary"]["summary"])
+            filename=doc["filename"],
+            title=doc["summary"]["title"],
+            summary=doc["summary"]["summary"],
+        )
 
         # extract relationships from document
         doc["relationships"] = doc["content"].transform(
             cocoindex.functions.ExtractByLlm(
                 llm_spec=cocoindex.LlmSpec(
                     # Supported LLM: https://cocoindex.io/docs/ai/llm
-                    api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
-                    output_type=list[Relationship],
-                    instruction=(
-                        "Please extract relationships from CocoIndex documents. "
-                        "Focus on concepts and ignore examples and code. "
-                        )))
+                    api_type=cocoindex.LlmApiType.OPENAI,
+                    model="gpt-4o",
+                ),
+                output_type=list[Relationship],
+                instruction=(
+                    "Please extract relationships from CocoIndex documents. "
+                    "Focus on concepts and ignore examples and code. "
+                ),
+            )
+        )
 
         with doc["relationships"].row() as relationship:
             # relationship between two entities
@@ -77,22 +98,23 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
             )
             # mention of an entity in a document, for subject
             entity_mention.collect(
-                id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
+                id=cocoindex.GeneratedField.UUID,
+                entity=relationship["subject"],
                 filename=doc["filename"],
             )
             # mention of an entity in a document, for object
             entity_mention.collect(
-                id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
+                id=cocoindex.GeneratedField.UUID,
+                entity=relationship["object"],
                 filename=doc["filename"],
             )
 
-
     # export to neo4j
     document_node.export(
         "document_node",
         cocoindex.storages.Neo4j(
-            connection=conn_spec,
-            mapping=cocoindex.storages.Nodes(label="Document")),
+            connection=conn_spec, mapping=cocoindex.storages.Nodes(label="Document")
+        ),
         primary_key_fields=["filename"],
     )
     # Declare reference Node to reference entity node in a relationship
@@ -113,15 +135,17 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
                     label="Entity",
                     fields=[
                         cocoindex.storages.TargetFieldMapping(
-                            source="subject", target="value"),
-                    ]
+                            source="subject", target="value"
+                        ),
+                    ],
                 ),
                 target=cocoindex.storages.NodeFromFields(
                     label="Entity",
                     fields=[
                         cocoindex.storages.TargetFieldMapping(
-                            source="object", target="value"),
-                    ]
+                            source="object", target="value"
+                        ),
+                    ],
                 ),
             ),
         ),
@@ -139,8 +163,11 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
                 ),
                 target=cocoindex.storages.NodeFromFields(
                     label="Entity",
-                    fields=[cocoindex.storages.TargetFieldMapping(
-                        source="entity", target="value")],
+                    fields=[
+                        cocoindex.storages.TargetFieldMapping(
+                            source="entity", target="value"
+                        )
+                    ],
                 ),
             ),
         ),