Skip to content

Commit 20d5c33

Browse files
I've added the Ruff formatter and used it to format all Python code in the python/ and examples/ directories.
A `ruff.toml` configuration file has been added to the root of the repository to ensure consistent formatting.
1 parent d5314c8 commit 20d5c33

File tree

36 files changed

+1917
-958
lines changed

36 files changed

+1917
-958
lines changed

examples/amazon_s3_embedding/main.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
import cocoindex
44
import os
55

6+
67
@cocoindex.flow_def(name="AmazonS3TextEmbedding")
7-
def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
8+
def amazon_s3_text_embedding_flow(
9+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
10+
):
811
"""
912
Define an example flow that embeds text from Amazon S3 into a vector database.
1013
"""
@@ -18,21 +21,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
1821
prefix=prefix,
1922
included_patterns=["*.md", "*.txt", "*.docx"],
2023
binary=False,
21-
sqs_queue_url=sqs_queue_url))
24+
sqs_queue_url=sqs_queue_url,
25+
)
26+
)
2227

2328
doc_embeddings = data_scope.add_collector()
2429

2530
with data_scope["documents"].row() as doc:
2631
doc["chunks"] = doc["content"].transform(
2732
cocoindex.functions.SplitRecursively(),
28-
language="markdown", chunk_size=2000, chunk_overlap=500)
33+
language="markdown",
34+
chunk_size=2000,
35+
chunk_overlap=500,
36+
)
2937

3038
with doc["chunks"].row() as chunk:
3139
chunk["embedding"] = chunk["text"].transform(
3240
cocoindex.functions.SentenceTransformerEmbed(
33-
model="sentence-transformers/all-MiniLM-L6-v2"))
34-
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
35-
text=chunk["text"], embedding=chunk["embedding"])
41+
model="sentence-transformers/all-MiniLM-L6-v2"
42+
)
43+
)
44+
doc_embeddings.collect(
45+
filename=doc["filename"],
46+
location=chunk["location"],
47+
text=chunk["text"],
48+
embedding=chunk["embedding"],
49+
)
3650

3751
doc_embeddings.export(
3852
"doc_embeddings",
@@ -41,24 +55,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
4155
vector_indexes=[
4256
cocoindex.VectorIndexDef(
4357
field_name="embedding",
44-
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
58+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
59+
)
60+
],
61+
)
62+
4563

4664
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
4765
name="SemanticsSearch",
4866
flow=amazon_s3_text_embedding_flow,
4967
target_name="doc_embeddings",
5068
query_transform_flow=lambda text: text.transform(
5169
cocoindex.functions.SentenceTransformerEmbed(
52-
model="sentence-transformers/all-MiniLM-L6-v2")),
53-
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
70+
model="sentence-transformers/all-MiniLM-L6-v2"
71+
)
72+
),
73+
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
74+
)
75+
5476

5577
def _main():
5678
# Use a `FlowLiveUpdater` to keep the flow data updated.
5779
with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
5880
# Run queries in a loop to demonstrate the query capabilities.
5981
while True:
6082
query = input("Enter search query (or Enter to quit): ")
61-
if query == '':
83+
if query == "":
6284
break
6385
results, _ = query_handler.search(query, 10)
6486
print("\nSearch results:")
@@ -68,6 +90,7 @@ def _main():
6890
print("---")
6991
print()
7092

93+
7194
if __name__ == "__main__":
7295
load_dotenv()
7396
cocoindex.init()

examples/code_embedding/main.py

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,40 +3,59 @@
33
import cocoindex
44
import os
55

6+
67
@cocoindex.op.function()
78
def extract_extension(filename: str) -> str:
89
"""Extract the extension of a filename."""
910
return os.path.splitext(filename)[1]
1011

12+
1113
@cocoindex.transform_flow()
12-
def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
14+
def code_to_embedding(
15+
text: cocoindex.DataSlice[str],
16+
) -> cocoindex.DataSlice[list[float]]:
1317
"""
1418
Embed the text using a SentenceTransformer model.
1519
"""
1620
return text.transform(
1721
cocoindex.functions.SentenceTransformerEmbed(
18-
model="sentence-transformers/all-MiniLM-L6-v2"))
22+
model="sentence-transformers/all-MiniLM-L6-v2"
23+
)
24+
)
25+
1926

2027
@cocoindex.flow_def(name="CodeEmbedding")
21-
def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
28+
def code_embedding_flow(
29+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
30+
):
2231
"""
2332
Define an example flow that embeds files into a vector database.
2433
"""
2534
data_scope["files"] = flow_builder.add_source(
26-
cocoindex.sources.LocalFile(path="../..",
27-
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
28-
excluded_patterns=["**/.*", "target", "**/node_modules"]))
35+
cocoindex.sources.LocalFile(
36+
path="../..",
37+
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
38+
excluded_patterns=["**/.*", "target", "**/node_modules"],
39+
)
40+
)
2941
code_embeddings = data_scope.add_collector()
3042

3143
with data_scope["files"].row() as file:
3244
file["extension"] = file["filename"].transform(extract_extension)
3345
file["chunks"] = file["content"].transform(
3446
cocoindex.functions.SplitRecursively(),
35-
language=file["extension"], chunk_size=1000, chunk_overlap=300)
47+
language=file["extension"],
48+
chunk_size=1000,
49+
chunk_overlap=300,
50+
)
3651
with file["chunks"].row() as chunk:
3752
chunk["embedding"] = chunk["text"].call(code_to_embedding)
38-
code_embeddings.collect(filename=file["filename"], location=chunk["location"],
39-
code=chunk["text"], embedding=chunk["embedding"])
53+
code_embeddings.collect(
54+
filename=file["filename"],
55+
location=chunk["location"],
56+
code=chunk["text"],
57+
embedding=chunk["embedding"],
58+
)
4059

4160
code_embeddings.export(
4261
"code_embeddings",
@@ -45,26 +64,35 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
4564
vector_indexes=[
4665
cocoindex.VectorIndexDef(
4766
field_name="embedding",
48-
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
67+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
68+
)
69+
],
70+
)
4971

5072

5173
def search(pool: ConnectionPool, query: str, top_k: int = 5):
5274
# Get the table name, for the export target in the code_embedding_flow above.
53-
table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings")
75+
table_name = cocoindex.utils.get_target_storage_default_name(
76+
code_embedding_flow, "code_embeddings"
77+
)
5478
# Evaluate the transform flow defined above with the input query, to get the embedding.
5579
query_vector = code_to_embedding.eval(query)
5680
# Run the query and get the results.
5781
with pool.connection() as conn:
5882
with conn.cursor() as cur:
59-
cur.execute(f"""
83+
cur.execute(
84+
f"""
6085
SELECT filename, code, embedding <=> %s::vector AS distance
6186
FROM {table_name} ORDER BY distance LIMIT %s
62-
""", (query_vector, top_k))
87+
""",
88+
(query_vector, top_k),
89+
)
6390
return [
6491
{"filename": row[0], "code": row[1], "score": 1.0 - row[2]}
6592
for row in cur.fetchall()
6693
]
6794

95+
6896
def _main():
6997
# Make sure the flow is built and up-to-date.
7098
stats = code_embedding_flow.update()
@@ -75,7 +103,7 @@ def _main():
75103
# Run queries in a loop to demonstrate the query capabilities.
76104
while True:
77105
query = input("Enter search query (or Enter to quit): ")
78-
if query == '':
106+
if query == "":
79107
break
80108
# Run the query function with the database connection pool and the query.
81109
results = search(pool, query)
@@ -86,6 +114,7 @@ def _main():
86114
print("---")
87115
print()
88116

117+
89118
if __name__ == "__main__":
90119
load_dotenv()
91120
cocoindex.init()

examples/docs_to_knowledge_graph/main.py

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,35 @@
11
"""
22
This example shows how to extract relationships from documents and build a knowledge graph.
33
"""
4+
45
import dataclasses
56
import cocoindex
67

8+
79
@dataclasses.dataclass
810
class DocumentSummary:
911
"""Describe a summary of a document."""
12+
1013
title: str
1114
summary: str
1215

16+
1317
@dataclasses.dataclass
1418
class Relationship:
1519
"""
1620
Describe a relationship between two entities.
1721
Subject and object should be Core CocoIndex concepts only, should be nouns. For example, `CocoIndex`, `Incremental Processing`, `ETL`, `Data` etc.
1822
"""
23+
1924
subject: str
2025
predicate: str
2126
object: str
2227

28+
2329
@cocoindex.flow_def(name="DocsToKG")
24-
def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
30+
def docs_to_kg_flow(
31+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
32+
):
2533
"""
2634
Define an example flow that extracts relationship from files and build knowledge graph.
2735
"""
@@ -32,11 +40,14 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
3240
uri="bolt://localhost:7687",
3341
user="neo4j",
3442
password="cocoindex",
35-
))
43+
),
44+
)
3645

3746
data_scope["documents"] = flow_builder.add_source(
38-
cocoindex.sources.LocalFile(path="../../docs/docs/core",
39-
included_patterns=["*.md", "*.mdx"]))
47+
cocoindex.sources.LocalFile(
48+
path="../../docs/docs/core", included_patterns=["*.md", "*.mdx"]
49+
)
50+
)
4051

4152
document_node = data_scope.add_collector()
4253
entity_relationship = data_scope.add_collector()
@@ -48,24 +59,34 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
4859
cocoindex.functions.ExtractByLlm(
4960
llm_spec=cocoindex.LlmSpec(
5061
# Supported LLM: https://cocoindex.io/docs/ai/llm
51-
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
62+
api_type=cocoindex.LlmApiType.OPENAI,
63+
model="gpt-4o",
64+
),
5265
output_type=DocumentSummary,
53-
instruction="Please summarize the content of the document."))
66+
instruction="Please summarize the content of the document.",
67+
)
68+
)
5469
document_node.collect(
55-
filename=doc["filename"], title=doc["summary"]["title"],
56-
summary=doc["summary"]["summary"])
70+
filename=doc["filename"],
71+
title=doc["summary"]["title"],
72+
summary=doc["summary"]["summary"],
73+
)
5774

5875
# extract relationships from document
5976
doc["relationships"] = doc["content"].transform(
6077
cocoindex.functions.ExtractByLlm(
6178
llm_spec=cocoindex.LlmSpec(
6279
# Supported LLM: https://cocoindex.io/docs/ai/llm
63-
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
64-
output_type=list[Relationship],
65-
instruction=(
66-
"Please extract relationships from CocoIndex documents. "
67-
"Focus on concepts and ignore examples and code. "
68-
)))
80+
api_type=cocoindex.LlmApiType.OPENAI,
81+
model="gpt-4o",
82+
),
83+
output_type=list[Relationship],
84+
instruction=(
85+
"Please extract relationships from CocoIndex documents. "
86+
"Focus on concepts and ignore examples and code. "
87+
),
88+
)
89+
)
6990

7091
with doc["relationships"].row() as relationship:
7192
# relationship between two entities
@@ -77,22 +98,23 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
7798
)
7899
# mention of an entity in a document, for subject
79100
entity_mention.collect(
80-
id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
101+
id=cocoindex.GeneratedField.UUID,
102+
entity=relationship["subject"],
81103
filename=doc["filename"],
82104
)
83105
# mention of an entity in a document, for object
84106
entity_mention.collect(
85-
id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
107+
id=cocoindex.GeneratedField.UUID,
108+
entity=relationship["object"],
86109
filename=doc["filename"],
87110
)
88111

89-
90112
# export to neo4j
91113
document_node.export(
92114
"document_node",
93115
cocoindex.storages.Neo4j(
94-
connection=conn_spec,
95-
mapping=cocoindex.storages.Nodes(label="Document")),
116+
connection=conn_spec, mapping=cocoindex.storages.Nodes(label="Document")
117+
),
96118
primary_key_fields=["filename"],
97119
)
98120
# Declare reference Node to reference entity node in a relationship
@@ -113,15 +135,17 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
113135
label="Entity",
114136
fields=[
115137
cocoindex.storages.TargetFieldMapping(
116-
source="subject", target="value"),
117-
]
138+
source="subject", target="value"
139+
),
140+
],
118141
),
119142
target=cocoindex.storages.NodeFromFields(
120143
label="Entity",
121144
fields=[
122145
cocoindex.storages.TargetFieldMapping(
123-
source="object", target="value"),
124-
]
146+
source="object", target="value"
147+
),
148+
],
125149
),
126150
),
127151
),
@@ -139,8 +163,11 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
139163
),
140164
target=cocoindex.storages.NodeFromFields(
141165
label="Entity",
142-
fields=[cocoindex.storages.TargetFieldMapping(
143-
source="entity", target="value")],
166+
fields=[
167+
cocoindex.storages.TargetFieldMapping(
168+
source="entity", target="value"
169+
)
170+
],
144171
),
145172
),
146173
),

0 commit comments

Comments
 (0)