Skip to content

Commit 5305e4c

Browse files
Add ruff formatter (#547)
* I've added the Ruff formatter and used it to format all Python code in the `python/` and `examples/` directories. A `ruff.toml` configuration file has been added to the root of the repository to ensure consistent formatting. * Refactor Ruff configuration and dependencies This change consolidates Python dependencies into a single top-level `pyproject.toml` file, removing the nested `pyproject.toml` in `python/cocoindex` and `ruff` from example-specific `requirements.txt` files. Ruff is configured globally via `ruff.toml` and has been run to ensure consistent formatting across the codebase. * I've updated the GitHub workflow to check Python formatting. This change updates the `_test.yml` GitHub workflow to include a step that verifies Python code formatting using `ruff format --check`. Ruff has also been added to the dependencies installed during the workflow. * auto run ruff formatter in VS.code * reorder to run faster checks (Python format) earlier * use separate job to check formatting instead --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent d5314c8 commit 5305e4c

File tree

35 files changed

+1920
-957
lines changed

35 files changed

+1920
-957
lines changed

.github/workflows/CI.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@ permissions:
2626
contents: read
2727

2828
jobs:
29+
format-check:
30+
name: Check Python formatting
31+
runs-on: ubuntu-latest
32+
steps:
33+
- uses: actions/checkout@v4
34+
- uses: actions/setup-python@v5
35+
with:
36+
python-version: 3.11
37+
- name: Install Ruff
38+
run: |
39+
pip install ruff
40+
- name: Check Python formatting
41+
run: |
42+
ruff format --check .
43+
2944
test:
3045
name: Run test
3146
uses: ./.github/workflows/_test.yml

.vscode/settings.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,7 @@
33
"cocoindex",
44
"reindexing",
55
"timedelta"
6-
]
6+
],
7+
"editor.formatOnSave": true,
8+
"python.formatting.provider": "ruff"
79
}

examples/amazon_s3_embedding/main.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
import cocoindex
44
import os
55

6+
67
@cocoindex.flow_def(name="AmazonS3TextEmbedding")
7-
def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
8+
def amazon_s3_text_embedding_flow(
9+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
10+
):
811
"""
912
Define an example flow that embeds text from Amazon S3 into a vector database.
1013
"""
@@ -18,21 +21,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
1821
prefix=prefix,
1922
included_patterns=["*.md", "*.txt", "*.docx"],
2023
binary=False,
21-
sqs_queue_url=sqs_queue_url))
24+
sqs_queue_url=sqs_queue_url,
25+
)
26+
)
2227

2328
doc_embeddings = data_scope.add_collector()
2429

2530
with data_scope["documents"].row() as doc:
2631
doc["chunks"] = doc["content"].transform(
2732
cocoindex.functions.SplitRecursively(),
28-
language="markdown", chunk_size=2000, chunk_overlap=500)
33+
language="markdown",
34+
chunk_size=2000,
35+
chunk_overlap=500,
36+
)
2937

3038
with doc["chunks"].row() as chunk:
3139
chunk["embedding"] = chunk["text"].transform(
3240
cocoindex.functions.SentenceTransformerEmbed(
33-
model="sentence-transformers/all-MiniLM-L6-v2"))
34-
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
35-
text=chunk["text"], embedding=chunk["embedding"])
41+
model="sentence-transformers/all-MiniLM-L6-v2"
42+
)
43+
)
44+
doc_embeddings.collect(
45+
filename=doc["filename"],
46+
location=chunk["location"],
47+
text=chunk["text"],
48+
embedding=chunk["embedding"],
49+
)
3650

3751
doc_embeddings.export(
3852
"doc_embeddings",
@@ -41,24 +55,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
4155
vector_indexes=[
4256
cocoindex.VectorIndexDef(
4357
field_name="embedding",
44-
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
58+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
59+
)
60+
],
61+
)
62+
4563

4664
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
4765
name="SemanticsSearch",
4866
flow=amazon_s3_text_embedding_flow,
4967
target_name="doc_embeddings",
5068
query_transform_flow=lambda text: text.transform(
5169
cocoindex.functions.SentenceTransformerEmbed(
52-
model="sentence-transformers/all-MiniLM-L6-v2")),
53-
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
70+
model="sentence-transformers/all-MiniLM-L6-v2"
71+
)
72+
),
73+
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
74+
)
75+
5476

5577
def _main():
5678
# Use a `FlowLiveUpdater` to keep the flow data updated.
5779
with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
5880
# Run queries in a loop to demonstrate the query capabilities.
5981
while True:
6082
query = input("Enter search query (or Enter to quit): ")
61-
if query == '':
83+
if query == "":
6284
break
6385
results, _ = query_handler.search(query, 10)
6486
print("\nSearch results:")
@@ -68,6 +90,7 @@ def _main():
6890
print("---")
6991
print()
7092

93+
7194
if __name__ == "__main__":
7295
load_dotenv()
7396
cocoindex.init()

examples/code_embedding/main.py

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,40 +3,59 @@
33
import cocoindex
44
import os
55

6+
67
@cocoindex.op.function()
78
def extract_extension(filename: str) -> str:
89
"""Extract the extension of a filename."""
910
return os.path.splitext(filename)[1]
1011

12+
1113
@cocoindex.transform_flow()
12-
def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
14+
def code_to_embedding(
15+
text: cocoindex.DataSlice[str],
16+
) -> cocoindex.DataSlice[list[float]]:
1317
"""
1418
Embed the text using a SentenceTransformer model.
1519
"""
1620
return text.transform(
1721
cocoindex.functions.SentenceTransformerEmbed(
18-
model="sentence-transformers/all-MiniLM-L6-v2"))
22+
model="sentence-transformers/all-MiniLM-L6-v2"
23+
)
24+
)
25+
1926

2027
@cocoindex.flow_def(name="CodeEmbedding")
21-
def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
28+
def code_embedding_flow(
29+
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
30+
):
2231
"""
2332
Define an example flow that embeds files into a vector database.
2433
"""
2534
data_scope["files"] = flow_builder.add_source(
26-
cocoindex.sources.LocalFile(path="../..",
27-
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
28-
excluded_patterns=["**/.*", "target", "**/node_modules"]))
35+
cocoindex.sources.LocalFile(
36+
path="../..",
37+
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
38+
excluded_patterns=["**/.*", "target", "**/node_modules"],
39+
)
40+
)
2941
code_embeddings = data_scope.add_collector()
3042

3143
with data_scope["files"].row() as file:
3244
file["extension"] = file["filename"].transform(extract_extension)
3345
file["chunks"] = file["content"].transform(
3446
cocoindex.functions.SplitRecursively(),
35-
language=file["extension"], chunk_size=1000, chunk_overlap=300)
47+
language=file["extension"],
48+
chunk_size=1000,
49+
chunk_overlap=300,
50+
)
3651
with file["chunks"].row() as chunk:
3752
chunk["embedding"] = chunk["text"].call(code_to_embedding)
38-
code_embeddings.collect(filename=file["filename"], location=chunk["location"],
39-
code=chunk["text"], embedding=chunk["embedding"])
53+
code_embeddings.collect(
54+
filename=file["filename"],
55+
location=chunk["location"],
56+
code=chunk["text"],
57+
embedding=chunk["embedding"],
58+
)
4059

4160
code_embeddings.export(
4261
"code_embeddings",
@@ -45,26 +64,35 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
4564
vector_indexes=[
4665
cocoindex.VectorIndexDef(
4766
field_name="embedding",
48-
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
67+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
68+
)
69+
],
70+
)
4971

5072

5173
def search(pool: ConnectionPool, query: str, top_k: int = 5):
5274
# Get the table name, for the export target in the code_embedding_flow above.
53-
table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings")
75+
table_name = cocoindex.utils.get_target_storage_default_name(
76+
code_embedding_flow, "code_embeddings"
77+
)
5478
# Evaluate the transform flow defined above with the input query, to get the embedding.
5579
query_vector = code_to_embedding.eval(query)
5680
# Run the query and get the results.
5781
with pool.connection() as conn:
5882
with conn.cursor() as cur:
59-
cur.execute(f"""
83+
cur.execute(
84+
f"""
6085
SELECT filename, code, embedding <=> %s::vector AS distance
6186
FROM {table_name} ORDER BY distance LIMIT %s
62-
""", (query_vector, top_k))
87+
""",
88+
(query_vector, top_k),
89+
)
6390
return [
6491
{"filename": row[0], "code": row[1], "score": 1.0 - row[2]}
6592
for row in cur.fetchall()
6693
]
6794

95+
6896
def _main():
6997
# Make sure the flow is built and up-to-date.
7098
stats = code_embedding_flow.update()
@@ -75,7 +103,7 @@ def _main():
75103
# Run queries in a loop to demonstrate the query capabilities.
76104
while True:
77105
query = input("Enter search query (or Enter to quit): ")
78-
if query == '':
106+
if query == "":
79107
break
80108
# Run the query function with the database connection pool and the query.
81109
results = search(pool, query)
@@ -86,6 +114,7 @@ def _main():
86114
print("---")
87115
print()
88116

117+
89118
if __name__ == "__main__":
90119
load_dotenv()
91120
cocoindex.init()

0 commit comments

Comments
 (0)