Skip to content

Commit d2aef9b

Browse files
badmonster0chardoncs
authored andcommitted
update text-embedding example to use latest query handler (cocoindex-io#518)
1 parent 73cffdb commit d2aef9b

File tree

3 files changed

+34
-23
lines changed

3 files changed

+34
-23
lines changed

examples/text_embedding/README.md

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
1-
Simple example for cocoindex: build embedding index based on local files.
2-
1+
# Build text embedding and semantic search 🔍
32
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb)
3+
[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
4+
5+
In this example, we will build index flow from text embedding from local markdown files, and query the index.
6+
7+
We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
8+
9+
## Steps:
10+
🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart)
11+
12+
### Indexing Flow:
13+
<img width="461" alt="Screenshot 2025-05-19 at 5 48 28 PM" src="https://github.com/user-attachments/assets/b6825302-a0c7-4b86-9a2d-52da8286b4bd" />
14+
15+
1. We will ingest from a list of local files.
16+
2. For each file, perform chunking (Recursive Split) and then embeddings.
17+
3. We will save the embeddings and the metadata in Postgres with PGVector.
18+
19+
### Query:
20+
We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow.
21+
422

523
## Prerequisite
624

@@ -34,9 +52,8 @@ python main.py
3452

3553
## CocoInsight
3654

37-
CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
38-
39-
Run CocoInsight to understand your RAG data pipeline:
55+
I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
56+
It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
4057

4158
```
4259
python main.py cocoindex server -ci

examples/text_embedding/main.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
import os
21
from dotenv import load_dotenv
32
from psycopg_pool import ConnectionPool
4-
53
import cocoindex
4+
import os
65

76
@cocoindex.transform_flow()
87
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
@@ -20,7 +19,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
2019
Define an example flow that embeds text into a vector database.
2120
"""
2221
data_scope["documents"] = flow_builder.add_source(
23-
cocoindex.sources.LocalFile(path="markdown_files", included_patterns=["*.md"]))
22+
cocoindex.sources.LocalFile(path="markdown_files"))
2423

2524
doc_embeddings = data_scope.add_collector()
2625

@@ -43,44 +42,39 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
4342
field_name="embedding",
4443
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
4544

46-
# Keep for now to allow CocoInsight to query.
47-
# Will be removed later after we expose `search()` below as a query function (https://github.com/cocoindex-io/cocoindex/issues/502).
48-
cocoindex.query.SimpleSemanticsQueryHandler(
49-
name="SemanticsSearch",
50-
flow=text_embedding_flow,
51-
target_name="doc_embeddings",
52-
query_transform_flow=text_to_embedding,
53-
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
5445

5546
def search(pool: ConnectionPool, query: str, top_k: int = 5):
47+
# Get the table name, for the export target in the text_embedding_flow above.
5648
table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
49+
# Evaluate the transform flow defined above with the input query, to get the embedding.
5750
query_vector = text_to_embedding.eval(query)
51+
# Run the query and get the results.
5852
with pool.connection() as conn:
5953
with conn.cursor() as cur:
6054
cur.execute(f"""
61-
SELECT filename, location, text, embedding <=> %s::vector AS distance
62-
FROM {table_name}
63-
ORDER BY distance
64-
LIMIT %s
55+
SELECT filename, text, embedding <=> %s::vector AS distance
56+
FROM {table_name} ORDER BY distance LIMIT %s
6557
""", (query_vector, top_k))
6658
return [
67-
{"filename": row[0], "location": row[1], "text": row[2], "score": 1.0 - row[3]}
59+
{"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
6860
for row in cur.fetchall()
6961
]
7062

7163
@cocoindex.main_fn()
7264
def _run():
65+
# Initialize the database connection pool.
7366
pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
7467
# Run queries in a loop to demonstrate the query capabilities.
7568
while True:
7669
try:
7770
query = input("Enter search query (or Enter to quit): ")
7871
if query == '':
7972
break
73+
# Run the query function with the database connection pool and the query.
8074
results = search(pool, query)
8175
print("\nSearch results:")
8276
for result in results:
83-
print(f"[{result['score']:.3f}] {result['filename']} location:{result['location']}")
77+
print(f"[{result['score']:.3f}] {result['filename']}")
8478
print(f" {result['text']}")
8579
print("---")
8680
print()

examples/text_embedding/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ version = "0.1.0"
44
description = "Simple example for cocoindex: build embedding index based on local text files."
55
requires-python = ">=3.10"
66
dependencies = [
7-
"cocoindex>=0.1.39",
7+
"cocoindex>=0.1.40",
88
"python-dotenv>=1.0.1",
99
"psycopg[binary,pool]",
1010
]

0 commit comments

Comments
 (0)