Skip to content

Commit 080ba2f

Browse files
authored
Add example gdrive_text_embedding. (#164)
1 parent 871b527 commit 080ba2f

File tree

5 files changed

+129
-0
lines changed

5 files changed

+129
-0
lines changed
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
3+
4+
# Google Drive service account credential path
5+
# GOOGLE_SERVICE_ACCOUNT_CREDENTIAL=/path/to/service_account_credential.json
6+
7+
# Google Drive root folder IDs, comma separated
8+
# GOOGLE_DRIVE_ROOT_FOLDER_IDS=id1,id2
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.env
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
Simple example for cocoindex: build embedding index based on Google Drive files.
2+
3+
## Prerequisite
4+
[Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
5+
6+
## Run
7+
8+
Install dependencies:
9+
10+
```bash
11+
pip install -e .
12+
```
13+
14+
Setup:
15+
16+
```bash
17+
python main.py cocoindex setup
18+
```
19+
20+
Update index:
21+
22+
```bash
23+
python main.py cocoindex update
24+
```
25+
26+
Run:
27+
28+
```bash
29+
python main.py
30+
```
31+
32+
## CocoInsight
33+
CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
34+
35+
Run CocoInsight to understand your RAG data pipeline:
36+
37+
```
38+
python main.py cocoindex server -c https://cocoindex.io
39+
```
40+
41+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from dotenv import load_dotenv
2+
3+
import cocoindex
4+
import os
5+
6+
def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
7+
"""
8+
Embed the text using a SentenceTransformer model.
9+
This is a shared logic between indexing and querying, so extract it as a function.
10+
"""
11+
return text.transform(
12+
cocoindex.functions.SentenceTransformerEmbed(
13+
model="sentence-transformers/all-MiniLM-L6-v2"))
14+
15+
@cocoindex.flow_def(name="GoogleDriveTextEmbedding")
16+
def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
17+
"""
18+
Define an example flow that embeds text into a vector database.
19+
"""
20+
credential_path = os.environ["GOOGLE_SERVICE_ACCOUNT_CREDENTIAL"]
21+
root_folder_ids = os.environ["GOOGLE_DRIVE_ROOT_FOLDER_IDS"].split(",")
22+
23+
data_scope["documents"] = flow_builder.add_source(
24+
cocoindex.sources.GoogleDrive(
25+
service_account_credential_path=credential_path,
26+
root_folder_ids=root_folder_ids))
27+
28+
doc_embeddings = data_scope.add_collector()
29+
30+
with data_scope["documents"].row() as doc:
31+
doc["chunks"] = doc["content"].transform(
32+
cocoindex.functions.SplitRecursively(),
33+
language="markdown", chunk_size=2000, chunk_overlap=500)
34+
35+
with doc["chunks"].row() as chunk:
36+
chunk["embedding"] = text_to_embedding(chunk["text"])
37+
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
38+
text=chunk["text"], embedding=chunk["embedding"])
39+
40+
doc_embeddings.export(
41+
"doc_embeddings",
42+
cocoindex.storages.Postgres(),
43+
primary_key_fields=["filename", "location"],
44+
vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
45+
46+
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
47+
name="SemanticsSearch",
48+
flow=gdrive_text_embedding_flow,
49+
target_name="doc_embeddings",
50+
query_transform_flow=text_to_embedding,
51+
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
52+
53+
@cocoindex.main_fn()
54+
def _run():
55+
# Run queries in a loop to demonstrate the query capabilities.
56+
while True:
57+
try:
58+
query = input("Enter search query (or Enter to quit): ")
59+
if query == '':
60+
break
61+
results, _ = query_handler.search(query, 10)
62+
print("\nSearch results:")
63+
for result in results:
64+
print(f"[{result.score:.3f}] {result.data['filename']}")
65+
print(f" {result.data['text']}")
66+
print("---")
67+
print()
68+
except KeyboardInterrupt:
69+
break
70+
71+
if __name__ == "__main__":
72+
load_dotenv(override=True)
73+
_run()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[project]
2+
name = "gdrive-text-embedding"
3+
version = "0.1.0"
4+
description = "Simple example for cocoindex: build embedding index based on Google Drive files."
5+
requires-python = ">=3.11"
6+
dependencies = ["cocoindex>=0.1.12", "python-dotenv>=1.0.1"]

0 commit comments

Comments
 (0)