Add example gdrive_text_embedding. (#164)

badmonster0 · web-flow · commit 080ba2f4afdf · 2025-03-18T17:06:32.000-07:00
diff --git a/examples/gdrive_text_embedding/.env.example b/examples/gdrive_text_embedding/.env.example
@@ -0,0 +1,8 @@
+# Postgres database address for cocoindex
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
+
+# Google Drive service account credential path
+# GOOGLE_SERVICE_ACCOUNT_CREDENTIAL=/path/to/service_account_credential.json
+
+# Google Drive root folder IDs, comma separated
+# GOOGLE_DRIVE_ROOT_FOLDER_IDS=id1,id2
diff --git a/examples/gdrive_text_embedding/.gitignore b/examples/gdrive_text_embedding/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/examples/gdrive_text_embedding/README.md b/examples/gdrive_text_embedding/README.md
@@ -0,0 +1,41 @@
+Simple example for cocoindex: build embedding index based on Google Drive files.
+
+## Prerequisite
+[Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
+
+## Run
+
+Install dependencies:
+
+```bash
+pip install -e .
+```
+
+Setup:
+
+```bash
+python main.py cocoindex setup
+```
+
+Update index:
+
+```bash
+python main.py cocoindex update
+```
+
+Run:
+
+```bash
+python main.py
+```
+
+## CocoInsight 
+CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
+
+Run CocoInsight to understand your RAG data pipeline:
+
+```
+python main.py cocoindex server -c https://cocoindex.io
+```
+
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py
@@ -0,0 +1,73 @@
+from dotenv import load_dotenv
+
+import cocoindex
+import os
+
+def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
+    """
+    Embed the text using a SentenceTransformer model.
+    This is a shared logic between indexing and querying, so extract it as a function.
+    """
+    return text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2"))
+
+@cocoindex.flow_def(name="GoogleDriveTextEmbedding")
+def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    """
+    Define an example flow that embeds text into a vector database.
+    """
+    credential_path = os.environ["GOOGLE_SERVICE_ACCOUNT_CREDENTIAL"]
+    root_folder_ids = os.environ["GOOGLE_DRIVE_ROOT_FOLDER_IDS"].split(",")
+    
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.GoogleDrive(
+            service_account_credential_path=credential_path,
+            root_folder_ids=root_folder_ids))
+
+    doc_embeddings = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["chunks"] = doc["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            language="markdown", chunk_size=2000, chunk_overlap=500)
+
+        with doc["chunks"].row() as chunk:
+            chunk["embedding"] = text_to_embedding(chunk["text"])
+            doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
+                                   text=chunk["text"], embedding=chunk["embedding"])
+
+    doc_embeddings.export(
+        "doc_embeddings",
+        cocoindex.storages.Postgres(),
+        primary_key_fields=["filename", "location"],
+        vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
+
+query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
+    name="SemanticsSearch",
+    flow=gdrive_text_embedding_flow,
+    target_name="doc_embeddings",
+    query_transform_flow=text_to_embedding,
+    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
+
+@cocoindex.main_fn()
+def _run():
+    # Run queries in a loop to demonstrate the query capabilities.
+    while True:
+        try:
+            query = input("Enter search query (or Enter to quit): ")
+            if query == '':
+                break
+            results, _ = query_handler.search(query, 10)
+            print("\nSearch results:")
+            for result in results:
+                print(f"[{result.score:.3f}] {result.data['filename']}")
+                print(f"    {result.data['text']}")
+                print("---")
+            print()
+        except KeyboardInterrupt:
+            break
+
+if __name__ == "__main__":
+    load_dotenv(override=True)
+    _run()
diff --git a/examples/gdrive_text_embedding/pyproject.toml b/examples/gdrive_text_embedding/pyproject.toml
@@ -0,0 +1,6 @@
+[project]
+name = "gdrive-text-embedding"
+version = "0.1.0"
+description = "Simple example for cocoindex: build embedding index based on Google Drive files."
+requires-python = ">=3.11"
+dependencies = ["cocoindex>=0.1.12", "python-dotenv>=1.0.1"]