Skip to content

Commit 24e9fc0

Browse files
committed
init
1 parent fd43562 commit 24e9fc0

File tree

10 files changed

+411
-0
lines changed

10 files changed

+411
-0
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,5 @@ rand = "0.9.0"
109109
indoc = "2.0.6"
110110
owo-colors = "4.2.0"
111111
json5 = "0.4.1"
112+
aws-config = "1"
113+
aws-sdk-s3 = "1"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
3+
4+
# S3 bucket name for CocoIndex source
5+
S3_BUCKET_NAME=your-s3-bucket-name
6+
7+
# (Optional) S3 prefix to restrict to a subfolder
8+
# S3_PREFIX=optional/subfolder/

examples/s3_text_embedding/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.env

examples/s3_text_embedding/README.md

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
This example builds an embedding index based on files stored in an Amazon S3 bucket.
2+
It continuously updates the index as files are added / updated / deleted in the source bucket:
3+
it keeps the index in sync with the S3 bucket effortlessly.
4+
5+
## Prerequisite
6+
7+
Before running the example, you need to:
8+
9+
1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
10+
11+
2. Prepare for Amazon S3:
12+
13+
- **Create an S3 bucket:**
14+
- Go to the [AWS S3 Console](https://s3.console.aws.amazon.com/s3/home) and click **Create bucket**. Give it a unique name and choose a region.
15+
- Or, use the AWS CLI:
16+
```sh
17+
aws s3 mb s3://your-s3-bucket-name
18+
```
19+
20+
- **Upload your files to the bucket:**
21+
- In the AWS Console, click your bucket, then click **Upload** and add your `.md`, `.txt`, `.docx`, or other files.
22+
- Or, use the AWS CLI:
23+
```sh
24+
aws s3 cp localfile.txt s3://your-s3-bucket-name/
25+
aws s3 cp your-folder/ s3://your-s3-bucket-name/ --recursive
26+
```
27+
28+
- **Set up AWS credentials:**
29+
- The easiest way is to run:
30+
```sh
31+
aws configure
32+
```
33+
Enter your AWS Access Key ID, Secret Access Key, region (e.g., `us-east-1`), and output format (`json`).
34+
- This creates a credentials file at `~/.aws/credentials` and config at `~/.aws/config`.
35+
- Alternatively, you can set environment variables:
36+
```sh
37+
export AWS_ACCESS_KEY_ID=your-access-key-id
38+
export AWS_SECRET_ACCESS_KEY=your-secret-access-key
39+
export AWS_DEFAULT_REGION=us-east-1
40+
```
41+
- If running on AWS EC2 or Lambda, you can use an [IAM role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) with S3 read permissions.
42+
43+
- **(Optional) Specify a prefix** to restrict to a subfolder in the bucket by setting `S3_PREFIX` in your `.env`.
44+
45+
See [AWS S3 documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) for more details.
46+
47+
3. Create a `.env` file with your S3 bucket name and (optionally) prefix.
48+
Start from copying the `.env.example`, and then edit it to fill in your bucket name and prefix.
49+
50+
```bash
51+
cp .env.example .env
52+
$EDITOR .env
53+
```
54+
55+
## Run
56+
57+
Install dependencies:
58+
59+
```sh
60+
uv pip install -r requirements.txt
61+
```
62+
63+
Setup:
64+
65+
```sh
66+
uv run main.py cocoindex setup
67+
```
68+
69+
Run:
70+
71+
```sh
72+
uv run main.py
73+
```
74+
75+
During running, it will keep observing changes in the S3 bucket and update the index automatically.
76+
At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
77+
78+
79+
## CocoInsight
80+
CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
81+
82+
Run CocoInsight to understand your RAG data pipeline:
83+
84+
```sh
85+
uv run main.py cocoindex server -ci
86+
```
87+
88+
You can also add a `-L` flag to make the server keep updating the index to reflect source changes at the same time:
89+
90+
```sh
91+
uv run main.py cocoindex server -ci -L
92+
```
93+
94+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).

examples/s3_text_embedding/main.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from dotenv import load_dotenv
2+
3+
import asyncio
4+
import cocoindex
5+
import datetime
6+
import os
7+
8+
@cocoindex.flow_def(name="S3TextEmbedding")
9+
def s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
10+
"""
11+
Define an example flow that embeds text from S3 into a vector database.
12+
"""
13+
bucket_name = os.environ["S3_BUCKET_NAME"]
14+
prefix = os.environ.get("S3_PREFIX", None)
15+
16+
data_scope["documents"] = flow_builder.add_source(
17+
cocoindex.sources.S3(
18+
bucket_name=bucket_name,
19+
prefix=prefix,
20+
included_patterns=["*.md", "*.txt", "*.docx"],
21+
binary=False),
22+
refresh_interval=datetime.timedelta(minutes=1))
23+
24+
doc_embeddings = data_scope.add_collector()
25+
26+
with data_scope["documents"].row() as doc:
27+
doc["chunks"] = doc["content"].transform(
28+
cocoindex.functions.SplitRecursively(),
29+
language="markdown", chunk_size=2000, chunk_overlap=500)
30+
31+
with doc["chunks"].row() as chunk:
32+
chunk["embedding"] = chunk["text"].transform(
33+
cocoindex.functions.SentenceTransformerEmbed(
34+
model="sentence-transformers/all-MiniLM-L6-v2"))
35+
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
36+
text=chunk["text"], embedding=chunk["embedding"])
37+
38+
doc_embeddings.export(
39+
"doc_embeddings",
40+
cocoindex.storages.Postgres(),
41+
primary_key_fields=["filename", "location"],
42+
vector_indexes=[
43+
cocoindex.VectorIndexDef(
44+
field_name="embedding",
45+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
46+
47+
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
48+
name="SemanticsSearch",
49+
flow=s3_text_embedding_flow,
50+
target_name="doc_embeddings",
51+
query_transform_flow=lambda text: text.transform(
52+
cocoindex.functions.SentenceTransformerEmbed(
53+
model="sentence-transformers/all-MiniLM-L6-v2")),
54+
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
55+
56+
@cocoindex.main_fn()
57+
def _run():
58+
# Use a `FlowLiveUpdater` to keep the flow data updated.
59+
with cocoindex.FlowLiveUpdater(s3_text_embedding_flow):
60+
# Run queries in a loop to demonstrate the query capabilities.
61+
while True:
62+
try:
63+
query = input("Enter search query (or Enter to quit): ")
64+
if query == '':
65+
break
66+
results, _ = query_handler.search(query, 10)
67+
print("\nSearch results:")
68+
for result in results:
69+
print(f"[{result.score:.3f}] {result.data['filename']}")
70+
print(f" {result.data['text']}")
71+
print("---")
72+
print()
73+
except KeyboardInterrupt:
74+
break
75+
76+
if __name__ == "__main__":
77+
load_dotenv(override=True)
78+
_run()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
cocoindex
2+
python-dotenv
3+
boto3

python/cocoindex/sources.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,15 @@ class GoogleDrive(op.SourceSpec):
2828
root_folder_ids: list[str]
2929
binary: bool = False
3030
recent_changes_poll_interval: datetime.timedelta | None = None
31+
32+
33+
class S3(op.SourceSpec):
34+
"""Import data from an Amazon S3 bucket. Supports optional prefix and file filtering by glob patterns."""
35+
36+
_op_category = op.OpCategory.SOURCE
37+
38+
bucket_name: str
39+
prefix: str | None = None
40+
binary: bool = False
41+
included_patterns: list[str] | None = None
42+
excluded_patterns: list[str] | None = None

src/ops/registration.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use std::sync::{Arc, LazyLock, RwLock, RwLockReadGuard};
88
fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
99
sources::local_file::Factory.register(registry)?;
1010
sources::google_drive::Factory.register(registry)?;
11+
sources::s3::Factory.register(registry)?;
1112

1213
functions::parse_json::Factory.register(registry)?;
1314
functions::split_recursively::Factory.register(registry)?;

src/ops/sources/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod google_drive;
22
pub mod local_file;
3+
pub mod s3;

0 commit comments

Comments
 (0)