Skip to content

Commit 61c759c

Browse files
authored
support s3 as native source (#475)
1 parent 3084754 commit 61c759c

File tree

10 files changed

+422
-0
lines changed

10 files changed

+422
-0
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,5 @@ rand = "0.9.0"
109109
indoc = "2.0.6"
110110
owo-colors = "4.2.0"
111111
json5 = "0.4.1"
112+
aws-config = "1.6.2"
113+
aws-sdk-s3 = "1.85.0"
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
3+
4+
# Amazon S3 Configuration
5+
AMAZON_S3_BUCKET_NAME=your-bucket-name
6+
AMAZON_S3_PREFIX=optional/prefix/path
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.env
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
This example builds an embedding index based on files stored in an Amazon S3 bucket.
2+
It continuously updates the index as files are added / updated / deleted in the source bucket:
3+
it keeps the index in sync with the Amazon S3 bucket effortlessly.
4+
5+
## Prerequisite
6+
7+
Before running the example, you need to:
8+
9+
1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
10+
11+
2. Prepare for Amazon S3:
12+
13+
- **Create an Amazon S3 bucket:**
14+
- Go to the [AWS S3 Console](https://s3.console.aws.amazon.com/s3/home) and click **Create bucket**. Give it a unique name and choose a region.
15+
- Or, use the AWS CLI:
16+
```sh
17+
aws s3 mb s3://your-s3-bucket-name
18+
```
19+
20+
- **Upload your files to the bucket:**
21+
- In the AWS Console, click your bucket, then click **Upload** and add your `.md`, `.txt`, `.docx`, or other files.
22+
- Or, use the AWS CLI:
23+
```sh
24+
aws s3 cp localfile.txt s3://your-s3-bucket-name/
25+
aws s3 cp your-folder/ s3://your-s3-bucket-name/ --recursive
26+
```
27+
28+
- **Set up AWS credentials:**
29+
- The easiest way is to run:
30+
```sh
31+
aws configure
32+
```
33+
Enter your AWS Access Key ID, Secret Access Key, region (e.g., `us-east-1`), and output format (`json`).
34+
- This creates a credentials file at `~/.aws/credentials` and config at `~/.aws/config`.
35+
- Alternatively, you can set environment variables:
36+
```sh
37+
export AWS_ACCESS_KEY_ID=your-access-key-id
38+
export AWS_SECRET_ACCESS_KEY=your-secret-access-key
39+
export AWS_DEFAULT_REGION=us-east-1
40+
```
41+
- If running on AWS EC2 or Lambda, you can use an [IAM role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) with S3 read permissions.
42+
43+
- **(Optional) Specify a prefix** to restrict to a subfolder in the bucket by setting `AMAZON_S3_PREFIX` in your `.env`.
44+
45+
See [AWS S3 documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) for more details.
46+
47+
3. Create a `.env` file with your Amazon S3 bucket name and (optionally) prefix.
48+
Start from copying the `.env.example`, and then edit it to fill in your bucket name and prefix.
49+
50+
```bash
51+
cp .env.example .env
52+
$EDITOR .env
53+
```
54+
55+
Example `.env` file:
56+
```
57+
# Database Configuration
58+
DATABASE_URL=postgresql://localhost:5432/cocoindex
59+
60+
# Amazon S3 Configuration
61+
AMAZON_S3_BUCKET_NAME=your-bucket-name
62+
AMAZON_S3_PREFIX=optional/prefix/path
63+
```
64+
65+
## Run
66+
67+
Install dependencies:
68+
69+
```sh
70+
uv pip install -r requirements.txt
71+
```
72+
73+
Setup:
74+
75+
```sh
76+
uv run main.py cocoindex setup
77+
```
78+
79+
Run:
80+
81+
```sh
82+
uv run main.py
83+
```
84+
85+
During running, it will keep observing changes in the Amazon S3 bucket and update the index automatically.
86+
At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
87+
88+
89+
## CocoInsight
90+
CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
91+
92+
Run CocoInsight to understand your RAG data pipeline:
93+
94+
```sh
95+
uv run main.py cocoindex server -ci
96+
```
97+
98+
You can also add a `-L` flag to make the server keep updating the index to reflect source changes at the same time:
99+
100+
```sh
101+
uv run main.py cocoindex server -ci -L
102+
```
103+
104+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from dotenv import load_dotenv
2+
3+
import asyncio
4+
import cocoindex
5+
import datetime
6+
import os
7+
8+
@cocoindex.flow_def(name="AmazonS3TextEmbedding")
9+
def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
10+
"""
11+
Define an example flow that embeds text from Amazon S3 into a vector database.
12+
"""
13+
bucket_name = os.environ["AMAZON_S3_BUCKET_NAME"]
14+
prefix = os.environ.get("AMAZON_S3_PREFIX", None)
15+
16+
data_scope["documents"] = flow_builder.add_source(
17+
cocoindex.sources.AmazonS3(
18+
bucket_name=bucket_name,
19+
prefix=prefix,
20+
included_patterns=["*.md", "*.txt", "*.docx"],
21+
binary=False),
22+
refresh_interval=datetime.timedelta(minutes=1))
23+
24+
doc_embeddings = data_scope.add_collector()
25+
26+
with data_scope["documents"].row() as doc:
27+
doc["chunks"] = doc["content"].transform(
28+
cocoindex.functions.SplitRecursively(),
29+
language="markdown", chunk_size=2000, chunk_overlap=500)
30+
31+
with doc["chunks"].row() as chunk:
32+
chunk["embedding"] = chunk["text"].transform(
33+
cocoindex.functions.SentenceTransformerEmbed(
34+
model="sentence-transformers/all-MiniLM-L6-v2"))
35+
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
36+
text=chunk["text"], embedding=chunk["embedding"])
37+
38+
doc_embeddings.export(
39+
"doc_embeddings",
40+
cocoindex.storages.Postgres(),
41+
primary_key_fields=["filename", "location"],
42+
vector_indexes=[
43+
cocoindex.VectorIndexDef(
44+
field_name="embedding",
45+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
46+
47+
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
48+
name="SemanticsSearch",
49+
flow=amazon_s3_text_embedding_flow,
50+
target_name="doc_embeddings",
51+
query_transform_flow=lambda text: text.transform(
52+
cocoindex.functions.SentenceTransformerEmbed(
53+
model="sentence-transformers/all-MiniLM-L6-v2")),
54+
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
55+
56+
@cocoindex.main_fn()
57+
def _run():
58+
# Use a `FlowLiveUpdater` to keep the flow data updated.
59+
with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
60+
# Run queries in a loop to demonstrate the query capabilities.
61+
while True:
62+
try:
63+
query = input("Enter search query (or Enter to quit): ")
64+
if query == '':
65+
break
66+
results, _ = query_handler.search(query, 10)
67+
print("\nSearch results:")
68+
for result in results:
69+
print(f"[{result.score:.3f}] {result.data['filename']}")
70+
print(f" {result.data['text']}")
71+
print("---")
72+
print()
73+
except KeyboardInterrupt:
74+
break
75+
76+
if __name__ == "__main__":
77+
load_dotenv(override=True)
78+
_run()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
cocoindex
2+
python-dotenv
3+
boto3

python/cocoindex/sources.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,15 @@ class GoogleDrive(op.SourceSpec):
2828
root_folder_ids: list[str]
2929
binary: bool = False
3030
recent_changes_poll_interval: datetime.timedelta | None = None
31+
32+
33+
class AmazonS3(op.SourceSpec):
34+
"""Import data from an Amazon S3 bucket. Supports optional prefix and file filtering by glob patterns."""
35+
36+
_op_category = op.OpCategory.SOURCE
37+
38+
bucket_name: str
39+
prefix: str | None = None
40+
binary: bool = False
41+
included_patterns: list[str] | None = None
42+
excluded_patterns: list[str] | None = None

src/ops/registration.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use std::sync::{Arc, LazyLock, RwLock, RwLockReadGuard};
88
fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
99
sources::local_file::Factory.register(registry)?;
1010
sources::google_drive::Factory.register(registry)?;
11+
sources::amazon_s3::Factory.register(registry)?;
1112

1213
functions::parse_json::Factory.register(registry)?;
1314
functions::split_recursively::Factory.register(registry)?;

0 commit comments

Comments
 (0)