Skip to content

Commit aad5eed

Browse files
committed
revie comments
1 parent 24e9fc0 commit aad5eed

File tree

10 files changed

+39
-26
lines changed

10 files changed

+39
-26
lines changed

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,5 +109,5 @@ rand = "0.9.0"
109109
indoc = "2.0.6"
110110
owo-colors = "4.2.0"
111111
json5 = "0.4.1"
112-
aws-config = "1"
113-
aws-sdk-s3 = "1"
112+
aws-config = "1.6.2"
113+
aws-sdk-s3 = "1.85.0"

examples/s3_text_embedding/README.md renamed to examples/amazon_s3_text_embedding/README.md

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
This example builds an embedding index based on files stored in an Amazon S3 bucket.
22
It continuously updates the index as files are added / updated / deleted in the source bucket:
3-
it keeps the index in sync with the S3 bucket effortlessly.
3+
it keeps the index in sync with the Amazon S3 bucket effortlessly.
44

55
## Prerequisite
66

@@ -10,7 +10,7 @@ Before running the example, you need to:
1010

1111
2. Prepare for Amazon S3:
1212

13-
- **Create an S3 bucket:**
13+
- **Create an Amazon S3 bucket:**
1414
- Go to the [AWS S3 Console](https://s3.console.aws.amazon.com/s3/home) and click **Create bucket**. Give it a unique name and choose a region.
1515
- Or, use the AWS CLI:
1616
```sh
@@ -40,18 +40,28 @@ Before running the example, you need to:
4040
```
4141
- If running on AWS EC2 or Lambda, you can use an [IAM role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) with S3 read permissions.
4242

43-
- **(Optional) Specify a prefix** to restrict to a subfolder in the bucket by setting `S3_PREFIX` in your `.env`.
43+
- **(Optional) Specify a prefix** to restrict to a subfolder in the bucket by setting `AMAZON_S3_PREFIX` in your `.env`.
4444

4545
See [AWS S3 documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) for more details.
4646

47-
3. Create a `.env` file with your S3 bucket name and (optionally) prefix.
47+
3. Create a `.env` file with your Amazon S3 bucket name and (optionally) prefix.
4848
Start from copying the `.env.example`, and then edit it to fill in your bucket name and prefix.
4949

5050
```bash
5151
cp .env.example .env
5252
$EDITOR .env
5353
```
5454

55+
Example `.env` file:
56+
```
57+
# Database Configuration
58+
DATABASE_URL=postgresql://localhost:5432/cocoindex
59+
60+
# Amazon S3 Configuration
61+
AMAZON_S3_BUCKET_NAME=your-bucket-name
62+
AMAZON_S3_PREFIX=optional/prefix/path
63+
```
64+
5565
## Run
5666

5767
Install dependencies:
@@ -72,7 +82,7 @@ Run:
7282
uv run main.py
7383
```
7484

75-
During running, it will keep observing changes in the S3 bucket and update the index automatically.
85+
During running, it will keep observing changes in the Amazon S3 bucket and update the index automatically.
7686
At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
7787

7888

examples/s3_text_embedding/main.py renamed to examples/amazon_s3_text_embedding/main.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55
import datetime
66
import os
77

8-
@cocoindex.flow_def(name="S3TextEmbedding")
9-
def s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
8+
@cocoindex.flow_def(name="AmazonS3TextEmbedding")
9+
def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
1010
"""
11-
Define an example flow that embeds text from S3 into a vector database.
11+
Define an example flow that embeds text from Amazon S3 into a vector database.
1212
"""
13-
bucket_name = os.environ["S3_BUCKET_NAME"]
14-
prefix = os.environ.get("S3_PREFIX", None)
13+
bucket_name = os.environ["AMAZON_S3_BUCKET_NAME"]
14+
prefix = os.environ.get("AMAZON_S3_PREFIX", None)
1515

1616
data_scope["documents"] = flow_builder.add_source(
17-
cocoindex.sources.S3(
17+
cocoindex.sources.AmazonS3(
1818
bucket_name=bucket_name,
1919
prefix=prefix,
2020
included_patterns=["*.md", "*.txt", "*.docx"],
@@ -46,7 +46,7 @@ def s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco
4646

4747
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
4848
name="SemanticsSearch",
49-
flow=s3_text_embedding_flow,
49+
flow=amazon_s3_text_embedding_flow,
5050
target_name="doc_embeddings",
5151
query_transform_flow=lambda text: text.transform(
5252
cocoindex.functions.SentenceTransformerEmbed(
@@ -56,7 +56,7 @@ def s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco
5656
@cocoindex.main_fn()
5757
def _run():
5858
# Use a `FlowLiveUpdater` to keep the flow data updated.
59-
with cocoindex.FlowLiveUpdater(s3_text_embedding_flow):
59+
with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
6060
# Run queries in a loop to demonstrate the query capabilities.
6161
while True:
6262
try:

python/cocoindex/sources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class GoogleDrive(op.SourceSpec):
3030
recent_changes_poll_interval: datetime.timedelta | None = None
3131

3232

33-
class S3(op.SourceSpec):
33+
class AmazonS3(op.SourceSpec):
3434
"""Import data from an Amazon S3 bucket. Supports optional prefix and file filtering by glob patterns."""
3535

3636
_op_category = op.OpCategory.SOURCE

src/ops/registration.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::sync::{Arc, LazyLock, RwLock, RwLockReadGuard};
88
fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
99
sources::local_file::Factory.register(registry)?;
1010
sources::google_drive::Factory.register(registry)?;
11-
sources::s3::Factory.register(registry)?;
11+
sources::amazon_s3::Factory.register(registry)?;
1212

1313
functions::parse_json::Factory.register(registry)?;
1414
functions::split_recursively::Factory.register(registry)?;

src/ops/sources/s3.rs renamed to src/ops/sources/amazon_s3.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,17 @@ impl SourceExecutor for Executor {
4949
&'a self,
5050
_options: &'a SourceExecutorListOptions,
5151
) -> BoxStream<'a, Result<Vec<SourceRowMetadata>>> {
52-
let client = self.client.clone();
53-
let bucket = self.bucket_name.clone();
54-
let prefix = self.prefix.clone();
55-
let included_glob_set = self.included_glob_set.clone();
56-
let excluded_glob_set = self.excluded_glob_set.clone();
52+
let client = &self.client;
53+
let bucket = &self.bucket_name;
54+
let prefix = &self.prefix;
55+
let included_glob_set = &self.included_glob_set;
56+
let excluded_glob_set = &self.excluded_glob_set;
5757
try_stream! {
5858
let mut continuation_token = None;
5959
loop {
6060
let mut req = client
6161
.list_objects_v2()
62-
.bucket(&bucket);
62+
.bucket(bucket);
6363
if let Some(ref p) = prefix {
6464
req = req.prefix(p);
6565
}
@@ -151,7 +151,7 @@ impl SourceFactoryBase for Factory {
151151
type Spec = Spec;
152152

153153
fn name(&self) -> &str {
154-
"S3"
154+
"AmazonS3"
155155
}
156156

157157
fn get_output_schema(
@@ -189,7 +189,10 @@ impl SourceFactoryBase for Factory {
189189
_context: Arc<FlowInstanceContext>,
190190
) -> Result<Box<dyn SourceExecutor>> {
191191
let region_provider = RegionProviderChain::default_provider().or_else(Region::new("us-east-1"));
192-
let config = aws_config::from_env().region(region_provider).load().await;
192+
let config = aws_config::defaults(aws_config::BehaviorVersion::latest())
193+
.region(region_provider)
194+
.load()
195+
.await;
193196
let client = Client::new(&config);
194197
Ok(Box::new(Executor {
195198
client,

src/ops/sources/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
pub mod google_drive;
22
pub mod local_file;
3-
pub mod s3;
3+
pub mod amazon_s3;

0 commit comments

Comments
 (0)