revie comments

badmonster0 · badmonster0 · commit aad5eedfd281 · 2025-05-12T15:35:52.000-07:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -109,5 +109,5 @@ rand = "0.9.0"
 indoc = "2.0.6"
 owo-colors = "4.2.0"
 json5 = "0.4.1"
-aws-config = "1"
-aws-sdk-s3 = "1"
+aws-config = "1.6.2"
+aws-sdk-s3 = "1.85.0"
diff --git a/examples/amazon_s3_text_embedding/.env.example b/examples/amazon_s3_text_embedding/.env.example
diff --git a/examples/amazon_s3_text_embedding/.gitignore b/examples/amazon_s3_text_embedding/.gitignore
diff --git a/examples/amazon_s3_text_embedding/README.md b/examples/amazon_s3_text_embedding/README.md
@@ -1,6 +1,6 @@
 This example builds an embedding index based on files stored in an Amazon S3 bucket.
 It continuously updates the index as files are added / updated / deleted in the source bucket:
-it keeps the index in sync with the S3 bucket effortlessly.
+it keeps the index in sync with the Amazon S3 bucket effortlessly.
 
 ## Prerequisite
 
@@ -10,7 +10,7 @@ Before running the example, you need to:
 
 2.  Prepare for Amazon S3:
 
-    -   **Create an S3 bucket:**
+    -   **Create an Amazon S3 bucket:**
         - Go to the [AWS S3 Console](https://s3.console.aws.amazon.com/s3/home) and click **Create bucket**. Give it a unique name and choose a region.
         - Or, use the AWS CLI:
           ```sh
@@ -40,18 +40,28 @@ Before running the example, you need to:
           ```
         - If running on AWS EC2 or Lambda, you can use an [IAM role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) with S3 read permissions.
 
-    -   **(Optional) Specify a prefix** to restrict to a subfolder in the bucket by setting `S3_PREFIX` in your `.env`.
+    -   **(Optional) Specify a prefix** to restrict to a subfolder in the bucket by setting `AMAZON_S3_PREFIX` in your `.env`.
 
     See [AWS S3 documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) for more details.
 
-3.  Create a `.env` file with your S3 bucket name and (optionally) prefix.
+3.  Create a `.env` file with your Amazon S3 bucket name and (optionally) prefix.
     Start from copying the `.env.example`, and then edit it to fill in your bucket name and prefix.
 
     ```bash
     cp .env.example .env
     $EDITOR .env
     ```
 
+    Example `.env` file:
+    ```
+    # Database Configuration
+    DATABASE_URL=postgresql://localhost:5432/cocoindex
+
+    # Amazon S3 Configuration
+    AMAZON_S3_BUCKET_NAME=your-bucket-name
+    AMAZON_S3_PREFIX=optional/prefix/path
+    ```
+
 ## Run
 
 Install dependencies:
@@ -72,7 +82,7 @@ Run:
 uv run main.py
 ```
 
-During running, it will keep observing changes in the S3 bucket and update the index automatically.
+During running, it will keep observing changes in the Amazon S3 bucket and update the index automatically.
 At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
 
 
diff --git a/examples/amazon_s3_text_embedding/main.py b/examples/amazon_s3_text_embedding/main.py
@@ -5,16 +5,16 @@
 import datetime
 import os
 
-@cocoindex.flow_def(name="S3TextEmbedding")
-def s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+@cocoindex.flow_def(name="AmazonS3TextEmbedding")
+def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
     """
-    Define an example flow that embeds text from S3 into a vector database.
+    Define an example flow that embeds text from Amazon S3 into a vector database.
     """
-    bucket_name = os.environ["S3_BUCKET_NAME"]
-    prefix = os.environ.get("S3_PREFIX", None)
+    bucket_name = os.environ["AMAZON_S3_BUCKET_NAME"]
+    prefix = os.environ.get("AMAZON_S3_PREFIX", None)
 
     data_scope["documents"] = flow_builder.add_source(
-        cocoindex.sources.S3(
+        cocoindex.sources.AmazonS3(
             bucket_name=bucket_name,
             prefix=prefix,
             included_patterns=["*.md", "*.txt", "*.docx"],
@@ -46,7 +46,7 @@ def s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco
 
 query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
     name="SemanticsSearch",
-    flow=s3_text_embedding_flow,
+    flow=amazon_s3_text_embedding_flow,
     target_name="doc_embeddings",
     query_transform_flow=lambda text: text.transform(
         cocoindex.functions.SentenceTransformerEmbed(
@@ -56,7 +56,7 @@ def s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco
 @cocoindex.main_fn()
 def _run():
     # Use a `FlowLiveUpdater` to keep the flow data updated.
-    with cocoindex.FlowLiveUpdater(s3_text_embedding_flow):
+    with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
         # Run queries in a loop to demonstrate the query capabilities.
         while True:
             try:
diff --git a/examples/amazon_s3_text_embedding/requirements.txt b/examples/amazon_s3_text_embedding/requirements.txt
diff --git a/python/cocoindex/sources.py b/python/cocoindex/sources.py
@@ -30,7 +30,7 @@ class GoogleDrive(op.SourceSpec):
     recent_changes_poll_interval: datetime.timedelta | None = None
 
 
-class S3(op.SourceSpec):
+class AmazonS3(op.SourceSpec):
     """Import data from an Amazon S3 bucket. Supports optional prefix and file filtering by glob patterns."""
 
     _op_category = op.OpCategory.SOURCE
diff --git a/src/ops/registration.rs b/src/ops/registration.rs
@@ -8,7 +8,7 @@ use std::sync::{Arc, LazyLock, RwLock, RwLockReadGuard};
 fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
     sources::local_file::Factory.register(registry)?;
     sources::google_drive::Factory.register(registry)?;
-    sources::s3::Factory.register(registry)?;
+    sources::amazon_s3::Factory.register(registry)?;
 
     functions::parse_json::Factory.register(registry)?;
     functions::split_recursively::Factory.register(registry)?;
diff --git a/src/ops/sources/amazon_s3.rs b/src/ops/sources/amazon_s3.rs
@@ -49,17 +49,17 @@ impl SourceExecutor for Executor {
         &'a self,
         _options: &'a SourceExecutorListOptions,
     ) -> BoxStream<'a, Result<Vec<SourceRowMetadata>>> {
-        let client = self.client.clone();
-        let bucket = self.bucket_name.clone();
-        let prefix = self.prefix.clone();
-        let included_glob_set = self.included_glob_set.clone();
-        let excluded_glob_set = self.excluded_glob_set.clone();
+        let client = &self.client;
+        let bucket = &self.bucket_name;
+        let prefix = &self.prefix;
+        let included_glob_set = &self.included_glob_set;
+        let excluded_glob_set = &self.excluded_glob_set;
         try_stream! {
             let mut continuation_token = None;
             loop {
                 let mut req = client
                     .list_objects_v2()
-                    .bucket(&bucket);
+                    .bucket(bucket);
                 if let Some(ref p) = prefix {
                     req = req.prefix(p);
                 }
@@ -151,7 +151,7 @@ impl SourceFactoryBase for Factory {
     type Spec = Spec;
 
     fn name(&self) -> &str {
-        "S3"
+        "AmazonS3"
     }
 
     fn get_output_schema(
@@ -189,7 +189,10 @@ impl SourceFactoryBase for Factory {
         _context: Arc<FlowInstanceContext>,
     ) -> Result<Box<dyn SourceExecutor>> {
         let region_provider = RegionProviderChain::default_provider().or_else(Region::new("us-east-1"));
-        let config = aws_config::from_env().region(region_provider).load().await;
+        let config = aws_config::defaults(aws_config::BehaviorVersion::latest())
+            .region(region_provider)
+            .load()
+            .await;
         let client = Client::new(&config);
         Ok(Box::new(Executor {
             client,
diff --git a/src/ops/sources/mod.rs b/src/ops/sources/mod.rs
@@ -1,3 +1,3 @@
 pub mod google_drive;
 pub mod local_file;
-pub mod s3;
+pub mod amazon_s3;