feat: DIA-2202: Support imported tasks that point to different buckets (#7458)

hakan458 · hakan458 · matt-bernstein · web-flow · commit b8e917a04655 · 2025-05-08T13:27:11.000-07:00
Co-authored-by: hakan458 &lt;hakan@heartex.com&gt;
Co-authored-by: matt-bernstein &lt;matt-bernstein@users.noreply.github.com&gt;
diff --git a/label_studio/io_storages/README.md b/label_studio/io_storages/README.md
@@ -184,4 +184,29 @@ The Storage Proxy API behavior can be configured using the following environment
 | `RESOLVER_PROXY_MAX_RANGE_SIZE` | Maximum size in bytes for a single range request | 7*1024*1024 |
 | `RESOLVER_PROXY_CACHE_TIMEOUT` | Cache TTL in seconds for proxy responses | 3600 |
 
-These optimizations ensure that the Proxy API remains responsive and resource-efficient, even when handling large files or many concurrent requests.
+These optimizations ensure that the Proxy API remains responsive and resource-efficient, even when handling large files or many concurrent requests.
+
+## Multiple Storages and URL Resolving
+
+There are use cases where multiple storages can/must be used in a single project. This can cause some confusion as to which storage gets used when. Here are some common cases and how to set up mutliple storages properly.
+
+### Case 1 - Tasks Referencing Other Buckets
+* bucket-A containing JSON tasks
+* bucket-B containing images/text/other data
+* Tasks synced from bucket-A have references to data in bucket-B
+
+##### How To Setup
+* Add storage 1 for bucket-A
+* Add storage 2 for bucket-B (might be same or different credentials than bucket-A)
+* Sync storage 1
+* All references to data in bucket-B will be resolved using storage 2 automatically
+
+### Case 2 - Buckets with Different Credentials
+* bucket-A accessible by credentials 1
+* bucket-B accessible by credentials 2
+
+##### How To Setup
+* Add storage 1 for bucket-A with credentials 1
+* Add storage 2 for bucket-B with credentials 2
+* Sync both storages
+* The appropriate storage will be used to resolve urls/generate presigned URLs
diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py
@@ -27,7 +27,7 @@
 from django.utils import timezone
 from django.utils.translation import gettext_lazy as _
 from django_rq import job
-from io_storages.utils import get_uri_via_regex
+from io_storages.utils import get_uri_via_regex, parse_bucket_uri
 from rq.job import Job
 from tasks.models import Annotation, Task
 from tasks.serializers import AnnotationSerializer, PredictionSerializer
@@ -255,8 +255,19 @@ def can_resolve_scheme(self, url: Union[str, None]) -> bool:
             return False
         # TODO: Search for occurrences inside string, e.g. for cases like "gs://bucket/file.pdf" or "<embed src='gs://bucket/file.pdf'/>"
         _, prefix = get_uri_via_regex(url, prefixes=(self.url_scheme,))
-        if prefix == self.url_scheme:
-            return True
+        bucket_uri = parse_bucket_uri(url, self)
+
+        # If there is a prefix and the bucket matches the storage's bucket/container/path
+        if prefix == self.url_scheme and bucket_uri:
+            # bucket is used for s3 and gcs
+            if hasattr(self, 'bucket') and bucket_uri.bucket == self.bucket:
+                return True
+            # container is used for azure blob
+            if hasattr(self, 'container') and bucket_uri.bucket == self.container:
+                return True
+            # path is used for redis
+            if hasattr(self, 'path') and bucket_uri.bucket == self.path:
+                return True
         # if not found any occurrences - this Storage can't resolve url
         return False
 
diff --git a/label_studio/io_storages/functions.py b/label_studio/io_storages/functions.py
@@ -54,6 +54,5 @@ def get_storage_by_url(url: Union[str, List, Dict], storage_objects: Iterable[Im
         for storage_object in storage_objects:
             if storage_object.can_resolve_url(url):
                 # note: only first found storage_object will be used for link resolving
-                # probably we need to use more advanced can_resolve_url mechanics
-                # that takes into account not only prefixes, but bucket path too
+                # can_resolve_url now checks both the scheme and the bucket to ensure the correct storage is used
                 return storage_object
diff --git a/label_studio/io_storages/s3/serializers.py b/label_studio/io_storages/s3/serializers.py
@@ -68,6 +68,8 @@ def validate(self, data):
         except TypeError as e:
             logger.info(f'It seems access keys are incorrect: {e}', exc_info=True)
             raise ValidationError('It seems access keys are incorrect')
+        except KeyError:
+            raise ValidationError(f'{storage.url_scheme}://{storage.bucket}/{storage.prefix} not found.')
         return data
 
 
diff --git a/label_studio/tasks/models.py b/label_studio/tasks/models.py
@@ -420,12 +420,10 @@ def prepare_filename(filename):
     def resolve_storage_uri(self, url) -> Optional[Mapping[str, Any]]:
         from io_storages.functions import get_storage_by_url
 
-        storage = self.storage
-        project = self.project
-
-        if not storage:
-            storage_objects = project.get_all_import_storage_objects
-            storage = get_storage_by_url(url, storage_objects)
+        # Instead of using self.storage, we check all storage objects for the project to
+        # support imported tasks that point to another bucket
+        storage_objects = self.project.get_all_import_storage_objects
+        storage = get_storage_by_url(url, storage_objects)
 
         if storage:
             return {
@@ -468,10 +466,9 @@ def resolve_uri(self, task_data, project):
 
                 # project storage
                 # TODO: to resolve nested lists and dicts we should improve get_storage_by_url(),
-                # TODO: problem with current approach: it can be used only the first storage that get_storage_by_url
-                # TODO: returns. However, maybe the second storage will resolve uris properly.
-                # TODO: resolve_uri() already supports them
-                storage = self.storage or get_storage_by_url(task_data[field], storage_objects)
+                # Now always using get_storage_by_url to ensure the storage with the correct bucket is used
+                # As a last fallback we can use self.storage which is the storage the Task was imported from
+                storage = get_storage_by_url(task_data[field], storage_objects) or self.storage
                 if storage:
                     try:
                         resolved_uri = storage.resolve_uri(task_data[field], self)