fixup! Encapsulate file mirroring in service class

nadove-ucsc · nadove-ucsc · commit 37c49006f834 · 2025-05-05T18:44:47.000-07:00
diff --git a/scripts/mirror_file.py b/scripts/mirror_file.py
@@ -65,7 +65,7 @@ def mirror_file(catalog: CatalogName, file_uuid: str, part_size: int) -> str:
     upload_id = service.begin_mirroring_file(file)
 
     def mirror_parts():
-        part = FilePart.head(file, part_size)
+        part = FilePart.first(file, part_size)
         while part is not None:
             yield service.mirror_file_part(catalog, file, part, upload_id)
             part = part.next(file)
diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py
@@ -49,40 +49,69 @@
 
 @attrs.frozen(auto_attribs=True, kw_only=True)
 class FilePart(SerializableAttrs):
-    part_number: int  # Starts at 1
-    start: int
-    end: int  # Included in the part
-
-    # https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
+    """
+    A part of a mirrored file
+    """
+    #: The part number, starting at 0 for the first part. Note that the S3 API
+    #: numbers parts starting at 1.
+    #:
+    index: int
+
+    #: Offset of the first byte of this part, relative to the start of the file
+    offset: int
+
+    #: The size of this part
+    #:
+    size: int
+
+    #: Various quotas related to parts and part sizes
+    #: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
+    #:
     min_size: ClassVar[int] = 5 * 1024 ** 2
-    default_size: ClassVar[int] = 50 * 1024 ** 2
     max_size: ClassVar[int] = 5 * 1024 ** 3
-    max_number: ClassVar[int] = 10000
-
-    @property
-    def size(self) -> int:
-        return self.end - self.start + 1
+    max_num_parts: ClassVar[int] = 10000
+
+    #: We observe a download rate of ~14 MB/s. Download time should ideally be
+    #: 1/4 of the Lambda timeout. Since we track the ETag of each part in SQS
+    #: messages, message size becomes another constraint: we observe ETags to be
+    #: 32 byte hexadecimal strings which, if represented in a JSON array, take
+    #: up 35 bytes per item, 36 if the comma is followed by a space. With a
+    #: maximum SQS message size of 256 KiB, we can store approximately 7280
+    #: ETags in an SQS messages, so the largest file we can mirror using a part
+    #: size of 256 MiB is 1.5 TiB.
+    #:
+    default_size: ClassVar[int] = 256 * 1024 ** 2
 
     @classmethod
-    def head(cls, file: File, part_size: int) -> Self:
-        assert file.size is not None, R('File size unknown', file)
-        assert cls.min_size <= part_size <= cls.max_size, R('Invalid part size', part_size)
+    def first(cls, file: File, part_size: int) -> Self:
+        """
+        The first part of the given file, using the given part size.
+        """
+        assert file.size is not None, R(
+            'File size unknown', file)
+        assert cls.min_size <= part_size <= cls.max_size, R(
+            'Invalid part size', part_size)
         part_count = math.ceil(file.size / part_size)
-        assert part_count <= 10000, R(
+        assert part_count <= cls.max_num_parts, R(
             'Part size is too small for this file', part_size, file)
-        return cls(part_number=1, start=0, end=min(part_size, file.size))
+        return cls(index=0, offset=0, size=min(part_size, file.size))
 
     def next(self, file: File) -> Self | None:
+        """
+        The part following this part in the given file, or None if this is the
+        last part.
+        """
         assert file.size is not None, R('File size unknown', file)
-        if self.end == file.size:
+        stop = self.offset + self.size
+        if stop == file.size:
             return None
-        elif self.end < file.size:
+        elif 0 < stop < file.size:
             return attr.evolve(self,
-                               part_number=self.part_number + 1,
-                               start=self.end + 1,
-                               end=min(self.end + self.size, file.size))
+                               index=self.index + 1,
+                               offset=stop,
+                               size=min(self.size, file.size - stop))
         else:
-            assert False, R('Invalid part range for this file', self, file)
+            assert False, R('Part range exceeds file size', self, file)
 
 
 class MirrorService(HasCachedHttpClient):
@@ -116,7 +145,7 @@ def mirror_file_part(self,
         upload = self._get_upload(file, upload_id)
         file_content = self._download(catalog, file, part)
         return self._storage.upload_multipart_part(file_content,
-                                                   part.part_number,
+                                                   part.index + 1,
                                                    upload)
 
     def finish_mirroring_file(self,
@@ -163,7 +192,7 @@ def _download(self,
             size = file.size
             expected_status = 200
         else:
-            headers = {'Range': f'bytes={part.start}-{part.end}'}
+            headers = {'Range': f'bytes={part.offset}-{part.offset + part.size + 1}'}
             size = part.size
             expected_status = 206
         # Ideally we would stream the response, but boto only supports uploading