From 60e7ec87cda08649b71b0951a3bda8e2be70bea9 Mon Sep 17 00:00:00 2001 From: thomas Date: Tue, 6 Feb 2024 11:35:36 +0000 Subject: [PATCH 1/7] update --- .../data/streaming/data_processor.py | 4 +++- src/lightning/data/streaming/serializers.py | 21 ++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py index 3fdebf29b5448..8d5a5e25a0918 100644 --- a/src/lightning/data/streaming/data_processor.py +++ b/src/lightning/data/streaming/data_processor.py @@ -498,7 +498,7 @@ def is_path(element: Any) -> bool: element: str = str(Path(element).resolve()) return ( element.startswith(self.input_dir.path) - if self.input_dir.path is not None + if self.input_dir.path is not None and _IS_IN_STUDIO else os.path.exists(element) ) @@ -509,6 +509,8 @@ def is_path(element: Any) -> bool: index: str(Path(element).resolve()) for index, element in enumerate(flattened_item) if is_path(element) } + print(indexed_paths) + if len(indexed_paths) == 0: raise ValueError( f"The provided item {item} didn't contain any filepaths. The input_dir is {self.input_dir.path}." diff --git a/src/lightning/data/streaming/serializers.py b/src/lightning/data/streaming/serializers.py index 82f0583bd9696..a79b6e628db29 100644 --- a/src/lightning/data/streaming/serializers.py +++ b/src/lightning/data/streaming/serializers.py @@ -28,6 +28,7 @@ _PIL_AVAILABLE = RequirementCache("PIL") _TORCH_VISION_AVAILABLE = RequirementCache("torchvision") _AV_AVAILABLE = RequirementCache("av") +_RASTERIO_AVAILABLE = RequirementCache("rasterio") if _PIL_AVAILABLE: from PIL import Image @@ -40,7 +41,6 @@ from torchvision.io import decode_jpeg from torchvision.transforms.functional import pil_to_tensor - class Serializer(ABC): """The base interface for any serializers. @@ -323,10 +323,29 @@ def can_serialize(self, data: Any) -> bool: return isinstance(data, str) and os.path.exists(data) and any(data.endswith(ext) for ext in self._EXTENSIONS) +class TifSerializer(Serializer): + def serialize(self, filepath: str) -> Tuple[bytes, Optional[str]]: + _, file_extension = os.path.splitext(filepath) + with open(filepath, "rb") as f: + return f.read(), file_extension.replace(".", "").lower() + + def deserialize(self, data: bytes) -> Any: + if not _RASTERIO_AVAILABLE: + raise ModuleNotFoundError("rasterio is required. Run: `pip install rasterio`") + + from rasterio.io import MemoryFile + + return MemoryFile(data) + + def can_serialize(self, data: Any) -> bool: + return isinstance(data, str) and os.path.exists(data) + + _SERIALIZERS = OrderedDict( **{ "video": VideoSerializer(), "file": FileSerializer(), + "tif": TifSerializer(), "pil": PILSerializer(), "int": IntSerializer(), "jpeg": JPEGSerializer(), From 1cff257b9e5cef0656a3d436fe5c08d426de5401 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 6 Feb 2024 12:38:39 +0000 Subject: [PATCH 2/7] update --- _notebooks | 1 - src/lightning/data/streaming/data_processor.py | 11 ++++++----- src/lightning/data/streaming/serializers.py | 13 ++----------- 3 files changed, 8 insertions(+), 17 deletions(-) delete mode 160000 _notebooks diff --git a/_notebooks b/_notebooks deleted file mode 160000 index 543a8d8200662..0000000000000 --- a/_notebooks +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 543a8d82006620906dc9eb669eab18d06ebe6863 diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py index 8d5a5e25a0918..9714e1b4fdf72 100644 --- a/src/lightning/data/streaming/data_processor.py +++ b/src/lightning/data/streaming/data_processor.py @@ -496,11 +496,12 @@ def is_path(element: Any) -> bool: return False element: str = str(Path(element).resolve()) - return ( - element.startswith(self.input_dir.path) - if self.input_dir.path is not None and _IS_IN_STUDIO - else os.path.exists(element) - ) + if _IS_IN_STUDIO: + if self.input_dir.path is not None: + if self.input_dir.path.startswith("/teamspace/studios/this_studio"): + return os.path.exists(element) + return element.startswith(self.input_dir.path) + return os.path.exists(element) # For speed reasons, we assume starting with `self.input_dir` is enough to be a real file. # Other alternative would be too slow. diff --git a/src/lightning/data/streaming/serializers.py b/src/lightning/data/streaming/serializers.py index a79b6e628db29..9fdec1e959ad9 100644 --- a/src/lightning/data/streaming/serializers.py +++ b/src/lightning/data/streaming/serializers.py @@ -323,12 +323,7 @@ def can_serialize(self, data: Any) -> bool: return isinstance(data, str) and os.path.exists(data) and any(data.endswith(ext) for ext in self._EXTENSIONS) -class TifSerializer(Serializer): - def serialize(self, filepath: str) -> Tuple[bytes, Optional[str]]: - _, file_extension = os.path.splitext(filepath) - with open(filepath, "rb") as f: - return f.read(), file_extension.replace(".", "").lower() - +class TifSerializer(FileSerializer): def deserialize(self, data: bytes) -> Any: if not _RASTERIO_AVAILABLE: raise ModuleNotFoundError("rasterio is required. Run: `pip install rasterio`") @@ -337,15 +332,11 @@ def deserialize(self, data: bytes) -> Any: return MemoryFile(data) - def can_serialize(self, data: Any) -> bool: - return isinstance(data, str) and os.path.exists(data) - - _SERIALIZERS = OrderedDict( **{ "video": VideoSerializer(), - "file": FileSerializer(), "tif": TifSerializer(), + "file": FileSerializer(), "pil": PILSerializer(), "int": IntSerializer(), "jpeg": JPEGSerializer(), From d45f72c6f6f92acaad7e00aae9b139ceb7210d29 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 12:39:34 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/data/streaming/data_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py index 9714e1b4fdf72..a0b0fb34d9f9f 100644 --- a/src/lightning/data/streaming/data_processor.py +++ b/src/lightning/data/streaming/data_processor.py @@ -499,7 +499,7 @@ def is_path(element: Any) -> bool: if _IS_IN_STUDIO: if self.input_dir.path is not None: if self.input_dir.path.startswith("/teamspace/studios/this_studio"): - return os.path.exists(element) + return os.path.exists(element) return element.startswith(self.input_dir.path) return os.path.exists(element) From 1b3b723fb103e6657007afd68525c8ef50942a30 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 6 Feb 2024 12:47:04 +0000 Subject: [PATCH 4/7] update --- src/lightning/data/streaming/serializers.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/lightning/data/streaming/serializers.py b/src/lightning/data/streaming/serializers.py index 9fdec1e959ad9..b689429953c21 100644 --- a/src/lightning/data/streaming/serializers.py +++ b/src/lightning/data/streaming/serializers.py @@ -28,7 +28,6 @@ _PIL_AVAILABLE = RequirementCache("PIL") _TORCH_VISION_AVAILABLE = RequirementCache("torchvision") _AV_AVAILABLE = RequirementCache("av") -_RASTERIO_AVAILABLE = RequirementCache("rasterio") if _PIL_AVAILABLE: from PIL import Image @@ -288,7 +287,7 @@ def serialize(self, filepath: str) -> Tuple[bytes, Optional[str]]: return f.read(), file_extension.replace(".", "").lower() def deserialize(self, data: bytes) -> Any: - pass + return data def can_serialize(self, data: Any) -> bool: return isinstance(data, str) and os.path.exists(data) @@ -323,19 +322,10 @@ def can_serialize(self, data: Any) -> bool: return isinstance(data, str) and os.path.exists(data) and any(data.endswith(ext) for ext in self._EXTENSIONS) -class TifSerializer(FileSerializer): - def deserialize(self, data: bytes) -> Any: - if not _RASTERIO_AVAILABLE: - raise ModuleNotFoundError("rasterio is required. Run: `pip install rasterio`") - - from rasterio.io import MemoryFile - - return MemoryFile(data) - _SERIALIZERS = OrderedDict( **{ "video": VideoSerializer(), - "tif": TifSerializer(), + "tif": FileSerializer(), "file": FileSerializer(), "pil": PILSerializer(), "int": IntSerializer(), From fbdbc4d17197e33edbe281428be0b4634176b523 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 6 Feb 2024 12:47:40 +0000 Subject: [PATCH 5/7] update --- _notebooks | 1 + 1 file changed, 1 insertion(+) create mode 160000 _notebooks diff --git a/_notebooks b/_notebooks new file mode 160000 index 0000000000000..543a8d8200662 --- /dev/null +++ b/_notebooks @@ -0,0 +1 @@ +Subproject commit 543a8d82006620906dc9eb669eab18d06ebe6863 From c02da003d533ca738544034bf60129203d33683b Mon Sep 17 00:00:00 2001 From: thomas Date: Tue, 6 Feb 2024 12:48:47 +0000 Subject: [PATCH 6/7] update --- src/lightning/data/streaming/data_processor.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lightning/data/streaming/data_processor.py b/src/lightning/data/streaming/data_processor.py index 9714e1b4fdf72..0449a1218623a 100644 --- a/src/lightning/data/streaming/data_processor.py +++ b/src/lightning/data/streaming/data_processor.py @@ -496,11 +496,10 @@ def is_path(element: Any) -> bool: return False element: str = str(Path(element).resolve()) - if _IS_IN_STUDIO: - if self.input_dir.path is not None: - if self.input_dir.path.startswith("/teamspace/studios/this_studio"): - return os.path.exists(element) - return element.startswith(self.input_dir.path) + if _IS_IN_STUDIO and self.input_dir.path is not None: + if self.input_dir.path.startswith("/teamspace/studios/this_studio"): + return os.path.exists(element) + return element.startswith(self.input_dir.path) return os.path.exists(element) # For speed reasons, we assume starting with `self.input_dir` is enough to be a real file. @@ -510,8 +509,6 @@ def is_path(element: Any) -> bool: index: str(Path(element).resolve()) for index, element in enumerate(flattened_item) if is_path(element) } - print(indexed_paths) - if len(indexed_paths) == 0: raise ValueError( f"The provided item {item} didn't contain any filepaths. The input_dir is {self.input_dir.path}." From fc94cb12898891e45d91fde060bfbe5f4bcad0ec Mon Sep 17 00:00:00 2001 From: thomas Date: Tue, 6 Feb 2024 13:49:02 +0000 Subject: [PATCH 7/7] update --- tests/tests_data/streaming/test_serializer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tests_data/streaming/test_serializer.py b/tests/tests_data/streaming/test_serializer.py index 26578d41df89f..6aff7318b2e30 100644 --- a/tests/tests_data/streaming/test_serializer.py +++ b/tests/tests_data/streaming/test_serializer.py @@ -44,6 +44,7 @@ def test_serializers(): assert list(_SERIALIZERS.keys()) == [ "video", + "tif", "file", "pil", "int",