Lightning-AI
diff --git a/‎README.md
+93 b/‎README.md
+93
diff --git a/‎pyproject.toml
-1 b/‎pyproject.toml
-1
diff --git a/‎requirements/extras.txt
+1 b/‎requirements/extras.txt
+1
diff --git a/‎src/litdata/__about__.py
+1-1 b/‎src/litdata/__about__.py
+1-1
diff --git a/‎src/litdata/__init__.py
+2 b/‎src/litdata/__init__.py
+2
diff --git a/‎src/litdata/constants.py
+2 b/‎src/litdata/constants.py
+2
diff --git a/‎src/litdata/processing/functions.py
+6-3 b/‎src/litdata/processing/functions.py
+6-3
diff --git a/‎src/litdata/streaming/config.py
+17-2 b/‎src/litdata/streaming/config.py
+17-2
diff --git a/‎src/litdata/streaming/dataset.py
+15-2 b/‎src/litdata/streaming/dataset.py
+15-2
@@ -236,6 +236,99 @@ dataset = StreamingDataset('s3://my-bucket/my-data', cache_dir="/path/to/cache")
 
 </details>
 
+<details>
+  <summary> ✅ Stream Hugging Face 🤗 datasets</summary>
+
+&nbsp;
+
+To use your favorite  Hugging Face dataset with LitData, simply pass its URL to `StreamingDataset`.
+
+<details>
+  <summary>How to get HF dataset URI?</summary>
+
+https://github.com/user-attachments/assets/3ba9e2ef-bf6b-41fc-a578-e4b4113a0e72
+
+</details>
+
+```python
+import litdata as ld
+
+hf_uri = "hf://datasets/leonardPKU/clevr_cogen_a_train/data"
+
+ds = ld.StreamingDataset(hf_uri)
+
+for _ds in ds:
+    print(f"{_ds[1]}; {_ds[2]}")
+```
+
+You don’t need to worry about indexing the dataset or any other setup. **LitData** will **handle all the necessary steps automatically** and `cache` the `index.json` file, so you won't have to index it again.
+
+This ensures that the next time you stream the dataset, the indexing step is skipped..
+
+&nbsp;
+
+### Indexing the HF dataset (Optional)
+
+If the Hugging Face dataset hasn't been indexed yet, you can index it first using the `index_hf_dataset` method, and then stream it using the code above.
+
+```python
+import litdata as ld
+
+hf_uri = "hf://datasets/leonardPKU/clevr_cogen_a_train/data"
+
+ld.index_hf_dataset(hf_uri)
+```
+
+- Indexing the Hugging Face dataset ahead of time will make streaming faster, as it avoids the need for real-time indexing during streaming.
+
+- To use `HF gated dataset`, ensure the `HF_TOKEN` environment variable is set.
+
+**Note**: For HuggingFace datasets, `indexing` & `streaming` is supported only for datasets in **`Parquet format`**.
+
+&nbsp;
+
+### Full Workflow for Hugging Face Datasets
+
+For full control over the cache path(`where index.json file will be stored`) and other configurations, follow these steps:
+
+1. Index the Hugging Face dataset first:
+
+```python
+import litdata as ld
+
+hf_uri = "hf://datasets/open-thoughts/OpenThoughts-114k/data"
+
+ld.index_parquet_dataset(hf_uri, "hf-index-dir")
+```
+
+2. To stream HF datasets now, pass the `HF dataset URI`, the path where the `index.json` file is stored, and `ParquetLoader` as the `item_loader` to the **`StreamingDataset`**:
+
+```python
+import litdata as ld
+from litdata.streaming.item_loader import ParquetLoader
+
+hf_uri = "hf://datasets/open-thoughts/OpenThoughts-114k/data"
+
+ds = ld.StreamingDataset(hf_uri, item_loader=ParquetLoader(), index_path="hf-index-dir")
+
+for _ds in ds:
+    print(f"{_ds[0]}; {_ds[1]}\n")
+```
+
+&nbsp;
+
+### LitData `Optimize` v/s `Parquet`
+
+Below is the benchmark for the `Imagenet dataset (155 GB)`, demonstrating that **`optimizing the dataset using LitData is faster and results in smaller output size compared to raw Parquet files`**.
+
+| **Operation**                    | **Size (GB)** | **Time (seconds)** | **Throughput (images/sec)** |
+|-----------------------------------|---------------|---------------------|-----------------------------|
+| LitData Optimize Dataset          | 45            | 283.17             | 4000-4700                  |
+| Parquet Optimize Dataset          | 51            | 465.96             | 3600-3900                  |
+| Index Parquet Dataset (overhead)  | N/A           | 6                  | N/A                         |
+
+</details>
+
 <details>
   <summary> ✅ Streams on multi-GPU, multi-node</summary>
 
 
@@ -72,7 +72,6 @@ lint.ignore = [
     "E731",  # Do not assign a lambda expression, use a def
     "S101",  # todo: Use of `assert` detected
 ]
-lint.ignore-init-module-imports = true
 # Unlike Flake8, default to a complexity level of 10.
 lint.mccabe.max-complexity = 10
 # Use Google-style docstrings.
 
@@ -6,3 +6,4 @@ tqdm
 lightning-sdk==0.1.46 # Must be pinned to ensure compatibility
 google-cloud-storage
 polars
+fsspec
@@ -14,7 +14,7 @@
 
 import time
 
-__version__ = "0.2.38"
+__version__ = "0.2.39"
 __author__ = "Lightning AI et al."
 __author_email__ = "pytorch@lightning.ai"
 __license__ = "Apache-2.0"
 
@@ -20,6 +20,7 @@
 from litdata.streaming.item_loader import TokensLoader
 from litdata.streaming.writer import index_parquet_dataset
 from litdata.utilities.breakpoint import breakpoint
+from litdata.utilities.hf_dataset import index_hf_dataset
 from litdata.utilities.train_test_split import train_test_split
 
 __all__ = [
@@ -33,6 +34,7 @@
     "train_test_split",
     "merge_datasets",
     "index_parquet_dataset",
+    "index_hf_dataset",
     "breakpoint",
 ]
 if RequirementCache("lightning_sdk"):
 
@@ -29,13 +29,15 @@
 _TORCH_GREATER_EQUAL_2_1_0 = RequirementCache("torch>=2.1.0")
 _VIZ_TRACKER_AVAILABLE = RequirementCache("viztracer")
 _BOTO3_AVAILABLE = RequirementCache("boto3")
+_FSSPEC_AVAILABLE = RequirementCache("fsspec")
 _TORCH_AUDIO_AVAILABLE = RequirementCache("torchaudio")
 _ZSTD_AVAILABLE = RequirementCache("zstd")
 _CRYPTOGRAPHY_AVAILABLE = RequirementCache("cryptography")
 _GOOGLE_STORAGE_AVAILABLE = RequirementCache("google.cloud.storage")
 _AZURE_STORAGE_AVAILABLE = RequirementCache("azure.storage.blob")
 _TQDM_AVAILABLE = RequirementCache("tqdm")
 _LIGHTNING_SDK_AVAILABLE = RequirementCache("lightning_sdk")
+_HF_HUB_AVAILABLE = RequirementCache("huggingface_hub")
 _POLARS_AVAILABLE = RequirementCache("polars>1.0.0")
 _DEBUG = bool(int(os.getenv("DEBUG", "1")))
 
 
@@ -22,7 +22,7 @@
 from functools import partial
 from pathlib import Path
 from types import FunctionType
-from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union
 from urllib import parse
 
 import torch
@@ -52,6 +52,9 @@
 from litdata.utilities.encryption import Encryption
 from litdata.utilities.format import _get_tqdm_iterator_if_available
 
+if TYPE_CHECKING:
+    from lightning_sdk import Machine
+
 
 def _is_remote_file(path: str) -> bool:
     obj = parse.urlparse(path)
@@ -194,7 +197,7 @@ def map(
     num_workers: Optional[int] = None,
     fast_dev_run: Union[bool, int] = False,
     num_nodes: Optional[int] = None,
-    machine: Optional[str] = None,
+    machine: Optional[Union["Machine", str]] = None,
     num_downloaders: Optional[int] = None,
     num_uploaders: Optional[int] = None,
     reorder_files: bool = True,
@@ -312,7 +315,7 @@ def optimize(
     num_workers: Optional[int] = None,
     fast_dev_run: bool = False,
     num_nodes: Optional[int] = None,
-    machine: Optional[str] = None,
+    machine: Optional[Union["Machine", str]] = None,
     num_downloaders: Optional[int] = None,
     num_uploaders: Optional[int] = None,
     reorder_files: bool = True,
 
@@ -120,11 +120,18 @@ def download_chunk_from_index(self, chunk_index: int) -> None:
 
         if os.path.exists(local_chunkpath):
             self.try_decompress(local_chunkpath)
+            if self._downloader is not None:
+                # We don't want to redownload the base, but we should mark
+                # it as having been requested by something
+                self._downloader._increment_local_lock(local_chunkpath.replace(f".{self._compressor_name}", ""))
+                pass
             return
 
         if self._downloader is None:
             return
 
+        self._downloader._increment_local_lock(local_chunkpath.replace(f".{self._compressor_name}", ""))
+
         self._downloader.download_chunk_from_index(chunk_index)
 
         self.try_decompress(local_chunkpath)
@@ -257,8 +264,16 @@ def load(
         cache_index_filepath = os.path.join(cache_dir, _INDEX_FILENAME)
 
         if isinstance(remote_dir, str):
-            downloader = get_downloader_cls(remote_dir, cache_dir, [], storage_options)
-            downloader.download_file(os.path.join(remote_dir, _INDEX_FILENAME), cache_index_filepath)
+            # for remote_dir, we try downloading `index.json` file.
+            # If the files are stored on HF, they don't have an index file, so we can skip downloading it.
+            if remote_dir.startswith("hf://"):
+                if not os.path.exists(cache_index_filepath):
+                    raise RuntimeError(
+                        f"This should not have happened. No index.json file found in cache: {cache_index_filepath}"
+                    )
+            else:
+                downloader = get_downloader_cls(remote_dir, cache_dir, [], storage_options)
+                downloader.download_file(os.path.join(remote_dir, _INDEX_FILENAME), cache_index_filepath)
 
         if not os.path.exists(cache_index_filepath):
             return None
 
@@ -26,14 +26,15 @@
 from litdata.helpers import _check_version_and_prompt_upgrade
 from litdata.streaming import Cache
 from litdata.streaming.downloader import get_downloader_cls  # noqa: F401
-from litdata.streaming.item_loader import BaseItemLoader
+from litdata.streaming.item_loader import BaseItemLoader, ParquetLoader
 from litdata.streaming.resolver import Dir, _resolve_dir
 from litdata.streaming.sampler import ChunkedIndex
 from litdata.streaming.serializers import Serializer
 from litdata.streaming.shuffle import FullShuffle, NoShuffle, Shuffle
 from litdata.utilities.dataset_utilities import _should_replace_path, _try_create_cache_dir, subsample_streaming_dataset
 from litdata.utilities.encryption import Encryption
 from litdata.utilities.env import _DistributedEnv, _is_in_dataloader_worker, _WorkerEnv
+from litdata.utilities.hf_dataset import index_hf_dataset
 from litdata.utilities.shuffle import (
     _find_chunks_per_workers_on_which_to_skip_deletion,
     _map_node_worker_rank_to_chunk_indexes_to_not_delete,
@@ -59,6 +60,7 @@ def __init__(
         encryption: Optional[Encryption] = None,
         storage_options: Optional[Dict] = {},
         max_pre_download: int = 2,
+        index_path: Optional[str] = None,
     ) -> None:
         """The streaming dataset can be used once your data have been optimised using the DatasetOptimiser class.
 
@@ -79,6 +81,9 @@ def __init__(
             encryption: The encryption object to use for decrypting the data.
             storage_options: Additional connection options for accessing storage services.
             max_pre_download: Maximum number of chunks that can be pre-downloaded by the StreamingDataset.
+            index_path: Path to `index.json` for the Parquet dataset.
+                If `index_path` is a directory, the function will look for `index.json` within it.
+                If `index_path` is a full file path, it will use that directly.
 
         """
         _check_version_and_prompt_upgrade(__version__)
@@ -93,12 +98,20 @@ def __init__(
         input_dir = _resolve_dir(input_dir)
         cache_dir = _resolve_dir(cache_dir)
 
+        if input_dir.url is not None and input_dir.url.startswith("hf://"):
+            if index_path is None:
+                # no index path provide, load from cache, or try indexing on the go.
+                index_path = index_hf_dataset(input_dir.url)
+                cache_dir.path = index_path
+                input_dir.path = index_path
+            item_loader = ParquetLoader()
+
         self.input_dir = input_dir
         self.cache_dir = cache_dir
         self.subsampled_files: List[str] = []
         self.region_of_interest: List[Tuple[int, int]] = []
         self.subsampled_files, self.region_of_interest = subsample_streaming_dataset(
-            self.input_dir, self.cache_dir, item_loader, subsample, shuffle, seed, storage_options
+            self.input_dir, self.cache_dir, item_loader, subsample, shuffle, seed, storage_options, index_path
         )
 
         self.item_loader = item_loader
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,6 @@ lint.ignore = [`
`72`	`72`	`"E731", # Do not assign a lambda expression, use a def`
`73`	`73`	"S101", # todo: Use of `assert` detected
`74`	`74`	`]`
`75`		`-lint.ignore-init-module-imports = true`
`76`	`75`	`# Unlike Flake8, default to a complexity level of 10.`
`77`	`76`	`lint.mccabe.max-complexity = 10`
`78`	`77`	`# Use Google-style docstrings.`