Skip to content

Commit 71bfdc3

Browse files
authored
Remove __len__ from CombinedStreamingDataset (#19321)
1 parent b446b08 commit 71bfdc3

File tree

1 file changed

+5
-6
lines changed

1 file changed

+5
-6
lines changed

src/lightning/data/streaming/combined.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,11 @@ class CombinedStreamingDataset(IterableDataset):
2727
"""The `CombinedStreamingDataset` enables to stream data from multiple StreamingDataset with the sampling ratio of
2828
your choice.
2929
30-
Addtionally, the `CombinedStreamingDataset` keeps track of the number of
31-
samples fetched to enable resumability of the datasets.
30+
Addtionally, the `CombinedStreamingDataset` keeps track of the number of samples fetched to enable resumability
31+
of the datasets.
32+
33+
Note that due to the random sampling, the number of samples returned from the iterator is variable and a function
34+
of the given seed. The combined dataset will raise a StopIteration as soon as any of the datasets is exhausted.
3235
3336
"""
3437

@@ -71,10 +74,6 @@ def _set_use_streaming_dataloader(self, use_streaming_dataloader: bool) -> None:
7174
# Used to prevent returning num_samples_yielded when using PyTorch DataLoader
7275
self._use_streaming_dataloader = use_streaming_dataloader
7376

74-
def __len__(self) -> int:
75-
assert self._weights
76-
return int(min([1 / w * len(d) for w, d in zip(self._weights, self._datasets) if w > 0]))
77-
7877
def __iter__(self) -> Iterator[Any]:
7978
assert self._weights
8079

0 commit comments

Comments
 (0)