Lightning-AI · tchaton · May 25, 2025 · Apr 19, 2025 · Apr 24, 2025 · Apr 24, 2025
@@ -649,6 +649,83 @@ for batch in tqdm(train_dataloader):
 ```
 </details>
 
+<details>
+  <summary> ✅ Parallel streaming</summary>
+&nbsp;
+
+While `CombinedDataset` allows to fetch a sample from one of the datasets it wraps at each iteration, `ParallelStreamingDataset` can be used to fetch a sample from all the wrapped datasets at each iteration:
+
+```python
+from litdata import StreamingDataset, ParallelStreamingDataset, StreamingDataLoader
+from tqdm import tqdm
+
+parallel_dataset = ParallelStreamingDataset(
+    [
+        StreamingDataset(input_dir="input_dir_1"),
+        StreamingDataset(input_dir="input_dir_2"),
+    ],
+)
+
+dataloader = StreamingDataLoader(parallel_dataset)
+
+for batch_1, batch_2 in tqdm(dataloader):
+    pass
+```
+
+This is useful to generate new data on-the-fly using a sample from each dataset. To do so, provide a ``transform`` function to `ParallelStreamingDataset`:
+
+```python
+def transform(samples: Tuple[Any]):
+    sample_1, sample_2 = samples  # as many samples as wrapped datasets
+    return sample_1 + sample_2  # example transformation
+
+parallel_dataset = ParallelStreamingDataset([dset_1, dset_2], transform=transform)
+
+dataloader = StreamingDataLoader(parallel_dataset)
+
+for transformed_batch in tqdm(dataloader):
+    pass
+```
+
+If the transformation requires random number generation, internal random number generators provided by `ParallelStreamingDataset` can be used. These are seeded using the current dataset state at the beginning of each epoch, which allows for reproducible and resumable data transformation. To use them, define a ``transform`` which takes a dictionary of random number generators as its second argument:
+
+```python
+def transform(samples: Tuple[Any], rngs: Dict[str, Any]):
+    sample_1, sample_2 = samples  # as many samples as wrapped datasets
+    rng = rngs["random"]  # "random", "numpy" and "torch" keys available
+    return rng.random() * sample_1 + rng.random() * sample_2  # example transformation
+
+parallel_dataset = ParallelStreamingDataset([dset_1, dset_2], transform=transform)
+```
+</details>
+
+<details>
+  <summary> ✅ Cycle datasets</summary>
+&nbsp;
+
+`ParallelStreamingDataset` can also be used to cycle a `StreamingDataset`. This allows to dissociate the epoch length from the number of samples in the dataset.
+
+To do so, set the `length` option to the desired number of samples to yield per epoch. If ``length`` is greater than the number of samples in the dataset, the dataset is cycled. At the beginning of a new epoch, the dataset resumes from where it left off at the end of the previous epoch.
+
+```python
+from litdata import StreamingDataset, ParallelStreamingDataset, StreamingDataLoader
+from tqdm import tqdm
+
+dataset = StreamingDataset(input_dir="input_dir")
+
+cycled_dataset = ParallelStreamingDataset([dataset], length=100)
+
+print(len(cycled_dataset)))  # 100
+
+dataloader = StreamingDataLoader(cycled_dataset)
+
+for batch, in tqdm(dataloader):
+    pass
+```
+
+You can even set `length` to `float("inf")` for an infinite dataset!
+</details>
+
 <details>
   <summary> ✅ Merge datasets</summary>
 &nbsp;

@@ -18,6 +18,7 @@
 from litdata.streaming.dataloader import StreamingDataLoader
 from litdata.streaming.dataset import StreamingDataset
 from litdata.streaming.item_loader import TokensLoader
+from litdata.streaming.parallel import ParallelStreamingDataset
 from litdata.streaming.writer import index_parquet_dataset
 from litdata.utilities.breakpoint import breakpoint
 from litdata.utilities.hf_dataset import index_hf_dataset
@@ -28,6 +29,7 @@
     "CombinedStreamingDataset",
     "StreamingDataLoader",
     "TokensLoader",
+    "ParallelStreamingDataset",
     "map",
     "optimize",
     "walk",

@@ -16,11 +16,13 @@
 from litdata.streaming.dataloader import StreamingDataLoader
 from litdata.streaming.dataset import StreamingDataset
 from litdata.streaming.item_loader import TokensLoader
+from litdata.streaming.parallel import ParallelStreamingDataset
 
 __all__ = [
     "Cache",
     "StreamingDataset",
     "CombinedStreamingDataset",
     "StreamingDataLoader",
     "TokensLoader",
+    "ParallelStreamingDataset",
 ]
@@ -16,15 +16,15 @@
 from copy import deepcopy
 from typing import Any, Dict, Iterator, List, Literal, Optional, Sequence
 
-from torch.utils.data import IterableDataset
-
 from litdata.debugger import ChromeTraceColors, _get_log_msg
 from litdata.streaming.dataset import StreamingDataset
+from litdata.utilities.base import (
+    __NUM_SAMPLES_YIELDED_KEY__,
+    __SAMPLES_KEY__,
+    _BaseStreamingDatasetWrapper,
+)
 from litdata.utilities.env import _WorkerEnv
 
-__NUM_SAMPLES_YIELDED_KEY__ = "__NUM_SAMPLES_YIELDED__"
-__SAMPLES_KEY__ = "__SAMPLES__"
-
 logger = logging.getLogger("litdata.streaming.combined")
 
 
@@ -36,7 +36,7 @@ class BatchingMethod:
 BatchingMethodType = Literal["stratified", "per_stream"]
 
 
-class CombinedStreamingDataset(IterableDataset):
+class CombinedStreamingDataset(_BaseStreamingDatasetWrapper):
     """Enables to stream data from multiple StreamingDataset with the sampling ratio of
     your choice.
 
@@ -99,7 +99,7 @@ def __init__(
 
         self._iterator: Optional[_CombinedDatasetIterator] = None
         self._use_streaming_dataloader = False
-        self._num_samples_yielded: Optional[List[int]] = None
+        self._num_samples_yielded: Optional[Dict[int, List[int]]] = None
         self._current_epoch = 0
         self.num_workers = 1
         self.batch_size = 1
@@ -119,11 +119,6 @@ def __len__(self) -> Optional[int]:
     def _get_total_length(self) -> int:
         return sum(self._get_len(d) for d in self._datasets)
 
-    def _get_len(self, d: Any) -> int:
-        if isinstance(d, StreamingDataset):
-            return d.get_len(self.num_workers, self.batch_size)
-        return len(d)
-
     def set_epoch(self, current_epoch: int) -> None:
         """Set the current epoch to the datasets on epoch starts.
 
@@ -134,40 +129,6 @@ def set_epoch(self, current_epoch: int) -> None:
         for dataset in self._datasets:
             dataset.set_epoch(current_epoch)
 
-    def set_shuffle(self, shuffle: bool) -> None:
-        """Set the current shuffle to the datasets."""
-        for dataset in self._datasets:
-            dataset.set_shuffle(shuffle)
-
-    def set_batch_size(self, batch_size: int) -> None:
-        """Set the current batch size to the datasets."""
-        self.batch_size = batch_size
-        for dataset in self._datasets:
-            dataset.set_batch_size(batch_size)
-
-    def set_num_workers(self, num_workers: int) -> None:
-        """Set the current number of workers to the datasets."""
-        for dataset in self._datasets:
-            dataset.set_num_workers(num_workers)
-
-    def set_drop_last(self, drop_last: bool) -> None:
-        """Set the current drop_last to the datasets."""
-        for dataset in self._datasets:
-            dataset.set_drop_last(drop_last)
-
-    def reset_state_dict(self) -> None:
-        """Reset the state of the dataset."""
-        for dataset in self._datasets:
-            dataset.reset_state_dict()
-
-    def _check_datasets(self, datasets: List[StreamingDataset]) -> None:
-        if any(not isinstance(d, StreamingDataset) for d in datasets):
-            raise RuntimeError("The provided datasets should be instances of the StreamingDataset.")
-
-    def _set_use_streaming_dataloader(self, use_streaming_dataloader: bool) -> None:
-        # Used to prevent returning num_samples_yielded when using PyTorch DataLoader
-        self._use_streaming_dataloader = use_streaming_dataloader
-
     def __iter__(self) -> Iterator[Any]:
         assert self._weights
 
@@ -199,31 +160,6 @@ def state_dict(
             return _state_dict(self._datasets, num_samples_yielded, num_workers, batch_size)
         return self._iterator.state_dict(num_workers, batch_size)
 
-    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
-        if not state_dict:
-            return
-
-        if len(state_dict["dataset"]) != len(self._datasets):
-            if not self._force_override_state_dict:
-                raise RuntimeError(
-                    f"The provided state doesn't match the current number of datasets: {self._datasets}."
-                )
-            if len(state_dict["dataset"]) > len(self._datasets):
-                raise RuntimeError(
-                    "Currently it's only possible to add datasets to the end of the dataset list when overriding state"
-                )
-
-        for dataset_idx, dataset in enumerate(self._datasets):
-            if str(dataset_idx) in state_dict["dataset"]:
-                dataset.load_state_dict(state_dict["dataset"][str(dataset_idx)])
-
-            elif not self._force_override_state_dict:
-                raise RuntimeError(f"The provided state doesn't contain the index {dataset_idx}.")
-
-        # Used to iterate over the sampler to avoid sampling the same samples
-        if self._use_streaming_dataloader:
-            self._num_samples_yielded = state_dict["num_samples_yielded"]
-
 
 class _CombinedDatasetIterator(Iterator):
     def __init__(