Feat: Add per_stream batching method to CombinedStreamingDataset (#438)

schopra8 · pre-commit-ci[bot] · bhimrazy · web-flow · commit cb3fc81741ab · 2025-04-16T22:50:08.000+05:45
* Init Implementation per_stream batching * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Address pre-commit issues * fixed bug in combined.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * expose _set_new_dataset_index() to dataloader * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * WIP - communicate to worker loop * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove print * cleanup print * updated doc string * added types and also update docs * revert: changes from dataloader * update the combined streaming dataset with batching method * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert changes from dataloader * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add test for combined dataset with per-stream batching * remove set_batching_method from CombinedStreamingDataset to simplify batch handling * updaet comment * refactor: replace string literals with BatchingMethod constants for clarity * cleanup * refactor: variables initialization --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bhimraj Yadav <bhimrajyadav977@gmail.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com>
diff --git a/src/litdata/streaming/combined.py b/src/litdata/streaming/combined.py
@@ -14,7 +14,7 @@
 import logging
 import random
 from copy import deepcopy
-from typing import Any, Dict, Iterator, List, Optional, Sequence
+from typing import Any, Dict, Iterator, List, Literal, Optional, Sequence
 
 from torch.utils.data import IterableDataset
 
@@ -28,6 +28,14 @@
 logger = logging.getLogger("litdata.streaming.combined")
 
 
+class BatchingMethod:
+    STRATIFIED = "stratified"
+    PER_STREAM = "per_stream"
+
+
+BatchingMethodType = Literal["stratified", "per_stream"]
+
+
 class CombinedStreamingDataset(IterableDataset):
     """Enables to stream data from multiple StreamingDataset with the sampling ratio of
     your choice.
@@ -46,6 +54,7 @@ def __init__(
         seed: int = 42,
         weights: Optional[Sequence[float]] = None,
         iterate_over_all: bool = True,
+        batching_method: BatchingMethodType = "stratified",
         force_override_state_dict: bool = False,
     ) -> None:
         """Enable to stream data from multiple StreamingDataset with the sampling ratio of your choice.
@@ -56,7 +65,11 @@ def __init__(
             weights: The sampling ratio for the datasets
             iterate_over_all: When iterate_over_all is True, the combined dataset iterates over all the datasets.
                 Otherwise, it stops as soon as one raises a StopIteration.
+            batching_method (str, optional): When batching_method is set to "stratified" (default),
+                batches will include samples from all datasets. On the other hand, when batching_method is "per_stream",
+                batches will consist of samples from a single dataset,  which is selected randomly.
             force_override_state_dict: Boolean flag for allowing local arguments to override a loaded state dict.
+
         """
         self._check_datasets(datasets)
 
@@ -90,6 +103,7 @@ def __init__(
         self._current_epoch = 0
         self.num_workers = 1
         self.batch_size = 1
+        self._batching_method: BatchingMethodType = batching_method
 
     def get_len(self, num_workers: int, batch_size: int) -> Optional[int]:
         self.num_workers = num_workers
@@ -170,6 +184,8 @@ def __iter__(self) -> Iterator[Any]:
             self._weights,
             self._use_streaming_dataloader,
             num_samples_yielded,
+            self.batch_size,
+            self._batching_method,
             self._iterate_over_all,
         )
         return self._iterator
@@ -217,6 +233,8 @@ def __init__(
         weights: Sequence[Optional[float]],
         use_streaming_dataloader: bool,
         num_samples_yielded: Any,
+        batch_size: int,
+        batching_method: BatchingMethodType,
         iterate_over_all: bool = False,
     ) -> None:
         self._datasets = datasets
@@ -227,6 +245,8 @@ def __init__(
         self._weights = deepcopy(weights)
         self._rng = random.Random(seed)  # noqa: S311
         self._iterate_over_all = iterate_over_all
+        self._batching_method = batching_method
+        self._batch_size = batch_size
         self._is_done = False
 
         if num_samples_yielded is not None:
@@ -238,6 +258,13 @@ def __init__(
 
         self._use_streaming_dataloader = use_streaming_dataloader
         self._is_done = False
+
+        # Used to track the number of samples yielded in the current batch
+        # and the current dataset index
+        # This is used only when batching_method is set to "per_stream"
+        self._samples_yielded_in_batch = 0
+        self._cur_dataset_index = -1
+
         logger.debug(
             _get_log_msg({"name": "iterating_combined_dataset", "ph": "B", "cname": ChromeTraceColors.LIGHT_BLUE})
         )
@@ -272,6 +299,21 @@ def __next__(self) -> Any:
         return self._get_sample(self._get_dataset_index())
 
     def _get_dataset_index(self) -> int:
+        if self._batching_method == BatchingMethod.STRATIFIED:
+            # For every sample, randomly select a dataset (weighted)
+            dataset_idx = self._set_new_dataset_index()
+        elif self._batching_method == BatchingMethod.PER_STREAM:
+            # For each batch, pick a dataset and stick with it for the whole batch
+            if self._cur_dataset_index == -1 or self._samples_yielded_in_batch >= self._batch_size:
+                self._cur_dataset_index = self._set_new_dataset_index()
+                self._samples_yielded_in_batch = 0
+            dataset_idx = self._cur_dataset_index
+            self._samples_yielded_in_batch += 1
+        else:
+            raise ValueError(f"Invalid batching method: {self._batching_method}")
+        return dataset_idx
+
+    def _set_new_dataset_index(self) -> int:
         # randomly select a dataset index
         indexes = [index for index in self._dataset_indexes if index is not None]
         weights = [w for w in self._weights if w is not None]
diff --git a/tests/streaming/test_combined.py b/tests/streaming/test_combined.py
@@ -287,6 +287,31 @@ def test_combined_dataset():
     assert torch.equal(next(dataloader_iter), torch.Tensor([0, 1]))
 
 
+@pytest.mark.parametrize("batch_size", [2, 4])
+@pytest.mark.parametrize("num_workers", [1, 2])
+def test_combined_dataset_with_per_stream_batching(tmpdir, batch_size, num_workers):
+    num_of_datasets = 2
+    dataset_ranges = [(0, 10), (10, 20)]
+    dataset_paths = [str(tmpdir.join(f"dataset_{i}")) for i in range(num_of_datasets)]
+    for dataset_path, (start, end) in zip(dataset_paths, dataset_ranges):
+        os.makedirs(dataset_path)
+        cache = Cache(input_dir=dataset_path, chunk_size=2)
+        for i in range(start, end):
+            cache[i] = i
+        cache.done()
+        cache.merge()
+
+    datasets = [StreamingDataset(input_dir=str(dataset_path)) for dataset_path in dataset_paths]
+    dataset = CombinedStreamingDataset(datasets=datasets, seed=12345, batching_method="per_stream")
+    dataloader = StreamingDataLoader(dataset, batch_size=batch_size, num_workers=num_workers, drop_last=True)
+
+    for batch in dataloader:
+        # Ensure that the batch contains items exclusively from a single dataset
+        assert all(x in range(0, 10) for x in batch) or all(x in range(10, 20) for x in batch), (
+            f"Batch should contain elements from only one dataset but got {batch}"
+        )
+
+
 @pytest.mark.parametrize("batch_size", [1, 2])
 def test_combined_dataset_with_dataloader_and_one_worker(batch_size):
     dataset1 = SimpleDataset(0, 10)