Add support for custom collate with the StreamingDataLoader (#163)

tchaton · web-flow · commit 07f0483c2ad5 · 2024-06-12T12:05:26.000+01:00
diff --git a/src/litdata/streaming/dataloader.py b/src/litdata/streaming/dataloader.py
@@ -476,6 +476,23 @@ def _try_put_index(self) -> None:
             super()._try_put_index()
 
 
+class StreamingDataLoaderCollateFn:
+    def __init__(self, collate_fn: Optional[Callable] = None) -> None:
+        self.collate_fn = collate_fn or default_collate
+
+    def __call__(self, items: List[Any]) -> Any:
+        if len(items) > 0 and isinstance(items[0], dict) and __NUM_SAMPLES_YIELDED_KEY__ in items[0]:
+            batch = self.collate_fn([item[__SAMPLES_KEY__] for item in items])
+            return {
+                __SAMPLES_KEY__: batch,
+                __NUM_SAMPLES_YIELDED_KEY__: [
+                    torch.cumsum([torch.tensor(item[__NUM_SAMPLES_YIELDED_KEY__]) for item in items][-1], dim=0)
+                ],
+            }
+
+        return self.collate_fn(items)
+
+
 class StreamingDataLoader(DataLoader):
     r"""The StreamingDataLoader combines a dataset and a sampler, and provides an iterable over the given dataset.
 
@@ -541,6 +558,7 @@ def __init__(
         prefetch_factor: Optional[int] = None,
         shuffle: Optional[bool] = None,
         drop_last: Optional[bool] = False,
+        collate_fn: Optional[Callable] = None,
         **kwargs: Any,
     ) -> None:  # pyright: ignore
         if not isinstance(dataset, (StreamingDataset, CombinedStreamingDataset)):
@@ -563,6 +581,9 @@ def __init__(
         if profile_batches and num_workers == 0:
             raise ValueError("Profiling is supported only with num_workers >= 1.")
 
+        if collate_fn:
+            collate_fn = StreamingDataLoaderCollateFn(collate_fn)
+
         self.current_epoch = 0
         self.batch_size = batch_size
         self.num_workers = num_workers
@@ -581,6 +602,7 @@ def __init__(
             batch_size=batch_size,
             num_workers=num_workers,
             prefetch_factor=(10 if num_workers > 0 else None) if prefetch_factor is None else prefetch_factor,
+            collate_fn=collate_fn,
             **kwargs,
         )  # type: ignore
 
diff --git a/tests/streaming/test_dataloader.py b/tests/streaming/test_dataloader.py
@@ -119,3 +119,31 @@ def test_dataloader_shuffle():
     StreamingDataLoader(dataset, batch_size=2, num_workers=1, shuffle=True)
     assert dataset._datasets[0].shuffle
     assert dataset._datasets[1].shuffle
+
+
+class TestStatefulDatasetDict(TestStatefulDataset):
+    def __next__(self):
+        return {"value": super().__next__()}
+
+
+def custom_collate_fn(samples):
+    assert len(samples) == 2
+    assert "value" in samples[0]
+    return "received"
+
+
+def test_custom_collate():
+    dataset = TestCombinedStreamingDataset(
+        [TestStatefulDatasetDict(10, 1), TestStatefulDatasetDict(10, -1)],
+        42,
+        weights=(0.5, 0.5),
+        iterate_over_all=False,
+    )
+    assert dataset._datasets[0].shuffle is None
+    assert dataset._datasets[1].shuffle is None
+    dataloader = StreamingDataLoader(dataset, batch_size=2, num_workers=0, shuffle=True, collate_fn=custom_collate_fn)
+    assert dataset._datasets[0].shuffle
+    assert dataset._datasets[1].shuffle
+    dataloader_iter = iter(dataloader)
+    assert next(dataloader_iter) == "received"
+    assert dataloader._num_samples_yielded_combined[0] == [2]