all tensors in ModelInput should be on pinned memory for non-blocking device-to-host data transfer (#2985)

TroyGarden · facebook-github-bot · commit 60442e6aa904 · 2025-05-20T12:29:00.000-07:00
Summary: Pull Request resolved: #2985 # context * `KeyedJaggedTensor` has the method of `pin_memory` so there's no need to do the pin_memory manually. * The `pin_memory()` call for input KJTs are important for training. NOTE: It's recommended in the prod training scenario that `TrainModelInput` should be created on pinned memory for a fast transfer to gpu. For more on [pin_memory](https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html#pin-memory). * ModelInput example ``` if pin_memory: float_features = float_features.pin_memory() label = label.pin_memory() idlist_features: Optional[KeyedJaggedTensor] = ( None if idlist_features is None else idlist_features.pin_memory() ) idscore_features: Optional[KeyedJaggedTensor] = ( None if idscore_features is None else idscore_features.pin_memory() ) return ModelInput( float_features=float_features, idlist_features=idlist_features, idscore_features=idscore_features, label=label, ) ``` WARNING: All the tensors in `TrainModelInput` should be pinned in memory, not just the KJTs. Otherwise you'll find that cpu execution is still blocked by `_to_copy` even most of the (host-to-device) data transfer is non-blocking. {F1978313151} {F1978313156} Reviewed By: tao-jia Differential Revision: D74434209 fbshipit-source-id: c7ad466b8d278044b2e2b9dd8f89489545f3060a
diff --git a/torchrec/distributed/test_utils/test_input.py b/torchrec/distributed/test_utils/test_input.py
@@ -287,7 +287,6 @@ def generate(
                 offsets_dtype=offsets_dtype,
                 lengths_dtype=lengths_dtype,
                 all_zeros=all_zeros,
-                pin_memory=pin_memory,
             )
             if tables is not None and len(tables) > 0
             else None
@@ -306,7 +305,6 @@ def generate(
                 offsets_dtype=offsets_dtype,
                 lengths_dtype=lengths_dtype,
                 all_zeros=all_zeros,
-                pin_memory=pin_memory,
             )
             if weighted_tables is not None and len(weighted_tables) > 0
             else None
@@ -317,8 +315,16 @@ def generate(
             else torch.rand((batch_size,), device=device)
         )
         if pin_memory:
+            # all tensors in `ModelInput` should be on pinned memory otherwise
+            # the `_to_copy` (host-to-device) data transfer still blocks cpu execution
             float_features = float_features.pin_memory()
             label = label.pin_memory()
+            idlist_features: Optional[KeyedJaggedTensor] = (
+                None if idlist_features is None else idlist_features.pin_memory()
+            )
+            idscore_features: Optional[KeyedJaggedTensor] = (
+                None if idscore_features is None else idscore_features.pin_memory()
+            )
         return ModelInput(
             float_features=float_features,
             idlist_features=idlist_features,
@@ -417,18 +423,12 @@ def _assemble_kjt(
         device: Optional[torch.device] = None,
         use_offsets: bool = False,
         offsets_dtype: torch.dtype = torch.int64,
-        pin_memory: bool = False,
     ) -> KeyedJaggedTensor:
         """
         Assembles a KeyedJaggedTensor (KJT) from the provided per-feature lengths and indices.
 
         This method is used to generate corresponding local_batches and global_batch KJTs.
         It concatenates the lengths and indices for each feature to form a complete KJT.
-
-        The `pin_memory()` call for all KJT tensors are important for training benchmark, and
-        also valid argument for the prod training scenario: TrainModelInput should be created
-        on pinned memory for a fast transfer to gpu. For more on pin_memory:
-        https://pytorch.org/tutorials/intermediate/pinmem_nonblock.html#pin-memory
         """
 
         lengths = torch.cat(lengths_per_feature)
@@ -440,11 +440,6 @@ def _assemble_kjt(
                 [torch.tensor([0], device=device), lengths.cumsum(0)]
             ).to(offsets_dtype)
             lengths = None
-        if pin_memory:
-            indices = indices.pin_memory()
-            lengths = lengths.pin_memory() if lengths is not None else None
-            weights = weights.pin_memory() if weights is not None else None
-            offsets = offsets.pin_memory() if offsets is not None else None
         return KeyedJaggedTensor(features, indices, weights, lengths, offsets)
 
     @staticmethod
@@ -463,7 +458,6 @@ def create_standard_kjt(
         offsets_dtype: torch.dtype = torch.int64,
         lengths_dtype: torch.dtype = torch.int64,
         all_zeros: bool = False,
-        pin_memory: bool = False,
     ) -> KeyedJaggedTensor:
         features, lengths_per_feature, indices_per_feature = (
             ModelInput._create_features_lengths_indices(
@@ -486,7 +480,6 @@ def create_standard_kjt(
             device=device,
             use_offsets=use_offsets,
             offsets_dtype=offsets_dtype,
-            pin_memory=pin_memory,
         )
 
     @staticmethod