Support ssd device propagation in Torch Rec for RecSys Inference (#2961)

faran928 · facebook-github-bot · commit ceebcf0fdb9f · 2025-05-12T10:35:10.000-07:00
Summary: Pull Request resolved: #2961 For RecSys Inference when tables are offloaded onto SSD: 1. Specify and propagate the tables to be offloaded to SSD in TorchRec via FUSED_PARAMS as discussed with TroyGarden 2. Continue using torch.device("cpu") as compute device while using separate input / output dist for SSD (as in house SSD TBE kernel based on EmbeddingDB is different than CPU TBE kernel) by creating a new device group for SSD. Would be renaming device_type_from_sharding_info to storage_device_type_from_sharding_info to clarify it better. Reviewed By: jiayisuse Differential Revision: D74378974 fbshipit-source-id: ad528cb35230837ccfc9dac23eff8cf4f9adac6f
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -45,6 +45,10 @@
     ShardedEmbeddingModule,
     ShardingType,
 )
+from torchrec.distributed.fused_params import (
+    FUSED_PARAM_IS_SSD_TABLE,
+    FUSED_PARAM_SSD_TABLE_LIST,
+)
 from torchrec.distributed.sharding.cw_sequence_sharding import (
     CwSequenceEmbeddingSharding,
 )
@@ -184,9 +188,16 @@ def create_sharding_infos_by_sharding_device_group(
         assert param_name in parameter_by_name or param_name in state_dict
         param = parameter_by_name.get(param_name, state_dict[param_name])
 
-        device_group: TypeUnion[str, Tuple[str, ...]] = (
-            get_device_from_parameter_sharding(parameter_sharding)
-        )
+        # if a table name is overridden to be offloaded to ssd storage for inference
+        # update the device group accordingly
+        if fused_params and table_name in fused_params.get(
+            FUSED_PARAM_SSD_TABLE_LIST, {}
+        ):
+            device_group: TypeUnion[str, Tuple[str, ...]] = "ssd"
+        else:
+            device_group: TypeUnion[str, Tuple[str, ...]] = (
+                get_device_from_parameter_sharding(parameter_sharding)
+            )
         if (
             parameter_sharding.sharding_type,
             device_group,
@@ -214,6 +225,8 @@ def create_sharding_infos_by_sharding_device_group(
             per_table_fused_params, parameter_sharding
         )
         per_table_fused_params = convert_to_fbgemm_types(per_table_fused_params)
+        if device_group == "ssd":
+            per_table_fused_params.update({FUSED_PARAM_IS_SSD_TABLE: True})
 
         sharding_type_device_group_to_sharding_infos[
             (parameter_sharding.sharding_type, device_group)
diff --git a/torchrec/distributed/embedding_sharding.py b/torchrec/distributed/embedding_sharding.py
@@ -34,6 +34,7 @@
     ListOfKJTList,
     ShardedEmbeddingTable,
 )
+from torchrec.distributed.fused_params import FUSED_PARAM_SSD_TABLE_LIST
 from torchrec.distributed.types import (
     Awaitable,
     EmbeddingEvent,
@@ -420,7 +421,7 @@ def _get_grouping_fused_params(
 ) -> Optional[Dict[str, Any]]:
     """
     Only shallow copy the fused params we need for grouping tables into TBEs. In
-    particular, we do not copy cache_load_factor.
+    particular, we do not copy cache_load_factor or ssd embedding table list.
     """
     grouping_fused_params: Optional[Dict[str, Any]] = copy.copy(fused_params)
 
@@ -430,6 +431,9 @@ def _get_grouping_fused_params(
     if CACHE_LOAD_FACTOR_STR in grouping_fused_params:
         del grouping_fused_params[CACHE_LOAD_FACTOR_STR]
 
+    if FUSED_PARAM_SSD_TABLE_LIST in grouping_fused_params:
+        del grouping_fused_params[FUSED_PARAM_SSD_TABLE_LIST]
+
     if grouping_fused_params.get(USE_ONE_TBE_PER_TABLE, False):
         # Replace with unique value to force it into singleton group.
         # Name is used as unique value so we won't group multiple shard belonging
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -51,6 +51,10 @@
     KJTList,
     ShardedEmbeddingModule,
 )
+from torchrec.distributed.fused_params import (
+    FUSED_PARAM_IS_SSD_TABLE,
+    FUSED_PARAM_SSD_TABLE_LIST,
+)
 from torchrec.distributed.sharding.cw_sharding import CwPooledEmbeddingSharding
 from torchrec.distributed.sharding.dp_sharding import DpPooledEmbeddingSharding
 from torchrec.distributed.sharding.dynamic_sharding import (
@@ -227,7 +231,16 @@ def create_sharding_infos_by_sharding_device_group(
         assert param_name in parameter_by_name or param_name in state_dict
         param = parameter_by_name.get(param_name, state_dict[param_name])
 
-        device_group = get_device_from_parameter_sharding(parameter_sharding)
+        # if a table name is overridden to be offloaded to ssd storage for inference
+        # update the device group accordingly
+        if fused_params and table_name in fused_params.get(
+            FUSED_PARAM_SSD_TABLE_LIST, {}
+        ):
+            device_group: Union[str, Tuple[str, ...]] = "ssd"
+        else:
+            device_group: Union[str, Tuple[str, ...]] = (
+                get_device_from_parameter_sharding(parameter_sharding)
+            )
 
         if (
             parameter_sharding.sharding_type,
@@ -257,6 +270,8 @@ def create_sharding_infos_by_sharding_device_group(
             per_table_fused_params, parameter_sharding
         )
         per_table_fused_params = convert_to_fbgemm_types(per_table_fused_params)
+        if device_group == "ssd":
+            per_table_fused_params.update({FUSED_PARAM_IS_SSD_TABLE: True})
 
         sharding_type_device_group_to_sharding_infos[
             (parameter_sharding.sharding_type, device_group)
diff --git a/torchrec/distributed/fused_params.py b/torchrec/distributed/fused_params.py
@@ -28,6 +28,12 @@
 # with certain ways to split models.
 FUSED_PARAM_LENGTHS_TO_OFFSETS_LOOKUP: str = "__register_lengths_to_offsets_lookup"
 
+# Fused param storing list of cpu embedding tables offloaded to ssd to scale
+# the embedding table size
+FUSED_PARAM_SSD_TABLE_LIST: str = "__register_ssd_table_list"
+# Bool fused param per table to check if the table is offloaded to SSD
+FUSED_PARAM_IS_SSD_TABLE: str = "__register_is_ssd_table"
+
 
 class TBEToRegisterMixIn:
     def get_tbes_to_register(
@@ -111,5 +117,7 @@ def tbe_fused_params(
         fused_params_for_tbe.pop(FUSED_PARAM_BOUNDS_CHECK_MODE)
     if FUSED_PARAM_LENGTHS_TO_OFFSETS_LOOKUP in fused_params_for_tbe:
         fused_params_for_tbe.pop(FUSED_PARAM_LENGTHS_TO_OFFSETS_LOOKUP)
+    if FUSED_PARAM_SSD_TABLE_LIST in fused_params_for_tbe:
+        fused_params_for_tbe.pop(FUSED_PARAM_SSD_TABLE_LIST)
 
     return fused_params_for_tbe
diff --git a/torchrec/distributed/quant_embedding.py b/torchrec/distributed/quant_embedding.py
@@ -47,6 +47,7 @@
     ShardingType,
 )
 from torchrec.distributed.fused_params import (
+    FUSED_PARAM_IS_SSD_TABLE,
     FUSED_PARAM_QUANT_STATE_DICT_SPLIT_SCALE_BIAS,
     FUSED_PARAM_REGISTER_TBE_BOOL,
     get_tbes_to_register_from_iterable,
@@ -173,12 +174,17 @@ def get_device_from_parameter_sharding(
 def get_device_from_sharding_infos(
     emb_shard_infos: List[EmbeddingShardingInfo],
 ) -> Union[str, Tuple[str, ...]]:
-    res = list(
-        {
-            get_device_from_parameter_sharding(ps.param_sharding)
-            for ps in emb_shard_infos
-        }
-    )
+    res_set = set()
+    for emb_shard_info in emb_shard_infos:
+        if emb_shard_info.fused_params and emb_shard_info.fused_params.get(
+            FUSED_PARAM_IS_SSD_TABLE, False
+        ):
+            res_set.add("ssd")
+        else:
+            res_set.add(
+                get_device_from_parameter_sharding(emb_shard_info.param_sharding)
+            )
+    res = list(res_set)
     assert len(res) == 1, "All shards should be on the same type of device"
     return res[0]
 
@@ -201,11 +207,11 @@ def create_infer_embedding_sharding(
     List[torch.Tensor],
     List[torch.Tensor],
 ]:
-    device_type_from_sharding_infos: Union[str, Tuple[str, ...]] = (
+    storage_device_type_from_sharding_infos: Union[str, Tuple[str, ...]] = (
         get_device_from_sharding_infos(sharding_infos)
     )
 
-    if device_type_from_sharding_infos in ["cuda", "mtia"]:
+    if storage_device_type_from_sharding_infos in ["cuda", "mtia"]:
         if sharding_type == ShardingType.TABLE_WISE.value:
             return InferTwSequenceEmbeddingSharding(sharding_infos, env, device)
         elif sharding_type == ShardingType.COLUMN_WISE.value:
@@ -215,31 +221,31 @@ def create_infer_embedding_sharding(
                 sharding_infos=sharding_infos,
                 env=env,
                 device=device,
-                device_type_from_sharding_infos=device_type_from_sharding_infos,
+                device_type_from_sharding_infos=storage_device_type_from_sharding_infos,
             )
         else:
             raise ValueError(
-                f"Sharding type not supported {sharding_type} for {device_type_from_sharding_infos} sharding"
+                f"Sharding type not supported {sharding_type} for {storage_device_type_from_sharding_infos} sharding"
             )
-    elif device_type_from_sharding_infos == "cpu" or isinstance(
-        device_type_from_sharding_infos, tuple
+    elif storage_device_type_from_sharding_infos in ["cpu", "ssd"] or isinstance(
+        storage_device_type_from_sharding_infos, tuple
     ):
         if sharding_type == ShardingType.ROW_WISE.value:
             return InferRwSequenceEmbeddingSharding(
                 sharding_infos=sharding_infos,
                 env=env,
                 device=device,
-                device_type_from_sharding_infos=device_type_from_sharding_infos,
+                device_type_from_sharding_infos=storage_device_type_from_sharding_infos,
             )
         elif sharding_type == ShardingType.TABLE_WISE.value:
             return InferTwSequenceEmbeddingSharding(sharding_infos, env, device)
         else:
             raise ValueError(
-                f"Sharding type not supported {sharding_type} for {device_type_from_sharding_infos} sharding"
+                f"Sharding type not supported {sharding_type} for {storage_device_type_from_sharding_infos} sharding"
             )
     else:
         raise ValueError(
-            f"Sharding type not supported {sharding_type} for {device_type_from_sharding_infos} sharding"
+            f"Sharding type not supported {sharding_type} for {storage_device_type_from_sharding_infos} sharding"
         )
 
 
diff --git a/torchrec/distributed/quant_embeddingbag.py b/torchrec/distributed/quant_embeddingbag.py
@@ -35,6 +35,7 @@
     create_sharding_infos_by_sharding_device_group,
 )
 from torchrec.distributed.fused_params import (
+    FUSED_PARAM_IS_SSD_TABLE,
     FUSED_PARAM_QUANT_STATE_DICT_SPLIT_SCALE_BIAS,
     FUSED_PARAM_REGISTER_TBE_BOOL,
     get_tbes_to_register_from_iterable,
@@ -97,12 +98,17 @@ def get_device_from_parameter_sharding(
 def get_device_from_sharding_infos(
     emb_shard_infos: List[EmbeddingShardingInfo],
 ) -> Union[str, Tuple[str, ...]]:
-    res = list(
-        {
-            get_device_from_parameter_sharding(ps.param_sharding)
-            for ps in emb_shard_infos
-        }
-    )
+    res_set = set()
+    for emb_shard_info in emb_shard_infos:
+        if emb_shard_info.fused_params and emb_shard_info.fused_params.get(
+            FUSED_PARAM_IS_SSD_TABLE, False
+        ):
+            res_set.add("ssd")
+        else:
+            res_set.add(
+                get_device_from_parameter_sharding(emb_shard_info.param_sharding)
+            )
+    res = list(res_set)
     assert len(res) == 1, "All shards should be on the same type of device"
     return res[0]
 
@@ -131,7 +137,7 @@ def create_infer_embedding_bag_sharding(
     NullShardingContext, InputDistOutputs, List[torch.Tensor], torch.Tensor
 ]:
     propogate_device: bool = get_propogate_device()
-    device_type_from_sharding_infos: Union[str, Tuple[str, ...]] = (
+    storage_device_type_from_sharding_infos: Union[str, Tuple[str, ...]] = (
         get_device_from_sharding_infos(sharding_infos)
     )
     if sharding_type == ShardingType.TABLE_WISE.value:
@@ -143,7 +149,7 @@ def create_infer_embedding_bag_sharding(
             sharding_infos,
             env,
             device=device if propogate_device else None,
-            device_type_from_sharding_infos=device_type_from_sharding_infos,
+            device_type_from_sharding_infos=storage_device_type_from_sharding_infos,
         )
     elif sharding_type == ShardingType.COLUMN_WISE.value:
         return InferCwPooledEmbeddingSharding(
diff --git a/torchrec/distributed/sharding/rw_sequence_sharding.py b/torchrec/distributed/sharding/rw_sequence_sharding.py
@@ -214,6 +214,9 @@ def forward(
             # using _device_type_from_sharding_infos to iterate on local_embs list as
             # that's a better practice.
             for i, device_type in enumerate(self._device_type_from_sharding_infos):
+                assert (
+                    device_type != "ssd"
+                ), "Heterogenous sharding across multiple storage device types for a single table not supported for ssd stroage device type"
                 if device_type != "cpu":
                     non_cpu_local_embs.append(
                         _get_batching_hinted_output(
@@ -235,7 +238,7 @@ def forward(
                     result.append(non_cpu_local_embs_dist[index])
                     index += 1
             return result
-        elif self._device_type_from_sharding_infos == "cpu":
+        elif self._device_type_from_sharding_infos in ["cpu", "ssd"]:
             # for cpu sharder, output dist should be a no-op
             return local_embs
         else: