add load checkpoint support for virtual table (#4250)

Jianbo Liu · facebook-github-bot · commit ee0264c59fc6 · 2025-06-03T22:08:21.000-07:00
Summary: X-link: pytorch/torchrec#3037 X-link: facebookresearch/FBGEMM#1329 Pull Request resolved: #4250 after all of the rebasing and landing, the trunk still missed some of the needed changes for checkpoint loading: * change `create_virtual_table_global_metadata` to respect local_weight_count on each rank, or just use the param size as number of rows on each rank * register register_load_state_dict_post_hook in ShardedEmbeddingCollection to let it ignore loading the weight tensor Reviewed By: emlin Differential Revision: D75843542 Privacy Context Container: L1138451 fbshipit-source-id: 8b3c8d76bb2e7ba2137c8899de2c03d534f1365c
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1959,7 +1959,7 @@ def split_optimizer_states(
             # init for checkpointing loading
             assert (
                 self._cached_kvzch_data is not None
-                and self._cached_kvzch_data.cached_optimizer_state_per_table is not None
+                and self._cached_kvzch_data.cached_optimizer_state_per_table
             ), "optimizer state is not initialized for load checkpointing"
             return self._cached_kvzch_data.cached_optimizer_state_per_table
 
@@ -2365,13 +2365,17 @@ def streaming_write_weight_and_id_per_table(
         D_rounded = pad4(weight_state.size(1))  # padded to 4 bytes alignment
         dtype = self.weights_precision.as_dtype()
         kvt = torch.classes.fbgemm.KVTensorWrapper(
-            db=self.ssd_db,
             shape=[weight_state.size(0), self.cache_row_dim],
             dtype=dtype,
             row_offset=row_offset,
             snapshot_handle=None,
             sorted_indices=id_tensor,
         )
+        (
+            kvt.set_embedding_rocks_dp_wrapper(self.ssd_db)
+            if self.backend_type == BackendType.SSD
+            else kvt.set_dram_db_wrapper(self.ssd_db)
+        )
         # TODO: make chunk_size configurable or dynamic
         chunk_size = 10000
         row = weight_state.size(0)
@@ -2417,9 +2421,7 @@ def enable_load_state_dict_mode(self) -> None:
             logging.info(
                 f"for checkpoint loading, table {i}, weight_state shape is {weight_state.shape}, opt_state shape is {opt_state.shape}"
             )
-            id_tensor = torch.zeros(
-                (self.local_weight_counts[i], 1), dtype=torch.int64, device="cpu"
-            )
+            id_tensor = torch.zeros((rows, 1), dtype=torch.int64, device="cpu")
             # pyre-ignore [16]
             self._cached_kvzch_data.cached_id_tensor_per_table.append(id_tensor)
             # pyre-ignore [16]