Adding function to create a snapshot and exposing it from EmbeddingRocksDBWrapper (#3024)

Raahul Kalyaan Jakka · facebook-github-bot · commit 71ff1a333cd3 · 2025-06-03T10:02:20.000-07:00
Summary: X-link: pytorch/FBGEMM#4223 X-link: facebookresearch/FBGEMM#1299 Design doc: https://docs.google.com/document/d/149LdAEHOLP7ei4hwVVkAFXGa4N9uLs1J7efxfBZp3dY/edit?tab=t.0#heading=h.49t3yfaqmt54 Context: We are enabling the usage of rocksDB checkpoint feature in KVTensorWrapper. This allows us to create checkpoints of the embedding tables in SSD. Later, these checkpoints are used by the checkpointing component to create a checkpoint and upload it it to the manifold In this diff: Creating a function to create a checkpoint and exposing it to EmbeddingRocksDBWrapper Reviewed By: duduyi2013 Differential Revision: D75489841
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -1225,6 +1225,12 @@ def purge(self) -> None:
         self.emb_module.lxu_cache_weights.zero_()
         self.emb_module.lxu_cache_state.fill_(-1)
 
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        """
+        Create a RocksDB checkpoint. This is needed before we call state_dict() for publish.
+        """
+        self.emb_module.create_rocksdb_hard_link_snapshot()
+
     # pyre-ignore [15]
     def split_embedding_weights(self, no_snapshot: bool = True) -> Tuple[
         List[PartiallyMaterializedTensor],
@@ -1520,6 +1526,12 @@ def purge(self) -> None:
         self.emb_module.lxu_cache_weights.zero_()
         self.emb_module.lxu_cache_state.fill_(-1)
 
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        """
+        Create a RocksDB checkpoint. This is needed before we call state_dict() for publish.
+        """
+        self.emb_module.create_rocksdb_hard_link_snapshot()
+
     # pyre-ignore [15]
     def split_embedding_weights(
         self, no_snapshot: bool = True, should_flush: bool = True
@@ -2033,6 +2045,12 @@ def purge(self) -> None:
         self.emb_module.lxu_cache_weights.zero_()
         self.emb_module.lxu_cache_state.fill_(-1)
 
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        """
+        Create a RocksDB checkpoint. This is needed before we call state_dict() for publish.
+        """
+        self.emb_module.create_rocksdb_hard_link_snapshot()
+
     # pyre-ignore [15]
     def split_embedding_weights(self, no_snapshot: bool = True) -> Tuple[
         List[PartiallyMaterializedTensor],
diff --git a/torchrec/distributed/embedding.py b/torchrec/distributed/embedding.py
@@ -1533,6 +1533,15 @@ def _embedding_dim_for_sharding_type(self, sharding_type: str) -> int:
             else self._embedding_dim
         )
 
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        for lookup in self._lookups:
+            while isinstance(lookup, DistributedDataParallel):
+                lookup = lookup.module
+            if hasattr(lookup, "create_rocksdb_hard_link_snapshot") and callable(
+                lookup.create_rocksdb_hard_link_snapshot()
+            ):
+                lookup.create_rocksdb_hard_link_snapshot()
+
     @property
     def fused_optimizer(self) -> KeyedOptimizer:
         return self._optim
diff --git a/torchrec/distributed/embedding_lookup.py b/torchrec/distributed/embedding_lookup.py
@@ -399,6 +399,11 @@ def flush(self) -> None:
             # pyre-fixme[29]: `Union[Module, Tensor]` is not a function.
             emb_module.flush()
 
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        for emb_module in self._emb_modules:
+            if isinstance(emb_module, KeyValueEmbedding):
+                emb_module.create_rocksdb_hard_link_snapshot()
+
     def purge(self) -> None:
         for emb_module in self._emb_modules:
             # pyre-fixme[29]: `Union[Module, Tensor]` is not a function.
@@ -723,6 +728,11 @@ def flush(self) -> None:
             # pyre-fixme[29]: `Union[Module, Tensor]` is not a function.
             emb_module.flush()
 
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        for emb_module in self._emb_modules:
+            if isinstance(emb_module, KeyValueEmbedding):
+                emb_module.create_rocksdb_hard_link_snapshot()
+
     def purge(self) -> None:
         for emb_module in self._emb_modules:
             # pyre-fixme[29]: `Union[Module, Tensor]` is not a function.
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -1638,6 +1638,15 @@ def update_shards(
         update_module_sharding_plan(self, changed_sharding_params)
         return
 
+    def create_rocksdb_hard_link_snapshot(self) -> None:
+        for lookup in self._lookups:
+            while isinstance(lookup, DistributedDataParallel):
+                lookup = lookup.module
+            if hasattr(lookup, "create_rocksdb_hard_link_snapshot") and callable(
+                lookup.create_rocksdb_hard_link_snapshot()
+            ):
+                lookup.create_rocksdb_hard_link_snapshot()
+
     @property
     def fused_optimizer(self) -> KeyedOptimizer:
         return self._optim