chunk processing l2 cache flush (#4216)

duduyi2013 · facebook-github-bot · commit 792e68a923a8 · 2025-06-05T00:57:15.000-07:00
Summary: Pull Request resolved: #4216 X-link: facebookresearch/FBGEMM#1292 as title, when we have a large L2 cache, flush will double up the mem footprint Reviewed By: emlin Differential Revision: D75314575 fbshipit-source-id: c0963665aed9065d833bc94e961a528d239b1ada
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -168,6 +168,7 @@ def __init__(
         kv_zch_params: Optional[KVZCHParams] = None,
         enable_raw_embedding_streaming: bool = False,  # whether enable raw embedding streaming
         res_params: Optional[RESParams] = None,  # raw embedding streaming sharding info
+        flushing_block_size: int = 2_000_000_000,  # 2GB
     ) -> None:
         super(SSDTableBatchedEmbeddingBags, self).__init__()
 
@@ -520,15 +521,19 @@ def __init__(
         logging.info(f"tbe_unique_id: {tbe_unique_id}")
         if self.backend_type == BackendType.SSD:
             logging.info(
-                f"Logging SSD offloading setup, tbe_unique_id:{tbe_unique_id}, l2_cache_size:{l2_cache_size}GB, enable_async_update:{enable_async_update}"
-                f"passed_in_path={ssd_directory}, num_shards={ssd_rocksdb_shards},num_threads={ssd_rocksdb_shards},"
-                f"memtable_flush_period={ssd_memtable_flush_period},memtable_flush_offset={ssd_memtable_flush_offset},"
-                f"l0_files_per_compact={ssd_l0_files_per_compact},max_D={self.max_D},cache_row_dim={self.cache_row_dim},rate_limit_mbps={ssd_rate_limit_mbps},"
-                f"size_ratio={ssd_size_ratio},compaction_trigger={ssd_compaction_trigger}, lazy_bulk_init_enabled={lazy_bulk_init_enabled},"
-                f"write_buffer_size_per_tbe={ssd_rocksdb_write_buffer_size},max_write_buffer_num_per_db_shard={ssd_max_write_buffer_num},"
-                f"uniform_init_lower={ssd_uniform_init_lower},uniform_init_upper={ssd_uniform_init_upper},"
-                f"row_storage_bitwidth={weights_precision.bit_rate()},block_cache_size_per_tbe={ssd_block_cache_size_per_tbe},"
-                f"use_passed_in_path:{use_passed_in_path}, real_path will be printed in EmbeddingRocksDB, enable_raw_embedding_streaming:{self.enable_raw_embedding_streaming}"
+                f"Logging SSD offloading setup, tbe_unique_id:{tbe_unique_id}, l2_cache_size:{l2_cache_size}GB, "
+                f"enable_async_update:{enable_async_update}, passed_in_path={ssd_directory}, "
+                f"num_shards={ssd_rocksdb_shards}, num_threads={ssd_rocksdb_shards}, "
+                f"memtable_flush_period={ssd_memtable_flush_period}, memtable_flush_offset={ssd_memtable_flush_offset}, "
+                f"l0_files_per_compact={ssd_l0_files_per_compact}, max_D={self.max_D}, "
+                f"cache_row_size={self.cache_row_dim}, rate_limit_mbps={ssd_rate_limit_mbps}, "
+                f"size_ratio={ssd_size_ratio}, compaction_trigger={ssd_compaction_trigger}, "
+                f"lazy_bulk_init_enabled={lazy_bulk_init_enabled}, write_buffer_size_per_tbe={ssd_rocksdb_write_buffer_size}, "
+                f"max_write_buffer_num_per_db_shard={ssd_max_write_buffer_num}, "
+                f"uniform_init_lower={ssd_uniform_init_lower}, uniform_init_upper={ssd_uniform_init_upper}, "
+                f"row_storage_bitwidth={weights_precision.bit_rate()}, block_cache_size_per_tbe={ssd_block_cache_size_per_tbe}, "
+                f"use_passed_in_path:{use_passed_in_path}, real_path will be printed in EmbeddingRocksDB, "
+                f"enable_raw_embedding_streaming:{self.enable_raw_embedding_streaming}, flushing_block_size:{flushing_block_size}"
             )
             # pyre-fixme[4]: Attribute must be annotated.
             self._ssd_db = torch.classes.fbgemm.EmbeddingRocksDBWrapper(
@@ -568,6 +573,7 @@ def __init__(
                     if self.enable_optimizer_offloading
                     else None
                 ),
+                flushing_block_size,
             )
             if self.bulk_init_chunk_size > 0:
                 self.ssd_uniform_init_lower: float = ssd_uniform_init_lower
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/cachelib_cache.h b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/cachelib_cache.h
@@ -43,6 +43,8 @@ class CacheLibCache {
       const CacheConfig& cache_config,
       int64_t unique_tbe_id);
 
+  size_t get_cache_item_size() const;
+  Cache::AccessIterator begin();
   std::unique_ptr<Cache> initializeCacheLib(const CacheConfig& config);
 
   std::unique_ptr<facebook::cachelib::CacheAdmin> createCacheAdmin(
@@ -99,7 +101,7 @@ class CacheLibCache {
   /// @note cache_->allocation will trigger eviction callback func
   bool put(const at::Tensor& key_tensor, const at::Tensor& data);
 
-  /// iterate through all items in L2 cache, fill them in indices and weights
+  /// iterate through N items in L2 cache, fill them in indices and weights
   /// respectively and return indices, weights and count
   ///
   /// @return optional value, if cache is empty return none
@@ -109,11 +111,11 @@ class CacheLibCache {
   /// relative element in <indices>
   /// @return count A single element tensor that contains the number of indices
   /// to be processed
-  ///
   /// @note this isn't thread safe, caller needs to make sure put isn't called
   /// while this is executed.
-  folly::Optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
-  get_all_items();
+  folly::Optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>> get_n_items(
+      int n,
+      Cache::AccessIterator& start_itr);
 
   /// instantiate eviction related indices and weights tensors(size of <count>)
   /// for L2 eviction using the same dtype and device from <indices> and
diff --git a/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp b/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp
@@ -34,6 +34,14 @@ CacheLibCache::CacheLibCache(
   }
 }
 
+size_t CacheLibCache::get_cache_item_size() const {
+  return cache_config_.item_size_bytes;
+}
+
+Cache::AccessIterator CacheLibCache::begin() {
+  return cache_->begin();
+}
+
 std::unique_ptr<Cache> CacheLibCache::initializeCacheLib(
     const CacheConfig& config) {
   auto eviction_cb = [this](
@@ -177,50 +185,41 @@ bool CacheLibCache::put(const at::Tensor& key_tensor, const at::Tensor& data) {
 }
 
 folly::Optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
-CacheLibCache::get_all_items() {
+CacheLibCache::get_n_items(int n, Cache::AccessIterator& itr) {
   if (!index_dtype_.has_value() || !weights_dtype_.has_value()) {
     return folly::none;
   }
-  int total_num_items = 0;
-  for (auto& pool_id : pool_ids_) {
-    total_num_items += cache_->getPoolStats(pool_id).numItems();
-  }
   auto weight_dim = cache_config_.max_D_;
   auto indices = at::empty(
-      total_num_items,
-      at::TensorOptions().dtype(index_dtype_.value()).device(at::kCPU));
+      n, at::TensorOptions().dtype(index_dtype_.value()).device(at::kCPU));
   auto weights = at::empty(
-      {total_num_items, weight_dim},
+      {n, weight_dim},
       at::TensorOptions().dtype(weights_dtype_.value()).device(at::kCPU));
+  int cnt = 0;
   FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(
-      weights.scalar_type(), "get_all_items", [&] {
+      weights.scalar_type(), "get_n_items", [&] {
         using value_t = scalar_t;
         FBGEMM_DISPATCH_INTEGRAL_TYPES(
-            indices.scalar_type(), "get_all_items", [&] {
+            indices.scalar_type(), "get_n_items", [&] {
               using index_t = scalar_t;
               auto indices_data_ptr = indices.data_ptr<index_t>();
               auto weights_data_ptr = weights.data_ptr<value_t>();
-              int64_t item_idx = 0;
-              for (auto itr = cache_->begin(); itr != cache_->end(); ++itr) {
+              for (; itr != cache_->end() && cnt < n; ++itr, ++cnt) {
                 const auto key_ptr =
                     reinterpret_cast<const index_t*>(itr->getKey().data());
-                indices_data_ptr[item_idx] = *key_ptr;
+                indices_data_ptr[cnt] = *key_ptr;
                 std::copy(
                     reinterpret_cast<const value_t*>(itr->getMemory()),
                     reinterpret_cast<const value_t*>(itr->getMemory()) +
                         weight_dim,
-                    &weights_data_ptr[item_idx * weight_dim]); // dst_start
-                item_idx++;
+                    &weights_data_ptr[cnt * weight_dim]); // dst_start
               }
-              CHECK_EQ(total_num_items, item_idx);
             });
       });
   return std::make_tuple(
       indices,
       weights,
-      at::tensor(
-          {total_num_items},
-          at::TensorOptions().dtype(at::kLong).device(at::kCPU)));
+      at::tensor({cnt}, at::TensorOptions().dtype(at::kLong).device(at::kCPU)));
 }
 
 void CacheLibCache::init_tensor_for_l2_eviction(
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h
@@ -43,7 +43,8 @@ class EmbeddingRocksDBWrapper : public torch::jit::CustomClassHolder {
       std::vector<int64_t> table_offsets = {},
       const std::vector<int64_t>& table_sizes = {},
       std::optional<at::Tensor> table_dims = std::nullopt,
-      std::optional<at::Tensor> hash_size_cumsum = std::nullopt)
+      std::optional<at::Tensor> hash_size_cumsum = std::nullopt,
+      int64_t flushing_block_size = 2000000000 /*2GB*/)
       : impl_(std::make_shared<ssd::EmbeddingRocksDB>(
             path,
             num_shards,
@@ -72,7 +73,8 @@ class EmbeddingRocksDBWrapper : public torch::jit::CustomClassHolder {
             std::move(table_offsets),
             table_sizes,
             table_dims,
-            hash_size_cumsum)) {}
+            hash_size_cumsum,
+            flushing_block_size)) {}
 
   void set_cuda(
       at::Tensor indices,
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp
@@ -108,8 +108,10 @@ EmbeddingKVDB::EmbeddingKVDB(
     int64_t res_server_port,
     std::vector<std::string> table_names,
     std::vector<int64_t> table_offsets,
-    const std::vector<int64_t>& table_sizes)
-    : unique_id_(unique_id),
+    const std::vector<int64_t>& table_sizes,
+    int64_t flushing_block_size)
+    : flushing_block_size_(flushing_block_size),
+      unique_id_(unique_id),
       num_shards_(num_shards),
       max_D_(max_D),
       executor_tp_(std::make_unique<folly::CPUThreadPoolExecutor>(num_shards)),
@@ -333,17 +335,29 @@ void EmbeddingKVDB::update_cache_and_storage(
 void EmbeddingKVDB::flush() {
   wait_util_filling_work_done();
   if (l2_cache_) {
-    auto tensor_tuple_opt = l2_cache_->get_all_items();
-    if (!tensor_tuple_opt.has_value()) {
-      XLOG(INFO) << "[TBE_ID" << unique_id_
-                 << "]no items exist in L2 cache, flush nothing";
-      return;
+    int block_size = std::max(
+        (int)(flushing_block_size_ / l2_cache_->get_cache_item_size()), 1);
+    folly::Optional<l2_cache::CacheLibCache::Cache::AccessIterator> start_itr =
+        folly::none;
+    folly::Optional<at::Tensor> count = folly::none;
+    auto itr = l2_cache_->begin();
+    while (count == folly::none || count->item<int64_t>() > 0) {
+      auto res_tuple_opt = l2_cache_->get_n_items(block_size, itr);
+      if (!res_tuple_opt.has_value()) {
+        XLOG(INFO) << "[TBE_ID" << unique_id_
+                   << "]no items exist in L2 cache, flush nothing";
+        return;
+      }
+      auto& indices = std::get<0>(res_tuple_opt.value());
+      auto& weights = std::get<1>(res_tuple_opt.value());
+      count = std::get<2>(res_tuple_opt.value());
+
+      if (count->item<int64_t>() > 0) {
+        set_kv_db_async(
+            indices, weights, count.value(), kv_db::RocksdbWriteMode::FLUSH)
+            .wait();
+      }
     }
-    auto& indices = std::get<0>(tensor_tuple_opt.value());
-    auto& weights = std::get<1>(tensor_tuple_opt.value());
-    auto& count = std::get<2>(tensor_tuple_opt.value());
-    set_kv_db_async(indices, weights, count, kv_db::RocksdbWriteMode::FLUSH)
-        .wait();
   }
 }
 
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.h
@@ -142,7 +142,8 @@ class EmbeddingKVDB : public std::enable_shared_from_this<EmbeddingKVDB> {
       int64_t res_server_port = 0,
       std::vector<std::string> table_names = {},
       std::vector<int64_t> table_offsets = {},
-      const std::vector<int64_t>& table_sizes = {});
+      const std::vector<int64_t>& table_sizes = {},
+      int64_t flushing_block_size = 2000000000 /*2GB*/);
 
   virtual ~EmbeddingKVDB();
 
@@ -396,6 +397,8 @@ class EmbeddingKVDB : public std::enable_shared_from_this<EmbeddingKVDB> {
       const at::Tensor& weights);
 
   std::unique_ptr<l2_cache::CacheLibCache> l2_cache_;
+  // when flushing l2, the block size in bytes that we flush l2 progressively
+  int64_t flushing_block_size_;
   const int64_t unique_id_;
   const int64_t num_shards_;
   const int64_t max_D_;
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp
@@ -379,6 +379,7 @@ void KVTensorWrapper::set_range(
     const at::Tensor& weights) {
   // Mutex lock for disabling concurrent writes to the same KVTensor
   std::lock_guard<std::mutex> lock(mtx);
+  CHECK_EQ(weights.device(), at::kCPU);
   CHECK_EQ(dim, 0) << "Only set_range on dim 0 is supported";
   CHECK_TRUE(db_ != nullptr);
   CHECK_GE(db_->get_max_D(), shape_[1]);
@@ -396,6 +397,7 @@ void KVTensorWrapper::set_range(
 void KVTensorWrapper::set_weights_and_ids(
     const at::Tensor& weights,
     const at::Tensor& ids) {
+  CHECK_EQ(weights.device(), at::kCPU);
   CHECK_TRUE(db_ != nullptr);
   CHECK_EQ(ids.size(0), weights.size(0))
       << "ids and weights must have same # rows";
@@ -502,7 +504,8 @@ static auto embedding_rocks_db_wrapper =
                 std::vector<int64_t>,
                 std::vector<int64_t>,
                 std::optional<at::Tensor>,
-                std::optional<at::Tensor>>(),
+                std::optional<at::Tensor>,
+                int64_t>(),
             "",
             {
                 torch::arg("path"),
@@ -533,6 +536,7 @@ static auto embedding_rocks_db_wrapper =
                 torch::arg("table_sizes") = torch::List<int64_t>(),
                 torch::arg("table_dims") = std::nullopt,
                 torch::arg("hash_size_cumsum") = std::nullopt,
+                torch::arg("flushing_block_size") = 2000000000 /* 2GB */,
             })
         .def(
             "set_cuda",
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h
@@ -103,7 +103,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
       std::vector<int64_t> table_offsets = {},
       const std::vector<int64_t>& table_sizes = {},
       std::optional<at::Tensor> table_dims = std::nullopt,
-      std::optional<at::Tensor> hash_size_cumsum = std::nullopt)
+      std::optional<at::Tensor> hash_size_cumsum = std::nullopt,
+      int64_t flushing_block_size = 2000000000 /*2GB*/)
       : kv_db::EmbeddingKVDB(
             num_shards,
             max_D,
@@ -116,7 +117,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
             res_server_port,
             std::move(table_names),
             std::move(table_offsets),
-            table_sizes),
+            table_sizes,
+            flushing_block_size),
         auto_compaction_enabled_(true),
         max_D_(max_D),
         elem_size_(row_storage_bitwidth / 8) {
diff --git a/fbgemm_gpu/test/tbe/ssd/kv_backend_test.py b/fbgemm_gpu/test/tbe/ssd/kv_backend_test.py
@@ -61,6 +61,7 @@ def generate_fbgemm_kv_tbe(
         ssd_rocksdb_shards: int = 1,
         kv_zch_params: Optional[KVZCHParams] = None,
         backend_type: BackendType = BackendType.SSD,
+        flushing_block_size: int = 1000,
     ) -> Tuple[SSDTableBatchedEmbeddingBags, List[int], List[int]]:
         E = int(10**log_E)
         D = D * 4
@@ -89,6 +90,7 @@ def generate_fbgemm_kv_tbe(
             ssd_rocksdb_shards=ssd_rocksdb_shards,
             kv_zch_params=kv_zch_params,
             backend_type=backend_type,
+            flushing_block_size=flushing_block_size,
         )
         return emb, Es, Ds
 
@@ -103,7 +105,9 @@ def test_l2_flush(
         weights_precision: SparseType,
         do_flush: bool,
     ) -> None:
-        emb, Es, _ = self.generate_fbgemm_kv_tbe(T, D, log_E, weights_precision, mixed)
+        emb, Es, _ = self.generate_fbgemm_kv_tbe(
+            T, D, log_E, weights_precision, mixed, flushing_block_size=1
+        )
         indices = torch.arange(start=0, end=sum(Es))
         weights = torch.randn(
             indices.numel(), emb.cache_row_dim, dtype=weights_precision.as_dtype()