extend mempool (#112)

lyppg · web-flow · commit 82a8e1e9c44f · 2025-01-29T10:11:04.000-08:00
* extend mempool

* fix write perf issue

* add autoincrease arg

* minor change
diff --git a/infinistore/lib.py b/infinistore/lib.py
@@ -86,6 +86,7 @@ class ServerConfig:
             prealloc_size (int): The preallocation size. Defaults to 16.
             minimal_allocate_size (int): The minimal allocation size. Defaults to 64.
             num_stream (int): The number of streams. Defaults to 1.
+            auto_increase (bool): indicate if infinistore will be automatically increased. 10GB each time. Default False.
         """
 
     def __init__(self, **kwargs):
@@ -99,6 +100,7 @@ def __init__(self, **kwargs):
         self.prealloc_size = kwargs.get("prealloc_size", 16)
         self.minimal_allocate_size = kwargs.get("minimal_allocate_size", 64)
         self.num_stream = kwargs.get("num_stream", 1)
+        self.auto_increase = kwargs.get("auto_increase", False)
 
     def __repr__(self):
         return (
diff --git a/infinistore/server.py b/infinistore/server.py
@@ -44,6 +44,12 @@ def check_p2p_access():
 
 def parse_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--auto-increase",
+        required=False,
+        action="store_true",
+        help="increase allocated memory automatically, 10GB each time, default False",
+    )
     parser.add_argument(
         "--host",
         required=False,
@@ -144,6 +150,7 @@ def main():
         link_type=args.link_type,
         minimal_allocate_size=args.minimal_allocate_size,
         num_stream=args.num_stream,
+        auto_increase=args.auto_increase,
     )
     config.verify()
     # check_p2p_access()
diff --git a/src/config.h b/src/config.h
@@ -19,6 +19,7 @@ typedef struct ServerConfig {
     std::string link_type;
     int minimal_allocate_size;  // unit: KB
     int num_stream;             // can only be 1,2,4, number of stream for each client
+    bool auto_increase;
 } server_config_t;
 
 typedef struct ClientConfig {
diff --git a/src/infinistore.cpp b/src/infinistore.cpp
@@ -43,6 +43,11 @@ uint8_t ib_port = -1;
 // local active_mtu attr, after exchanging with remote, we will use the min of the two for path.mtu
 ibv_mtu active_mtu;
 
+// indicate if the MM extend is in flight
+bool extend_in_flight = false;
+// indicate the number of cudaIpcOpenMemHandle
+std::atomic<unsigned int> opened_ipc{0};
+
 // PTR is shared by kv_map and inflight_rdma_kv_map
 class PTR : public IntrusivePtrTarget {
    public:
@@ -587,6 +592,7 @@ int Client::read_cache(const LocalMetaRequest *meta_req) {
 
     cudaIpcMemHandle_t ipc_handle = *(cudaIpcMemHandle_t *)meta_req->ipc_handle()->data();
     CHECK_CUDA(cudaIpcOpenMemHandle(&d_ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+    opened_ipc++;
 
     size_t block_size = meta_req->block_size();
     int idx = 0;
@@ -674,6 +680,7 @@ void Client::wait_for_ipc_close(std::shared_ptr<CudaTaskQueue> cuda_task_queue)
         CHECK_CUDA(cudaEventDestroy(task->event));
         CHECK_CUDA(cudaStreamDestroy(task->stream));
         CHECK_CUDA(cudaIpcCloseMemHandle(task->d_ptr));
+        opened_ipc--;
         DEBUG("CUDA_TASK done");
 
         if (task->type == CUDA_WRITE) {
@@ -690,6 +697,19 @@ void Client::wait_for_ipc_close(std::shared_ptr<CudaTaskQueue> cuda_task_queue)
     INFO("quit the waiting_for_ipc_close thread");
 }
 
+void add_mempool(uv_work_t *req) {
+    while (opened_ipc > 0) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+    mm->add_mempool(pd);
+}
+
+void add_mempool_completion(uv_work_t *req, int status) {
+    extend_in_flight = false;
+    mm->need_extend = false;
+    delete req;
+}
+
 int Client::write_cache(const LocalMetaRequest *meta_req) {
     INFO("do write_cache..., num of blocks: {}, stream num {}", meta_req->blocks()->size(),
          global_config.num_stream);
@@ -702,6 +722,7 @@ int Client::write_cache(const LocalMetaRequest *meta_req) {
     CHECK_CUDA(cudaSetDevice(meta_req->device()));
 
     CHECK_CUDA(cudaIpcOpenMemHandle(&d_ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess));
+    opened_ipc++;
 
     int key_idx = 0;
     size_t block_size = meta_req->block_size();
@@ -743,6 +764,12 @@ int Client::write_cache(const LocalMetaRequest *meta_req) {
             task->ptrs.push_back(ptr);
             key_idx++;
         });
+    if (global_config.auto_increase && mm->need_extend && !extend_in_flight) {
+        INFO("Extend another mempool");
+        uv_work_t *req = new uv_work_t();
+        uv_queue_work(loop, req, add_mempool, add_mempool_completion);
+        extend_in_flight = true;
+    }
 
     CHECK_CUDA(cudaEventRecord(event, cuda_stream));
 
diff --git a/src/mempool.cpp b/src/mempool.cpp
@@ -16,7 +16,8 @@ MemoryPool::MemoryPool(size_t pool_size, size_t block_size, struct ibv_pd* pd)
       block_size_(block_size),
       pd_(pd),
       mr_(nullptr),
-      last_search_position_(0) {
+      last_search_position_(0),
+      allocated_blocks_(0) {
     // 计算总的内存块数量
     total_blocks_ = pool_size_ / block_size_;
     assert(pool_size % block_size == 0);
@@ -53,28 +54,32 @@ MemoryPool::~MemoryPool() {
     }
 }
 
-bool MemoryPool::allocate(size_t size, size_t n, SimpleAllocationCallback callback) {
+int MemoryPool::allocate(size_t size, size_t n, SimpleAllocationCallback callback) {
     size_t required_blocks = (size + block_size_ - 1) / block_size_;  // round up
+    int num_allocated = 0;
 
     if (required_blocks > total_blocks_) {
-        return false;
+        return 0;
     }
 
-    int num_allocated = 0;
     size_t bit_per_word = 64;
     size_t shift = 6;
 
     for (size_t word_index = last_search_position_; word_index < bitmap_.size(); ++word_index) {
+        if (num_allocated == n) {
+            break;
+        }
+
         uint64_t word = bitmap_[word_index];
         if (word == 0xFFFFFFFFFFFFFFFFULL) {
             continue;
         }
-
         for (size_t bit_index = __builtin_ctzll(~word); bit_index < bit_per_word; ++bit_index) {
             size_t start_block = (word_index << shift) + bit_index;
 
             if (start_block + required_blocks > total_blocks_) {
-                return false;
+                allocated_blocks_ += num_allocated * required_blocks;
+                return num_allocated;
             }
 
             bool found = true;
@@ -98,13 +103,14 @@ bool MemoryPool::allocate(size_t size, size_t n, SimpleAllocationCallback callba
                 callback(addr, mr_->lkey, mr_->rkey);
                 last_search_position_ = word_index;
                 if (++num_allocated == n) {
-                    return true;
+                    break;
                 }
             }
         }
     }
 
-    return num_allocated == n;
+    allocated_blocks_ += num_allocated * required_blocks;
+    return num_allocated;
 }
 
 void MemoryPool::deallocate(void* ptr, size_t size) {
@@ -142,18 +148,42 @@ void MemoryPool::deallocate(void* ptr, size_t size) {
     }
 }
 
+void MM::add_mempool(struct ibv_pd* pd) {
+    mempools_.push_back(new MemoryPool((size_t)EXTEND_POOL_SIZE, (size_t)EXTEND_BLOCK_SIZE, pd));
+}
+
+void MM::add_mempool(size_t pool_size, size_t block_size, struct ibv_pd* pd) {
+    mempools_.push_back(new MemoryPool(pool_size, block_size, pd));
+}
+
 bool MM::allocate(size_t size, size_t n, AllocationCallback callback) {
-    for (int i = 0; i < mempools_.size(); ++i) {
+    bool allocated = false;
+    int mempool_cnt = mempools_.size();
+    for (int i = 0; i < mempool_cnt; ++i) {
         // create a new callback from the original callback
         auto simple_callback = [callback, i](void* ptr, uint32_t lkey, uint32_t rkey) {
             callback(ptr, lkey, rkey, i);
         };
 
-        if (mempools_[i]->allocate(size, n, simple_callback)) {
-            return true;
+        int num_allocated = mempools_[i]->allocate(size, n, simple_callback);
+        n -= num_allocated;
+
+        auto total_blocks = mempools_[i]->get_total_blocks();
+        auto allocated_blocks = mempools_[i]->get_allocated_blocks();
+        DEBUG(
+            "Mempool Count: {}, Pool idx: {}, Total blocks: {}, allocated blocks: {}, block usage: "
+            "{}%",
+            mempool_cnt, i, total_blocks, allocated_blocks, 100 * allocated_blocks / total_blocks);
+        if (i == mempools_.size() - 1 &&
+            (float)allocated_blocks / total_blocks > BLOCK_USAGE_RATIO) {
+            need_extend = true;
+        }
+        if (n == 0) {
+            allocated = true;
+            break;
         }
     }
-    return false;
+    return allocated;
 }
 
 void MM::deallocate(void* ptr, size_t size, int pool_idx) {
diff --git a/src/mempool.h b/src/mempool.h
@@ -10,6 +10,10 @@
 #include <functional>
 #include <vector>
 
+#define BLOCK_USAGE_RATIO 0.5
+#define EXTEND_POOL_SIZE 10 << 30
+#define EXTEND_BLOCK_SIZE 64 << 10
+
 using AllocationCallback =
     std::function<void(void* ptr, uint32_t lkey, uint32_t rkey, int pool_idx)>;
 using SimpleAllocationCallback = std::function<void(void* ptr, uint32_t lkey, uint32_t rkey)>;
@@ -23,21 +27,24 @@ class MemoryPool {
     /*
     @brief size should be aligned to block size
     */
-    bool allocate(size_t size, size_t n, SimpleAllocationCallback callback);
+    int allocate(size_t size, size_t n, SimpleAllocationCallback callback);
     /*
     @brief size should be aligned to block size
     */
     void deallocate(void* ptr, size_t size);
 
     uint32_t get_lkey() const { return mr_->lkey; }
     uint32_t get_rkey() const { return mr_->rkey; }
+    uint32_t get_total_blocks() const { return total_blocks_; }
+    uint32_t get_allocated_blocks() const { return allocated_blocks_; }
 
    private:
     void* pool_;
     size_t pool_size_;
     size_t block_size_;
     size_t total_blocks_;
     size_t last_search_position_;
+    size_t allocated_blocks_;
 
     // TODO: use judy libray to speed up the bitmap?
     std::vector<uint64_t> bitmap_;
@@ -52,9 +59,12 @@ class MM {
 
    public:
     MM(size_t pool_size, size_t block_size, struct ibv_pd* pd) {
-        mempools_.push_back(new MemoryPool(pool_size, block_size, pd));
+        add_mempool(pool_size, block_size, pd);
     }
     MM(const MM& mm) = delete;
+    bool need_extend = false;
+    void add_mempool(struct ibv_pd* pd);
+    void add_mempool(size_t pool_size, size_t block_size, struct ibv_pd* pd);
     bool allocate(size_t size, size_t n, AllocationCallback callback);
     void deallocate(void* ptr, size_t size, int pool_idx);
     uint32_t get_lkey(int pool_idx) const {
diff --git a/src/pybind.cpp b/src/pybind.cpp
@@ -120,7 +120,8 @@ PYBIND11_MODULE(_infinistore, m) {
         .def_readwrite("link_type", &ServerConfig::link_type)
         .def_readwrite("prealloc_size", &ServerConfig::prealloc_size)
         .def_readwrite("minimal_allocate_size", &ServerConfig::minimal_allocate_size)
-        .def_readwrite("num_stream", &ServerConfig::num_stream);
+        .def_readwrite("num_stream", &ServerConfig::num_stream)
+        .def_readwrite("auto_increase", &ServerConfig::auto_increase);
     m.def("get_kvmap_len", &get_kvmap_len, "get kv map size");
     m.def("register_server", &register_server, "register the server");