Skip to content

Commit 80267fe

Browse files
dongmao zhangthesues
dongmao zhang
authored andcommitted
feature: thread safe delete any keys
1. new http API to purge all keys 2. add inflight_rdma_reads to track all outstanding rdma read requests 3. fix a bug in mempool(reset last_search_ after deallcoate) 4. create a new file infinistore.h
1 parent 82a8e1e commit 80267fe

9 files changed

+140
-66
lines changed

infinistore/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
LINK_ETHERNET,
1111
LINK_IB,
1212
register_server,
13+
purge_kv_map,
14+
get_kvmap_len,
1315
)
1416

1517
__all__ = [
@@ -24,4 +26,6 @@
2426
"check_supported",
2527
"LINK_ETHERNET",
2628
"LINK_IB",
29+
"purge_kv_map",
30+
"get_kvmap_len",
2731
]

infinistore/lib.py

+26
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,32 @@ def set_log_level(level):
148148
_infinistore.set_log_level(level)
149149

150150

151+
def get_kvmap_len():
152+
"""
153+
Returns the length of the key-value map in the infinistore.
154+
155+
This function calls the underlying _infinistore.get_kvmap_len() method to
156+
get the length of the key-value map.
157+
158+
Returns:
159+
The result of the _infinistore.get_kvmap_len() method call.
160+
"""
161+
return _infinistore.get_kvmap_len()
162+
163+
164+
def purge_kv_map():
165+
"""
166+
Purges the key-value map in the infinistore.
167+
168+
This function calls the underlying _infinistore.purge_kv_map() method to
169+
clear all entries in the key-value map, effectively resetting it.
170+
171+
Returns:
172+
The result of the _infinistore.purge_kv_map() method call.
173+
"""
174+
return _infinistore.purge_kv_map()
175+
176+
151177
def register_server(loop, config: ServerConfig):
152178
"""
153179
Registers a server with the given event loop and configuration.

infinistore/server.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
from infinistore.lib import (
1+
from infinistore import (
22
register_server,
3+
purge_kv_map,
4+
get_kvmap_len,
35
check_supported,
46
ServerConfig,
57
Logger,
68
)
79

8-
from infinistore._infinistore import get_kvmap_len
9-
1010
import asyncio
1111
import uvloop
1212
from fastapi import FastAPI
@@ -24,9 +24,17 @@
2424
app = FastAPI()
2525

2626

27-
@app.get("/kvmapSize")
28-
async def read_status():
29-
return get_kvmap_len()
27+
@app.post("/purge")
28+
async def purge():
29+
Logger.info("clear kvmap")
30+
num = get_kvmap_len()
31+
purge_kv_map()
32+
return {"status": "ok", "num": num}
33+
34+
35+
@app.get("/kvmap_len")
36+
async def kvmap_len():
37+
return {"len": get_kvmap_len()}
3038

3139

3240
def check_p2p_access():
@@ -188,7 +196,7 @@ def main():
188196

189197
server = uvicorn.Server(http_config)
190198

191-
Logger.info("server started")
199+
Logger.warn("server started")
192200
loop.run_until_complete(server.serve())
193201

194202

src/infinistore.cpp

+32-53
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
// single thread right now.
2+
#include "infinistore.h"
3+
24
#include <arpa/inet.h>
35
#include <assert.h>
46
#include <cuda.h>
@@ -11,7 +13,6 @@
1113
#include <sys/socket.h>
1214
#include <time.h>
1315
#include <unistd.h>
14-
#include <uv.h>
1516

1617
#include <boost/lockfree/spsc_queue.hpp>
1718
#include <chrono>
@@ -21,12 +22,8 @@
2122
#include <string>
2223
#include <unordered_map>
2324

24-
#include "config.h"
2525
#include "ibv_helper.h"
26-
#include "log.h"
27-
#include "mempool.h"
2826
#include "protocol.h"
29-
#include "utils.h"
3027

3128
server_config_t global_config;
3229

@@ -48,22 +45,6 @@ bool extend_in_flight = false;
4845
// indicate the number of cudaIpcOpenMemHandle
4946
std::atomic<unsigned int> opened_ipc{0};
5047

51-
// PTR is shared by kv_map and inflight_rdma_kv_map
52-
class PTR : public IntrusivePtrTarget {
53-
public:
54-
void *ptr;
55-
size_t size;
56-
int pool_idx;
57-
bool committed;
58-
PTR(void *ptr, size_t size, int pool_idx, bool committed = false)
59-
: ptr(ptr), size(size), pool_idx(pool_idx), committed(committed) {}
60-
~PTR() {
61-
if (ptr) {
62-
DEBUG("deallocate ptr: {}, size: {}, pool_idx: {}", ptr, size, pool_idx);
63-
mm->deallocate(ptr, size, pool_idx);
64-
}
65-
}
66-
};
6748

6849
enum CUDA_TASK_TYPE {
6950
CUDA_READ,
@@ -80,10 +61,9 @@ struct CUDA_TASK {
8061
cudaEvent_t event;
8162
};
8263

83-
std::unordered_map<uintptr_t, boost::intrusive_ptr<PTR>> inflight_rdma_kv_map;
84-
std::unordered_map<std::string, boost::intrusive_ptr<PTR>> kv_map;
64+
std::unordered_map<uintptr_t, boost::intrusive_ptr<PTR>> inflight_rdma_writes;
8565

86-
int get_kvmap_len() { return kv_map.size(); }
66+
std::unordered_map<std::string, boost::intrusive_ptr<PTR>> kv_map;
8767

8868
typedef enum {
8969
READ_HEADER,
@@ -130,11 +110,6 @@ struct Client {
130110

131111
uv_poll_t poll_handle_;
132112

133-
struct block {
134-
uint32_t lkey;
135-
uintptr_t local_addr;
136-
};
137-
138113
Client() = default;
139114
Client(const Client &) = delete;
140115
~Client();
@@ -173,6 +148,7 @@ Client::~Client() {
173148
if (poll_handle_.data) {
174149
uv_poll_stop(&poll_handle_);
175150
}
151+
176152
if (handle_) {
177153
free(handle_);
178154
handle_ = NULL;
@@ -274,15 +250,15 @@ void Client::cq_poll_handle(uv_poll_t *handle, int status, int events) {
274250
ERROR("remote_addrs size should not be 0");
275251
}
276252
for (auto addr : *request->remote_addrs()) {
277-
auto it = inflight_rdma_kv_map.find(addr);
278-
if (it == inflight_rdma_kv_map.end()) {
253+
auto it = inflight_rdma_writes.find(addr);
254+
if (it == inflight_rdma_writes.end()) {
279255
ERROR("commit msg: Key not found: {}", addr);
280256
continue;
281257
}
282258
it->second->committed = true;
283-
inflight_rdma_kv_map.erase(it);
259+
inflight_rdma_writes.erase(it);
284260
}
285-
DEBUG("inflight_rdma_kv_map size: {}", inflight_rdma_kv_map.size());
261+
DEBUG("inflight_rdma_kv_map size: {}", inflight_rdma_writes.size());
286262
break;
287263
}
288264
default:
@@ -332,6 +308,12 @@ void Client::cq_poll_handle(uv_poll_t *handle, int status, int events) {
332308
delete[] sges;
333309
outstanding_rdma_writes_queue_.pop_front();
334310
}
311+
312+
if (wc.wr_id > 0) {
313+
// last WR will inform that all RDMA write is finished,so we can dereference PTR
314+
auto inflight_rdma_reads = (std::vector<boost::intrusive_ptr<PTR>> *)wc.wr_id;
315+
delete inflight_rdma_reads;
316+
}
335317
}
336318
else {
337319
ERROR("Unexpected wc opcode: {}", (int)wc.opcode);
@@ -376,7 +358,7 @@ int Client::allocate_rdma(const RemoteMetaRequest *req) {
376358

377359
// save in inflight_rdma_kv_map, when write is finished, we can merge it
378360
// into kv_map
379-
inflight_rdma_kv_map[(uintptr_t)addr] = ptr;
361+
inflight_rdma_writes[(uintptr_t)addr] = ptr;
380362

381363
blocks.push_back(RemoteBlock(rkey, (uint64_t)addr));
382364
key_idx++;
@@ -439,8 +421,9 @@ int Client::read_rdma_cache(const RemoteMetaRequest *remote_meta_req) {
439421
return -1;
440422
}
441423

442-
std::vector<block> blocks;
443-
blocks.reserve(remote_meta_req->keys()->size());
424+
auto *inflight_rdma_reads = new std::vector<boost::intrusive_ptr<PTR>>;
425+
426+
inflight_rdma_reads->reserve(remote_meta_req->keys()->size());
444427

445428
for (const auto *key : *remote_meta_req->keys()) {
446429
auto it = kv_map.find(key->str());
@@ -459,7 +442,7 @@ int Client::read_rdma_cache(const RemoteMetaRequest *remote_meta_req) {
459442
DEBUG("rkey: {}, local_addr: {}, size : {}", mm->get_lkey(ptr->pool_idx),
460443
(uintptr_t)ptr->ptr, ptr->size);
461444

462-
blocks.push_back({.lkey = mm->get_lkey(ptr->pool_idx), .local_addr = (uintptr_t)ptr->ptr});
445+
inflight_rdma_reads->push_back(ptr);
463446
}
464447

465448
const size_t max_wr = MAX_WR_BATCH;
@@ -477,13 +460,12 @@ int Client::read_rdma_cache(const RemoteMetaRequest *remote_meta_req) {
477460
wrs = new struct ibv_send_wr[max_wr];
478461
sges = new struct ibv_sge[max_wr];
479462
}
480-
481463
for (size_t i = 0; i < remote_meta_req->keys()->size(); i++) {
482-
sges[num_wr].addr = blocks[i].local_addr;
464+
sges[num_wr].addr = (uintptr_t)(*inflight_rdma_reads)[i]->ptr;
483465
sges[num_wr].length = remote_meta_req->block_size();
484-
sges[num_wr].lkey = blocks[i].lkey;
466+
sges[num_wr].lkey = mm->get_lkey((*inflight_rdma_reads)[i]->pool_idx);
485467

486-
wrs[num_wr].wr_id = i;
468+
wrs[num_wr].wr_id = 0;
487469
wrs[num_wr].opcode = (i == remote_meta_req->keys()->size() - 1) ? IBV_WR_RDMA_WRITE_WITH_IMM
488470
: IBV_WR_RDMA_WRITE;
489471
wrs[num_wr].sg_list = &sges[num_wr];
@@ -498,6 +480,10 @@ int Client::read_rdma_cache(const RemoteMetaRequest *remote_meta_req) {
498480
? IBV_SEND_SIGNALED
499481
: 0;
500482

483+
if (i == remote_meta_req->keys()->size() - 1) {
484+
wrs[num_wr].wr_id = (uintptr_t)inflight_rdma_reads;
485+
}
486+
501487
num_wr++;
502488

503489
if (num_wr == max_wr || i == remote_meta_req->keys()->size() - 1) {
@@ -617,19 +603,13 @@ int Client::read_cache(const LocalMetaRequest *meta_req) {
617603

618604
DEBUG("key: {}, local_addr: {}, size : {}", key, (uintptr_t)h_src, block_size);
619605

620-
task->ptrs.push_back(kv_map[key]);
606+
task->ptrs.push_back(kv_map[key]); // keep PTR in task as reference count.
621607
remote_addrs.push_back((uintptr_t)((char *)d_ptr + block->offset()));
622608
idx++;
623609
}
624610

625611
assert(task->ptrs.size() == remote_addrs.size());
626612

627-
// for (auto &task : tasks) {
628-
// // CHECK_CUDA(cudaMemcpyAsync((void *)task.dst, task.ptr->ptr, block_size,
629-
// // cudaMemcpyHostToDevice, cuda_streams[task.stream_idx]));
630-
631-
// }
632-
633613
for (int i = 0; i < task->ptrs.size(); i++) {
634614
CHECK_CUDA(cudaMemcpyAsync((void *)remote_addrs[i], task->ptrs[i]->ptr, block_size,
635615
cudaMemcpyHostToDevice, cuda_stream));
@@ -711,8 +691,7 @@ void add_mempool_completion(uv_work_t *req, int status) {
711691
}
712692

713693
int Client::write_cache(const LocalMetaRequest *meta_req) {
714-
INFO("do write_cache..., num of blocks: {}, stream num {}", meta_req->blocks()->size(),
715-
global_config.num_stream);
694+
INFO("do write_cache..., num of blocks: {}", meta_req->blocks()->size());
716695

717696
void *d_ptr;
718697
int return_code = TASK_ACCEPTED;
@@ -1270,11 +1249,11 @@ void on_new_connection(uv_stream_t *server, int status) {
12701249
}
12711250

12721251
void signal_handler(int signum) {
1273-
void *array[10];
1252+
void *array[32];
12741253
size_t size;
12751254
if (signum == SIGSEGV) {
12761255
ERROR("Caught SIGSEGV: segmentation fault");
1277-
size = backtrace(array, 10);
1256+
size = backtrace(array, 32);
12781257
// print signum's name
12791258
ERROR("Error: signal {}", signum);
12801259
// backtrace_symbols_fd(array, size, STDERR_FILENO);
@@ -1335,4 +1314,4 @@ int register_server(unsigned long loop_ptr, server_config_t config) {
13351314
INFO("register server done");
13361315

13371316
return 0;
1338-
}
1317+
}

src/infinistore.h

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#ifndef INFINISTORE_H
2+
#define INFINISTORE_H
3+
#include <uv.h>
4+
5+
#include "config.h"
6+
#include "log.h"
7+
#include "mempool.h"
8+
#include "utils.h"
9+
10+
extern server_config_t global_config;
11+
extern uv_loop_t *loop;
12+
extern uv_tcp_t server;
13+
// global ibv context
14+
extern struct ibv_context *ib_ctx;
15+
extern struct ibv_pd *pd;
16+
extern MM *mm;
17+
18+
extern int gidx;
19+
extern int lid;
20+
extern uint8_t ib_port;
21+
// local active_mtu attr, after exchanging with remote, we will use the min of the two for path.mtu
22+
extern ibv_mtu active_mtu;
23+
24+
// indicate if the MM extend is in flight
25+
extern bool extend_in_flight;
26+
// indicate the number of cudaIpcOpenMemHandle
27+
extern std::atomic<unsigned int> opened_ipc;
28+
29+
// PTR is shared by kv_map and inflight_rdma_kv_map
30+
class PTR : public IntrusivePtrTarget {
31+
public:
32+
void *ptr;
33+
size_t size;
34+
int pool_idx;
35+
bool committed;
36+
PTR(void *ptr, size_t size, int pool_idx, bool committed = false)
37+
: ptr(ptr), size(size), pool_idx(pool_idx), committed(committed) {}
38+
~PTR() {
39+
if (ptr) {
40+
DEBUG("deallocate ptr: {}, size: {}, pool_idx: {}", ptr, size, pool_idx);
41+
mm->deallocate(ptr, size, pool_idx);
42+
}
43+
}
44+
};
45+
extern std::unordered_map<std::string, boost::intrusive_ptr<PTR>> kv_map;
46+
extern std::unordered_map<uintptr_t, boost::intrusive_ptr<PTR>> inflight_rdma_writes;
47+
48+
// global function to bind with python
49+
int register_server(unsigned long loop_ptr, server_config_t config);
50+
void purge_kv_map();
51+
52+
#endif

0 commit comments

Comments
 (0)