Skip to content

Commit 1f1a2ba

Browse files
committed
optimize curve-client
1 parent 3c69933 commit 1f1a2ba

File tree

88 files changed

+2145
-2087
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+2145
-2087
lines changed

conf/client.conf

+2-2
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ schedule.queueCapacity=1000000
5252
# 队列取出到发送完rpc请求大概在(20us-100us),20us是正常情况下不需要获取leader的时候
5353
# 如果在发送的时候需要获取leader,时间会在100us左右,一个线程的吞吐在10w-50w
5454
# 性能已经满足需求
55-
schedule.threadpoolSize=1
55+
schedule.threadpoolSize=2
5656

5757
# 为隔离qemu侧线程引入的任务队列,因为qemu一侧只有一个IO线程
5858
# 当qemu一侧调用aio接口的时候直接将调用push到任务队列就返回,
@@ -112,7 +112,7 @@ chunkserver.maxRetryTimesBeforeConsiderSuspend=20
112112
################# 文件级别配置项 #############
113113
#
114114
# libcurve底层rpc调度允许最大的未返回rpc数量,每个文件的inflight RPC独立
115-
global.fileMaxInFlightRPCNum=64
115+
global.fileMaxInFlightRPCNum=128
116116

117117
# 文件IO下发到底层chunkserver最大的分片KB
118118
global.fileIOSplitMaxSizeKB=64

curve-ansible/roles/generate_config/defaults/main.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ client_metacache_get_leader_timeout_ms: 500
169169
client_metacache_get_leader_retry: 5
170170
client_metacache_rpc_retry_interval_us: 100000
171171
client_schedule_queue_capacity: 1000000
172-
client_schedule_threadpool_size: 1
172+
client_schedule_threadpool_size: 2
173173
client_isolation_task_queue_capacity: 1000000
174174
client_isolation_task_thread_pool_size: 1
175175
client_chunkserver_op_retry_interval_us: 100000
@@ -183,7 +183,7 @@ client_chunkserver_check_health_timeout_ms: 100
183183
client_chunkserver_server_stable_threshold: 3
184184
client_chunkserver_min_retry_times_force_timeout_backoff: 5
185185
client_chunkserver_max_retry_times_before_consider_suspend: 20
186-
client_file_max_inflight_rpc_num: 64
186+
client_file_max_inflight_rpc_num: 128
187187
client_file_io_split_max_size_kb: 64
188188
client_log_level: 0
189189
client_log_path: /data/log/curve/
@@ -198,6 +198,7 @@ nebd_client_rpc_retry_max_inverval_us: 64000000
198198
nebd_client_rpc_hostdown_retry_inverval_us: 10000
199199
nebd_client_health_check_internal_s: 1
200200
nebd_client_delay_health_check_internal_ms: 100
201+
nebd_client_rpc_send_exec_queue_num: 2
201202
nebd_client_heartbeat_inverval_s: 5
202203
nebd_client_heartbeat_rpc_timeout_ms: 500
203204
nebd_server_heartbeat_timeout_s: 30

curve-ansible/roles/generate_config/templates/nebd-client.conf.j2

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ request.rpcHostDownRetryIntervalUs={{ nebd_client_rpc_hostdown_retry_inverval_us
1616
request.rpcHealthCheckIntervalS={{ nebd_client_health_check_internal_s }}
1717
# brpc从rpc失败到进行健康检查的最大时间间隔,单位ms
1818
request.rpcMaxDelayHealthCheckIntervalMs={{ nebd_client_delay_health_check_internal_ms }}
19+
# rpc发送执行队列个数
20+
request.rpcSendExecQueueNum={{ nebd_client_rpc_send_exec_queue_num }}
1921

2022
# heartbeat间隔
2123
heartbeat.intervalS={{ nebd_client_heartbeat_inverval_s }}

curvesnapshot_python/libcurveSnapshot.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,16 @@
3030
#include "src/client/client_config.h"
3131
#include "include/client/libcurve.h"
3232
#include "src/client/client_common.h"
33+
#include "src/common/concurrent/concurrent.h"
3334

3435
using curve::client::UserInfo;
3536
using curve::client::ClientConfig;
3637
using curve::client::SnapshotClient;
3738
using curve::client::SnapCloneClosure;
39+
using curve::client::FileServiceOption;
40+
using curve::client::ClientConfigOption;
41+
using curve::common::Mutex;
42+
using curve::common::ConditionVariable;
3843

3944
class TaskTracker {
4045
public:
@@ -128,8 +133,8 @@ int Init(const char* path) {
128133
return -LIBCURVE_ERROR::FAILED;
129134
}
130135

131-
FileServiceOption_t fileopt = cc.GetFileServiceOption();
132-
ClientConfigOption_t copt;
136+
FileServiceOption fileopt = cc.GetFileServiceOption();
137+
ClientConfigOption copt;
133138
copt.loginfo = fileopt.loginfo;
134139
copt.ioOpt = fileopt.ioOpt;
135140
copt.metaServerOpt = fileopt.metaServerOpt;

nebd/etc/nebd/nebd-client.conf

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ request.rpcHostDownRetryIntervalUs=10000
1616
request.rpcHealthCheckIntervalS=1
1717
# brpc从rpc失败到进行健康检查的最大时间间隔,单位ms
1818
request.rpcMaxDelayHealthCheckIntervalMs=100
19+
# rpc发送执行队列个数
20+
request.rpcSendExecQueueNum=2
1921

2022
# heartbeat间隔
2123
heartbeat.intervalS=5

nebd/src/part1/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ cc_library(
5151
"//external:brpc",
5252
"//nebd/src/common:nebd_common",
5353
"//nebd/proto:client_cc_proto",
54+
"//include:include-common"
5455
],
5556
copts = COPTS,
5657
linkopts = ["-Wl,-rpath=/usr/lib/nebd"],

nebd/src/part1/nebd_client.cpp

+100-43
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,31 @@ int NebdClient::Init(const char* confpath) {
102102

103103
heartbeatMgr_->Run();
104104

105+
// init rpc send exec-queue
106+
rpcTaskQueues_.resize(option_.requestOption.rpcSendExecQueueNum);
107+
for (auto& q : rpcTaskQueues_) {
108+
int rc = bthread::execution_queue_start(
109+
&q, nullptr, &NebdClient::ExecAsyncRpcTask, this);
110+
if (rc != 0) {
111+
LOG(ERROR) << "Init AsyncRpcQueues failed";
112+
return -1;
113+
}
114+
}
115+
105116
return 0;
106117
}
107118

108119
void NebdClient::Uninit() {
109120
if (heartbeatMgr_ != nullptr) {
110121
heartbeatMgr_->Stop();
111122
}
123+
124+
// stop exec queue
125+
for (auto& q : rpcTaskQueues_) {
126+
bthread::execution_queue_stop(q);
127+
bthread::execution_queue_join(q);
128+
}
129+
112130
LOG(INFO) << "NebdClient uninit success.";
113131
google::ShutdownGoogleLogging();
114132
}
@@ -289,67 +307,85 @@ int64_t NebdClient::GetFileSize(int fd) {
289307
}
290308

291309
int NebdClient::Discard(int fd, NebdClientAioContext* aioctx) {
292-
nebd::client::NebdFileService_Stub stub(&channel_);
293-
nebd::client::DiscardRequest request;
294-
request.set_fd(fd);
295-
request.set_offset(aioctx->offset);
296-
request.set_size(aioctx->length);
297-
298-
AioDiscardClosure* done = new(std::nothrow) AioDiscardClosure(
299-
fd, aioctx, option_.requestOption);
300-
done->cntl.set_timeout_ms(-1);
301-
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
302-
stub.Discard(&done->cntl, &request, &done->response, done);
310+
auto task = [this, fd, aioctx]() {
311+
nebd::client::NebdFileService_Stub stub(&channel_);
312+
nebd::client::DiscardRequest request;
313+
request.set_fd(fd);
314+
request.set_offset(aioctx->offset);
315+
request.set_size(aioctx->length);
316+
317+
AioDiscardClosure* done = new(std::nothrow) AioDiscardClosure(
318+
fd, aioctx, option_.requestOption);
319+
done->cntl.set_timeout_ms(-1);
320+
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
321+
stub.Discard(&done->cntl, &request, &done->response, done);
322+
};
323+
324+
PushAsyncTask(task);
303325

304326
return 0;
305327
}
306328

307329
int NebdClient::AioRead(int fd, NebdClientAioContext* aioctx) {
308-
nebd::client::NebdFileService_Stub stub(&channel_);
309-
nebd::client::ReadRequest request;
310-
request.set_fd(fd);
311-
request.set_offset(aioctx->offset);
312-
request.set_size(aioctx->length);
313-
314-
AioReadClosure* done = new(std::nothrow) AioReadClosure(
315-
fd, aioctx, option_.requestOption);
316-
done->cntl.set_timeout_ms(-1);
317-
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
318-
stub.Read(&done->cntl, &request, &done->response, done);
330+
auto task = [this, fd, aioctx]() {
331+
nebd::client::NebdFileService_Stub stub(&channel_);
332+
nebd::client::ReadRequest request;
333+
request.set_fd(fd);
334+
request.set_offset(aioctx->offset);
335+
request.set_size(aioctx->length);
336+
337+
AioReadClosure* done = new(std::nothrow) AioReadClosure(
338+
fd, aioctx, option_.requestOption);
339+
done->cntl.set_timeout_ms(-1);
340+
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
341+
stub.Read(&done->cntl, &request, &done->response, done);
342+
};
343+
344+
PushAsyncTask(task);
345+
319346
return 0;
320347
}
321348

322349
static void EmptyDeleter(void* m) {}
323350

324351
int NebdClient::AioWrite(int fd, NebdClientAioContext* aioctx) {
325-
nebd::client::NebdFileService_Stub stub(&channel_);
326-
nebd::client::WriteRequest request;
327-
request.set_fd(fd);
328-
request.set_offset(aioctx->offset);
329-
request.set_size(aioctx->length);
352+
auto task = [this, fd, aioctx]() {
353+
nebd::client::NebdFileService_Stub stub(&channel_);
354+
nebd::client::WriteRequest request;
355+
request.set_fd(fd);
356+
request.set_offset(aioctx->offset);
357+
request.set_size(aioctx->length);
358+
359+
AioWriteClosure* done = new(std::nothrow) AioWriteClosure(
360+
fd, aioctx, option_.requestOption);
330361

331-
AioWriteClosure* done = new(std::nothrow) AioWriteClosure(
332-
fd, aioctx, option_.requestOption);
362+
done->cntl.set_timeout_ms(-1);
363+
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
364+
done->cntl.request_attachment().append_user_data(
365+
aioctx->buf, aioctx->length, EmptyDeleter);
366+
stub.Write(&done->cntl, &request, &done->response, done);
367+
};
333368

334-
done->cntl.set_timeout_ms(-1);
335-
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
336-
done->cntl.request_attachment().append_user_data(
337-
aioctx->buf, aioctx->length, EmptyDeleter);
338-
stub.Write(&done->cntl, &request, &done->response, done);
369+
PushAsyncTask(task);
339370

340371
return 0;
341372
}
342373

343374
int NebdClient::Flush(int fd, NebdClientAioContext* aioctx) {
344-
nebd::client::NebdFileService_Stub stub(&channel_);
345-
nebd::client::FlushRequest request;
346-
request.set_fd(fd);
347-
348-
AioFlushClosure* done = new(std::nothrow) AioFlushClosure(
349-
fd, aioctx, option_.requestOption);
350-
done->cntl.set_timeout_ms(-1);
351-
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
352-
stub.Flush(&done->cntl, &request, &done->response, done);
375+
auto task = [this, fd, aioctx]() {
376+
nebd::client::NebdFileService_Stub stub(&channel_);
377+
nebd::client::FlushRequest request;
378+
request.set_fd(fd);
379+
380+
AioFlushClosure* done = new(std::nothrow) AioFlushClosure(
381+
fd, aioctx, option_.requestOption);
382+
done->cntl.set_timeout_ms(-1);
383+
done->cntl.set_log_id(logId_.fetch_add(1, std::memory_order_relaxed));
384+
stub.Flush(&done->cntl, &request, &done->response, done);
385+
};
386+
387+
PushAsyncTask(task);
388+
353389
return 0;
354390
}
355391

@@ -473,6 +509,13 @@ int NebdClient::InitNebdClientOption(Configuration* conf) {
473509
LOG_IF(ERROR, ret != true) << "Load request.rpcMaxDelayHealthCheckIntervalMs failed"; // NOLINT
474510
RETURN_IF_FALSE(ret);
475511

512+
ret = conf->GetUInt32Value("request.rpcSendExecQueueNum",
513+
&requestOption.rpcSendExecQueueNum);
514+
LOG_IF(ERROR, ret != true)
515+
<< "Load request.rpcSendExecQueueNum from config file failed, current "
516+
"value is "
517+
<< requestOption.rpcSendExecQueueNum;
518+
476519
option_.requestOption = requestOption;
477520

478521
ret = conf->GetStringValue("log.path", &option_.logOption.logPath);
@@ -564,5 +607,19 @@ void NebdClient::InitLogger(const LogOption& logOption) {
564607
google::InitGoogleLogging(kProcessName);
565608
}
566609

610+
int NebdClient::ExecAsyncRpcTask(void* meta,
611+
bthread::TaskIterator<AsyncRpcTask>& iter) { // NOLINT
612+
if (iter.is_queue_stopped()) {
613+
return 0;
614+
}
615+
616+
for (; iter; ++iter) {
617+
auto& task = *iter;
618+
task();
619+
}
620+
621+
return 0;
622+
}
623+
567624
} // namespace client
568625
} // namespace nebd

nebd/src/part1/nebd_client.h

+22
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@
2424
#define NEBD_SRC_PART1_NEBD_CLIENT_H_
2525

2626
#include <brpc/channel.h>
27+
#include <bthread/execution_queue.h>
2728

2829
#include <functional>
2930
#include <string>
3031
#include <memory>
32+
#include <vector>
3133

3234
#include "nebd/src/part1/nebd_common.h"
3335
#include "nebd/src/common/configuration.h"
@@ -36,6 +38,8 @@
3638
#include "nebd/src/part1/heartbeat_manager.h"
3739
#include "nebd/src/part1/nebd_metacache.h"
3840

41+
#include "include/curve_compiler_specific.h"
42+
3943
namespace nebd {
4044
namespace client {
4145

@@ -171,6 +175,24 @@ class NebdClient {
171175
brpc::Channel channel_;
172176

173177
std::atomic<uint64_t> logId_{1};
178+
179+
private:
180+
using AsyncRpcTask = std::function<void()>;
181+
182+
std::vector<bthread::ExecutionQueueId<AsyncRpcTask>> rpcTaskQueues_;
183+
184+
static int ExecAsyncRpcTask(void* meta, bthread::TaskIterator<AsyncRpcTask>& iter); // NOLINT
185+
186+
void PushAsyncTask(const AsyncRpcTask& task) {
187+
static thread_local unsigned int seed = time(nullptr);
188+
189+
int idx = rand_r(&seed) % rpcTaskQueues_.size();
190+
int rc = bthread::execution_queue_execute(rpcTaskQueues_[idx], task);
191+
192+
if (CURVE_UNLIKELY(rc != 0)) {
193+
task();
194+
}
195+
}
174196
};
175197

176198
extern NebdClient &nebdClient;

nebd/src/part1/nebd_common.h

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ struct RequestOption {
3939
int64_t rpcHealthCheckIntervalS;
4040
// brpc从rpc失败到进行健康检查的最大时间间隔
4141
int64_t rpcMaxDelayHealthCheckIntervalMs;
42+
// rpc发送执行队列个数
43+
uint32_t rpcSendExecQueueNum = 2;
4244
};
4345

4446
// 日志配置项

src/chunkserver/clone_manager.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ int CloneManager::Run() {
4242
return 0;
4343
// 启动线程池
4444
LOG(INFO) << "Begin to run clone manager.";
45-
tp_ = std::make_shared<TaskThreadPool>();
45+
tp_ = std::make_shared<TaskThreadPool<>>();
4646
int ret = tp_->Start(options_.threadNum, options_.queueCapacity);
4747
if (ret < 0) {
4848
LOG(ERROR) << "clone manager start error."

src/chunkserver/clone_manager.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ class CloneManager {
105105
// 克隆任务管理相关的选项,调Init的时候初始化
106106
CloneOptions options_;
107107
// 处理克隆任务的异步线程池
108-
std::shared_ptr<TaskThreadPool> tp_;
108+
std::shared_ptr<TaskThreadPool<>> tp_;
109109
// 当前线程池是否处于工作状态
110110
std::atomic<bool> isRunning_;
111111
};

src/chunkserver/copyset_node_manager.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ std::once_flag addServiceFlag;
5151
int CopysetNodeManager::Init(const CopysetNodeOptions &copysetNodeOptions) {
5252
copysetNodeOptions_ = copysetNodeOptions;
5353
if (copysetNodeOptions_.loadConcurrency > 0) {
54-
copysetLoader_ = std::make_shared<TaskThreadPool>();
54+
copysetLoader_ = std::make_shared<TaskThreadPool<>>();
5555
} else {
5656
copysetLoader_ = nullptr;
5757
}

src/chunkserver/copyset_node_manager.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ class CopysetNodeManager : public curve::common::Uncopyable {
205205
// 复制组配置选项
206206
CopysetNodeOptions copysetNodeOptions_;
207207
// 控制copyset并发启动的数量
208-
std::shared_ptr<TaskThreadPool> copysetLoader_;
208+
std::shared_ptr<TaskThreadPool<>> copysetLoader_;
209209
// 表示copyset node manager当前是否正在运行
210210
Atomic<bool> running_;
211211
// 表示copyset node manager当前是否已经完成加载

0 commit comments

Comments
 (0)