-
Notifications
You must be signed in to change notification settings - Fork 522
Device api v2 perftest #11125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Device api v2 perftest #11125
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -112,6 +112,9 @@ __host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid) | |
| case UCX_PERF_CMD_PUT_SINGLE: \ | ||
| _func(UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \ | ||
| break; \ | ||
| case UCX_PERF_CMD_PUT_SINGLE_V2: \ | ||
| _func(UCX_PERF_CMD_PUT_SINGLE_V2, __VA_ARGS__); \ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a general suggestion here: #define UCX_PERF_SWITCH_CMD(_cmd, _func, ...) \
switch (_cmd) { \
case UCX_PERF_CMD_PUT_SINGLE: \
case UCX_PERF_CMD_PUT_SINGLE_V2: \
case UCX_PERF_CMD_PUT_MULTI: \
case UCX_PERF_CMD_PUT_PARTIAL: \
_func(_cmd, __VA_ARGS__); \
break; \
default: \
ucs_error("Unsupported cmd: %d", _cmd); \
break; \
}The same could be applied for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is that |
||
| break; \ | ||
| case UCX_PERF_CMD_PUT_MULTI: \ | ||
| _func(UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \ | ||
| break; \ | ||
|
|
@@ -230,6 +233,7 @@ ucx_perf_cuda_dispatch(ucx_perf_context_t *perf) | |
| Runner runner(*perf); | ||
| if ((perf->params.command == UCX_PERF_CMD_PUT_MULTI) || | ||
| (perf->params.command == UCX_PERF_CMD_PUT_SINGLE) || | ||
| (perf->params.command == UCX_PERF_CMD_PUT_SINGLE_V2) || | ||
| (perf->params.command == UCX_PERF_CMD_PUT_PARTIAL)) { | ||
| if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) { | ||
| return runner.run_pingpong(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -129,14 +129,16 @@ private: | |
| }; | ||
|
|
||
| struct ucp_perf_cuda_params { | ||
| ucp_device_mem_list_handle_h mem_list; | ||
| size_t length; | ||
| unsigned *indices; | ||
| size_t *local_offsets; | ||
| size_t *remote_offsets; | ||
| size_t *lengths; | ||
| uint64_t *counter_send; | ||
| uint64_t *counter_recv; | ||
| ucp_device_mem_list_handle_h mem_list; | ||
| ucp_device_local_mem_list_h local_mem_list; | ||
| ucp_device_remote_mem_list_h remote_mem_list; | ||
| size_t length; | ||
| unsigned *indices; | ||
| size_t *local_offsets; | ||
| size_t *remote_offsets; | ||
| size_t *lengths; | ||
| uint64_t *counter_send; | ||
| uint64_t *counter_recv; | ||
| }; | ||
|
|
||
| class ucp_perf_cuda_params_handler { | ||
|
|
@@ -151,6 +153,8 @@ public: | |
| ~ucp_perf_cuda_params_handler() | ||
| { | ||
| ucp_device_mem_list_release(m_params.mem_list); | ||
| ucp_device_mem_list_release(m_params.local_mem_list); | ||
| ucp_device_mem_list_release(m_params.remote_mem_list); | ||
| CUDA_CALL_WARN(cudaFree, m_params.indices); | ||
| CUDA_CALL_WARN(cudaFree, m_params.local_offsets); | ||
| CUDA_CALL_WARN(cudaFree, m_params.remote_offsets); | ||
|
|
@@ -162,7 +166,8 @@ public: | |
| private: | ||
| static bool has_counter(const ucx_perf_context_t &perf) | ||
| { | ||
| return (perf.params.command != UCX_PERF_CMD_PUT_SINGLE); | ||
| return ((perf.params.command != UCX_PERF_CMD_PUT_SINGLE) && | ||
| (perf.params.command != UCX_PERF_CMD_PUT_SINGLE_V2)); | ||
| } | ||
|
|
||
| void init_mem_list(const ucx_perf_context_t &perf) | ||
|
|
@@ -171,28 +176,44 @@ private: | |
| size_t count = data_count + (has_counter(perf) ? 1 : 0); | ||
| size_t offset = 0; | ||
| ucp_device_mem_list_elem_t elems[count]; | ||
| ucp_device_mem_list_elem_t local_elems[count]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With latest API we can use existing |
||
| ucp_device_mem_list_elem_t remote_elems[count]; | ||
|
|
||
| for (size_t i = 0; i < data_count; ++i) { | ||
| elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH; | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR; | ||
| elems[i].memh = perf.ucp.send_memh; | ||
| elems[i].rkey = perf.ucp.rkey; | ||
| elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset); | ||
| elems[i].remote_addr = perf.ucp.remote_addr + offset; | ||
| elems[i].length = perf.params.msg_size_list[i]; | ||
| offset += elems[i].length; | ||
|
|
||
| /* local elements - API v2 */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe init mem list elems on demand according to test and not both always. |
||
| local_elems[i].field_mask = | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR; | ||
| local_elems[i].memh = perf.ucp.send_memh; | ||
| local_elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, | ||
| offset); | ||
|
|
||
| /* remote elements - API v2 */ | ||
| remote_elems[i].field_mask = | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_EP | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR; | ||
| remote_elems[i].ep = perf.ucp.ep; | ||
| remote_elems[i].rkey = perf.ucp.rkey; | ||
| remote_elems[i].remote_addr = perf.ucp.remote_addr + offset; | ||
|
|
||
| offset += perf.params.msg_size_list[i]; | ||
| } | ||
|
|
||
| if (has_counter(perf)) { | ||
| elems[data_count].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR | | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH; | ||
| UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR; | ||
| elems[data_count].rkey = perf.ucp.rkey; | ||
| elems[data_count].remote_addr = perf.ucp.remote_addr + offset; | ||
| elems[data_count].length = ONESIDED_SIGNAL_SIZE; | ||
| } | ||
|
|
||
| ucp_device_mem_list_params_t params; | ||
|
|
@@ -219,6 +240,48 @@ private: | |
| if (status != UCS_OK) { | ||
| throw std::runtime_error("Failed to create memory list"); | ||
| } | ||
|
|
||
| ucp_device_mem_list_params_t local_params; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe create mem list handle on demand according to test and not both always. |
||
| local_params.field_mask = | ||
| UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS | | ||
| UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE | | ||
| UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS | | ||
| UCP_DEVICE_MEM_LIST_PARAMS_FIELD_WORKER; | ||
| local_params.element_size = sizeof(ucp_device_mem_list_elem_t); | ||
| local_params.num_elements = count; | ||
| local_params.elements = local_elems; | ||
| local_params.worker = perf.ucp.worker; | ||
|
|
||
| status = ucp_device_local_mem_list_create(&local_params, | ||
| &m_params.local_mem_list); | ||
| if (status != UCS_OK) { | ||
| throw std::runtime_error("Failed to create local memory list"); | ||
| } | ||
|
|
||
| ucp_device_mem_list_params_t remote_params; | ||
| remote_params.field_mask = | ||
| UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS | | ||
| UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE | | ||
| UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS; | ||
| remote_params.element_size = sizeof(ucp_device_mem_list_elem_t); | ||
| remote_params.num_elements = count; | ||
| remote_params.elements = remote_elems; | ||
|
|
||
| deadline = ucs_get_time() + ucs_time_from_sec(60.0); | ||
| do { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe use helper func to improve code reuse and better separation ? |
||
| if (ucs_get_time() > deadline) { | ||
| ucs_warn("timeout creating remote device memory list"); | ||
| deadline = ULONG_MAX; | ||
| } | ||
|
|
||
| ucp_worker_progress(perf.ucp.worker); | ||
| status = ucp_device_remote_mem_list_create(&remote_params, | ||
| &m_params.remote_mem_list); | ||
| } while (status == UCS_ERR_NOT_CONNECTED); | ||
|
|
||
| if (status != UCS_OK) { | ||
| throw std::runtime_error("Failed to create remote memory list"); | ||
| } | ||
| } | ||
|
|
||
| void init_elements(const ucx_perf_context_t &perf) | ||
|
|
@@ -283,6 +346,14 @@ ucp_perf_cuda_send_async(const ucp_perf_cuda_params ¶ms, | |
| 0, 0, | ||
| params.length + ONESIDED_SIGNAL_SIZE, | ||
| channel_id, flags, req); | ||
| case UCX_PERF_CMD_PUT_SINGLE_V2: | ||
| *params.counter_send = idx + 1; | ||
| return ucp_device_put<level>(params.local_mem_list, | ||
| params.indices[0], 0, | ||
| params.remote_mem_list, | ||
| params.indices[0], 0, | ||
| params.length + ONESIDED_SIGNAL_SIZE, | ||
| channel_id, flags, req); | ||
| case UCX_PERF_CMD_PUT_MULTI: | ||
| return ucp_device_put_multi<level>(params.mem_list, 1, channel_id, | ||
| flags, req); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we maybe use existing
UCX_PERF_CMD_PUTinstead ofUCX_PERF_CMD_PUT_SINGLE_V2?UCX_PERF_CMD_PUTis used for host put tests, but maybe we can use it also for device put test if we can differ between them by the-aoption