Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/tools/perf/api/libperf.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ typedef enum {
UCX_PERF_CMD_AM,
UCX_PERF_CMD_PUT,
UCX_PERF_CMD_PUT_SINGLE,
UCX_PERF_CMD_PUT_SINGLE_V2,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we maybe use existing UCX_PERF_CMD_PUT instead of UCX_PERF_CMD_PUT_SINGLE_V2?
UCX_PERF_CMD_PUT is used for host put tests, but maybe we can use it also for device put test if we can differ between them by the -a option

UCX_PERF_CMD_PUT_MULTI,
UCX_PERF_CMD_PUT_PARTIAL,
UCX_PERF_CMD_GET,
Expand Down
4 changes: 4 additions & 0 deletions src/tools/perf/cuda/cuda_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ __host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid)
case UCX_PERF_CMD_PUT_SINGLE: \
_func(UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \
break; \
case UCX_PERF_CMD_PUT_SINGLE_V2: \
_func(UCX_PERF_CMD_PUT_SINGLE_V2, __VA_ARGS__); \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a general suggestion here:
Could we utilize the fall through feature of the switch statement to avoid repetition?
Something like:

#define UCX_PERF_SWITCH_CMD(_cmd, _func, ...) \
    switch (_cmd) { \
    case UCX_PERF_CMD_PUT_SINGLE: \
    case UCX_PERF_CMD_PUT_SINGLE_V2: \
    case UCX_PERF_CMD_PUT_MULTI: \
    case UCX_PERF_CMD_PUT_PARTIAL: \
        _func(_cmd, __VA_ARGS__); \
        break; \
    default: \
        ucs_error("Unsupported cmd: %d", _cmd); \
        break; \
    }

The same could be applied for #define UCX_PERF_SWITCH_LEVEL

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that _cmd's value eventually gets used as a template parameter in the kernel launch:
_kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>>(__VA_ARGS__);
and nvcc requires template parameters to be compile-time constants, but when using fall-through, _cmd remains a runtime variable.

break; \
case UCX_PERF_CMD_PUT_MULTI: \
_func(UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \
break; \
Expand Down Expand Up @@ -230,6 +233,7 @@ ucx_perf_cuda_dispatch(ucx_perf_context_t *perf)
Runner runner(*perf);
if ((perf->params.command == UCX_PERF_CMD_PUT_MULTI) ||
(perf->params.command == UCX_PERF_CMD_PUT_SINGLE) ||
(perf->params.command == UCX_PERF_CMD_PUT_SINGLE_V2) ||
(perf->params.command == UCX_PERF_CMD_PUT_PARTIAL)) {
if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) {
return runner.run_pingpong();
Expand Down
103 changes: 87 additions & 16 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,16 @@ private:
};

struct ucp_perf_cuda_params {
ucp_device_mem_list_handle_h mem_list;
size_t length;
unsigned *indices;
size_t *local_offsets;
size_t *remote_offsets;
size_t *lengths;
uint64_t *counter_send;
uint64_t *counter_recv;
ucp_device_mem_list_handle_h mem_list;
ucp_device_local_mem_list_h local_mem_list;
ucp_device_remote_mem_list_h remote_mem_list;
size_t length;
unsigned *indices;
size_t *local_offsets;
size_t *remote_offsets;
size_t *lengths;
uint64_t *counter_send;
uint64_t *counter_recv;
};

class ucp_perf_cuda_params_handler {
Expand All @@ -151,6 +153,8 @@ public:
~ucp_perf_cuda_params_handler()
{
ucp_device_mem_list_release(m_params.mem_list);
ucp_device_mem_list_release(m_params.local_mem_list);
ucp_device_mem_list_release(m_params.remote_mem_list);
CUDA_CALL_WARN(cudaFree, m_params.indices);
CUDA_CALL_WARN(cudaFree, m_params.local_offsets);
CUDA_CALL_WARN(cudaFree, m_params.remote_offsets);
Expand All @@ -162,7 +166,8 @@ public:
private:
static bool has_counter(const ucx_perf_context_t &perf)
{
return (perf.params.command != UCX_PERF_CMD_PUT_SINGLE);
return ((perf.params.command != UCX_PERF_CMD_PUT_SINGLE) &&
(perf.params.command != UCX_PERF_CMD_PUT_SINGLE_V2));
}

void init_mem_list(const ucx_perf_context_t &perf)
Expand All @@ -171,28 +176,44 @@ private:
size_t count = data_count + (has_counter(perf) ? 1 : 0);
size_t offset = 0;
ucp_device_mem_list_elem_t elems[count];
ucp_device_mem_list_elem_t local_elems[count];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With latest API we can use existing ucp_device_mem_list_elem_t elems[count] to create all handles types

ucp_device_mem_list_elem_t remote_elems[count];

for (size_t i = 0; i < data_count; ++i) {
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR;
elems[i].memh = perf.ucp.send_memh;
elems[i].rkey = perf.ucp.rkey;
elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset);
elems[i].remote_addr = perf.ucp.remote_addr + offset;
elems[i].length = perf.params.msg_size_list[i];
offset += elems[i].length;

/* local elements - API v2 */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe init mem list elems on demand according to test and not both always.

local_elems[i].field_mask =
UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR;
local_elems[i].memh = perf.ucp.send_memh;
local_elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer,
offset);

/* remote elements - API v2 */
remote_elems[i].field_mask =
UCP_DEVICE_MEM_LIST_ELEM_FIELD_EP |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR;
remote_elems[i].ep = perf.ucp.ep;
remote_elems[i].rkey = perf.ucp.rkey;
remote_elems[i].remote_addr = perf.ucp.remote_addr + offset;

offset += perf.params.msg_size_list[i];
}

if (has_counter(perf)) {
elems[data_count].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR;
elems[data_count].rkey = perf.ucp.rkey;
elems[data_count].remote_addr = perf.ucp.remote_addr + offset;
elems[data_count].length = ONESIDED_SIGNAL_SIZE;
}

ucp_device_mem_list_params_t params;
Expand All @@ -219,6 +240,48 @@ private:
if (status != UCS_OK) {
throw std::runtime_error("Failed to create memory list");
}

ucp_device_mem_list_params_t local_params;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe create mem list handle on demand according to test and not both always.

local_params.field_mask =
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS |
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE |
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS |
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_WORKER;
local_params.element_size = sizeof(ucp_device_mem_list_elem_t);
local_params.num_elements = count;
local_params.elements = local_elems;
local_params.worker = perf.ucp.worker;

status = ucp_device_local_mem_list_create(&local_params,
&m_params.local_mem_list);
if (status != UCS_OK) {
throw std::runtime_error("Failed to create local memory list");
}

ucp_device_mem_list_params_t remote_params;
remote_params.field_mask =
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS |
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE |
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_NUM_ELEMENTS;
remote_params.element_size = sizeof(ucp_device_mem_list_elem_t);
remote_params.num_elements = count;
remote_params.elements = remote_elems;

deadline = ucs_get_time() + ucs_time_from_sec(60.0);
do {
Copy link
Contributor

@ofirfarjun7 ofirfarjun7 Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use helper func to improve code reuse and better separation ?

if (ucs_get_time() > deadline) {
ucs_warn("timeout creating remote device memory list");
deadline = ULONG_MAX;
}

ucp_worker_progress(perf.ucp.worker);
status = ucp_device_remote_mem_list_create(&remote_params,
&m_params.remote_mem_list);
} while (status == UCS_ERR_NOT_CONNECTED);

if (status != UCS_OK) {
throw std::runtime_error("Failed to create remote memory list");
}
}

void init_elements(const ucx_perf_context_t &perf)
Expand Down Expand Up @@ -283,6 +346,14 @@ ucp_perf_cuda_send_async(const ucp_perf_cuda_params &params,
0, 0,
params.length + ONESIDED_SIGNAL_SIZE,
channel_id, flags, req);
case UCX_PERF_CMD_PUT_SINGLE_V2:
*params.counter_send = idx + 1;
return ucp_device_put<level>(params.local_mem_list,
params.indices[0], 0,
params.remote_mem_list,
params.indices[0], 0,
params.length + ONESIDED_SIGNAL_SIZE,
channel_id, flags, req);
case UCX_PERF_CMD_PUT_MULTI:
return ucp_device_put_multi<level>(params.mem_list, 1, channel_id,
flags, req);
Expand Down
2 changes: 2 additions & 0 deletions src/tools/perf/lib/libperf.c
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params,
max_iov = attr.cap.put.max_iov;
break;
case UCX_PERF_CMD_PUT_SINGLE:
case UCX_PERF_CMD_PUT_SINGLE_V2:
case UCX_PERF_CMD_PUT_MULTI:
case UCX_PERF_CMD_PUT_PARTIAL:
min_size = 0;
Expand Down Expand Up @@ -888,6 +889,7 @@ static ucs_status_t ucp_perf_test_fill_params(ucx_perf_params_t *params,
switch (params->command) {
case UCX_PERF_CMD_PUT:
case UCX_PERF_CMD_PUT_SINGLE:
case UCX_PERF_CMD_PUT_SINGLE_V2:
case UCX_PERF_CMD_PUT_MULTI:
case UCX_PERF_CMD_PUT_PARTIAL:
case UCX_PERF_CMD_GET:
Expand Down
1 change: 1 addition & 0 deletions src/tools/perf/lib/libperf_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf)
flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT;
break;
case UCX_PERF_CMD_PUT_SINGLE:
case UCX_PERF_CMD_PUT_SINGLE_V2:
case UCX_PERF_CMD_PUT_MULTI:
case UCX_PERF_CMD_PUT_PARTIAL:
flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT | UCT_MD_MEM_ACCESS_REMOTE_ATOMIC;
Expand Down
6 changes: 6 additions & 0 deletions src/tools/perf/perftest.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ test_type_t tests[] = {
{"ucp_put_single_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_SINGLE, UCX_PERF_TEST_TYPE_PINGPONG,
"put single latency", "latency", 1},

{"ucp_put_single_v2_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_SINGLE_V2, UCX_PERF_TEST_TYPE_STREAM_UNI,
"put single v2 bandwidth", "overhead", 32},

{"ucp_put_single_v2_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_SINGLE_V2, UCX_PERF_TEST_TYPE_PINGPONG,
"put single v2 latency", "latency", 1},

{"ucp_put_multi_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT_MULTI, UCX_PERF_TEST_TYPE_STREAM_UNI,
"put multi bandwidth", "overhead", 32},

Expand Down
Loading