Skip to content

Commit 0183280

Browse files
UCT/CUDA: fix review comments
1 parent ea8d74e commit 0183280

File tree

1 file changed

+55
-44
lines changed

1 file changed

+55
-44
lines changed

src/uct/cuda/cuda_ipc/cuda_ipc.cuh

Lines changed: 55 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,15 @@ UCS_F_DEVICE void uct_cuda_ipc_level_sync()
9393
return;
9494
}
9595

96-
UCS_F_DEVICE void* uct_cuda_ipc_map_remote(const uct_cuda_ipc_device_mem_element_t* elem,
97-
uint64_t remote_address)
96+
UCS_F_DEVICE void*
97+
uct_cuda_ipc_map_remote(const uct_cuda_ipc_device_mem_element_t* elem,
98+
uint64_t remote_address)
9899
{
99100
return reinterpret_cast<void*>((uintptr_t)remote_address + elem->mapped_offset);
100101
}
101102

102-
UCS_F_DEVICE void uct_cuda_ipc_atomic_inc(uint64_t *dst, uint64_t inc_value)
103+
UCS_F_DEVICE void
104+
uct_cuda_ipc_atomic_inc(uint64_t *dst, uint64_t inc_value)
103105
{
104106
cuda::atomic_ref<uint64_t, cuda::thread_scope_system> dst_ref{*dst};
105107
dst_ref.fetch_add(inc_value, cuda::memory_order_relaxed);
@@ -110,18 +112,22 @@ template<ucs_device_level_t level>
110112
UCS_F_DEVICE void uct_cuda_ipc_copy_level(void *dst, const void *src, size_t len);
111113

112114
template<>
113-
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_THREAD>(void *dst, const void *src, size_t len)
115+
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_THREAD>(void *dst, const void *src,
116+
size_t len)
114117
{
115118
memcpy(dst, src, len);
116119
}
117120

118121
template<>
119-
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src, size_t len)
122+
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
123+
size_t len)
120124
{
121-
using vec4 = int4;
122-
using vec2 = int2;
123-
auto s1 = reinterpret_cast<const char*>(src);
124-
auto d1 = reinterpret_cast<char *>(dst);
125+
using vec4 = int4;
126+
using vec2 = int2;
127+
auto s1 = reinterpret_cast<const char*>(src);
128+
auto d1 = reinterpret_cast<char *>(dst);
129+
constexpr unsigned lanes_unroll = UCS_DEVICE_NUM_THREADS_IN_WARP *
130+
UCT_CUDA_IPC_COPY_LOOP_UNROLL;
125131
unsigned int lane_id, num_lanes;
126132
size_t num_lines;
127133

@@ -130,14 +136,13 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
130136
/* 16B-aligned fast path using vec4 with unrolling */
131137
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec4)) &&
132138
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec4))) {
133-
const vec4 *s4 = reinterpret_cast<const vec4*>(s1);
134-
vec4 *d4 = reinterpret_cast<vec4*>(d1);
139+
auto s4 = reinterpret_cast<const vec4*>(s1);
140+
auto d4 = reinterpret_cast<vec4*>(d1);
135141
vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
136142

137-
num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec4))) *
138-
(num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
143+
num_lines = (len / (lanes_unroll * sizeof(vec4))) * lanes_unroll;
139144

140-
for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
145+
for (size_t line = lane_id; line < num_lines; line += lanes_unroll) {
141146
#pragma unroll
142147
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
143148
tmp[i] = uct_cuda_ipc_ld_global_cg(s4 + (line + num_lanes * i));
@@ -176,14 +181,13 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
176181
/* 8B-aligned fast path using vec2 with unrolling */
177182
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec2)) &&
178183
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec2))) {
179-
const vec2 *s2 = reinterpret_cast<const vec2*>(s1);
180-
vec2 *d2 = reinterpret_cast<vec2*>(d1);
184+
auto s2 = reinterpret_cast<const vec2*>(s1);
185+
auto d2 = reinterpret_cast<vec2*>(d1);
181186
vec2 tmp2[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
182187

183-
num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec2))) *
184-
(num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
188+
num_lines = (len / (lanes_unroll * sizeof(vec2))) * lanes_unroll;
185189

186-
for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
190+
for (size_t line = lane_id; line < num_lines; line += lanes_unroll) {
187191
#pragma unroll
188192
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
189193
tmp2[i] = uct_cuda_ipc_ld_global_cg(s2 + (line + num_lanes * i));
@@ -226,38 +230,43 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
226230
}
227231

228232
template<>
229-
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src, size_t len)
233+
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
234+
size_t len)
230235
{
231-
using vec4 = int4;
232-
using vec2 = int2;
233-
auto s1 = reinterpret_cast<const char*>(src);
234-
auto d1 = reinterpret_cast<char *>(dst);
235-
const int warp = threadIdx.x / UCS_DEVICE_NUM_THREADS_IN_WARP;
236-
const int num_warps = blockDim.x / UCS_DEVICE_NUM_THREADS_IN_WARP;
237-
const int idx = threadIdx.x % UCS_DEVICE_NUM_THREADS_IN_WARP;
236+
using vec4 = int4;
237+
using vec2 = int2;
238+
auto s1 = reinterpret_cast<const char*>(src);
239+
auto d1 = reinterpret_cast<char *>(dst);
240+
const int warp = threadIdx.x / UCS_DEVICE_NUM_THREADS_IN_WARP;
241+
const int num_warps = blockDim.x / UCS_DEVICE_NUM_THREADS_IN_WARP;
242+
const int idx = threadIdx.x % UCS_DEVICE_NUM_THREADS_IN_WARP;
243+
constexpr unsigned lanes_unroll = UCS_DEVICE_NUM_THREADS_IN_WARP *
244+
UCT_CUDA_IPC_COPY_LOOP_UNROLL;
238245
size_t num_lines;
239246

240247
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec4)) &&
241248
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec4))) {
242-
const vec4 *s4 = reinterpret_cast<const vec4*>(s1);
243-
vec4 *d4 = reinterpret_cast<vec4*>(d1);
249+
auto s4 = reinterpret_cast<const vec4*>(s1);
250+
auto d4 = reinterpret_cast<vec4*>(d1);
244251
vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
245252

246-
num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec4))) *
247-
(UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
253+
num_lines = (len / (lanes_unroll * sizeof(vec4))) * lanes_unroll;
248254

249-
for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
250-
line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
255+
for (size_t line = warp * lanes_unroll + idx; line < num_lines;
256+
line += num_warps * lanes_unroll) {
251257
#pragma unroll
252258
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
253-
tmp[i] = uct_cuda_ipc_ld_global_cg(s4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
259+
tmp[i] = uct_cuda_ipc_ld_global_cg(
260+
s4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
254261
}
255262

256263
#pragma unroll
257264
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
258-
uct_cuda_ipc_st_global_cg(d4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
265+
uct_cuda_ipc_st_global_cg(
266+
d4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
259267
}
260268
}
269+
261270
len = len - num_lines * sizeof(vec4);
262271
if (len == 0) {
263272
return;
@@ -283,23 +292,24 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
283292
/* If not 16B-aligned, try 8B-aligned fast path using vec2 */
284293
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec2)) &&
285294
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec2))) {
286-
const vec2 *s2 = reinterpret_cast<const vec2*>(s1);
287-
vec2 *d2 = reinterpret_cast<vec2*>(d1);
295+
auto s2 = reinterpret_cast<const vec2*>(s1);
296+
auto d2 = reinterpret_cast<vec2*>(d1);
288297
vec2 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
289298

290-
num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec2))) *
291-
(UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
299+
num_lines = (len / (lanes_unroll * sizeof(vec2))) * lanes_unroll;
292300

293-
for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
294-
line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
301+
for (size_t line = warp * lanes_unroll + idx; line < num_lines;
302+
line += num_warps * lanes_unroll) {
295303
#pragma unroll
296304
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
297-
tmp[i] = uct_cuda_ipc_ld_global_cg(s2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
305+
tmp[i] = uct_cuda_ipc_ld_global_cg(
306+
s2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
298307
}
299308

300309
#pragma unroll
301310
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
302-
uct_cuda_ipc_st_global_cg(d2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
311+
uct_cuda_ipc_st_global_cg(
312+
d2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
303313
}
304314
}
305315

@@ -331,7 +341,8 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
331341
}
332342

333343
template<>
334-
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_GRID>(void *dst, const void *src, size_t len)
344+
void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_GRID>(void *dst, const void *src,
345+
size_t len)
335346
{/* not implemented */}
336347

337348
template<ucs_device_level_t level = UCS_DEVICE_LEVEL_BLOCK>

0 commit comments

Comments
 (0)