@@ -93,13 +93,15 @@ UCS_F_DEVICE void uct_cuda_ipc_level_sync()
9393 return ;
9494}
9595
96- UCS_F_DEVICE void * uct_cuda_ipc_map_remote (const uct_cuda_ipc_device_mem_element_t * elem,
97- uint64_t remote_address)
96+ UCS_F_DEVICE void *
97+ uct_cuda_ipc_map_remote (const uct_cuda_ipc_device_mem_element_t * elem,
98+ uint64_t remote_address)
9899{
99100 return reinterpret_cast <void *>((uintptr_t )remote_address + elem->mapped_offset );
100101}
101102
102- UCS_F_DEVICE void uct_cuda_ipc_atomic_inc (uint64_t *dst, uint64_t inc_value)
103+ UCS_F_DEVICE void
104+ uct_cuda_ipc_atomic_inc (uint64_t *dst, uint64_t inc_value)
103105{
104106 cuda::atomic_ref<uint64_t , cuda::thread_scope_system> dst_ref{*dst};
105107 dst_ref.fetch_add (inc_value, cuda::memory_order_relaxed);
@@ -110,18 +112,22 @@ template<ucs_device_level_t level>
110112UCS_F_DEVICE void uct_cuda_ipc_copy_level (void *dst, const void *src, size_t len);
111113
112114template <>
113- void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_THREAD>(void *dst, const void *src, size_t len)
115+ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_THREAD>(void *dst, const void *src,
116+ size_t len)
114117{
115118 memcpy (dst, src, len);
116119}
117120
118121template <>
119- void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src, size_t len)
122+ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
123+ size_t len)
120124{
121- using vec4 = int4 ;
122- using vec2 = int2 ;
123- auto s1 = reinterpret_cast <const char *>(src);
124- auto d1 = reinterpret_cast <char *>(dst);
125+ using vec4 = int4 ;
126+ using vec2 = int2 ;
127+ auto s1 = reinterpret_cast <const char *>(src);
128+ auto d1 = reinterpret_cast <char *>(dst);
129+ constexpr unsigned lanes_unroll = UCS_DEVICE_NUM_THREADS_IN_WARP *
130+ UCT_CUDA_IPC_COPY_LOOP_UNROLL;
125131 unsigned int lane_id, num_lanes;
126132 size_t num_lines;
127133
@@ -130,14 +136,13 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
130136 /* 16B-aligned fast path using vec4 with unrolling */
131137 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec4)) &&
132138 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec4))) {
133- const vec4 * s4 = reinterpret_cast <const vec4*>(s1);
134- vec4 * d4 = reinterpret_cast <vec4*>(d1);
139+ auto s4 = reinterpret_cast <const vec4*>(s1);
140+ auto d4 = reinterpret_cast <vec4*>(d1);
135141 vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
136142
137- num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec4))) *
138- (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
143+ num_lines = (len / (lanes_unroll * sizeof (vec4))) * lanes_unroll;
139144
140- for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL ) {
145+ for (size_t line = lane_id; line < num_lines; line += lanes_unroll ) {
141146#pragma unroll
142147 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
143148 tmp[i] = uct_cuda_ipc_ld_global_cg (s4 + (line + num_lanes * i));
@@ -176,14 +181,13 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
176181 /* 8B-aligned fast path using vec2 with unrolling */
177182 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec2)) &&
178183 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec2))) {
179- const vec2 * s2 = reinterpret_cast <const vec2*>(s1);
180- vec2 * d2 = reinterpret_cast <vec2*>(d1);
184+ auto s2 = reinterpret_cast <const vec2*>(s1);
185+ auto d2 = reinterpret_cast <vec2*>(d1);
181186 vec2 tmp2[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
182187
183- num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec2))) *
184- (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
188+ num_lines = (len / (lanes_unroll * sizeof (vec2))) * lanes_unroll;
185189
186- for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL ) {
190+ for (size_t line = lane_id; line < num_lines; line += lanes_unroll ) {
187191#pragma unroll
188192 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
189193 tmp2[i] = uct_cuda_ipc_ld_global_cg (s2 + (line + num_lanes * i));
@@ -226,38 +230,43 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
226230}
227231
228232template <>
229- void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src, size_t len)
233+ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
234+ size_t len)
230235{
231- using vec4 = int4 ;
232- using vec2 = int2 ;
233- auto s1 = reinterpret_cast <const char *>(src);
234- auto d1 = reinterpret_cast <char *>(dst);
235- const int warp = threadIdx .x / UCS_DEVICE_NUM_THREADS_IN_WARP;
236- const int num_warps = blockDim .x / UCS_DEVICE_NUM_THREADS_IN_WARP;
237- const int idx = threadIdx .x % UCS_DEVICE_NUM_THREADS_IN_WARP;
236+ using vec4 = int4 ;
237+ using vec2 = int2 ;
238+ auto s1 = reinterpret_cast <const char *>(src);
239+ auto d1 = reinterpret_cast <char *>(dst);
240+ const int warp = threadIdx .x / UCS_DEVICE_NUM_THREADS_IN_WARP;
241+ const int num_warps = blockDim .x / UCS_DEVICE_NUM_THREADS_IN_WARP;
242+ const int idx = threadIdx .x % UCS_DEVICE_NUM_THREADS_IN_WARP;
243+ constexpr unsigned lanes_unroll = UCS_DEVICE_NUM_THREADS_IN_WARP *
244+ UCT_CUDA_IPC_COPY_LOOP_UNROLL;
238245 size_t num_lines;
239246
240247 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec4)) &&
241248 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec4))) {
242- const vec4 * s4 = reinterpret_cast <const vec4*>(s1);
243- vec4 * d4 = reinterpret_cast <vec4*>(d1);
249+ auto s4 = reinterpret_cast <const vec4*>(s1);
250+ auto d4 = reinterpret_cast <vec4*>(d1);
244251 vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
245252
246- num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec4))) *
247- (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
253+ num_lines = (len / (lanes_unroll * sizeof (vec4))) * lanes_unroll;
248254
249- for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
250- line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL ) {
255+ for (size_t line = warp * lanes_unroll + idx; line < num_lines;
256+ line += num_warps * lanes_unroll ) {
251257#pragma unroll
252258 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
253- tmp[i] = uct_cuda_ipc_ld_global_cg (s4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
259+ tmp[i] = uct_cuda_ipc_ld_global_cg (
260+ s4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
254261 }
255262
256263#pragma unroll
257264 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
258- uct_cuda_ipc_st_global_cg (d4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
265+ uct_cuda_ipc_st_global_cg (
266+ d4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
259267 }
260268 }
269+
261270 len = len - num_lines * sizeof (vec4);
262271 if (len == 0 ) {
263272 return ;
@@ -283,23 +292,24 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
283292 /* If not 16B-aligned, try 8B-aligned fast path using vec2 */
284293 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec2)) &&
285294 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec2))) {
286- const vec2 * s2 = reinterpret_cast <const vec2*>(s1);
287- vec2 * d2 = reinterpret_cast <vec2*>(d1);
295+ auto s2 = reinterpret_cast <const vec2*>(s1);
296+ auto d2 = reinterpret_cast <vec2*>(d1);
288297 vec2 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
289298
290- num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec2))) *
291- (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
299+ num_lines = (len / (lanes_unroll * sizeof (vec2))) * lanes_unroll;
292300
293- for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
294- line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL ) {
301+ for (size_t line = warp * lanes_unroll + idx; line < num_lines;
302+ line += num_warps * lanes_unroll ) {
295303#pragma unroll
296304 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
297- tmp[i] = uct_cuda_ipc_ld_global_cg (s2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
305+ tmp[i] = uct_cuda_ipc_ld_global_cg (
306+ s2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
298307 }
299308
300309#pragma unroll
301310 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
302- uct_cuda_ipc_st_global_cg (d2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
311+ uct_cuda_ipc_st_global_cg (
312+ d2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
303313 }
304314 }
305315
@@ -331,7 +341,8 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
331341}
332342
333343template <>
334- void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_GRID>(void *dst, const void *src, size_t len)
344+ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_GRID>(void *dst, const void *src,
345+ size_t len)
335346{/* not implemented */ }
336347
337348template <ucs_device_level_t level = UCS_DEVICE_LEVEL_BLOCK>
0 commit comments