1515#include < cuda/atomic>
1616
1717#define UCT_CUDA_IPC_IS_ALIGNED_POW2 (_n, _p ) (!((_n) & ((_p) - 1 )))
18- #define UCT_CUDA_IPC_WARP_SIZE 32
1918#define UCT_CUDA_IPC_COPY_LOOP_UNROLL 8
2019
2120UCS_F_DEVICE int4 uct_cuda_ipc_ld_global_cg (const int4 * p)
@@ -60,8 +59,8 @@ uct_cuda_ipc_get_lane(unsigned &lane_id, unsigned &num_lanes)
6059 num_lanes = 1 ;
6160 break ;
6261 case UCS_DEVICE_LEVEL_WARP:
63- lane_id = threadIdx .x % UCT_CUDA_IPC_WARP_SIZE ;
64- num_lanes = UCT_CUDA_IPC_WARP_SIZE ;
62+ lane_id = threadIdx .x % UCS_DEVICE_NUM_THREADS_IN_WARP ;
63+ num_lanes = UCS_DEVICE_NUM_THREADS_IN_WARP ;
6564 break ;
6665 case UCS_DEVICE_LEVEL_BLOCK:
6766 lane_id = threadIdx .x ;
@@ -74,19 +73,6 @@ uct_cuda_ipc_get_lane(unsigned &lane_id, unsigned &num_lanes)
7473 }
7574}
7675
77- UCS_F_DEVICE void * uct_cuda_ipc_map_remote (const uct_cuda_ipc_device_mem_element_t * elem,
78- uint64_t remote_address)
79- {
80- return reinterpret_cast <void *>((uintptr_t )remote_address + elem->mapped_offset );
81- }
82-
83- UCS_F_DEVICE void uct_cuda_ipc_atomic_inc (uint64_t *dst, uint64_t inc_value)
84- {
85- cuda::atomic_ref<uint64_t , cuda::thread_scope_system> dst_ref{*dst};
86- dst_ref.fetch_add (inc_value, cuda::memory_order_relaxed);
87- cuda::atomic_thread_fence (cuda::memory_order_release, cuda::thread_scope_system);
88- }
89-
9076template <ucs_device_level_t level>
9177UCS_F_DEVICE void uct_cuda_ipc_level_sync ()
9278{
@@ -107,6 +93,19 @@ UCS_F_DEVICE void uct_cuda_ipc_level_sync()
10793 return ;
10894}
10995
96+ UCS_F_DEVICE void * uct_cuda_ipc_map_remote (const uct_cuda_ipc_device_mem_element_t * elem,
97+ uint64_t remote_address)
98+ {
99+ return reinterpret_cast <void *>((uintptr_t )remote_address + elem->mapped_offset );
100+ }
101+
102+ UCS_F_DEVICE void uct_cuda_ipc_atomic_inc (uint64_t *dst, uint64_t inc_value)
103+ {
104+ cuda::atomic_ref<uint64_t , cuda::thread_scope_system> dst_ref{*dst};
105+ dst_ref.fetch_add (inc_value, cuda::memory_order_relaxed);
106+ cuda::atomic_thread_fence (cuda::memory_order_release, cuda::thread_scope_system);
107+ }
108+
110109template <ucs_device_level_t level>
111110UCS_F_DEVICE void uct_cuda_ipc_copy_level (void *dst, const void *src, size_t len);
112111
@@ -121,50 +120,103 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
121120{
122121 using vec4 = int4 ;
123122 using vec2 = int2 ;
123+ auto s1 = reinterpret_cast <const char *>(src);
124+ auto d1 = reinterpret_cast <char *>(dst);
124125 unsigned int lane_id, num_lanes;
126+ size_t num_lines;
125127
126128 uct_cuda_ipc_get_lane<UCS_DEVICE_LEVEL_WARP>(lane_id, num_lanes);
127- auto s1 = reinterpret_cast <const char *>(src);
128- auto d1 = reinterpret_cast <char *>(dst);
129129
130- /* 16B-aligned fast path using vec4 */
130+ /* 16B-aligned fast path using vec4 with unrolling */
131131 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec4)) &&
132132 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec4))) {
133133 const vec4 *s4 = reinterpret_cast <const vec4*>(s1);
134- vec4 *d4 = reinterpret_cast <vec4*>(d1);
135- size_t n4 = len / sizeof (vec4);
136- for (size_t i = lane_id; i < n4; i += num_lanes) {
137- vec4 v = uct_cuda_ipc_ld_global_cg (s4 + i);
138- uct_cuda_ipc_st_global_cg (d4 + i, v);
134+ vec4 *d4 = reinterpret_cast <vec4*>(d1);
135+ vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
136+
137+ num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec4))) *
138+ (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
139+
140+ for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
141+ #pragma unroll
142+ for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
143+ tmp[i] = uct_cuda_ipc_ld_global_cg (s4 + (line + num_lanes * i));
144+ }
145+
146+ #pragma unroll
147+ for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
148+ uct_cuda_ipc_st_global_cg (d4 + (line + num_lanes * i), tmp[i]);
149+ }
150+ }
151+
152+ len = len - num_lines * sizeof (vec4);
153+ if (len == 0 ) {
154+ return ;
139155 }
140156
141- len = len - n4 * sizeof (vec4);
157+ s4 = s4 + num_lines;
158+ d4 = d4 + num_lines;
159+
160+ /* leftover vec4 lines without unrolling */
161+ num_lines = len / sizeof (vec4);
162+ for (size_t line = lane_id; line < num_lines; line += num_lanes) {
163+ vec4 v = uct_cuda_ipc_ld_global_cg (s4 + line);
164+ uct_cuda_ipc_st_global_cg (d4 + line, v);
165+ }
166+
167+ len = len - num_lines * sizeof (vec4);
142168 if (len == 0 ) {
143169 return ;
144170 }
145171
146- s1 = reinterpret_cast <const char *>(s4 + n4 );
147- d1 = reinterpret_cast <char *>(d4 + n4 );
172+ s1 = reinterpret_cast <const char *>(s4 + num_lines );
173+ d1 = reinterpret_cast <char *>(d4 + num_lines );
148174 }
149175
150- /* 8B-aligned fast path using vec2 */
176+ /* 8B-aligned fast path using vec2 with unrolling */
151177 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec2)) &&
152178 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec2))) {
153179 const vec2 *s2 = reinterpret_cast <const vec2*>(s1);
154- vec2 *d2 = reinterpret_cast <vec2*>(d1);
155- size_t n2 = len / sizeof (vec2);
156- for (size_t i = lane_id; i < n2; i += num_lanes) {
157- vec2 v2 = uct_cuda_ipc_ld_global_cg (s2 + i);
158- uct_cuda_ipc_st_global_cg (d2 + i, v2);
180+ vec2 *d2 = reinterpret_cast <vec2*>(d1);
181+ vec2 tmp2[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
182+
183+ num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec2))) *
184+ (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
185+
186+ for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
187+ #pragma unroll
188+ for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
189+ tmp2[i] = uct_cuda_ipc_ld_global_cg (s2 + (line + num_lanes * i));
190+ }
191+
192+ #pragma unroll
193+ for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
194+ uct_cuda_ipc_st_global_cg (d2 + (line + num_lanes * i), tmp2[i]);
195+ }
159196 }
160197
161- len = len - n2 * sizeof (vec2);
198+ len = len - num_lines * sizeof (vec2);
162199 if (len == 0 ) {
163200 return ;
164201 }
165202
166- s1 = reinterpret_cast <const char *>(s2 + n2);
167- d1 = reinterpret_cast <char *>(d2 + n2);
203+ s2 = s2 + num_lines;
204+ d2 = d2 + num_lines;
205+
206+ /* leftover vec2 lines without unrolling */
207+ num_lines = len / sizeof (vec2);
208+ for (size_t line = lane_id; line < num_lines; line += num_lanes) {
209+ vec2 v2 = uct_cuda_ipc_ld_global_cg (s2 + line);
210+ uct_cuda_ipc_st_global_cg (d2 + line, v2);
211+ }
212+
213+ len = len - num_lines * sizeof (vec2);
214+ if (len == 0 ) {
215+ return ;
216+ }
217+
218+ s1 = reinterpret_cast <const char *>(s2 + num_lines);
219+ d1 = reinterpret_cast <char *>(d2 + num_lines);
168220 }
169221
170222 /* byte tail */
@@ -178,34 +230,32 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
178230{
179231 using vec4 = int4 ;
180232 using vec2 = int2 ;
181- auto s1 = reinterpret_cast <const char *>(src);
182- auto d1 = reinterpret_cast <char *>(dst);
183- const vec4 *s4 ;
184- vec4 *d4 ;
185- int warp, num_warps, idx ;
233+ auto s1 = reinterpret_cast <const char *>(src);
234+ auto d1 = reinterpret_cast <char *>(dst);
235+ const int warp = threadIdx . x / UCS_DEVICE_NUM_THREADS_IN_WARP ;
236+ const int num_warps = blockDim . x / UCS_DEVICE_NUM_THREADS_IN_WARP ;
237+ const int idx = threadIdx . x % UCS_DEVICE_NUM_THREADS_IN_WARP ;
186238 size_t num_lines;
187239
188240 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec4)) &&
189241 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec4))) {
242+ const vec4 *s4 = reinterpret_cast <const vec4*>(s1);
243+ vec4 *d4 = reinterpret_cast <vec4*>(d1);
190244 vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
191- warp = threadIdx .x / UCT_CUDA_IPC_WARP_SIZE;
192- num_warps = blockDim .x / UCT_CUDA_IPC_WARP_SIZE;
193- idx = threadIdx .x % UCT_CUDA_IPC_WARP_SIZE;
194- s4 = reinterpret_cast <const vec4*>(s1);
195- d4 = reinterpret_cast <vec4*>(d1);
196- num_lines = (len / (UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec4))) *
197- (UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
198-
199- for (size_t line = warp * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
200- line += num_warps * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
245+
246+ num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec4))) *
247+ (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
248+
249+ for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
250+ line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
201251#pragma unroll
202252 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
203- tmp[i] = uct_cuda_ipc_ld_global_cg (s4 + (line + UCT_CUDA_IPC_WARP_SIZE * i));
253+ tmp[i] = uct_cuda_ipc_ld_global_cg (s4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
204254 }
205255
206256#pragma unroll
207257 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
208- uct_cuda_ipc_st_global_cg (d4 + (line + UCT_CUDA_IPC_WARP_SIZE * i), tmp[i]);
258+ uct_cuda_ipc_st_global_cg (d4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
209259 }
210260 }
211261 len = len - num_lines * sizeof (vec4);
@@ -233,28 +283,23 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
233283 /* If not 16B-aligned, try 8B-aligned fast path using vec2 */
234284 if (UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )s1, sizeof (vec2)) &&
235285 UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )d1, sizeof (vec2))) {
236- const vec2 *s2;
237- vec2 *d2;
238- vec2 tmp2 [UCT_CUDA_IPC_COPY_LOOP_UNROLL];
286+ const vec2 *s2 = reinterpret_cast < const vec2*>(s1) ;
287+ vec2 *d2 = reinterpret_cast <vec2*>(d1) ;
288+ vec2 tmp [UCT_CUDA_IPC_COPY_LOOP_UNROLL];
239289
240- warp = threadIdx .x / UCT_CUDA_IPC_WARP_SIZE;
241- num_warps = blockDim .x / UCT_CUDA_IPC_WARP_SIZE;
242- idx = threadIdx .x % UCT_CUDA_IPC_WARP_SIZE;
243- s2 = reinterpret_cast <const vec2*>(s1);
244- d2 = reinterpret_cast <vec2*>(d1);
245- num_lines = (len / (UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec2))) *
246- (UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
290+ num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof (vec2))) *
291+ (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
247292
248- for (size_t line = warp * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
249- line += num_warps * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
293+ for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
294+ line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
250295#pragma unroll
251296 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
252- tmp2 [i] = uct_cuda_ipc_ld_global_cg (s2 + (line + UCT_CUDA_IPC_WARP_SIZE * i));
297+ tmp [i] = uct_cuda_ipc_ld_global_cg (s2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
253298 }
254299
255300#pragma unroll
256301 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
257- uct_cuda_ipc_st_global_cg (d2 + (line + UCT_CUDA_IPC_WARP_SIZE * i), tmp2 [i]);
302+ uct_cuda_ipc_st_global_cg (d2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp [i]);
258303 }
259304 }
260305
0 commit comments