Skip to content

Commit 85b7fc6

Browse files
UCT/CUDA: add loop unroll for warp copy
1 parent 1be106e commit 85b7fc6

File tree

3 files changed

+117
-70
lines changed

3 files changed

+117
-70
lines changed

src/uct/cuda/cuda_ipc/cuda_ipc.cuh

Lines changed: 112 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
#include <cuda/atomic>
1616

1717
#define UCT_CUDA_IPC_IS_ALIGNED_POW2(_n, _p) (!((_n) & ((_p) - 1)))
18-
#define UCT_CUDA_IPC_WARP_SIZE 32
1918
#define UCT_CUDA_IPC_COPY_LOOP_UNROLL 8
2019

2120
UCS_F_DEVICE int4 uct_cuda_ipc_ld_global_cg(const int4* p)
@@ -60,8 +59,8 @@ uct_cuda_ipc_get_lane(unsigned &lane_id, unsigned &num_lanes)
6059
num_lanes = 1;
6160
break;
6261
case UCS_DEVICE_LEVEL_WARP:
63-
lane_id = threadIdx.x % UCT_CUDA_IPC_WARP_SIZE;
64-
num_lanes = UCT_CUDA_IPC_WARP_SIZE;
62+
lane_id = threadIdx.x % UCS_DEVICE_NUM_THREADS_IN_WARP;
63+
num_lanes = UCS_DEVICE_NUM_THREADS_IN_WARP;
6564
break;
6665
case UCS_DEVICE_LEVEL_BLOCK:
6766
lane_id = threadIdx.x;
@@ -74,19 +73,6 @@ uct_cuda_ipc_get_lane(unsigned &lane_id, unsigned &num_lanes)
7473
}
7574
}
7675

77-
UCS_F_DEVICE void* uct_cuda_ipc_map_remote(const uct_cuda_ipc_device_mem_element_t* elem,
78-
uint64_t remote_address)
79-
{
80-
return reinterpret_cast<void*>((uintptr_t)remote_address + elem->mapped_offset);
81-
}
82-
83-
UCS_F_DEVICE void uct_cuda_ipc_atomic_inc(uint64_t *dst, uint64_t inc_value)
84-
{
85-
cuda::atomic_ref<uint64_t, cuda::thread_scope_system> dst_ref{*dst};
86-
dst_ref.fetch_add(inc_value, cuda::memory_order_relaxed);
87-
cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
88-
}
89-
9076
template<ucs_device_level_t level>
9177
UCS_F_DEVICE void uct_cuda_ipc_level_sync()
9278
{
@@ -107,6 +93,19 @@ UCS_F_DEVICE void uct_cuda_ipc_level_sync()
10793
return;
10894
}
10995

96+
UCS_F_DEVICE void* uct_cuda_ipc_map_remote(const uct_cuda_ipc_device_mem_element_t* elem,
97+
uint64_t remote_address)
98+
{
99+
return reinterpret_cast<void*>((uintptr_t)remote_address + elem->mapped_offset);
100+
}
101+
102+
UCS_F_DEVICE void uct_cuda_ipc_atomic_inc(uint64_t *dst, uint64_t inc_value)
103+
{
104+
cuda::atomic_ref<uint64_t, cuda::thread_scope_system> dst_ref{*dst};
105+
dst_ref.fetch_add(inc_value, cuda::memory_order_relaxed);
106+
cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
107+
}
108+
110109
template<ucs_device_level_t level>
111110
UCS_F_DEVICE void uct_cuda_ipc_copy_level(void *dst, const void *src, size_t len);
112111

@@ -121,50 +120,103 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_WARP>(void *dst, const void *src,
121120
{
122121
using vec4 = int4;
123122
using vec2 = int2;
123+
auto s1 = reinterpret_cast<const char*>(src);
124+
auto d1 = reinterpret_cast<char *>(dst);
124125
unsigned int lane_id, num_lanes;
126+
size_t num_lines;
125127

126128
uct_cuda_ipc_get_lane<UCS_DEVICE_LEVEL_WARP>(lane_id, num_lanes);
127-
auto s1 = reinterpret_cast<const char*>(src);
128-
auto d1 = reinterpret_cast<char *>(dst);
129129

130-
/* 16B-aligned fast path using vec4 */
130+
/* 16B-aligned fast path using vec4 with unrolling */
131131
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec4)) &&
132132
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec4))) {
133133
const vec4 *s4 = reinterpret_cast<const vec4*>(s1);
134-
vec4 *d4 = reinterpret_cast<vec4*>(d1);
135-
size_t n4 = len / sizeof(vec4);
136-
for (size_t i = lane_id; i < n4; i += num_lanes) {
137-
vec4 v = uct_cuda_ipc_ld_global_cg(s4 + i);
138-
uct_cuda_ipc_st_global_cg(d4 + i, v);
134+
vec4 *d4 = reinterpret_cast<vec4*>(d1);
135+
vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
136+
137+
num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec4))) *
138+
(num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
139+
140+
for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
141+
#pragma unroll
142+
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
143+
tmp[i] = uct_cuda_ipc_ld_global_cg(s4 + (line + num_lanes * i));
144+
}
145+
146+
#pragma unroll
147+
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
148+
uct_cuda_ipc_st_global_cg(d4 + (line + num_lanes * i), tmp[i]);
149+
}
150+
}
151+
152+
len = len - num_lines * sizeof(vec4);
153+
if (len == 0) {
154+
return;
139155
}
140156

141-
len = len - n4 * sizeof(vec4);
157+
s4 = s4 + num_lines;
158+
d4 = d4 + num_lines;
159+
160+
/* leftover vec4 lines without unrolling */
161+
num_lines = len / sizeof(vec4);
162+
for (size_t line = lane_id; line < num_lines; line += num_lanes) {
163+
vec4 v = uct_cuda_ipc_ld_global_cg(s4 + line);
164+
uct_cuda_ipc_st_global_cg(d4 + line, v);
165+
}
166+
167+
len = len - num_lines * sizeof(vec4);
142168
if (len == 0) {
143169
return;
144170
}
145171

146-
s1 = reinterpret_cast<const char*>(s4 + n4);
147-
d1 = reinterpret_cast<char*>(d4 + n4);
172+
s1 = reinterpret_cast<const char*>(s4 + num_lines);
173+
d1 = reinterpret_cast<char*>(d4 + num_lines);
148174
}
149175

150-
/* 8B-aligned fast path using vec2 */
176+
/* 8B-aligned fast path using vec2 with unrolling */
151177
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec2)) &&
152178
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec2))) {
153179
const vec2 *s2 = reinterpret_cast<const vec2*>(s1);
154-
vec2 *d2 = reinterpret_cast<vec2*>(d1);
155-
size_t n2 = len / sizeof(vec2);
156-
for (size_t i = lane_id; i < n2; i += num_lanes) {
157-
vec2 v2 = uct_cuda_ipc_ld_global_cg(s2 + i);
158-
uct_cuda_ipc_st_global_cg(d2 + i, v2);
180+
vec2 *d2 = reinterpret_cast<vec2*>(d1);
181+
vec2 tmp2[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
182+
183+
num_lines = (len / (num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec2))) *
184+
(num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
185+
186+
for (size_t line = lane_id; line < num_lines; line += num_lanes * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
187+
#pragma unroll
188+
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
189+
tmp2[i] = uct_cuda_ipc_ld_global_cg(s2 + (line + num_lanes * i));
190+
}
191+
192+
#pragma unroll
193+
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
194+
uct_cuda_ipc_st_global_cg(d2 + (line + num_lanes * i), tmp2[i]);
195+
}
159196
}
160197

161-
len = len - n2 * sizeof(vec2);
198+
len = len - num_lines * sizeof(vec2);
162199
if (len == 0) {
163200
return;
164201
}
165202

166-
s1 = reinterpret_cast<const char*>(s2 + n2);
167-
d1 = reinterpret_cast<char*>(d2 + n2);
203+
s2 = s2 + num_lines;
204+
d2 = d2 + num_lines;
205+
206+
/* leftover vec2 lines without unrolling */
207+
num_lines = len / sizeof(vec2);
208+
for (size_t line = lane_id; line < num_lines; line += num_lanes) {
209+
vec2 v2 = uct_cuda_ipc_ld_global_cg(s2 + line);
210+
uct_cuda_ipc_st_global_cg(d2 + line, v2);
211+
}
212+
213+
len = len - num_lines * sizeof(vec2);
214+
if (len == 0) {
215+
return;
216+
}
217+
218+
s1 = reinterpret_cast<const char*>(s2 + num_lines);
219+
d1 = reinterpret_cast<char*>(d2 + num_lines);
168220
}
169221

170222
/* byte tail */
@@ -178,34 +230,32 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
178230
{
179231
using vec4 = int4;
180232
using vec2 = int2;
181-
auto s1 = reinterpret_cast<const char*>(src);
182-
auto d1 = reinterpret_cast<char *>(dst);
183-
const vec4 *s4;
184-
vec4 *d4;
185-
int warp, num_warps, idx;
233+
auto s1 = reinterpret_cast<const char*>(src);
234+
auto d1 = reinterpret_cast<char *>(dst);
235+
const int warp = threadIdx.x / UCS_DEVICE_NUM_THREADS_IN_WARP;
236+
const int num_warps = blockDim.x / UCS_DEVICE_NUM_THREADS_IN_WARP;
237+
const int idx = threadIdx.x % UCS_DEVICE_NUM_THREADS_IN_WARP;
186238
size_t num_lines;
187239

188240
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec4)) &&
189241
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec4))) {
242+
const vec4 *s4 = reinterpret_cast<const vec4*>(s1);
243+
vec4 *d4 = reinterpret_cast<vec4*>(d1);
190244
vec4 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
191-
warp = threadIdx.x / UCT_CUDA_IPC_WARP_SIZE;
192-
num_warps = blockDim.x / UCT_CUDA_IPC_WARP_SIZE;
193-
idx = threadIdx.x % UCT_CUDA_IPC_WARP_SIZE;
194-
s4 = reinterpret_cast<const vec4*>(s1);
195-
d4 = reinterpret_cast<vec4*>(d1);
196-
num_lines = (len / (UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec4))) *
197-
(UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
198-
199-
for (size_t line = warp * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
200-
line += num_warps * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
245+
246+
num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec4))) *
247+
(UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
248+
249+
for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
250+
line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
201251
#pragma unroll
202252
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
203-
tmp[i] = uct_cuda_ipc_ld_global_cg(s4 + (line + UCT_CUDA_IPC_WARP_SIZE * i));
253+
tmp[i] = uct_cuda_ipc_ld_global_cg(s4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
204254
}
205255

206256
#pragma unroll
207257
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
208-
uct_cuda_ipc_st_global_cg(d4 + (line + UCT_CUDA_IPC_WARP_SIZE * i), tmp[i]);
258+
uct_cuda_ipc_st_global_cg(d4 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
209259
}
210260
}
211261
len = len - num_lines * sizeof(vec4);
@@ -233,28 +283,23 @@ void uct_cuda_ipc_copy_level<UCS_DEVICE_LEVEL_BLOCK>(void *dst, const void *src,
233283
/* If not 16B-aligned, try 8B-aligned fast path using vec2 */
234284
if (UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)s1, sizeof(vec2)) &&
235285
UCT_CUDA_IPC_IS_ALIGNED_POW2((intptr_t)d1, sizeof(vec2))) {
236-
const vec2 *s2;
237-
vec2 *d2;
238-
vec2 tmp2[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
286+
const vec2 *s2 = reinterpret_cast<const vec2*>(s1);
287+
vec2 *d2 = reinterpret_cast<vec2*>(d1);
288+
vec2 tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
239289

240-
warp = threadIdx.x / UCT_CUDA_IPC_WARP_SIZE;
241-
num_warps = blockDim.x / UCT_CUDA_IPC_WARP_SIZE;
242-
idx = threadIdx.x % UCT_CUDA_IPC_WARP_SIZE;
243-
s2 = reinterpret_cast<const vec2*>(s1);
244-
d2 = reinterpret_cast<vec2*>(d1);
245-
num_lines = (len / (UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec2))) *
246-
(UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
290+
num_lines = (len / (UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL * sizeof(vec2))) *
291+
(UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL);
247292

248-
for (size_t line = warp * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
249-
line += num_warps * UCT_CUDA_IPC_WARP_SIZE * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
293+
for (size_t line = warp * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL + idx; line < num_lines;
294+
line += num_warps * UCS_DEVICE_NUM_THREADS_IN_WARP * UCT_CUDA_IPC_COPY_LOOP_UNROLL) {
250295
#pragma unroll
251296
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
252-
tmp2[i] = uct_cuda_ipc_ld_global_cg(s2 + (line + UCT_CUDA_IPC_WARP_SIZE * i));
297+
tmp[i] = uct_cuda_ipc_ld_global_cg(s2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
253298
}
254299

255300
#pragma unroll
256301
for (int i = 0; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
257-
uct_cuda_ipc_st_global_cg(d2 + (line + UCT_CUDA_IPC_WARP_SIZE * i), tmp2[i]);
302+
uct_cuda_ipc_st_global_cg(d2 + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
258303
}
259304
}
260305

test/gtest/ucp/test_ucp_device.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ UCS_TEST_P(test_ucp_device, create_fail)
256256
}
257257

258258
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, rc_gda, "rc,rc_gda")
259-
259+
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, cuda_ipc, "rc,cuda_copy,cuda_ipc")
260260

261261
class test_ucp_device_kernel : public test_ucp_device {
262262
public:
@@ -347,7 +347,8 @@ UCS_TEST_P(test_ucp_device_kernel, local_counter)
347347

348348
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device_kernel, rc_gda,
349349
"rc,rc_gda")
350-
350+
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device_kernel, cuda_ipc,
351+
"rc,cuda_copy,cuda_ipc")
351352

352353
class test_ucp_device_xfer : public test_ucp_device_kernel {
353354
public:
@@ -640,3 +641,5 @@ UCS_TEST_P(test_ucp_device_xfer, counter)
640641

641642
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device_xfer, rc_gda,
642643
"rc,rc_gda")
644+
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device_xfer, cuda_ipc,
645+
"rc,cuda_copy,cuda_ipc")

test/gtest/uct/cuda/test_cuda_ipc_device.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ class test_cuda_ipc_rma : public uct_test {
5353
CUdevice m_cuda_dev;
5454
static const uint64_t SEED1 = 0xABClu;
5555
static const uint64_t SEED2 = 0xDEFlu;
56-
static const unsigned WARP_SIZE = 32;
5756
};
5857

5958
UCS_TEST_P(test_cuda_ipc_rma, has_device_ep_capability)

0 commit comments

Comments
 (0)