@@ -93,56 +93,58 @@ UCS_F_DEVICE void uct_cuda_ipc_level_sync()
9393 return ;
9494}
9595
96- template <typename VecT >
96+ template <typename vec_t >
9797UCS_F_DEVICE void uct_cuda_ipc_try_copy_aligned (const char * &src, char * &dst,
9898 size_t &len,
9999 unsigned warp_id,
100100 unsigned num_warps,
101101 unsigned lane_id,
102102 unsigned num_lanes)
103103{
104- if (!(UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )src, sizeof (VecT)) &&
105- UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )dst, sizeof (VecT)))) {
104+ constexpr size_t vec_size = sizeof (vec_t );
105+
106+ if (!(UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )src, vec_size) &&
107+ UCT_CUDA_IPC_IS_ALIGNED_POW2 ((intptr_t )dst, vec_size))) {
106108 return ;
107109 }
108110
109- auto src_vec = reinterpret_cast <const VecT *>(src);
110- auto dst_vec = reinterpret_cast <VecT *>(dst);
111- constexpr unsigned lanes_unroll = UCS_DEVICE_NUM_THREADS_IN_WARP *
111+ auto src_vec = reinterpret_cast <const vec_t *>(src);
112+ auto dst_vec = reinterpret_cast <vec_t *>(dst);
113+ constexpr size_t lanes_unroll = UCS_DEVICE_NUM_THREADS_IN_WARP *
112114 UCT_CUDA_IPC_COPY_LOOP_UNROLL;
113- size_t num_lines = (len / (lanes_unroll * sizeof (VecT) )) *
115+ size_t num_vectors = (len / (lanes_unroll * vec_size )) *
114116 lanes_unroll;
115- VecT tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
116117
117- for (size_t line = warp_id * lanes_unroll + lane_id % UCS_DEVICE_NUM_THREADS_IN_WARP;
118- line < num_lines;
119- line += num_warps * lanes_unroll) {
118+ for (size_t vec = warp_id * lanes_unroll + lane_id % UCS_DEVICE_NUM_THREADS_IN_WARP;
119+ vec < num_vectors;
120+ vec += num_warps * lanes_unroll) {
121+ vec_t tmp[UCT_CUDA_IPC_COPY_LOOP_UNROLL];
120122#pragma unroll
121123 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
122124 tmp[i] = uct_cuda_ipc_ld_global_cg (
123- src_vec + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
125+ src_vec + (vec + UCS_DEVICE_NUM_THREADS_IN_WARP * i));
124126 }
125127
126128#pragma unroll
127129 for (int i = 0 ; i < UCT_CUDA_IPC_COPY_LOOP_UNROLL; i++) {
128130 uct_cuda_ipc_st_global_cg (
129- dst_vec + (line + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
131+ dst_vec + (vec + UCS_DEVICE_NUM_THREADS_IN_WARP * i), tmp[i]);
130132 }
131133 }
132134
133- src_vec += num_lines ;
134- dst_vec += num_lines ;
135- len = len - num_lines * sizeof (VecT) ;
135+ src_vec += num_vectors ;
136+ dst_vec += num_vectors ;
137+ len = len - num_vectors * vec_size ;
136138
137- num_lines = len / sizeof (VecT) ;
138- for (size_t line = lane_id; line < num_lines; line += num_lanes) {
139- VecT v = uct_cuda_ipc_ld_global_cg (src_vec + line );
140- uct_cuda_ipc_st_global_cg (dst_vec + line , v);
139+ num_vectors = len / vec_size ;
140+ for (size_t vec = lane_id; vec < num_vectors; vec += num_lanes) {
141+ vec_t v = uct_cuda_ipc_ld_global_cg (src_vec + vec );
142+ uct_cuda_ipc_st_global_cg (dst_vec + vec , v);
141143 }
142144
143- len -= num_lines * sizeof (VecT) ;
144- src = reinterpret_cast <const char *>(src_vec + num_lines );
145- dst = reinterpret_cast <char *>(dst_vec + num_lines );
145+ len -= num_vectors * vec_size ;
146+ src = reinterpret_cast <const char *>(src_vec + num_vectors );
147+ dst = reinterpret_cast <char *>(dst_vec + num_vectors );
146148}
147149
148150UCS_F_DEVICE void *
@@ -165,11 +167,12 @@ UCS_F_DEVICE void uct_cuda_ipc_copy_level(void *dst, const void *src, size_t len
165167{
166168 auto s1 = reinterpret_cast <const char *>(src);
167169 auto d1 = reinterpret_cast <char *>(dst);
168- unsigned int lane_id, num_lanes, warp_id, num_warps ;
170+ unsigned int lane_id, num_lanes;
169171
170172 uct_cuda_ipc_get_lane<level>(lane_id, num_lanes);
171- warp_id = lane_id / UCS_DEVICE_NUM_THREADS_IN_WARP;
172- num_warps = num_lanes / UCS_DEVICE_NUM_THREADS_IN_WARP;
173+
174+ const unsigned warp_id = lane_id / UCS_DEVICE_NUM_THREADS_IN_WARP;
175+ const unsigned num_warps = num_lanes / UCS_DEVICE_NUM_THREADS_IN_WARP;
173176
174177 uct_cuda_ipc_try_copy_aligned<int4 >(s1, d1, len, warp_id, num_warps,
175178 lane_id, num_lanes);
0 commit comments