Skip to content

Commit 822e820

Browse files
authored
Merge pull request #2310 from bureddy/delay-stream-create-v1.3
UCT/CUDA: create cuda streams on first use - v1.3.x
2 parents 3aef6b6 + 94a632c commit 822e820

File tree

3 files changed

+38
-22
lines changed

3 files changed

+38
-22
lines changed

src/uct/cuda/cuda_copy/cuda_copy_ep.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,13 @@ uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t
5555
return UCS_ERR_NO_MEMORY;
5656
}
5757

58-
status = CUDA_FUNC(cudaMemcpyAsync(dst, src, length, direction, stream));
58+
status = UCT_CUDA_FUNC(cudaMemcpyAsync(dst, src, length, direction, stream));
5959
if (UCS_OK != status) {
60-
ucs_error("cudaMemcpyAsync Failed ");
6160
return UCS_ERR_IO_ERROR;
6261
}
6362

64-
status = CUDA_FUNC(cudaEventRecord(cuda_event->event, stream));
63+
status = UCT_CUDA_FUNC(cudaEventRecord(cuda_event->event, stream));
6564
if (UCS_OK != status) {
66-
ucs_error("cudaEventRecord Failed ");
6765
return UCS_ERR_IO_ERROR;
6866
}
6967
ucs_queue_push(outstanding_queue, &cuda_event->queue);
@@ -81,6 +79,14 @@ ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si
8179
uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t);
8280
ucs_status_t status;
8381

82+
if (iface->stream_d2h == 0) {
83+
status = UCT_CUDA_FUNC(cudaStreamCreateWithFlags(&iface->stream_d2h,
84+
cudaStreamNonBlocking));
85+
if (UCS_OK != status) {
86+
return UCS_ERR_IO_ERROR;
87+
}
88+
}
89+
8490
status = uct_cuda_copy_post_cuda_async_copy(tl_ep, iov[0].buffer, (void *)remote_addr,
8591
iov[0].length, cudaMemcpyDeviceToHost,
8692
iface->stream_d2h,
@@ -101,6 +107,14 @@ ucs_status_t uct_cuda_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si
101107
uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t);
102108
ucs_status_t status;
103109

110+
if (iface->stream_h2d == 0) {
111+
status = UCT_CUDA_FUNC(cudaStreamCreateWithFlags(&iface->stream_h2d,
112+
cudaStreamNonBlocking));
113+
if (UCS_OK != status) {
114+
return UCS_ERR_IO_ERROR;
115+
}
116+
}
117+
104118
status = uct_cuda_copy_post_cuda_async_copy(tl_ep, (void *)remote_addr, iov[0].buffer,
105119
iov[0].length, cudaMemcpyHostToDevice,
106120
iface->stream_h2d,

src/uct/cuda/cuda_copy/cuda_copy_iface.c

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,8 @@ static void uct_cuda_copy_event_desc_init(ucs_mpool_t *mp, void *obj, void *chun
177177
ucs_status_t status;
178178

179179
memset(base, 0 , sizeof(*base));
180-
status = CUDA_FUNC(cudaEventCreateWithFlags(&(base->event), cudaEventDisableTiming));
180+
status = UCT_CUDA_FUNC(cudaEventCreateWithFlags(&(base->event),
181+
cudaEventDisableTiming));
181182
if (UCS_OK != status) {
182183
ucs_error("cudaEventCreateWithFlags Failed");
183184
}
@@ -188,7 +189,7 @@ static void uct_cuda_copy_event_desc_cleanup(ucs_mpool_t *mp, void *obj)
188189
uct_cuda_copy_event_desc_t *base = (uct_cuda_copy_event_desc_t *) obj;
189190
ucs_status_t status;
190191

191-
status = CUDA_FUNC(cudaEventDestroy(base->event));
192+
status = UCT_CUDA_FUNC(cudaEventDestroy(base->event));
192193
if (UCS_OK != status) {
193194
ucs_error("cudaEventDestroy Failed");
194195
}
@@ -215,7 +216,7 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work
215216

216217
if (strncmp(params->mode.device.dev_name,
217218
UCT_CUDA_DEV_NAME, strlen(UCT_CUDA_DEV_NAME)) != 0) {
218-
ucs_error("No device was found: %s", params->mode.device.dev_name);
219+
ucs_error("no device was found: %s", params->mode.device.dev_name);
219220
return UCS_ERR_NO_DEVICE;
220221
}
221222

@@ -232,20 +233,12 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work
232233
"CUDA EVENT objects");
233234

234235
if (UCS_OK != status) {
235-
ucs_error("Mpool creation failed");
236+
ucs_error("mpool creation failed");
236237
return UCS_ERR_IO_ERROR;
237238
}
238239

239-
status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_d2h, cudaStreamNonBlocking));
240-
if (UCS_OK != status) {
241-
ucs_error("cudaStreamCreateWithFlags creation failed");
242-
return UCS_ERR_IO_ERROR;
243-
}
244-
status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_h2d, cudaStreamNonBlocking));
245-
if (UCS_OK != status) {
246-
ucs_error("cudaStreamCreateWithFlags creation failed");
247-
return UCS_ERR_IO_ERROR;
248-
}
240+
self->stream_d2h = 0;
241+
self->stream_h2d = 0;
249242

250243
ucs_queue_head_init(&self->outstanding_d2h_cuda_event_q);
251244
ucs_queue_head_init(&self->outstanding_h2d_cuda_event_q);
@@ -257,6 +250,14 @@ static UCS_CLASS_CLEANUP_FUNC(uct_cuda_copy_iface_t)
257250
{
258251
uct_base_iface_progress_disable(&self->super.super,
259252
UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);
253+
if (self->stream_h2d != 0) {
254+
UCT_CUDA_FUNC(cudaStreamDestroy(self->stream_h2d));
255+
}
256+
257+
if (self->stream_d2h != 0) {
258+
UCT_CUDA_FUNC(cudaStreamDestroy(self->stream_d2h));
259+
}
260+
260261
ucs_mpool_cleanup(&self->cuda_event_desc, 1);
261262
}
262263

src/uct/cuda/cuda_copy/cuda_copy_iface.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,21 @@
88

99
#include <uct/base/uct_iface.h>
1010
#include <ucs/arch/cpu.h>
11+
#include <ucs/sys/preprocessor.h>
1112
#include <cuda_runtime.h>
1213
#include <cuda.h>
1314

1415

1516
#define UCT_CUDA_COPY_TL_NAME "cuda_copy"
1617
#define UCT_CUDA_DEV_NAME "cudacopy0"
1718

18-
#define CUDA_FUNC(func) ({ \
19+
#define UCT_CUDA_FUNC(_func) ({ \
1920
ucs_status_t _status = UCS_OK; \
2021
do { \
21-
cudaError_t _result = (func); \
22+
cudaError_t _result = (_func); \
2223
if (cudaSuccess != _result) { \
23-
ucs_error("[%s:%d] cuda failed with %d \n", \
24-
__FILE__, __LINE__,_result); \
24+
ucs_error("%s failed with %d \n", \
25+
UCS_PP_MAKE_STRING(_func), _result); \
2526
_status = UCS_ERR_IO_ERROR; \
2627
} \
2728
} while (0); \

0 commit comments

Comments
 (0)