diff --git a/doc/mpich/tuning_parameters.md b/doc/mpich/tuning_parameters.md index 8d1e0aff474..b581a6297a8 100644 --- a/doc/mpich/tuning_parameters.md +++ b/doc/mpich/tuning_parameters.md @@ -1096,27 +1096,22 @@ GPU pipeline uses host buffer and pipelining technique to send internode messages instead of GPU RDMA. To enable this mode, use the following two CVARs: -* `MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE`: This CVAR enables GPU pipeline -for inter-node pt2pt messages -* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD`: The threshold to start using -GPU pipelining. Default is 1MB. +* `MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD`: This CVAR enables enables the RNDV +(rendezvous) path for large messages above the threshold. Recommended value +is 1MB. -* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ`: Specifies the chunk size -(in bytes) for GPU pipeline data transfer. +* `MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline`: Forces the RNDV algorithm to +use pipelining. The default is "auto", which will select best algorithms +based on message attributes. Other include protocols include "read" - RDMA +read, and "direct", which relies on underlying network library implementations. -* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK`: Specifies the +* `MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ`: Specifies the chunk size +(in bytes) for pipeline data transfer. + +* `MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS`: Specifies the number of buffers for GPU pipeline data transfer in each block/chunk of the pool. -* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS`: Specifies the maximum -total number of buffers MPICH buffer pool can allocate. - -* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE`: Specify engine type -for copying from device to host (sender side), default 0 - -* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE`: Specify engine type -for copying from host to device (receiver side), default 0 - To enable GPU Direct RDMA support for pt2pt communication, use the following CVARs: * `MPIR_CVAR_CH4_OFI_ENABLE_HMEM`: This CVAR with a value of `1` enables diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h index ebe8b779464..09f9079d934 100644 --- a/src/include/mpiimpl.h +++ b/src/include/mpiimpl.h @@ -155,7 +155,6 @@ typedef struct MPIR_Stream MPIR_Stream; /******************* PART 3: DEVICE INDEPENDENT HEADERS **********************/ /*****************************************************************************/ -#include "mpir_misc.h" #include "mpir_dbg.h" #include "mpir_objects.h" #include "mpir_strerror.h" @@ -166,6 +165,7 @@ typedef struct MPIR_Stream MPIR_Stream; #include "mpir_mem.h" #include "mpir_info.h" #include "mpir_errcodes.h" +#include "mpir_misc.h" #include "mpir_errhandler.h" #include "mpir_attr_generic.h" #include "mpir_contextid.h" diff --git a/src/include/mpir_misc.h b/src/include/mpir_misc.h index 4937cac5342..6743412eff4 100644 --- a/src/include/mpir_misc.h +++ b/src/include/mpir_misc.h @@ -74,6 +74,26 @@ typedef struct { MPIR_request_type_t type; } MPIR_gpu_req; +MPL_STATIC_INLINE_PREFIX void MPIR_async_test(MPIR_gpu_req * areq, int *is_done) +{ + int err; + switch (areq->type) { + case MPIR_NULL_REQUEST: + /* a dummy, immediately complete */ + *is_done = 1; + break; + case MPIR_TYPEREP_REQUEST: + MPIR_Typerep_test(areq->u.y_req, is_done); + break; + case MPIR_GPU_REQUEST: + err = MPL_gpu_test(&areq->u.gpu_req, is_done); + MPIR_Assertp(err == MPL_SUCCESS); + break; + default: + MPIR_Assert(0); + } +} + int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype); int MPIR_Ilocalcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype, diff --git a/src/include/mpir_typerep.h b/src/include/mpir_typerep.h index e6758be3aa5..7f544fd44e8 100644 --- a/src/include/mpir_typerep.h +++ b/src/include/mpir_typerep.h @@ -78,8 +78,6 @@ int MPIR_Typerep_ipack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatyp int MPIR_Typerep_iunpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_Aint outcount, MPI_Datatype datatype, MPI_Aint outoffset, MPI_Aint * actual_unpack_bytes, MPIR_Typerep_req * typerep_req, uint32_t flags); -int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req); -int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed); int MPIR_Typerep_size_external32(MPI_Datatype type); int MPIR_Typerep_pack_external(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype, diff --git a/src/mpi/datatype/typerep/src/typerep_pre.h b/src/mpi/datatype/typerep/src/typerep_pre.h index 022510fbe2c..347bed20a41 100644 --- a/src/mpi/datatype/typerep/src/typerep_pre.h +++ b/src/mpi/datatype/typerep/src/typerep_pre.h @@ -28,4 +28,7 @@ typedef struct { #define MPIR_TYPEREP_HANDLE_NULL NULL #endif +int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req); +int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed); + #endif /* TYPEREP_PRE_H_INCLUDED */ diff --git a/src/mpi/misc/utils.c b/src/mpi/misc/utils.c index bff9a3f9b7b..5b2334f11a2 100644 --- a/src/mpi/misc/utils.c +++ b/src/mpi/misc/utils.c @@ -500,7 +500,12 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se do_localcopy(sendbuf, sendcount, sendtype, sendoffset, recvbuf, recvcount, recvtype, recvoffset, LOCALCOPY_NONBLOCKING, &req->u.y_req); MPIR_ERR_CHECK(mpi_errno); - req->type = MPIR_TYPEREP_REQUEST; + + if (req->u.y_req.req == MPIR_TYPEREP_REQ_NULL) { + req->type = MPIR_NULL_REQUEST; + } else { + req->type = MPIR_TYPEREP_REQUEST; + } #endif fn_exit: diff --git a/src/mpid/ch4/ch4_api.txt b/src/mpid/ch4/ch4_api.txt index a7c8bdaeb99..631144bd573 100644 --- a/src/mpid/ch4/ch4_api.txt +++ b/src/mpid/ch4/ch4_api.txt @@ -71,8 +71,8 @@ Non Native API: NM*: am_hdr_sz, data_sz, data, count, datatype, sreq SHM*: am_hdr_sz, data_sz, data, count, datatype, sreq am_can_do_tag: bool - NM*: void - SHM*: void + NM*: rreq + SHM*: rreq am_tag_send : int NM*: rank, comm, handler_id, tag, buf, count, datatype, src_vci, dst_vci, sreq SHM*: rank, comm, handler_id, tag, buf, count, datatype, src_vci, dst_vci, sreq diff --git a/src/mpid/ch4/netmod/ofi/Makefile.mk b/src/mpid/ch4/netmod/ofi/Makefile.mk index 0ee8c26e384..6bf7e2ca3fb 100644 --- a/src/mpid/ch4/netmod/ofi/Makefile.mk +++ b/src/mpid/ch4/netmod/ofi/Makefile.mk @@ -18,7 +18,9 @@ mpi_core_sources += src/mpid/ch4/netmod/ofi/func_table.c \ src/mpid/ch4/netmod/ofi/ofi_part.c \ src/mpid/ch4/netmod/ofi/ofi_events.c \ src/mpid/ch4/netmod/ofi/ofi_rndv.c \ - src/mpid/ch4/netmod/ofi/ofi_huge.c \ + src/mpid/ch4/netmod/ofi/ofi_rndv_read.c \ + src/mpid/ch4/netmod/ofi/ofi_rndv_write.c \ + src/mpid/ch4/netmod/ofi/ofi_pipeline.c \ src/mpid/ch4/netmod/ofi/ofi_progress.c \ src/mpid/ch4/netmod/ofi/ofi_am_events.c \ src/mpid/ch4/netmod/ofi/ofi_nic.c \ diff --git a/src/mpid/ch4/netmod/ofi/ofi_am.h b/src/mpid/ch4/netmod/ofi/ofi_am.h index 9abef986825..9fa5607e552 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_am.h +++ b/src/mpid/ch4/netmod/ofi/ofi_am.h @@ -215,9 +215,17 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_check_eager(MPI_Aint am_hdr_sz, MPI_Ai } } -MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(void) +MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(MPIR_Request * rreq) { - return MPIDI_OFI_ENABLE_TAGGED; + if (MPIDI_OFI_ENABLE_TAGGED) { + MPI_Aint data_sz; + MPIR_Datatype_get_size_macro(MPIDIG_REQUEST(rreq, datatype), data_sz); + data_sz *= MPIDIG_REQUEST(rreq, count); + if (data_sz <= MPIDI_OFI_global.max_msg_size) { + return true; + } + } + return false; } MPL_STATIC_INLINE_PREFIX MPIDIG_recv_data_copy_cb MPIDI_NM_am_get_data_copy_cb(uint32_t attr) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 1945f15811a..7dc2023cf38 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -14,7 +14,6 @@ static int peek_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); static int peek_empty_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); -static int send_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq); static int ssend_ack_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq); static int chunk_done_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req); static int inject_emu_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req); @@ -38,9 +37,6 @@ static int peek_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rre if (MPIDI_OFI_is_tag_rndv(wc->tag)) { mpi_errno = MPIDI_OFI_peek_rndv_event(vci, wc, rreq); goto fn_exit; - } else if (MPIDI_OFI_is_tag_huge(wc->tag)) { - mpi_errno = MPIDI_OFI_peek_huge_event(vci, wc, rreq); - goto fn_exit; } MPIR_STATUS_SET_COUNT(rreq->status, wc->len); @@ -77,211 +73,6 @@ static int peek_empty_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request return MPI_SUCCESS; } -/* GPU pipeline events */ -static int pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) -{ - int mpi_errno = MPI_SUCCESS; - int c; - MPIDI_OFI_gpu_pipeline_request *req; - MPIR_Request *sreq; - void *wc_buf = NULL; - MPIR_FUNC_ENTER; - - req = (MPIDI_OFI_gpu_pipeline_request *) r; - /* get original mpi request */ - sreq = req->parent; - wc_buf = req->buf; - MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, wc_buf); - - MPIR_cc_decr(sreq->cc_ptr, &c); - if (c == 0) { - MPIR_Request_free(sreq); - } - MPL_free(r); - - MPIR_FUNC_EXIT; - return mpi_errno; -} - -static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, int event_id) -{ - int mpi_errno = MPI_SUCCESS; - int vci_local, i; - MPIDI_OFI_gpu_pipeline_request *req; - MPIR_Request *rreq; - void *wc_buf = NULL; - int in_use MPL_UNUSED; - MPIDI_OFI_gpu_task_t *task = NULL; - MPL_gpu_engine_type_t engine_type = - (MPL_gpu_engine_type_t) MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE; - - MPIR_FUNC_ENTER; - - req = (MPIDI_OFI_gpu_pipeline_request *) r; - rreq = req->parent; - wc_buf = req->buf; - MPL_free(r); - - void *recv_buf = MPIDI_OFI_REQUEST(rreq, buf); - size_t recv_count = MPIDI_OFI_REQUEST(rreq, count); - MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, datatype); - - fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr); - vci_local = MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local); - - if (event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) { - rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true); - rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data); - rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); - - if (unlikely(MPIDI_OFI_is_tag_sync(wc->tag))) { - MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = true; - } - - uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data); - uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data); - if (likely(packed == 0)) { - if (wc->len > 0) { - MPIR_Assert(n_chunks == 0); - /* First chunk arrives. */ - MPI_Aint actual_unpack_bytes; - MPIR_gpu_req yreq; - mpi_errno = - MPIR_Ilocalcopy_gpu(wc_buf, wc->len, MPIR_BYTE_INTERNAL, 0, NULL, recv_buf, - recv_count, datatype, 0, NULL, MPL_GPU_COPY_H2D, - engine_type, 1, &yreq); - MPIR_ERR_CHECK(mpi_errno); - actual_unpack_bytes = wc->len; - task = - MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf, - actual_unpack_bytes, rreq, yreq); - DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task); - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes; - } else { - /* free this chunk */ - MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, wc_buf); - MPIR_Assert(n_chunks > 0); - /* Post recv for remaining chunks. */ - MPIR_cc_dec(rreq->cc_ptr); - for (i = 0; i < n_chunks; i++) { - MPIR_cc_inc(rreq->cc_ptr); - - size_t chunk_sz = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; - - char *host_buf = NULL; - MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - (void **) &host_buf); - - MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; - - MPIDI_OFI_gpu_pipeline_request *chunk_req = NULL; - chunk_req = (MPIDI_OFI_gpu_pipeline_request *) - MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); - if (chunk_req == NULL) { - mpi_errno = MPIR_ERR_OTHER; - goto fn_fail; - } - chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; - chunk_req->parent = rreq; - chunk_req->buf = host_buf; - chunk_req->offset = chunk_sz * i; - int ret = 0; - if (!MPIDI_OFI_global.gpu_recv_queue && host_buf) { - ret = fi_trecv - (MPIDI_OFI_global.ctx - [MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx)].rx, - host_buf, chunk_sz, NULL, remote_addr, - MPIDI_OFI_REQUEST(rreq, - pipeline_info.match_bits) | - MPIDI_OFI_GPU_PIPELINE_SEND, MPIDI_OFI_REQUEST(rreq, - pipeline_info. - mask_bits), - (void *) &chunk_req->context); - } - if (MPIDI_OFI_global.gpu_recv_queue || !host_buf || ret != 0) { - MPIDI_OFI_gpu_pending_recv_t *recv_task = - MPIDI_OFI_create_recv_task(chunk_req, i, n_chunks); - DL_APPEND(MPIDI_OFI_global.gpu_recv_queue, recv_task); - } - } - } - } else { - MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); - } - } else { - if (likely(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE)) { - /* FIXME: current design unpacks all bytes from host buffer, overflow check is missing. */ - MPI_Aint actual_unpack_bytes; - MPIR_gpu_req yreq; - mpi_errno = - MPIR_Ilocalcopy_gpu(wc_buf, (MPI_Aint) wc->len, MPIR_BYTE_INTERNAL, 0, NULL, - (char *) recv_buf, (MPI_Aint) recv_count, datatype, - req->offset, NULL, MPL_GPU_COPY_H2D, engine_type, 1, &yreq); - MPIR_ERR_CHECK(mpi_errno); - actual_unpack_bytes = wc->len; - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes; - task = - MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf, actual_unpack_bytes, - rreq, yreq); - DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task); - } else { - MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); - } - } - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - rreq->status.MPI_ERROR = mpi_errno; - goto fn_exit; -} - -static int send_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq) -{ - int mpi_errno = MPI_SUCCESS; - int c, num_nics; - MPIR_FUNC_ENTER; - - MPIR_cc_decr(sreq->cc_ptr, &c); - - if (c == 0) { - MPIR_Comm *comm; - struct fid_mr **huge_send_mrs; - - comm = sreq->comm; - num_nics = MPIDI_OFI_COMM(comm).enable_striping ? MPIDI_OFI_global.num_nics : 1; - huge_send_mrs = MPIDI_OFI_REQUEST(sreq, huge.send_mrs); - - /* Clean up the memory region */ - for (int i = 0; i < num_nics; i++) { - uint64_t key = fi_mr_key(huge_send_mrs[i]); - MPIDI_OFI_CALL(fi_close(&huge_send_mrs[i]->fid), mr_unreg); - if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { - MPIDI_OFI_mr_key_free(MPIDI_OFI_LOCAL_MR_KEY, key); - } - } - MPL_free(huge_send_mrs); - - if (MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer)) { - MPL_free(MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer)); - } - - if (MPIDI_OFI_REQUEST(sreq, am_req)) { - MPIR_Request *am_sreq = MPIDI_OFI_REQUEST(sreq, am_req); - int handler_id = MPIDI_OFI_REQUEST(sreq, am_handler_id); - mpi_errno = MPIDIG_global.origin_cbs[handler_id] (am_sreq); - } - - MPIDI_CH4_REQUEST_FREE(sreq); - } - /* c != 0, ssend */ - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - static int ssend_ack_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq) { MPIDI_OFI_ack_request_t *req = (MPIDI_OFI_ack_request_t *) sreq; @@ -622,25 +413,12 @@ int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Req } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_AM_READ) { mpi_errno = am_read_event(vci, wc, req); goto fn_exit; - } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_SEND_GPU_PIPELINE) { - mpi_errno = pipeline_send_event(wc, req); - goto fn_exit; - } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) { - mpi_errno = pipeline_recv_event(wc, req, MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT); - goto fn_exit; - } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE) { - mpi_errno = pipeline_recv_event(wc, req, MPIDI_OFI_EVENT_RECV_GPU_PIPELINE); - goto fn_exit; } else if (unlikely(1)) { switch (MPIDI_OFI_REQUEST(req, event_id)) { case MPIDI_OFI_EVENT_PEEK: mpi_errno = peek_event(vci, wc, req); break; - case MPIDI_OFI_EVENT_RECV_HUGE: - mpi_errno = MPIDI_OFI_recv_event(vci, wc, req, MPIDI_OFI_EVENT_RECV_HUGE); - break; - case MPIDI_OFI_EVENT_RECV_PACK: mpi_errno = MPIDI_OFI_recv_event(vci, wc, req, MPIDI_OFI_EVENT_RECV_PACK); break; @@ -649,10 +427,6 @@ int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Req mpi_errno = MPIDI_OFI_recv_event(vci, wc, req, MPIDI_OFI_EVENT_RECV_NOPACK); break; - case MPIDI_OFI_EVENT_SEND_HUGE: - mpi_errno = send_huge_event(vci, wc, req); - break; - case MPIDI_OFI_EVENT_SEND_PACK: mpi_errno = MPIDI_OFI_send_event(vci, wc, req, MPIDI_OFI_EVENT_SEND_PACK); break; @@ -669,12 +443,44 @@ int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Req mpi_errno = MPIDI_OFI_rndv_cts_event(vci, wc, req); break; - case MPIDI_OFI_EVENT_CHUNK_DONE: - mpi_errno = chunk_done_event(vci, wc, req); + case MPIDI_OFI_EVENT_PIPELINE_RECV_DATASIZE: + mpi_errno = MPIDI_OFI_pipeline_recv_datasize_event(wc, req); + break; + + case MPIDI_OFI_EVENT_PIPELINE_SEND_CHUNK: + mpi_errno = MPIDI_OFI_pipeline_send_chunk_event(wc, req); + break; + + case MPIDI_OFI_EVENT_PIPELINE_RECV_CHUNK: + mpi_errno = MPIDI_OFI_pipeline_recv_chunk_event(wc, req); break; - case MPIDI_OFI_EVENT_HUGE_CHUNK_DONE: - mpi_errno = MPIDI_OFI_huge_chunk_done_event(vci, wc, req); + case MPIDI_OFI_EVENT_RNDVREAD_RECV_MRS: + mpi_errno = MPIDI_OFI_rndvread_recv_mrs_event(wc, req); + break; + + case MPIDI_OFI_EVENT_RNDVREAD_READ_CHUNK: + mpi_errno = MPIDI_OFI_rndvread_read_chunk_event(wc, req); + break; + + case MPIDI_OFI_EVENT_RNDVREAD_ACK: + mpi_errno = MPIDI_OFI_rndvread_ack_event(wc, req); + break; + + case MPIDI_OFI_EVENT_RNDVWRITE_RECV_MRS: + mpi_errno = MPIDI_OFI_rndvwrite_recv_mrs_event(wc, req); + break; + + case MPIDI_OFI_EVENT_RNDVWRITE_WRITE_CHUNK: + mpi_errno = MPIDI_OFI_rndvwrite_write_chunk_event(wc, req); + break; + + case MPIDI_OFI_EVENT_RNDVWRITE_ACK: + mpi_errno = MPIDI_OFI_rndvwrite_ack_event(wc, req); + break; + + case MPIDI_OFI_EVENT_CHUNK_DONE: + mpi_errno = chunk_done_event(vci, wc, req); break; case MPIDI_OFI_EVENT_INJECT_EMU: @@ -759,12 +565,10 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret) case MPIDI_OFI_EVENT_RECV: case MPIDI_OFI_EVENT_RECV_PACK: case MPIDI_OFI_EVENT_RECV_NOPACK: - case MPIDI_OFI_EVENT_RECV_HUGE: MPIR_STATUS_SET_CANCEL_BIT(req->status, TRUE); MPIR_STATUS_SET_COUNT(req->status, 0); MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(req, datatype)); - if ((event_id == MPIDI_OFI_EVENT_RECV_PACK || - event_id == MPIDI_OFI_EVENT_GET_HUGE) && + if ((event_id == MPIDI_OFI_EVENT_RECV_PACK) && MPIDI_OFI_REQUEST(req, noncontig.pack.pack_buffer)) { MPL_free(MPIDI_OFI_REQUEST(req, noncontig.pack.pack_buffer)); } else if (event_id == MPIDI_OFI_EVENT_RECV_NOPACK) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h index 2c6721d7f99..7ca8d300b0d 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.h +++ b/src/mpid/ch4/netmod/ofi/ofi_events.h @@ -11,47 +11,20 @@ #include "ofi_am_events.h" #include "utlist.h" -/* -=== BEGIN_MPI_T_CVAR_INFO_BLOCK === -cvars: - - name : MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE - category : CH4_OFI - type : enum - default : copy_low_latency - class : none - verbosity : MPI_T_VERBOSITY_USER_BASIC - scope : MPI_T_SCOPE_LOCAL - description : |- - Specifies GPU engine type for GPU pt2pt on the receiver side. - compute - use a compute engine - copy_high_bandwidth - use a high-bandwidth copy engine - copy_low_latency - use a low-latency copy engine - yaksa - use Yaksa - -=== END_MPI_T_CVAR_INFO_BLOCK === -*/ - int MPIDI_OFI_rma_done_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * in_req); int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req); int MPIDI_OFI_recv_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); int MPIDI_OFI_peek_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req); - -MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_recv_engine_type(void) -{ - if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE == - MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_compute) { - return MPL_GPU_ENGINE_TYPE_COMPUTE; - } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE == - MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_high_bandwidth) { - return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; - } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE == - MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_low_latency) { - return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; - } else { - return MPL_GPU_ENGINE_TYPE_LAST; - } -} +int MPIDI_OFI_pipeline_recv_datasize_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_pipeline_send_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_pipeline_recv_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_rndvread_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_rndvread_read_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_rndvread_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_rndvwrite_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_rndvwrite_write_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_rndvwrite_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_cqe_get_source(struct fi_cq_tagged_entry *wc, bool has_err) { @@ -83,12 +56,24 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_event(int vci, if (MPIDI_OFI_REQUEST(sreq, am_req)) { MPIR_Request *am_sreq = MPIDI_OFI_REQUEST(sreq, am_req); int handler_id = MPIDI_OFI_REQUEST(sreq, am_handler_id); - mpi_errno = MPIDIG_global.origin_cbs[handler_id] (am_sreq); + if (handler_id == -1) { + /* native rndv direct */ + MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(am_sreq); + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(am_sreq); + } else { + mpi_errno = MPIDIG_global.origin_cbs[handler_id] (am_sreq); + MPIR_ERR_CHECK(mpi_errno); + } } MPIDI_Request_complete_fast(sreq); + + fn_exit: MPIR_FUNC_EXIT; return mpi_errno; + fn_fail: + goto fn_exit; } MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_complete(MPIR_Request * rreq, int event_id) @@ -99,7 +84,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_complete(MPIR_Request * rreq, int ev #ifndef MPIDI_CH4_DIRECT_NETMOD MPIDI_anysrc_free_partner(rreq); #endif - if ((event_id == MPIDI_OFI_EVENT_RECV_PACK || event_id == MPIDI_OFI_EVENT_GET_HUGE) && + if ((event_id == MPIDI_OFI_EVENT_RECV_PACK) && (MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer))) { MPI_Aint count = MPIR_STATUS_GET_COUNT(rreq->status); mpi_errno = MPIR_Localcopy_gpu(MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer), count, @@ -130,8 +115,25 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_complete(MPIR_Request * rreq, int ev MPI_Status *status = &rreq->status; MPIR_Request *am_req = MPIDI_OFI_REQUEST(rreq, am_req); int am_recv_id = MPIDI_OFI_REQUEST(rreq, am_handler_id); - mpi_errno = MPIDIG_global.tag_recv_cbs[am_recv_id] (am_req, status); - MPIR_ERR_CHECK(mpi_errno); + if (am_recv_id == -1) { + /* native rndv direct */ + + /* copy COUNT and MPI_ERROR, but skip MPI_SOURCE and MPI_RANK */ + am_req->status.count_lo = rreq->status.count_lo; + am_req->status.count_hi_and_cancelled = rreq->status.count_hi_and_cancelled; + am_req->status.MPI_ERROR = rreq->status.MPI_ERROR; + +#ifndef MPIDI_CH4_DIRECT_NETMOD + MPIDI_anysrc_free_partner(am_req); +#endif + + MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(am_req); + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(am_req); + } else { + mpi_errno = MPIDIG_global.tag_recv_cbs[am_recv_id] (am_req, status); + MPIR_ERR_CHECK(mpi_errno); + } } MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(rreq, datatype)); MPIDI_Request_complete_fast(rreq); @@ -168,9 +170,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_event(int vci, struct fi_cq_tagged_e if (MPIDI_OFI_is_tag_rndv(wc->tag)) { mpi_errno = MPIDI_OFI_recv_rndv_event(vci, wc, rreq); goto fn_exit; - } else if (MPIDI_OFI_is_tag_huge(wc->tag)) { - mpi_errno = MPIDI_OFI_recv_huge_event(vci, wc, rreq); - goto fn_exit; } /* If synchronous, send ack */ @@ -180,12 +179,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_recv_event(int vci, struct fi_cq_tagged_e MPIR_ERR_CHECK(mpi_errno); } - /* If striping is enabled, this data will be counted elsewhere. */ - if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_HUGE || - !MPIDI_OFI_COMM(rreq->comm).enable_striping) { - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[MPIDI_OFI_REQUEST(rreq, nic_num)], - wc->len); - } mpi_errno = MPIDI_OFI_recv_complete(rreq, event_id); fn_exit: diff --git a/src/mpid/ch4/netmod/ofi/ofi_huge.c b/src/mpid/ch4/netmod/ofi/ofi_huge.c deleted file mode 100644 index 160ea7b29ab..00000000000 --- a/src/mpid/ch4/netmod/ofi/ofi_huge.c +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Copyright (C) by Argonne National Laboratory - * See COPYRIGHT in top-level directory - */ - -#include -#include "ofi_impl.h" -#include "ofi_events.h" - -static int get_huge(MPIR_Request * rreq); -static int get_huge_issue_read(MPIR_Request * rreq); -static int get_huge_complete(MPIR_Request * rreq); - -static int get_huge(MPIR_Request * rreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info); - - MPI_Aint cur_offset; - if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { - cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; - } else { - cur_offset = MPIDI_OFI_global.max_msg_size; - } - - MPI_Aint data_sz = MPIDI_OFI_REQUEST(rreq, util.iov.iov_len); - - if (data_sz < info->msgsize) { - rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE; - info->msgsize = data_sz; - } - - if (info->msgsize <= cur_offset) { - /* huge message sent to small recv buffer */ - mpi_errno = get_huge_complete(rreq); - MPIR_ERR_CHECK(mpi_errno); - goto fn_exit; - } - - get_huge_issue_read(rreq); - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static uintptr_t recv_rbase(MPIDI_OFI_huge_remote_info_t * remote_info) -{ - if (!MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) { - return 0; - } else { - return (uintptr_t) remote_info->send_buf; - } -} - -static int get_huge_issue_read(MPIR_Request * rreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info); - MPIR_Comm *comm = rreq->comm; - MPIR_FUNC_ENTER; - - MPI_Aint cur_offset, bytesLeft; - if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { - cur_offset = MPIDI_OFI_STRIPE_CHUNK_SIZE; - } else { - cur_offset = MPIDI_OFI_global.max_msg_size; - } - bytesLeft = info->msgsize - cur_offset; - - void *recv_buf = MPIDI_OFI_REQUEST(rreq, util.iov.iov_base); - - MPI_Aint chunk_size; - if (MPIDI_OFI_COMM(comm).enable_striping) { - chunk_size = (info->msgsize - MPIDI_OFI_STRIPE_CHUNK_SIZE) / MPIDI_OFI_global.num_nics; - chunk_size = MPL_MIN(chunk_size, MPIDI_OFI_global.max_msg_size); - } else { - chunk_size = MPIDI_OFI_global.max_msg_size; - } - - int num_chunks = MPL_DIV_ROUNDUP(bytesLeft, chunk_size); - - /* note: this is receiver read from sender */ - int vci_remote = info->vci_src; - int vci_local = info->vci_dst; - - /* We'll issue multiple fi_read for every chunks. All the chunks will be tracked by a - * chunks_outstanding counter. */ - /* NOTE: there is a possibility completion happens in between issuing fi_read (due to - * MPIDI_OFI_CALL_RETRY). Thus we need initialize chunks_outstanding before issuing any - * chunk */ - /* allocate and initialize cc_ptr. It will be freed by event completion when it reaches 0 */ - MPIR_cc_t *cc_ptr; - cc_ptr = MPL_malloc(sizeof(MPIR_cc_t), MPL_MEM_OTHER); - MPIR_cc_set(cc_ptr, num_chunks); - - int issued_chunks = 0; - - int nic = 0; - while (bytesLeft > 0) { - int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic); - MPIDI_av_entry_t *av = MPIDIU_comm_rank_to_av(comm, info->origin_rank); - fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, nic, vci_remote, nic); - uint64_t remote_key = info->rma_keys[nic]; - - MPI_Aint bytesToGet = MPL_MIN(chunk_size, bytesLeft); - - MPIDI_OFI_read_chunk_t *chunk = MPL_malloc(sizeof(MPIDI_OFI_read_chunk_t), MPL_MEM_OTHER); - chunk->event_id = MPIDI_OFI_EVENT_HUGE_CHUNK_DONE; - chunk->localreq = rreq; - chunk->chunks_outstanding = cc_ptr; - - MPIDI_OFI_cntr_incr(vci_local, nic); - MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, - (void *) ((char *) recv_buf + cur_offset), - bytesToGet, NULL, addr, recv_rbase(info) + cur_offset, - remote_key, (void *) &chunk->context), - vci_local, rdma_readfrom); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[nic], bytesToGet); - if (MPIDI_OFI_COMM(comm).enable_striping) { - MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_recvd_bytes_count[nic], bytesToGet); - /* round-robin to next nic */ - nic = (nic + 1) % MPIDI_OFI_global.num_nics; - } - - issued_chunks++; - cur_offset += bytesToGet; - bytesLeft -= bytesToGet; - } - - MPIR_Assert(issued_chunks == num_chunks); - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -static int get_huge_complete(MPIR_Request * rreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - MPIDI_OFI_huge_remote_info_t *info = MPIDI_OFI_REQUEST(rreq, huge.remote_info); - - /* note: it's receiver ack sender */ - int vci_remote = info->vci_src; - int vci_local = info->vci_dst; - - /* important: save comm_ptr because MPIDI_OFI_recv_complete may free the request. */ - MPIR_Comm *comm_ptr = rreq->comm; - MPIR_STATUS_SET_COUNT(rreq->status, info->msgsize); - - mpi_errno = MPIDI_OFI_recv_complete(rreq, MPIDI_OFI_EVENT_GET_HUGE); - - MPIDI_OFI_send_control_t ctrl; - ctrl.type = MPIDI_OFI_CTRL_HUGEACK; - ctrl.u.huge_ack.ackreq = info->ackreq; - mpi_errno = MPIDI_NM_am_send_hdr(info->origin_rank, comm_ptr, - MPIDI_OFI_INTERNAL_HANDLER_CONTROL, - &ctrl, sizeof(ctrl), vci_local, vci_remote); - MPIR_ERR_CHECK(mpi_errno); - - MPL_free(info); - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -/* this function called by recv event of a huge message */ -int MPIDI_OFI_recv_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_Comm *comm_ptr; - MPIR_FUNC_ENTER; - - bool ready_to_get = false; - if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_HUGE) { - /* huge send recved by a small buffer */ - } else if (MPIDI_OFI_COMM(rreq->comm).enable_striping) { - MPIR_Assert(wc->len == MPIDI_OFI_STRIPE_CHUNK_SIZE); - } else { - MPIR_Assert(wc->len == MPIDI_OFI_global.max_msg_size); - } - - comm_ptr = rreq->comm; - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_recvd_bytes_count[MPIDI_OFI_REQUEST(rreq, nic_num)], - wc->len); - if (MPIDI_OFI_REQUEST(rreq, huge.remote_info)) { - /* this is mrecv, we already got remote info */ - ready_to_get = true; - } else { - /* Check for remote control info */ - MPIDI_OFI_huge_recv_list_t *list_ptr; - int comm_id = comm_ptr->recvcontext_id; - int rank = MPIDI_OFI_cqe_get_source(wc, false); - int tag = (MPIDI_OFI_TAG_MASK & wc->tag); - - LL_FOREACH(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head, list_ptr) { - if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) { - MPIDI_OFI_REQUEST(rreq, huge.remote_info) = list_ptr->u.info; - LL_DELETE(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head, - MPIDI_OFI_global.per_vci[vci].huge_ctrl_tail, list_ptr); - MPL_free(list_ptr); - ready_to_get = true; - break; - } - } - } - - if (!ready_to_get) { - MPIDI_OFI_huge_recv_list_t *list_ptr; - - list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*list_ptr), 1, MPL_MEM_BUFFER); - if (!list_ptr) - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - - list_ptr->comm_id = comm_ptr->recvcontext_id; - list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); - list_ptr->tag = (MPIDI_OFI_TAG_MASK & wc->tag); - list_ptr->u.rreq = rreq; - - LL_APPEND(MPIDI_OFI_global.per_vci[vci].huge_recv_head, - MPIDI_OFI_global.per_vci[vci].huge_recv_tail, list_ptr); - /* control handler will finish the recv */ - } else { - /* proceed to get the huge message */ - mpi_errno = get_huge(rreq); - MPIR_ERR_CHECK(mpi_errno); - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -/* This function is called when we receive a huge control message */ -int MPIDI_OFI_recv_huge_control(int vci, int comm_id, int rank, int tag, - MPIDI_OFI_huge_remote_info_t * info_ptr) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - MPIDI_OFI_huge_recv_list_t *list_ptr; - MPIR_Request *rreq = NULL; - MPIDI_OFI_huge_remote_info_t *info; - - /* need persist the info. It will eventually get freed at recv completion */ - info = MPL_malloc(sizeof(MPIDI_OFI_huge_remote_info_t), MPL_MEM_OTHER); - MPIR_Assert(info); - memcpy(info, info_ptr, sizeof(*info)); - - /* If there has been a posted receive, search through the list of unmatched - * receives to find the one that goes with the incoming message. */ - LL_FOREACH(MPIDI_OFI_global.per_vci[vci].huge_recv_head, list_ptr) { - if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) { - rreq = list_ptr->u.rreq; - LL_DELETE(MPIDI_OFI_global.per_vci[vci].huge_recv_head, - MPIDI_OFI_global.per_vci[vci].huge_recv_tail, list_ptr); - MPL_free(list_ptr); - break; - } - } - - if (!rreq) { - list_ptr = (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(MPIDI_OFI_huge_recv_list_t), - 1, MPL_MEM_OTHER); - if (!list_ptr) { - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); - } - list_ptr->comm_id = comm_id; - list_ptr->rank = rank; - list_ptr->tag = tag; - list_ptr->u.info = info; - - LL_APPEND(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head, - MPIDI_OFI_global.per_vci[vci].huge_ctrl_tail, list_ptr); - /* let MPIDI_OFI_recv_huge_event finish the recv */ - } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) { - /* attach info and finish the mprobe */ - MPIDI_OFI_REQUEST(rreq, huge.remote_info) = info; - MPIR_STATUS_SET_COUNT(rreq->status, info->msgsize); - MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, peek_status)), MPIDI_OFI_PEEK_FOUND); - } else { - /* attach info and finish recv */ - MPIDI_OFI_REQUEST(rreq, huge.remote_info) = info; - mpi_errno = get_huge(rreq); - MPIR_ERR_CHECK(mpi_errno); - } - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -int MPIDI_OFI_peek_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - MPI_Aint count = 0; - MPIDI_OFI_huge_recv_list_t *list_ptr; - bool found_msg = false; - - /* If this is a huge message, find the control message on the unexpected list that matches - * with this and return the size in that. */ - LL_FOREACH(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head, list_ptr) { - /* FIXME: fix the type of comm_id */ - int comm_id = rreq->comm->recvcontext_id; - int rank = MPIDI_OFI_cqe_get_source(wc, false); - int tag = (int) (MPIDI_OFI_TAG_MASK & wc->tag); - if (list_ptr->comm_id == comm_id && list_ptr->rank == rank && list_ptr->tag == tag) { - count = list_ptr->u.info->msgsize; - found_msg = true; - break; - } - } - if (found_msg) { - if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) { - MPIDI_OFI_REQUEST(rreq, huge.remote_info) = list_ptr->u.info; - LL_DELETE(MPIDI_OFI_global.per_vci[vci].huge_ctrl_head, - MPIDI_OFI_global.per_vci[vci].huge_ctrl_tail, list_ptr); - MPL_free(list_ptr); - } - MPIR_STATUS_SET_COUNT(rreq->status, count); - /* peek_status should be the last thing to change in rreq. Reason is - * we use peek_status to indicate peek_event has completed and all the - * relevant values have been copied to rreq. */ - MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, peek_status)), MPIDI_OFI_PEEK_FOUND); - } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__probe) { - /* return not found for this probe. User can probe again. */ - MPL_atomic_release_store_int(&(MPIDI_OFI_REQUEST(rreq, peek_status)), - MPIDI_OFI_PEEK_NOT_FOUND); - } else if (MPIDI_OFI_REQUEST(rreq, kind) == MPIDI_OFI_req_kind__mprobe) { - /* fill the status with wc info. Count is still missing */ - - /* post the rreq to list and let control handler handle it */ - MPIDI_OFI_huge_recv_list_t *huge_list_ptr; - - huge_list_ptr = - (MPIDI_OFI_huge_recv_list_t *) MPL_calloc(sizeof(*huge_list_ptr), 1, MPL_MEM_COMM); - MPIR_ERR_CHKANDJUMP(huge_list_ptr == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem"); - - huge_list_ptr->comm_id = rreq->comm->recvcontext_id; - huge_list_ptr->rank = MPIDI_OFI_cqe_get_source(wc, false); - huge_list_ptr->tag = MPIDI_OFI_TAG_MASK & wc->tag; - huge_list_ptr->u.rreq = rreq; - - LL_APPEND(MPIDI_OFI_global.per_vci[vci].huge_recv_head, - MPIDI_OFI_global.per_vci[vci].huge_recv_tail, huge_list_ptr); - } - - - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -int MPIDI_OFI_huge_chunk_done_event(int vci, struct fi_cq_tagged_entry *wc, void *req) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_read_chunk_t *chunk_req = (MPIDI_OFI_read_chunk_t *) req; - - int c; - MPIR_cc_decr(chunk_req->chunks_outstanding, &c); - - if (c == 0) { - MPL_free(chunk_req->chunks_outstanding); - mpi_errno = get_huge_complete(chunk_req->localreq); - MPIR_ERR_CHECK(mpi_errno); - } - - MPL_free(chunk_req); - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 891cc98e187..8f441c16ca6 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -56,6 +56,8 @@ ATTRIBUTE((unused)); MPIDI_OFI_global.prov_use[nic]->domain_attr->name : "(n/a)") #define MPIDI_OFI_DEFAULT_NIC_NAME (MPIDI_OFI_NIC_NAME(0)) +#define MPIDI_OFI_EAGER_THRESH (MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD == -1 ? MPIDI_OFI_global.max_msg_size : MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD) + int MPIDI_OFI_progress_uninlined(int vci); int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); int MPIDI_OFI_flush_send_queue(void); @@ -293,13 +295,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_mr_bind(struct fi_info *prov, struct fid_ #define MPIDI_OFI_COLL_MR_KEY 1 #define MPIDI_OFI_INVALID_MR_KEY 0xFFFFFFFFFFFFFFFFULL int MPIDI_OFI_retry_progress(int vci, int retry); -int MPIDI_OFI_recv_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); -int MPIDI_OFI_recv_huge_control(int vci, int comm_id, int rank, int tag, - MPIDI_OFI_huge_remote_info_t * info); -int MPIDI_OFI_peek_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq); -int MPIDI_OFI_huge_chunk_done_event(int vci, struct fi_cq_tagged_entry *wc, void *req); -int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, - uint32_t attr, MPIR_Request ** req); int MPIDI_OFI_am_rdma_read_ack_handler(void *am_hdr, void *data, MPI_Aint in_data_sz, uint32_t attr, MPIR_Request ** req); int MPIDI_OFI_rndv_info_handler(void *am_hdr, void *data, MPI_Aint data_sz, @@ -486,14 +481,20 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_sync(uint64_t match_bits) return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_SYNC_SEND); } -MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_huge(uint64_t match_bits) +MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_rndv(uint64_t match_bits) { - return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_HUGE_SEND); + return (bool) (match_bits & MPIDI_OFI_RNDV_SEND); } -MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_rndv(uint64_t match_bits) +MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_is_tag_rndv_pack(uint64_t match_bits) +{ + return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_RNDV_PACK); +} + +MPL_STATIC_INLINE_PREFIX bool MPIDI_OFI_rndv_need_pack(int dt_contig, MPL_pointer_attr_t * attr) { - return ((match_bits & MPIDI_OFI_PROTOCOL_MASK) == MPIDI_OFI_RNDV_SEND); + /* assume noncontig data or device data can benefit from pipelined packing/unpacking */ + return (!dt_contig || MPL_gpu_attr_is_dev(attr)); } MPL_STATIC_INLINE_PREFIX uint64_t MPIDI_OFI_init_sendtag(int contextid, int source, int tag) @@ -856,304 +857,35 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr) } } -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_pipeline_chunk_size(size_t data_sz) -{ - int chunk_size = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; - if (data_sz <= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ) { - chunk_size = data_sz; - } - return chunk_size; -} - -MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_task_t *MPIDI_OFI_create_gpu_task(MPIDI_OFI_pipeline_type_t - type, void *buf, - size_t len, - MPIR_Request * request, - MPIR_gpu_req yreq) -{ - MPIDI_OFI_gpu_task_t *task = - (MPIDI_OFI_gpu_task_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_task_t), MPL_MEM_OTHER); - MPIR_Assert(task != NULL); - task->type = type; - task->status = MPIDI_OFI_PIPELINE_READY; - task->buf = buf; - task->len = len; - task->request = request; - task->yreq = yreq; - task->prev = NULL; - task->next = NULL; - return task; -} - -MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_recv_t - * MPIDI_OFI_create_recv_task(MPIDI_OFI_gpu_pipeline_request * req, int idx, int n_chunks) -{ - MPIDI_OFI_gpu_pending_recv_t *task = - (MPIDI_OFI_gpu_pending_recv_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_pending_recv_t), - MPL_MEM_OTHER); - MPIR_Assert(task); - task->req = req; - task->idx = idx; - task->n_chunks = n_chunks; - task->prev = NULL; - task->next = NULL; - return task; -} - -MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_send_t *MPIDI_OFI_create_send_task(MPIR_Request * - req, - void *send_buf, - MPI_Aint count, - MPI_Datatype - datatype, - MPL_pointer_attr_t - attr, - MPI_Aint left_sz, - int dt_contig) -{ - MPIDI_OFI_gpu_pending_send_t *task = - (MPIDI_OFI_gpu_pending_send_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_pending_send_t), - MPL_MEM_OTHER); - MPIR_Assert(task); - task->sreq = req; - task->attr = attr; - task->send_buf = send_buf; - task->datatype = datatype; - MPIR_Datatype_add_ref_if_not_builtin(datatype); - task->offset = 0; - task->n_chunks = 0; - task->left_sz = left_sz; - task->count = count; - task->dt_contig = dt_contig; - task->prev = NULL; - task->next = NULL; - return task; -} - -static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int vni); - -static int MPIDI_OFI_gpu_progress_send(void) -{ - int mpi_errno = MPI_SUCCESS; - MPL_gpu_engine_type_t engine_type = - (MPL_gpu_engine_type_t) MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE; - - while (MPIDI_OFI_global.gpu_send_queue) { - char *host_buf = NULL; - MPI_Aint chunk_sz; - int vci_local = -1; - - MPIDI_OFI_gpu_pending_send_t *send_task = MPIDI_OFI_global.gpu_send_queue; - int block_sz = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.chunk_sz); - while (send_task->left_sz > 0) { - MPIDI_OFI_gpu_task_t *task = NULL; - chunk_sz = send_task->left_sz > block_sz ? block_sz : send_task->left_sz; - host_buf = NULL; - MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, - (void **) &host_buf); - if (host_buf == NULL) { - goto fn_exit; - } - MPI_Aint actual_pack_bytes; - MPIR_gpu_req yreq; - int commit = send_task->left_sz <= chunk_sz ? 1 : 0; - if (!commit && - !MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST && - send_task->n_chunks % MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK == - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK - 1) - commit = 1; - mpi_errno = - MPIR_Ilocalcopy_gpu((char *) send_task->send_buf, send_task->count, - send_task->datatype, send_task->offset, &send_task->attr, - host_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL, - MPL_GPU_COPY_D2H, engine_type, commit, &yreq); - MPIR_ERR_CHECK(mpi_errno); - actual_pack_bytes = chunk_sz; - task = - MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_SEND, host_buf, actual_pack_bytes, - send_task->sreq, yreq); - send_task->offset += (size_t) actual_pack_bytes; - send_task->left_sz -= (size_t) actual_pack_bytes; - vci_local = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.vci_local); - MPIR_Assert(vci_local < MPIDI_CH4_MAX_VCIS); - DL_APPEND(MPIDI_OFI_global.gpu_send_task_queue[vci_local], task); - send_task->n_chunks++; - /* Increase request completion cnt, cc is 1 more than necessary - * to prevent parent request being freed prematurally. */ - MPIR_cc_inc(send_task->sreq->cc_ptr); - } - /* all done, decrease cc by 1 to allow parent request to be freed - * when complete */ - MPIR_cc_dec(send_task->sreq->cc_ptr); - /* Update correct number of chunks in immediate data. */ - MPIDI_OFI_idata_set_gpuchunk_bits(&MPIDI_OFI_REQUEST - (send_task->sreq, pipeline_info.cq_data), - send_task->n_chunks); - DL_DELETE(MPIDI_OFI_global.gpu_send_queue, send_task); - MPIR_Datatype_release_if_not_builtin(send_task->datatype); - MPL_free(send_task); - - if (vci_local != -1) - MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_send_task_queue, vci_local); - - } - - fn_exit: - return mpi_errno; - fn_fail: - mpi_errno = MPI_ERR_OTHER; - goto fn_exit; -} - -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress_recv(void) +MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_send_engine_type(void) { - int mpi_errno = MPI_SUCCESS; - - while (MPIDI_OFI_global.gpu_recv_queue) { - MPIDI_OFI_gpu_pending_recv_t *recv_task = MPIDI_OFI_global.gpu_recv_queue; - MPIDI_OFI_gpu_pipeline_request *chunk_req = recv_task->req; - MPIR_Request *rreq = chunk_req->parent; - void *host_buf = chunk_req->buf; - if (!host_buf) { - MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - (void **) &host_buf); - if (!host_buf) { - break; - } - chunk_req->buf = host_buf; - } - fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr); - - int ret = fi_trecv(MPIDI_OFI_global.ctx[MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx)].rx, - (void *) host_buf, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, NULL, remote_addr, - MPIDI_OFI_REQUEST(rreq, - pipeline_info.match_bits) | - MPIDI_OFI_GPU_PIPELINE_SEND, - MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits), - (void *) &chunk_req->context); - if (ret == 0) { - DL_DELETE(MPIDI_OFI_global.gpu_recv_queue, recv_task); - MPL_free(recv_task); - } else if (ret == -FI_EAGAIN || ret == -FI_ENOMEM) { - break; - } else { - goto fn_fail; - } + if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_compute) { + return MPL_GPU_ENGINE_TYPE_COMPUTE; + } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == + MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_high_bandwidth) { + return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; + } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == + MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_low_latency) { + return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; + } else { + return MPL_GPU_ENGINE_TYPE_LAST; } - - fn_exit: - return mpi_errno; - fn_fail: - mpi_errno = MPI_ERR_OTHER; - goto fn_exit; } -static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int vni) +MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_recv_engine_type(void) { - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_gpu_task_t *task = NULL; - MPIDI_OFI_gpu_task_t *tmp; - - DL_FOREACH_SAFE(gpu_queue[vni], task, tmp) { - if (task->status == MPIDI_OFI_PIPELINE_EXEC) { - /* Avoid the deadlock of re-launching an executing OFI task. */ - goto fn_exit; - } - - MPIR_gpu_req *yreq = &task->yreq; - int completed = 0; - if (yreq->type == MPIR_GPU_REQUEST) { - mpi_errno = MPL_gpu_test(&yreq->u.gpu_req, &completed); - MPIR_ERR_CHECK(mpi_errno); - } else if (yreq->type == MPIR_TYPEREP_REQUEST) { - MPIR_Typerep_test(yreq->u.y_req, &completed); - } else { - completed = 1; - } - if (completed == 1) { - /* GPU transfer completes. */ - task->status = MPIDI_OFI_PIPELINE_EXEC; - MPIR_Request *request = task->request; - - if (task->type == MPIDI_OFI_PIPELINE_SEND) { - MPIDI_OFI_gpu_pipeline_request *chunk_req = (MPIDI_OFI_gpu_pipeline_request *) - MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); - MPIR_ERR_CHKANDJUMP1(chunk_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", - "**nomem %s", "GPU pipelining chunk_req alloc"); - chunk_req->parent = request; - chunk_req->event_id = MPIDI_OFI_EVENT_SEND_GPU_PIPELINE; - chunk_req->buf = task->buf; - MPIDI_OFI_CALL(fi_tsenddata - (MPIDI_OFI_global.ctx - [MPIDI_OFI_REQUEST(request, pipeline_info.ctx_idx)].tx, - task->buf, task->len, NULL /* desc */ , - MPIDI_OFI_REQUEST(request, pipeline_info.cq_data), - MPIDI_OFI_REQUEST(request, pipeline_info.remote_addr), - MPIDI_OFI_REQUEST(request, - pipeline_info.match_bits) | - MPIDI_OFI_GPU_PIPELINE_SEND, (void *) &chunk_req->context), - tsenddata); - DL_DELETE(gpu_queue[vni], task); - MPL_free(task); - } else { - MPIR_Assert(task->type == MPIDI_OFI_PIPELINE_RECV); - int c; - MPIR_cc_decr(request->cc_ptr, &c); - if (c == 0) { - /* If synchronous, send ack */ - if (unlikely(MPIDI_OFI_REQUEST(request, pipeline_info.is_sync))) { - int context_id = MPIDI_OFI_REQUEST(request, context_id); - mpi_errno = MPIDI_OFI_send_ack(request, context_id, NULL, 0); - MPIR_ERR_CHECK(mpi_errno); - } - /* Set number of bytes in status. */ - MPIR_STATUS_SET_COUNT(request->status, - MPIDI_OFI_REQUEST(request, pipeline_info.offset)); - - MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(request, datatype)); - MPIR_Request_free(request); - } - - /* For recv, now task can be deleted from DL. */ - DL_DELETE(gpu_queue[vni], task); - /* Free host buffer, yaksa request and task. */ - if (task->type == MPIDI_OFI_PIPELINE_RECV) - MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - task->buf); - else - MPIDI_OFI_gpu_free_pack_buffer(task->buf); - MPL_free(task); - } - } else { - goto fn_exit; - } + if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE == + MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_compute) { + return MPL_GPU_ENGINE_TYPE_COMPUTE; + } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE == + MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_high_bandwidth) { + return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; + } else if (MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE == + MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_low_latency) { + return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; + } else { + return MPL_GPU_ENGINE_TYPE_LAST; } - - fn_exit: - return mpi_errno; - fn_fail: - mpi_errno = MPI_ERR_OTHER; - goto fn_exit; -} - -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress(int vni) -{ - int mpi_errno = MPI_SUCCESS; - - mpi_errno = MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_recv_task_queue, vni); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_send_task_queue, vni); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIDI_OFI_gpu_progress_send(); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIDI_OFI_gpu_progress_recv(); - MPIR_ERR_CHECK(mpi_errno); - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; } #endif /* OFI_IMPL_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index f6480e3c57c..3aab17f80a0 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -375,21 +375,6 @@ categories : Specifies the maximum number of iovecs to allocate for RMA operations to/from noncontiguous buffers. - - name : MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE - category : CH4_OFI - type : int - default : -1 - class : none - verbosity : MPI_T_VERBOSITY_USER_BASIC - scope : MPI_T_SCOPE_LOCAL - description : >- - This cvar controls the message size at which OFI native path switches from eager to - rendezvous mode. It does not affect the AM path eager limit. Having this gives a way to - reliably test native non-path. - If the number is positive, OFI will init the MPIDI_OFI_global.max_msg_size to the value of - cvar. If the number is negative, OFI will init the MPIDI_OFI_globa.max_msg_size using - whatever provider gives (which might be unlimited for socket provider). - - name : MPIR_CVAR_CH4_OFI_MAX_NICS category : CH4 type : int @@ -470,30 +455,7 @@ categories : description : >- If true, enable OFI triggered ops for MPI collectives. - - name : MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE - category : CH4_OFI - type : boolean - default : false - class : none - verbosity : MPI_T_VERBOSITY_USER_BASIC - scope : MPI_T_SCOPE_LOCAL - description : >- - If true, enable pipeline for GPU data transfer. - GPU pipeline does not support non-contiguous datatypes or mixed buffer types - (i.e. GPU send buffer, host recv buffer). If GPU pipeline is enabled, the unsupported - scenarios will cause undefined behavior if encountered. - - - name : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD - category : CH4_OFI - type : int - default : 131072 - class : none - verbosity : MPI_T_VERBOSITY_USER_BASIC - scope : MPI_T_SCOPE_LOCAL - description : >- - This is the threshold to start using GPU pipeline. - - - name : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ + - name : MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ category : CH4_OFI type : int default : 1048576 @@ -501,9 +463,9 @@ categories : verbosity : MPI_T_VERBOSITY_USER_BASIC scope : MPI_T_SCOPE_LOCAL description : >- - Specifies the buffer size (in bytes) for GPU pipeline data transfer. + Specifies the chunk size (in bytes) for pipeline data transfer. - - name : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK + - name : MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS category : CH4_OFI type : int default : 32 @@ -511,40 +473,35 @@ categories : verbosity : MPI_T_VERBOSITY_USER_BASIC scope : MPI_T_SCOPE_LOCAL description : >- - Specifies the number of buffers for GPU pipeline data transfer in - each block/chunk of the pool. + Specifies the number of chunk buffers for pipeline data transfer. - - name : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS + - name : MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE category : CH4_OFI - type : int - default : 32 + type : enum + default : copy_low_latency class : none verbosity : MPI_T_VERBOSITY_USER_BASIC scope : MPI_T_SCOPE_LOCAL - description : >- - Specifies the total number of buffers for GPU pipeline data transfer - - - name : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE - category : CH4_OFI - type : int - default : 0 - class : none - verbosity : MPI_T_VERBOSITY_USER_BASIC - scope : MPI_T_SCOPE_LOCAL - description : >- - Specifies the GPU engine type for GPU pipeline on the sender side, - default is MPL_GPU_ENGINE_TYPE_COMPUTE - - - name : MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE + description : |- + Specifies GPU engine type for GPU pt2pt on the sender side. + compute - use a compute engine + copy_high_bandwidth - use a high-bandwidth copy engine + copy_low_latency - use a low-latency copy engine + yaksa - use Yaksa + + - name : MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE category : CH4_OFI - type : int - default : 0 + type : enum + default : copy_low_latency class : none verbosity : MPI_T_VERBOSITY_USER_BASIC scope : MPI_T_SCOPE_LOCAL - description : >- - Specifies the GPU engine type for GPU pipeline on the receiver side, - default is MPL_GPU_ENGINE_TYPE_COMPUTE + description : |- + Specifies GPU engine type for GPU pt2pt on the receiver side. + compute - use a compute engine + copy_high_bandwidth - use a high-bandwidth copy engine + copy_low_latency - use a low-latency copy engine + yaksa - use Yaksa === END_MPI_T_CVAR_INFO_BLOCK === */ @@ -646,8 +603,6 @@ int MPIDI_OFI_init_local(int *tag_bits) MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) == offsetof(MPIDI_OFI_chunk_request, context)); - MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) == - offsetof(MPIDI_OFI_read_chunk_t, context)); MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) == offsetof(MPIDI_OFI_am_repost_request_t, context)); MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) == @@ -655,7 +610,7 @@ int MPIDI_OFI_init_local(int *tag_bits) MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.netmod) == offsetof(MPIDI_OFI_dynamic_process_request_t, context)); MPL_COMPILE_TIME_ASSERT(offsetof(struct MPIR_Request, dev.ch4.am.netmod_am.ofi.context) == - offsetof(struct MPIR_Request, dev.ch4.netmod.ofi.context)); + offsetof(struct MPIR_Request, dev.ch4.netmod.ofi.direct.context)); MPL_COMPILE_TIME_ASSERT(sizeof(MPIDI_Devreq_t) >= sizeof(MPIDI_OFI_request_t)); int err; @@ -676,28 +631,6 @@ int MPIDI_OFI_init_local(int *tag_bits) /* -------------------------------- */ MPIDIU_map_create(&MPIDI_OFI_global.win_map, MPL_MEM_RMA); - /* Create pack buffer pool for GPU pipeline */ - if (MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE) { - mpi_errno = - MPIDU_genq_private_pool_create(MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS, - host_alloc_registered, - host_free_registered, - &MPIDI_OFI_global.gpu_pipeline_send_pool); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = - MPIDU_genq_private_pool_create(MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS, - host_alloc_registered, - host_free_registered, - &MPIDI_OFI_global.gpu_pipeline_recv_pool); - MPIR_ERR_CHECK(mpi_errno); - MPIDI_OFI_global.gpu_send_queue = NULL; - MPIDI_OFI_global.gpu_recv_queue = NULL; - } - /* Initialize RMA keys allocator */ MPIDI_OFI_mr_key_allocator_init(); @@ -794,8 +727,7 @@ int MPIDI_OFI_init_local(int *tag_bits) MPIR_Assert(MPIDI_OFI_DEFAULT_SHORT_SEND_SIZE <= MPIR_CVAR_CH4_PACK_BUFFER_SIZE); MPIDI_OFI_global.num_vcis = 1; - MPIDI_OFI_am_init(0); - MPIDI_OFI_am_post_recv(0, 0); + MPIDI_OFI_init_per_vci(0); fn_exit: *tag_bits = MPIDI_OFI_TAG_BITS; @@ -984,6 +916,10 @@ int MPIDI_OFI_mpi_finalize_hook(void) MPIDIU_map_destroy(MPIDI_OFI_global.win_map); + for (int vci = 0; vci < MPIDI_OFI_global.num_vcis; vci++) { + MPIDU_genq_private_pool_destroy(MPIDI_OFI_global.per_vci[vci].pipeline_pool); + } + if (MPIDI_OFI_ENABLE_AM) { for (int vci = 0; vci < MPIDI_OFI_global.num_vcis; vci++) { while (MPIDI_OFI_global.per_vci[vci].am_unordered_msgs) { @@ -1007,11 +943,6 @@ int MPIDI_OFI_mpi_finalize_hook(void) } } - if (MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE) { - MPIDU_genq_private_pool_destroy(MPIDI_OFI_global.gpu_pipeline_send_pool); - MPIDU_genq_private_pool_destroy(MPIDI_OFI_global.gpu_pipeline_recv_pool); - } - int err; MPID_Thread_mutex_destroy(&MPIDI_OFI_THREAD_UTIL_MUTEX, &err); MPIR_Assert(err == 0); @@ -1458,13 +1389,7 @@ static int update_global_limits(struct fi_info *prov) MPIDI_OFI_global.max_buffered_send = prov->tx_attr->inject_size; MPIDI_OFI_global.max_buffered_write = prov->tx_attr->inject_size; - if (MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE > 0 && - MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE <= prov->ep_attr->max_msg_size) { - /* Truncate max_msg_size to a user-selected value */ - MPIDI_OFI_global.max_msg_size = MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE; - } else { - MPIDI_OFI_global.max_msg_size = MPL_MIN(prov->ep_attr->max_msg_size, MPIR_AINT_MAX); - } + MPIDI_OFI_global.max_msg_size = MPL_MIN(prov->ep_attr->max_msg_size, MPIR_AINT_MAX); MPIDI_OFI_global.cq_data_size = prov->domain_attr->cq_data_size; MPIDI_OFI_global.stripe_threshold = MPIR_CVAR_CH4_OFI_MULTI_NIC_STRIPING_THRESHOLD; if (prov->ep_attr->max_order_raw_size > MPIR_AINT_MAX) { @@ -1499,11 +1424,6 @@ static int update_global_limits(struct fi_info *prov) MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch4|too_many_ranks"); } - if (MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE && (prov->domain_attr->cq_data_size < 8 || - MPIDI_OFI_GPU_PIPELINE_SEND == 0)) { - MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch4|too_small_cqdata"); - } - fn_exit: return mpi_errno; fn_fail: @@ -1584,7 +1504,35 @@ static void dump_global_settings(void) /* static functions for AM */ -int MPIDI_OFI_am_init(int vci) +static int am_init(int vci); +static int am_post_recv(int vci, int nic); + +int MPIDI_OFI_init_per_vci(int vci) +{ + int mpi_errno = MPI_SUCCESS; + + /* Create chunk buffer pool (for pipeline etc.) */ + mpi_errno = MPIDU_genq_private_pool_create(MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ, + MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS, + MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS, + host_alloc_registered, + host_free_registered, + &MPIDI_OFI_global.per_vci[vci].pipeline_pool); + MPIR_ERR_CHECK(mpi_errno); + + mpi_errno = am_init(vci); + MPIR_ERR_CHECK(mpi_errno); + + mpi_errno = am_post_recv(vci, 0); + MPIR_ERR_CHECK(mpi_errno); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +static int am_init(int vci) { int mpi_errno = MPI_SUCCESS; @@ -1621,7 +1569,6 @@ int MPIDI_OFI_am_init(int vci) MPIDI_OFI_global.per_vci[vci].am_inflight_rma_send_mrs = 0; if (vci == 0) { - MPIDIG_am_reg_cb(MPIDI_OFI_INTERNAL_HANDLER_CONTROL, NULL, &MPIDI_OFI_control_handler); MPIDIG_am_reg_cb(MPIDI_OFI_AM_RDMA_READ_ACK, NULL, &MPIDI_OFI_am_rdma_read_ack_handler); MPIDIG_am_reg_cb(MPIDI_OFI_RNDV_INFO, NULL, &MPIDI_OFI_rndv_info_handler); } @@ -1633,7 +1580,7 @@ int MPIDI_OFI_am_init(int vci) goto fn_exit; } -int MPIDI_OFI_am_post_recv(int vci, int nic) +static int am_post_recv(int vci, int nic) { int mpi_errno = MPI_SUCCESS; diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.h b/src/mpid/ch4/netmod/ofi/ofi_init.h index f59e0befc1e..ef8845afa37 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.h +++ b/src/mpid/ch4/netmod/ofi/ofi_init.h @@ -34,8 +34,7 @@ void MPIDI_OFI_update_global_settings(struct fi_info *prov); bool MPIDI_OFI_nic_already_used(const struct fi_info *prov, struct fi_info **others, int nic_count); int MPIDI_OFI_create_vci_context(int vci, int nic); -int MPIDI_OFI_am_init(int vci); -int MPIDI_OFI_am_post_recv(int vci, int nic); +int MPIDI_OFI_init_per_vci(int vci); bool MPIDI_OFI_nic_is_up(struct fi_info *prov); diff --git a/src/mpid/ch4/netmod/ofi/ofi_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_pipeline.c new file mode 100644 index 00000000000..f591bcd01cd --- /dev/null +++ b/src/mpid/ch4/netmod/ofi/ofi_pipeline.c @@ -0,0 +1,462 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include "mpidimpl.h" +#include "ofi_impl.h" +#include "ofi_rndv.h" + +#define MPIDI_OFI_PIPILINE_INFLY_CHUNKS 10 + +struct send_chunk_req { + char pad[MPIDI_REQUEST_HDR_SIZE]; + struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; + int event_id; + MPI_Aint chunk_sz; + void *chunk_buf; + struct iovec iov; + MPIR_Request *sreq; +}; + +struct recv_chunk_req { + char pad[MPIDI_REQUEST_HDR_SIZE]; + struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; + int event_id; + MPI_Aint chunk_sz; + void *chunk_buf; + MPIR_Request *rreq; + MPI_Aint recv_offset; /* need remember where to copy chunk */ +}; + +static int pipeline_send_poll(MPIX_Async_thing thing); +static void spawn_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, MPIR_gpu_req * areq, + int chunk_index, void *chunk_buf, MPI_Aint chunk_sz); +static int send_copy_poll(MPIX_Async_thing thing); +static bool send_copy_complete(MPIR_Request * sreq, int chunk_index, + void *chunk_buf, MPI_Aint chunk_sz); +static void send_chunk_complete(MPIR_Request * sreq, void *chunk_buf, MPI_Aint chunk_sz); + +static int pipeline_recv_poll(MPIX_Async_thing thing); +static void recv_chunk_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz, + MPI_Aint offset); +static void add_recv_copy(MPIR_Request * rreq, MPIR_gpu_req * areq, void *chunk_buf, + MPI_Aint chunk_sz); +static int recv_copy_poll(MPIX_Async_thing thing); +static void recv_copy_complete(MPIR_Request * sreq, void *chunk_buf, MPI_Aint chunk_sz); + +/* NOTE: fields cached or duplicated in MPIDI_OFI_pipeline_t for clarity. + * We can optimize if we are concerned with the overhead or request header size. + */ +int MPIDI_OFI_pipeline_send(MPIR_Request * sreq, int tag) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq); + + p->remote_data_sz = MPL_MIN(p->remote_data_sz, p->data_sz); + p->remain_sz = p->remote_data_sz; + p->chunk_index = 0; + p->u.send.copy_offset = 0; + p->u.send.copy_infly = 0; /* control to avoid overwhelming async progress */ + p->u.send.send_infly = 0; /* control to avoid overwhelming unexpected recv */ + + if (!MPIDI_OFI_CAN_SEND_CQ_DATASIZE(p->data_sz)) { + mpi_errno = MPIDI_OFI_RNDV_send_hdr(&p->data_sz, sizeof(MPI_Aint), + p->av, p->vci_local, p->vci_remote, p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + } + + mpi_errno = MPIR_Async_things_add(pipeline_send_poll, sreq, NULL); + /* poke progress? */ + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_pipeline_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq); + + p->chunk_index = 0; + p->u.recv.recv_offset = 0; + p->u.recv.recv_infly = 0; /* just need enough to match infly send */ + + if (p->remote_data_sz != -1) { + p->remote_data_sz = MPL_MIN(p->remote_data_sz, p->data_sz); + p->remain_sz = p->remote_data_sz; + } else { + mpi_errno = MPIDI_OFI_RNDV_recv_hdr(rreq, MPIDI_OFI_EVENT_PIPELINE_RECV_DATASIZE, + sizeof(MPI_Aint), p->av, p->vci_local, p->vci_remote, + p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + } + + mpi_errno = MPIR_Async_things_add(pipeline_recv_poll, rreq, NULL); + /* poke progress? */ + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* callback from MPIDI_OFI_dispatch_function in ofi_events.c */ + +int MPIDI_OFI_pipeline_recv_datasize_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Request *rreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r); + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq); + + MPI_Aint *hdr_data_sz = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r); + MPIDI_OFI_RNDV_update_count(rreq, *hdr_data_sz); + + p->remote_data_sz = MPL_MIN(*hdr_data_sz, p->data_sz); + p->remain_sz = p->remote_data_sz; + + MPL_free(r); + return mpi_errno; +} + +int MPIDI_OFI_pipeline_send_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + + struct send_chunk_req *chunk_req = (void *) r; + send_chunk_complete(chunk_req->sreq, chunk_req->chunk_buf, chunk_req->chunk_sz); + MPL_free(chunk_req); + + return mpi_errno; +} + +int MPIDI_OFI_pipeline_recv_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + + struct recv_chunk_req *chunk_req = (void *) r; + recv_chunk_copy(chunk_req->rreq, chunk_req->chunk_buf, chunk_req->chunk_sz, + chunk_req->recv_offset); + MPL_free(chunk_req); + + return mpi_errno; +} + +/* ---- static routines for send side ---- */ + +/* async send chunks until done */ +static int pipeline_send_poll(MPIX_Async_thing thing) +{ + int ret = MPIX_ASYNC_NOPROGRESS; + MPIR_Request *sreq = MPIR_Async_thing_get_state(thing); + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq); + + /* CS required for genq pool and gpu imemcpy */ + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local)); + + while (p->u.send.copy_offset < p->remote_data_sz) { + /* limit copy_infly so it doesn't overwhelm async progress */ + if (p->u.send.copy_infly >= MPIDI_OFI_PIPILINE_INFLY_CHUNKS) { + goto fn_exit; + } + + void *chunk_buf; + MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ; + if (chunk_sz > p->remote_data_sz - p->u.send.copy_offset) { + chunk_sz = p->remote_data_sz - p->u.send.copy_offset; + } + + /* alloc a chunk */ + MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool, + &chunk_buf); + if (!chunk_buf) { + goto fn_exit; + } + + /* async copy */ + MPIR_gpu_req async_req; + int mpi_errno; + int engine_type = MPIDI_OFI_gpu_get_send_engine_type(); + int copy_dir = MPL_GPU_COPY_DIRECTION_NONE; + mpi_errno = MPIR_Ilocalcopy_gpu(p->buf, p->count, p->datatype, + p->u.send.copy_offset, &p->attr, chunk_buf, chunk_sz, + MPIR_BYTE_INTERNAL, 0, NULL, copy_dir, engine_type, + 1, &async_req); + MPIR_Assertp(mpi_errno == MPI_SUCCESS); + spawn_send_copy(thing, sreq, &async_req, p->chunk_index, chunk_buf, chunk_sz); + p->chunk_index++; + p->u.send.copy_offset += chunk_sz; + p->u.send.copy_infly++; + } + + ret = MPIX_ASYNC_DONE; + fn_exit: + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local)); + return ret; +} + +/* ---- send_copy ---- */ +struct send_copy { + MPIR_Request *sreq; + /* async handle */ + bool copy_done; + MPIR_gpu_req async_req; + /* for sending data */ + int chunk_index; + void *chunk_buf; + MPI_Aint chunk_sz; +}; + +static void spawn_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, MPIR_gpu_req * areq, + int chunk_index, void *chunk_buf, MPI_Aint chunk_sz) +{ + struct send_copy *p; + p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); + MPIR_Assert(p); + + p->sreq = sreq; + p->copy_done = false; + p->async_req = *areq; + p->chunk_index = chunk_index; + p->chunk_buf = chunk_buf; + p->chunk_sz = chunk_sz; + + MPIR_Async_thing_spawn(thing, send_copy_poll, p, NULL); +} + +static int send_copy_poll(MPIX_Async_thing thing) +{ + struct send_copy *p = MPIR_Async_thing_get_state(thing); + + /* this async task contains two parts: copy and send. Use the copy_done field to allow each part to be pending */ + int is_done = p->copy_done; + if (!is_done) { + MPIR_async_test(&(p->async_req), &is_done); + if (is_done) { + MPIDI_OFI_pipeline_t *p_req = &MPIDI_OFI_AMREQ_PIPELINE(p->sreq); + p_req->u.send.copy_infly--; + p->copy_done = true; + } + } + + if (!is_done) { + return MPIX_ASYNC_NOPROGRESS; + } else { + if (send_copy_complete(p->sreq, p->chunk_index, p->chunk_buf, p->chunk_sz)) { + MPL_free(p); + return MPIX_ASYNC_DONE; + } else { + /* We can't send the chunk for some reason. We'll try again */ + return MPIX_ASYNC_NOPROGRESS; + } + } +} + +static bool send_copy_complete(MPIR_Request * sreq, int chunk_index, + void *chunk_buf, MPI_Aint chunk_sz) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq); + + if (p->u.send.send_infly >= MPIDI_OFI_PIPILINE_INFLY_CHUNKS) { + return false; + } + + struct send_chunk_req *chunk_req = MPL_malloc(sizeof(struct send_chunk_req), MPL_MEM_BUFFER); + MPIR_Assertp(chunk_req); + + chunk_req->event_id = MPIDI_OFI_EVENT_PIPELINE_SEND_CHUNK; + chunk_req->sreq = sreq; + chunk_req->chunk_buf = (void *) chunk_buf; + chunk_req->chunk_sz = chunk_sz; + chunk_req->iov.iov_base = chunk_buf; + chunk_req->iov.iov_len = chunk_sz; + + /* send */ + int nic = chunk_index % MPIDI_OFI_global.num_nics; + int ctx_idx = MPIDI_OFI_get_ctx_index(p->vci_local, nic); + fi_addr_t addr = MPIDI_OFI_av_to_phys(p->av, p->vci_local, nic, p->vci_remote, nic); + + struct fi_msg_tagged msg; + msg.msg_iov = &chunk_req->iov; + msg.desc = NULL; + msg.iov_count = 1; + msg.addr = addr; + msg.tag = p->match_bits; + msg.context = (void *) &(chunk_req->context); + msg.data = 0; + + uint64_t flags = FI_COMPLETION | FI_MATCH_COMPLETE | FI_DELIVERY_COMPLETE; + + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local)); + MPIDI_OFI_CALL_RETRY(fi_tsendmsg(MPIDI_OFI_global.ctx[ctx_idx].tx, &msg, flags), + p->vci_local, tsendv); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local)); + + p->u.send.send_infly++; + /* both send buffer and chunk_req will be freed in pipeline_send_event */ + + return true; + fn_fail: + MPIR_Assert(0); + return false; +} + +static void send_chunk_complete(MPIR_Request * sreq, void *chunk_buf, MPI_Aint chunk_sz) +{ + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(sreq); + + p->u.send.send_infly--; + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool, + chunk_buf); + + p->remain_sz -= chunk_sz; + if (p->remain_sz == 0) { + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(sreq); + } +} + +/* ---- static routines for recv side ---- */ + +/* async post recv chunks until done */ +static int pipeline_recv_poll(MPIX_Async_thing thing) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Request *rreq = MPIR_Async_thing_get_state(thing); + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq); + + if (p->remote_data_sz == -1) { + /* Maybe we can issue 1 chunk anyway? */ + return MPIX_ASYNC_NOPROGRESS; + } + + int ret = MPIX_ASYNC_NOPROGRESS; + /* CS required for genq pool and gpu imemcpy */ + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local)); + + while (p->u.recv.recv_offset < p->remote_data_sz) { + /* only need issue enough recv_infly to match send_infly */ + if (p->u.recv.recv_infly >= MPIDI_OFI_PIPILINE_INFLY_CHUNKS) { + goto fn_exit; + } + + void *chunk_buf; + MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ; + if (chunk_sz > p->remote_data_sz - p->u.recv.recv_offset) { + chunk_sz = p->remote_data_sz - p->u.recv.recv_offset; + } + + /* alloc a chunk */ + MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool, + &chunk_buf); + if (!chunk_buf) { + goto fn_exit; + } + + struct recv_chunk_req *chunk_req = + MPL_malloc(sizeof(struct recv_chunk_req), MPL_MEM_BUFFER); + MPIR_Assertp(chunk_req); + + chunk_req->event_id = MPIDI_OFI_EVENT_PIPELINE_RECV_CHUNK; + chunk_req->rreq = rreq; + chunk_req->chunk_buf = (void *) chunk_buf; + chunk_req->chunk_sz = chunk_sz; + chunk_req->recv_offset = p->u.recv.recv_offset; + + /* post recv */ + int nic = p->chunk_index % MPIDI_OFI_global.num_nics; + int ctx_idx = MPIDI_OFI_get_ctx_index(p->vci_local, nic); + fi_addr_t addr = MPIDI_OFI_av_to_phys(p->av, p->vci_local, nic, p->vci_remote, nic); + + MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, + chunk_buf, chunk_sz, NULL, addr, p->match_bits, 0, + (void *) &chunk_req->context), p->vci_local, trecv); + p->chunk_index++; + p->u.recv.recv_offset += chunk_sz; + p->u.recv.recv_infly++; + } + + ret = MPIX_ASYNC_DONE; + fn_exit: + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local)); + return ret; + fn_fail: + MPIR_Assert(0); + goto fn_exit; +} + +static void recv_chunk_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz, + MPI_Aint offset) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq); + p->u.recv.recv_infly--; + + MPIR_gpu_req async_req; + int engine_type = MPIDI_OFI_gpu_get_recv_engine_type(); + int copy_dir = MPL_GPU_COPY_DIRECTION_NONE; + mpi_errno = MPIR_Ilocalcopy_gpu(chunk_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL, + (void *) p->buf, p->count, p->datatype, offset, &p->attr, + copy_dir, engine_type, 1, &async_req); + MPIR_Assertp(mpi_errno == MPI_SUCCESS); + + add_recv_copy(rreq, &async_req, chunk_buf, chunk_sz); +} + +/* async recv copy */ +struct recv_copy { + MPIR_Request *rreq; + MPIR_gpu_req async_req; + void *chunk_buf; + MPI_Aint chunk_sz; +}; + +static void add_recv_copy(MPIR_Request * rreq, MPIR_gpu_req * areq, + void *chunk_buf, MPI_Aint chunk_sz) +{ + struct recv_copy *p; + p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); + MPIR_Assert(p); + + p->rreq = rreq; + p->async_req = *areq; + p->chunk_buf = chunk_buf; + p->chunk_sz = chunk_sz; + + MPIR_Async_things_add(recv_copy_poll, p, NULL); +} + +static int recv_copy_poll(MPIX_Async_thing thing) +{ + struct recv_copy *p = MPIR_Async_thing_get_state(thing); + + int is_done = 0; + MPIR_async_test(&(p->async_req), &is_done); + + if (!is_done) { + return MPIX_ASYNC_NOPROGRESS; + } else { + recv_copy_complete(p->rreq, p->chunk_buf, p->chunk_sz); + MPL_free(p); + return MPIX_ASYNC_DONE; + } +} + +static void recv_copy_complete(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz) +{ + MPIDI_OFI_pipeline_t *p = &MPIDI_OFI_AMREQ_PIPELINE(rreq); + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local)); + + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool, + chunk_buf); + + p->remain_sz -= chunk_sz; + if (p->remain_sz == 0) { + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(rreq); + } + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local)); +} diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 98d41802bc2..795c8e5c6f5 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -185,6 +185,96 @@ typedef struct { MPI_Aint data_sz; /* save data_sz to avoid double checking */ } MPIDI_OFI_am_request_t; + +/* define common fields for the next 3 structs and common macros used to initialize these fields. + * These macros are "fragile" but they are not supposed to be used anywhere with ignorance. + * note: the missing semicolon on the last line is intentional. + */ + +#define MPIDI_OFI_RNDV_COMMON_FIELDS \ + const void *buf; \ + MPI_Aint count; \ + MPI_Datatype datatype; \ + /* cached fields */ \ + bool need_pack; \ + MPL_pointer_attr_t attr; \ + MPI_Aint data_sz; \ + MPI_Aint remote_data_sz; \ + /* send/recv fields */ \ + int vci_local; \ + int vci_remote; \ + struct MPIDI_av_entry *av; \ + uint64_t match_bits; \ + /* only needed for sender to am_tag_send or replying probe */ \ + int remote_rank + +typedef struct { + MPIDI_OFI_RNDV_COMMON_FIELDS; +} MPIDI_OFI_rndv_common_t; + +typedef struct { + MPIDI_OFI_RNDV_COMMON_FIELDS; + union { + struct { + MPI_Aint copy_offset; + int copy_infly; + int send_infly; + } send; + struct { + MPI_Aint recv_offset; + int recv_infly; + } recv; + } u; + int chunk_index; + MPI_Aint remain_sz; +} MPIDI_OFI_pipeline_t; + +typedef struct { + MPIDI_OFI_RNDV_COMMON_FIELDS; + MPI_Aint sz_per_nic; + union { + struct { + const void *data; + struct fid_mr **mrs; + } send; + struct { + union { + void *data; /* !need_pack */ + int copy_infly; /* need_pack */ + } u; + uint64_t remote_base; + uint64_t *rkeys; + MPI_Aint chunks_per_nic; + MPI_Aint cur_chunk_index; + int num_infly; + bool all_issued; + } recv; + } u; +} MPIDI_OFI_rndvread_t; + +typedef struct { + MPIDI_OFI_RNDV_COMMON_FIELDS; + MPI_Aint sz_per_nic; + union { + struct { + union { + void *data; /* !need_pack */ + int copy_infly; /* need_pack */ + } u; + uint64_t remote_base; + uint64_t *rkeys; + MPI_Aint chunks_per_nic; + MPI_Aint cur_chunk_index; + int write_infly; + MPI_Aint chunks_remain; + } send; + struct { + const void *data; + struct fid_mr **mrs; + } recv; + } u; +} MPIDI_OFI_rndvwrite_t; + enum MPIDI_OFI_req_kind { MPIDI_OFI_req_kind__any, MPIDI_OFI_req_kind__probe, @@ -210,10 +300,6 @@ typedef struct { int context_id; enum MPIDI_OFI_req_kind kind; - union { - struct fid_mr **send_mrs; - void *remote_info; - } huge; union { struct { char *pack_buffer; @@ -226,23 +312,21 @@ typedef struct { struct iovec iov; void *inject_buf; /* Internal buffer for inject emulation */ } util; - struct { - fi_addr_t remote_addr; - int ctx_idx; - int vci_local; - int chunk_sz; - bool is_sync; - uint64_t cq_data; - uint64_t match_bits; - uint64_t mask_bits; - size_t offset; - size_t data_sz; - char *pack_recv_buf; - void *usm_host_buf; /* recv */ - MPIR_Request *req; - } pipeline_info; /* GPU pipeline */ +} MPIDI_OFI_direct_t; + +typedef union { + MPIDI_OFI_direct_t direct; + MPIDI_OFI_rndv_common_t common; + MPIDI_OFI_pipeline_t pipeline; + MPIDI_OFI_rndvread_t read; + MPIDI_OFI_rndvwrite_t write; } MPIDI_OFI_request_t; +#define MPIDI_OFI_AMREQ_COMMON(req) ((req)->dev.ch4.netmod.ofi.common) +#define MPIDI_OFI_AMREQ_PIPELINE(req) ((req)->dev.ch4.netmod.ofi.pipeline) +#define MPIDI_OFI_AMREQ_READ(req) ((req)->dev.ch4.netmod.ofi.read) +#define MPIDI_OFI_AMREQ_WRITE(req) ((req)->dev.ch4.netmod.ofi.write) + typedef struct { int index; } MPIDI_OFI_dt_t; diff --git a/src/mpid/ch4/netmod/ofi/ofi_probe.h b/src/mpid/ch4/netmod/ofi/ofi_probe.h index 8a0d7f98da9..14ad8638740 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_probe.h +++ b/src/mpid/ch4/netmod/ofi/ofi_probe.h @@ -49,7 +49,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_iprobe(int source, } else { MPIDI_OFI_REQUEST(rreq, kind) = MPIDI_OFI_req_kind__probe; } - MPIDI_OFI_REQUEST(rreq, huge.remote_info) = NULL; MPIDI_OFI_REQUEST(rreq, context_id) = comm->recvcontext_id + context_offset; rreq->comm = comm; MPIR_Comm_add_ref(comm); diff --git a/src/mpid/ch4/netmod/ofi/ofi_progress.h b/src/mpid/ch4/netmod/ofi/ofi_progress.h index 1413e34417b..94c3b0a2709 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_progress.h +++ b/src/mpid/ch4/netmod/ofi/ofi_progress.h @@ -82,12 +82,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_progress(int vci, int *made_progress) * to do, so simply return. * NOTE: it is not an error since global progress will poll every vci. */ - return MPI_SUCCESS; + goto fn_exit; } - mpi_errno = MPIDI_OFI_gpu_progress(vci); - MPIR_ERR_CHECK(mpi_errno); - if (unlikely(MPIDI_OFI_has_cq_buffered(vci))) { int num = MPIDI_OFI_get_buffered(vci, wc); mpi_errno = MPIDI_OFI_handle_cq_entries(vci, wc, num); @@ -107,10 +104,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_progress(int vci, int *made_progress) mpi_errno = MPI_SUCCESS; else mpi_errno = MPIDI_OFI_handle_cq_error(vci, nic, ret); + + MPIR_ERR_CHECK(mpi_errno); } if (unlikely(mpi_errno == MPI_SUCCESS && MPIDI_OFI_global.per_vci[vci].deferred_am_isend_q)) { mpi_errno = MPIDI_OFI_handle_deferred_ops(vci); + MPIR_ERR_CHECK(mpi_errno); } } diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h index de2637bdc62..9a78284e0f2 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_recv.h +++ b/src/mpid/ch4/netmod/ofi/ofi_recv.h @@ -165,10 +165,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, MPIR_Comm_add_ref(comm); } - if (!flags) { - MPIDI_OFI_REQUEST(rreq, huge.remote_info) = NULL; /* for huge recv remote info */ - } - /* Calculate the correct NICs. */ receiver_nic = MPIDI_OFI_multx_receiver_nic_index(comm, comm->recvcontext_id, rank, comm->rank, tag); @@ -226,58 +222,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, mpi_errno = MPI_SUCCESS; /* Reset error code */ } - if (force_gpu_pack && MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE && - data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) { - /* Pipeline path */ - MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; - /* Only post first recv with pipeline chunk size. */ - char *host_buf = NULL; - MPIDU_genq_private_pool_force_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - (void **) &host_buf); - MPIR_ERR_CHKANDJUMP1(host_buf == NULL, mpi_errno, - MPI_ERR_OTHER, "**nomem", "**nomem %s", - "Pipeline Init recv alloc"); - - fi_addr_t remote_addr; - if (MPI_ANY_SOURCE == rank) - remote_addr = FI_ADDR_UNSPEC; - else { - int sender_nic = - MPIDI_OFI_multx_sender_nic_index(comm, comm->recvcontext_id, rank, comm->rank, - MPIDI_OFI_init_get_tag(match_bits)); - remote_addr = - MPIDI_OFI_av_to_phys(addr, vci_local, receiver_nic, vci_remote, sender_nic); - } - - /* Save pipeline information. */ - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) = 0; - MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = false; - MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr) = remote_addr; - MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local) = vci_local; - MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits) = match_bits; - MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits) = mask_bits; - MPIDI_OFI_REQUEST(rreq, pipeline_info.data_sz) = data_sz; - MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx) = ctx_idx; - - /* Save original buf, datatype and count */ - MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = host_buf; - - MPIDI_OFI_gpu_pipeline_request *chunk_req; - chunk_req = (MPIDI_OFI_gpu_pipeline_request *) - MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); - MPIR_ERR_CHKANDJUMP1(chunk_req == NULL, mpi_errno, - MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv chunk_req alloc"); - chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; - chunk_req->parent = rreq; - chunk_req->buf = host_buf; - MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, - host_buf, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, - NULL, remote_addr, match_bits, mask_bits, - (void *) &chunk_req->context), vci_local, trecv); - goto fn_exit; - } - /* Unpack */ MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_PACK; MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = @@ -289,21 +233,18 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = NULL; } - /* Read ordering unnecessary for context_id, so use relaxed load */ + if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_PACK) + MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV; + + /* Simply posting a large recv buffer may take a overhead that defeats the benefit of e.g. pipeline. + * Sender will never directly send a message larger than EAGER_THRESH, so - */ + if (data_sz > MPIDI_OFI_EAGER_THRESH && mode != MPIDI_OFI_AM_TAG_RECV) { + data_sz = MPIDI_OFI_EAGER_THRESH; + } + MPIDI_OFI_REQUEST(rreq, util.iov.iov_base) = recv_buf; MPIDI_OFI_REQUEST(rreq, util.iov.iov_len) = data_sz; - if (unlikely(data_sz >= MPIDI_OFI_global.max_msg_size) && !MPIDI_OFI_COMM(comm).enable_striping) { - MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_HUGE; - data_sz = MPIDI_OFI_global.max_msg_size; - } else if (MPIDI_OFI_COMM(comm).enable_striping && - (data_sz >= MPIDI_OFI_global.stripe_threshold)) { - MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_HUGE; - /* Receive has to be posted with size MPIDI_OFI_global.stripe_threshold to handle underflow */ - data_sz = MPIDI_OFI_global.stripe_threshold; - } else if (MPIDI_OFI_REQUEST(rreq, event_id) != MPIDI_OFI_EVENT_RECV_PACK) - MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV; - if (!flags) { fi_addr_t sender_addr; if (MPI_ANY_SOURCE == rank) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv.c b/src/mpid/ch4/netmod/ofi/ofi_rndv.c index a3700be5b21..b6fa3b57bd4 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_rndv.c +++ b/src/mpid/ch4/netmod/ofi/ofi_rndv.c @@ -6,15 +6,64 @@ #include "mpidimpl.h" #include "ofi_am_events.h" #include "ofi_events.h" +#include "ofi_rndv.h" #define MPIDI_OFI_CTS_FLAG__NONE 0 #define MPIDI_OFI_CTS_FLAG__PROBE 1 +#define MPIDI_OFI_CTS_FLAG__NEED_PACK 2 + +static bool cts_is_probe(int flag) +{ + return (flag & MPIDI_OFI_CTS_FLAG__PROBE); +} + +static int get_rndv_protocol(bool send_need_pack, bool recv_need_pack, MPI_Aint recv_data_sz) +{ + /* NOTE: some protocols may not work, fallback to auto */ + switch (MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL) { + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline: + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read: + if (!send_need_pack) { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read; + } + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write: + if (!recv_need_pack) { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write; + } + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct: + /* libfabric can't direct send > max_msg_size. Only sender knows both sizes from + * receiving CTS, thus we use recv_data_sz so both sides can agree. + * NOTE: psm3 has max_msg_size at 4294963200. + */ + if (recv_data_sz <= MPIDI_OFI_global.max_msg_size) { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct; + } + break; + } + + /* auto */ + if (send_need_pack && recv_need_pack) { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline; + } else if (send_need_pack) { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write; + } else if (recv_need_pack) { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read; + } else if (recv_data_sz < MPIDI_OFI_global.max_msg_size) { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct; + } else { + return MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read; + } +} /* receiver -> sender */ struct rndv_cts { MPIR_Request *rreq; int am_tag; int flag; + MPI_Aint data_sz; }; /* sender -> receiver */ @@ -26,14 +75,22 @@ struct rndv_info_hdr { /* ---- receiver side ---- */ -static int rndv_event_common(int vci, MPIR_Request * rreq, int *vci_src_out, int *vci_dst_out) +int MPIDI_OFI_recv_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; + MPIR_FUNC_ENTER; + + /* save context_offset for MPIDI_OFI_send_ack */ + int context_id = MPIDI_OFI_REQUEST(rreq, context_id); + + /* save and free up the OFI request */ + void *buf = MPIDI_OFI_REQUEST(rreq, buf); + MPI_Aint count = MPIDI_OFI_REQUEST(rreq, count); + MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, datatype);; /* if we were expecting an eager send, free the unneeded pack_buffer or iovs array */ switch (MPIDI_OFI_REQUEST(rreq, event_id)) { case MPIDI_OFI_EVENT_RECV_PACK: - case MPIDI_OFI_EVENT_RECV_HUGE: MPL_free(MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer)); break; case MPIDI_OFI_EVENT_RECV_NOPACK: @@ -41,61 +98,76 @@ static int rndv_event_common(int vci, MPIR_Request * rreq, int *vci_src_out, int break; } - /* save and free up the OFI request */ - void *buf = MPIDI_OFI_REQUEST(rreq, buf); - MPI_Aint count = MPIDI_OFI_REQUEST(rreq, count); - MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, datatype);; + int dt_contig; + MPIR_Datatype_is_contig(datatype, &dt_contig); + + MPL_pointer_attr_t attr; + MPIR_GPU_query_pointer_attr(buf, &attr); + + MPI_Aint data_sz; + MPIR_Datatype_get_size_macro(datatype, data_sz); + data_sz *= count; - /* next, convert it to an MPIDIG request */ - /* the vci need be consistent with MPIDI_OFI_RECV_VNIS in ofi_recv.h */ MPIR_Comm *comm = rreq->comm; int src_rank = rreq->status.MPI_SOURCE; int tag = rreq->status.MPI_TAG; int vci_src = MPIDI_get_vci(SRC_VCI_FROM_RECVER, comm, src_rank, comm->rank, tag); int vci_dst = MPIDI_get_vci(DST_VCI_FROM_RECVER, comm, src_rank, comm->rank, tag); - MPIR_Assert(vci == vci_dst); - - /* TODO: optimize - since we are going to use am_tag_recv, we won't need most of the MPIDIG request fields */ - mpi_errno = MPIDIG_request_init_internal(rreq, vci_dst /* local */ , vci_src /* remote */); - MPIR_ERR_CHECK(mpi_errno); - MPIDIG_REQUEST(rreq, buffer) = buf; - MPIDIG_REQUEST(rreq, count) = count; - MPIDIG_REQUEST(rreq, datatype) = datatype; - - fn_exit: - *vci_src_out = vci_src; - *vci_dst_out = vci_dst; - return mpi_errno; - fn_fail: - goto fn_exit; -} - -int MPIDI_OFI_recv_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; + MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(rreq); + p->buf = buf; + p->count = count; + p->datatype = datatype; + p->need_pack = MPIDI_OFI_rndv_need_pack(dt_contig, &attr); + p->attr = attr; + p->data_sz = data_sz; + p->vci_local = vci_dst; + p->vci_remote = vci_src; + p->av = MPIDIU_comm_rank_to_av(comm, src_rank); + + MPI_Aint remote_data_sz = MPIDI_OFI_idata_get_size(wc->data); + if (remote_data_sz > 0) { + p->remote_data_sz = remote_data_sz; + MPIDI_OFI_RNDV_update_count(rreq, remote_data_sz); + } else { + /* mark remote_data_sz as unknown */ + p->remote_data_sz = -1; + } - /* save context_offset for MPIDI_OFI_send_ack */ - int context_id = MPIDI_OFI_REQUEST(rreq, context_id); + int am_tag = MPIDIG_get_next_am_tag(comm); + p->match_bits = MPIDI_OFI_init_sendtag(comm->recvcontext_id, 0, am_tag) | MPIDI_OFI_AM_SEND; - /* Convert rreq to an MPIDIG request */ - int vci_src, vci_dst; - mpi_errno = rndv_event_common(vci, rreq, &vci_src, &vci_dst); - MPIR_ERR_CHECK(mpi_errno); + bool send_need_pack = MPIDI_OFI_is_tag_rndv_pack(wc->tag); + bool recv_need_pack = p->need_pack; /* prepare rndv_cts */ struct rndv_cts hdr; hdr.rreq = rreq; - hdr.am_tag = MPIDIG_get_next_am_tag(rreq->comm); + hdr.am_tag = am_tag; hdr.flag = MPIDI_OFI_CTS_FLAG__NONE; + hdr.data_sz = data_sz; + if (recv_need_pack) { + hdr.flag |= MPIDI_OFI_CTS_FLAG__NEED_PACK; + } - /* am_tag_recv */ - mpi_errno = MPIDI_NM_am_tag_recv(rreq->status.MPI_SOURCE, rreq->comm, - MPIDIG_TAG_RECV_COMPLETE, hdr.am_tag, - MPIDIG_REQUEST(rreq, buffer), - MPIDIG_REQUEST(rreq, count), MPIDIG_REQUEST(rreq, datatype), - vci_src, vci_dst, rreq); + switch (get_rndv_protocol(send_need_pack, recv_need_pack, p->data_sz)) { + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline: + mpi_errno = MPIDI_OFI_pipeline_recv(rreq, hdr.am_tag, vci_src, vci_dst); + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read: + mpi_errno = MPIDI_OFI_rndvread_recv(rreq, hdr.am_tag, vci_src, vci_dst); + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write: + mpi_errno = MPIDI_OFI_rndvwrite_recv(rreq, hdr.am_tag, vci_src, vci_dst); + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct: + /* fall through */ + default: + mpi_errno = MPIDI_NM_am_tag_recv(rreq->status.MPI_SOURCE, rreq->comm, + -1, hdr.am_tag, + (void *) p->buf, p->count, p->datatype, + vci_src, vci_dst, rreq); + } MPIR_ERR_CHECK(mpi_errno); /* send cts */ @@ -118,11 +190,6 @@ int MPIDI_OFI_peek_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Reque /* save context_offset for MPIDI_OFI_send_ack */ int context_id = MPIDI_OFI_REQUEST(rreq, context_id); - /* Convert rreq to an MPIDIG request */ - int vci_src, vci_dst; - mpi_errno = rndv_event_common(vci, rreq, &vci_src, &vci_dst); - MPIR_ERR_CHECK(mpi_errno); - MPI_Aint data_sz; data_sz = MPIDI_OFI_idata_get_size(wc->data); @@ -136,6 +203,7 @@ int MPIDI_OFI_peek_rndv_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Reque hdr.rreq = rreq; hdr.am_tag = -1; /* don't issue am_tag_recv yet */ hdr.flag = MPIDI_OFI_CTS_FLAG__PROBE; + hdr.data_sz = 0; mpi_errno = MPIDI_OFI_send_ack(rreq, context_id, &hdr, sizeof(hdr)); MPIR_ERR_CHECK(mpi_errno); @@ -172,31 +240,57 @@ int MPIDI_OFI_rndv_info_handler(void *am_hdr, void *data, MPI_Aint in_data_sz, u /* ---- sender side ---- */ -int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req) +int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * r) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_ENTER; - MPIDI_OFI_ack_request_t *ack_req = (MPIDI_OFI_ack_request_t *) req; + MPIDI_OFI_ack_request_t *ack_req = (MPIDI_OFI_ack_request_t *) r; MPIR_Request *sreq = ack_req->signal_req; struct rndv_cts *hdr = ack_req->ack_hdr; - - /* sreq is already an MPIDIG request (ref. ofi_send.h) */ - if (hdr->flag == MPIDI_OFI_CTS_FLAG__NONE) { - /* issue MPIDI_NM_am_tag_send, ref. MPIDIG_send_cts_target_msg_cb */ - mpi_errno = MPIDI_NM_am_tag_send(MPIDIG_REQUEST(sreq, u.send.dest), sreq->comm, - MPIDIG_SEND_DATA, hdr->am_tag, - MPIDIG_REQUEST(sreq, buffer), - MPIDIG_REQUEST(sreq, count), - MPIDIG_REQUEST(sreq, datatype), - MPIDIG_REQUEST(sreq, req->local_vci), - MPIDIG_REQUEST(sreq, req->remote_vci), sreq); - + MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(sreq); + + p->remote_data_sz = hdr->data_sz; + p->match_bits = + MPIDI_OFI_init_sendtag(sreq->comm->context_id, 0, hdr->am_tag) | MPIDI_OFI_AM_SEND; + + if (!cts_is_probe(hdr->flag)) { + bool send_need_pack = p->need_pack; + bool recv_need_pack = hdr->flag & MPIDI_OFI_CTS_FLAG__NEED_PACK; + + switch (get_rndv_protocol(send_need_pack, recv_need_pack, hdr->data_sz)) { + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_pipeline: + mpi_errno = MPIDI_OFI_pipeline_send(sreq, hdr->am_tag); + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_read: + mpi_errno = MPIDI_OFI_rndvread_send(sreq, hdr->am_tag); + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_write: + mpi_errno = MPIDI_OFI_rndvwrite_send(sreq, hdr->am_tag); + break; + case MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL_direct: + /* fall through */ + default: + if (p->data_sz < MPIDI_OFI_global.max_msg_size) { + mpi_errno = MPIDI_NM_am_tag_send(p->remote_rank, sreq->comm, -1, hdr->am_tag, + p->buf, p->count, p->datatype, + p->vci_local, p->vci_remote, sreq); + } else { + /* Only contig data here (if this ever change, FIXME) - + * Send max_msg_size and receiver will get the truncation error.*/ + MPI_Aint true_extent, true_lb; + MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent); + void *data = MPIR_get_contig_ptr(p->buf, true_lb); + mpi_errno = MPIDI_NM_am_tag_send(p->remote_rank, sreq->comm, -1, hdr->am_tag, + data, MPIDI_OFI_global.max_msg_size, + MPIR_BYTE_INTERNAL, + p->vci_local, p->vci_remote, sreq); + } + } MPL_free(ack_req->ack_hdr); MPL_free(ack_req); } else { - MPIR_Assert(hdr->flag == MPIDI_OFI_CTS_FLAG__PROBE); /* re-issue the ack recv */ MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ack_req->ctx_idx].rx, ack_req->ack_hdr, ack_req->ack_hdr_sz, NULL, @@ -207,12 +301,10 @@ int MPIDI_OFI_rndv_cts_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Reques struct rndv_info_hdr rndv_info; rndv_info.sreq = sreq; rndv_info.rreq = hdr->rreq; - MPIDI_Datatype_check_size(MPIDIG_REQUEST(sreq, datatype), MPIDIG_REQUEST(sreq, count), - rndv_info.data_sz); - mpi_errno = MPIDI_NM_am_send_hdr_reply(sreq->comm, MPIDIG_REQUEST(sreq, u.send.dest), + rndv_info.data_sz = p->data_sz; + mpi_errno = MPIDI_NM_am_send_hdr_reply(sreq->comm, p->remote_rank, MPIDI_OFI_RNDV_INFO, &rndv_info, sizeof(rndv_info), - MPIDIG_REQUEST(sreq, req->local_vci), - MPIDIG_REQUEST(sreq, req->remote_vci)); + p->vci_local, p->vci_remote); } fn_exit: diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv.h b/src/mpid/ch4/netmod/ofi/ofi_rndv.h new file mode 100644 index 00000000000..e36f9eef273 --- /dev/null +++ b/src/mpid/ch4/netmod/ofi/ofi_rndv.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#ifndef OFI_RNDV_H_INCLUDED +#define OFI_RNDV_H_INCLUDED + +/* +=== BEGIN_MPI_T_CVAR_INFO_BLOCK === + +cvars: + - name : MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL + category : CH4_OFI + type : enum + default : auto + class : none + verbosity : MPI_T_VERBOSITY_USER_BASIC + scope : MPI_T_SCOPE_LOCAL + description : |- + When message size is greater than MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD, + specify large message protocol. + auto - decide protocols based on buffer attributes and datatypes. + pipeline - use pipeline protocol (forcing pack and unpack). + read - RDMA read. + write - RDMA write. + direct - direct send data using libfabric after the RNDV handshake. +=== END_MPI_T_CVAR_INFO_BLOCK === +*/ + +int MPIDI_OFI_pipeline_send(MPIR_Request * sreq, int tag); +int MPIDI_OFI_pipeline_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst); +int MPIDI_OFI_rndvread_send(MPIR_Request * sreq, int tag); +int MPIDI_OFI_rndvread_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst); +int MPIDI_OFI_rndvwrite_send(MPIR_Request * sreq, int tag); +int MPIDI_OFI_rndvwrite_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst); + +typedef struct { + char pad[MPIDI_REQUEST_HDR_SIZE]; + struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; + int event_id; + MPIR_Request *req; /* sreq or rreq */ + char hdr[]; +} MPIDI_OFI_RNDV_control_req_t; + +MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_RNDV_send_hdr(void *hdr, int hdr_sz, MPIDI_av_entry_t * av, + int vci_local, int vci_remote, + uint64_t match_bits) +{ + int mpi_errno = MPI_SUCCESS; + + /* control message always use nic 0 */ + int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, 0); + fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, 0, vci_remote, 0); + MPIDI_OFI_CALL_RETRY(fi_tinject(MPIDI_OFI_global.ctx[ctx_idx].tx, + hdr, hdr_sz, addr, match_bits), vci_local, tinject); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_RNDV_recv_hdr(MPIR_Request * parent_request, int event_id, + int hdr_sz, MPIDI_av_entry_t * av, + int vci_local, int vci_remote, + uint64_t match_bits) +{ + int mpi_errno = MPI_SUCCESS; + + MPIDI_OFI_RNDV_control_req_t *control; + control = MPL_malloc(sizeof(MPIDI_OFI_RNDV_control_req_t) + hdr_sz, MPL_MEM_OTHER); + MPIR_Assertp(control); + + control->event_id = event_id; + control->req = parent_request; + + /* control message always use nic 0 */ + int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, 0); + fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, 0, vci_remote, 0); + + MPIDI_OFI_CALL_RETRY(fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, + control->hdr, hdr_sz, NULL, + addr, match_bits, 0ULL, (void *) &control->context), + vci_local, trecv); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_RNDV_update_count(MPIR_Request * rreq, MPI_Aint data_sz) +{ + MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(rreq); + + MPIR_STATUS_SET_COUNT(rreq->status, data_sz); + if (data_sz > p->data_sz) { + rreq->status.MPI_ERROR = MPI_ERR_TRUNCATE; + } +} + +#define MPIDI_OFI_RNDV_GET_CONTROL_HDR(r) ((void *)((MPIDI_OFI_RNDV_control_req_t *)(r))->hdr) +#define MPIDI_OFI_RNDV_GET_CONTROL_REQ(r) ((MPIDI_OFI_RNDV_control_req_t *)(r))->req + +#endif /* OFI_RNDV_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv_rdma_common.inc b/src/mpid/ch4/netmod/ofi/ofi_rndv_rdma_common.inc new file mode 100644 index 00000000000..77d4fdfdb6b --- /dev/null +++ b/src/mpid/ch4/netmod/ofi/ofi_rndv_rdma_common.inc @@ -0,0 +1,116 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +struct rdma_info { + MPI_Aint data_sz; + uint64_t base; /* 0 unless MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS is true */ + int num_nics; /* redundant since we assume sender/receiver agree on num_nics */ + uint64_t rkeys[]; +}; + +static MPI_Aint get_chunks_per_nic(MPI_Aint data_sz, int num_nics); + +static int prepare_rdma_info(const void *buf, MPI_Datatype datatype, MPI_Aint data_sz, int vci, + int access, struct fid_mr **mrs, struct rdma_info *hdr) +{ + int mpi_errno = MPI_SUCCESS; + const void *data; + MPI_Aint true_extent, true_lb; + MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); + data = MPIR_get_contig_ptr(buf, true_lb); + + int num_nics = MPIDI_OFI_global.num_nics; + uint64_t rkeys[MPIDI_OFI_MAX_NICS]; + + /* prepare mr and rkey */ + if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { + /* Set up a memory region for the lmt data transfer */ + for (int i = 0; i < num_nics; i++) { + rkeys[i] = MPIDI_OFI_mr_key_alloc(MPIDI_OFI_LOCAL_MR_KEY, MPIDI_OFI_INVALID_MR_KEY); + MPIR_ERR_CHKANDJUMP(rkeys[i] == MPIDI_OFI_INVALID_MR_KEY, mpi_errno, + MPI_ERR_OTHER, "**ofid_mr_key"); + } + } else { + /* zero them to avoid warnings */ + for (int i = 0; i < num_nics; i++) { + rkeys[i] = 0; + } + } + MPI_Aint chunks_per_nic = get_chunks_per_nic(data_sz, num_nics); + MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ; + MPI_Aint sz_per_nic = chunks_per_nic * chunk_sz; + for (int i = 0; i < num_nics; i++) { + MPIDI_OFI_context_t *ctx = &MPIDI_OFI_global.ctx[MPIDI_OFI_get_ctx_index(vci, i)]; + /* note: fi_mr_reg is expensive, distribute over num_nics */ + void *nic_data = (char *) data + i * sz_per_nic; + MPI_Aint sz = (i != num_nics - 1) ? sz_per_nic : (data_sz - i * sz_per_nic); + MPIDI_OFI_CALL(fi_mr_reg(ctx->domain, nic_data, sz, access, 0ULL, + rkeys[i], 0ULL, &mrs[i], NULL), mr_reg); + mpi_errno = MPIDI_OFI_mr_bind(MPIDI_OFI_global.prov_use[0], mrs[i], ctx->ep, + NULL); + MPIR_ERR_CHECK(mpi_errno); + } + if (MPIDI_OFI_ENABLE_MR_PROV_KEY) { + for (int i = 0; i < num_nics; i++) { + rkeys[i] = fi_mr_key(mrs[i]); + } + } + + /* prepare rdma_info */ + hdr->data_sz = data_sz; + hdr->base = (uintptr_t) data; + for (int i = 0; i < num_nics; i++) { + hdr->rkeys[i] = rkeys[i]; + } + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* utility routine for calculating chunks */ +/* Each nic is assigned with chunks_per_nic chunks. The last nic may have less chunks */ + +static MPI_Aint get_chunks_per_nic(MPI_Aint data_sz, int num_nics) +{ + MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ; + MPI_Aint num_chunks = data_sz / chunk_sz; + if (chunk_sz * num_chunks < data_sz) { + num_chunks++; + } + + if (num_nics == 1) { + return num_chunks; + } else { + MPI_Aint chunks_per_nic = num_chunks / num_nics; + if (chunks_per_nic * num_nics < num_chunks) { + chunks_per_nic++; + } + return chunks_per_nic; + } +} + +static void get_chunk_offsets(MPI_Aint chunk_index, int num_nics, MPI_Aint chunks_per_nic, + MPI_Aint data_sz, MPI_Aint * total_offset_out, int *nic_out, + MPI_Aint * nic_offset_out, MPI_Aint * chunk_sz_out) +{ + MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ; + if (num_nics == 1) { + *nic_out = 0; + *nic_offset_out = *total_offset_out = chunk_index * chunk_sz; + } else { + int nic = chunk_index % num_nics; + MPI_Aint nic_chunk_index = chunk_index / chunks_per_nic; + *total_offset_out = (nic * chunks_per_nic + nic_chunk_index) * chunk_sz; + *nic_offset_out = nic_chunk_index * chunk_sz; + } + if (*total_offset_out + chunk_sz < data_sz) { + *chunk_sz_out = chunk_sz; + } else { + /* incomplete chunks */ + *chunk_sz_out = MPL_MAX(0, data_sz - *total_offset_out); + } +} diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv_read.c b/src/mpid/ch4/netmod/ofi/ofi_rndv_read.c new file mode 100644 index 00000000000..b2d0cb5653f --- /dev/null +++ b/src/mpid/ch4/netmod/ofi/ofi_rndv_read.c @@ -0,0 +1,361 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include +#include "ofi_impl.h" +#include "ofi_events.h" +#include "ofi_rndv.h" + +#include "ofi_rndv_rdma_common.inc" + +#define MPIDI_OFI_RNDVREAD_INFLY_CHUNKS 10 + +static int rndvread_read_poll(MPIX_Async_thing thing); +static int recv_issue_read(MPIR_Request * parent_request, int event_id, + void *buf, MPI_Aint data_sz, MPI_Aint offset, + MPIDI_av_entry_t * av, int vci_local, int vci_remote, int nic, + MPI_Aint remote_disp, uint64_t rkey); +static int async_recv_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz, + void *buf, MPI_Aint count, MPI_Datatype datatype, + MPI_Aint offset, MPL_pointer_attr_t * attr); +static int recv_copy_poll(MPIX_Async_thing thing); +static void recv_copy_complete(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz); +static int check_recv_complete(MPIR_Request * rreq); + +/* -- sender side -- */ + +int MPIDI_OFI_rndvread_send(MPIR_Request * sreq, int tag) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(sreq); + MPIR_FUNC_ENTER; + + MPIR_Assert(!p->need_pack); + + MPI_Aint true_extent, true_lb; + MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent); + p->u.send.data = MPIR_get_contig_ptr(p->buf, true_lb); + + int num_nics = MPIDI_OFI_global.num_nics; + p->u.send.mrs = MPL_malloc((num_nics * sizeof(struct fid_mr *)), MPL_MEM_OTHER); + + int hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t); + struct rdma_info *hdr = MPL_malloc(hdr_sz, MPL_MEM_OTHER); + MPIR_Assertp(hdr); + + mpi_errno = prepare_rdma_info((void *) p->buf, p->datatype, p->data_sz, p->vci_local, + FI_REMOTE_READ, p->u.send.mrs, hdr); + MPIR_ERR_CHECK(mpi_errno); + + /* send rdma_info */ + mpi_errno = MPIDI_OFI_RNDV_send_hdr(hdr, hdr_sz, + p->av, p->vci_local, p->vci_remote, p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + + MPL_free(hdr); + + /* issue recv for ack */ + mpi_errno = MPIDI_OFI_RNDV_recv_hdr(sreq, MPIDI_OFI_EVENT_RNDVREAD_ACK, 0, + p->av, p->vci_local, p->vci_remote, p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_rndvread_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Request *sreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r); + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(sreq); + + int num_nics = MPIDI_OFI_global.num_nics; + for (int i = 0; i < num_nics; i++) { + uint64_t key = fi_mr_key(p->u.send.mrs[i]); + MPIDI_OFI_CALL(fi_close(&p->u.send.mrs[i]->fid), mr_unreg); + if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { + MPIDI_OFI_mr_key_free(MPIDI_OFI_LOCAL_MR_KEY, key); + } + } + MPL_free(p->u.send.mrs); + MPL_free(r); + + /* complete sreq */ + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(sreq); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +/* -- receiver side -- */ + +int MPIDI_OFI_rndvread_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq); + MPIR_FUNC_ENTER; + + if (!p->need_pack) { + MPI_Aint true_extent, true_lb; + MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent); + p->u.recv.u.data = MPIR_get_contig_ptr(p->buf, true_lb); + } else { + p->u.recv.u.copy_infly = 0; + } + + /* recv the mrs */ + int num_nics = MPIDI_OFI_global.num_nics; + MPI_Aint hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t); + mpi_errno = MPIDI_OFI_RNDV_recv_hdr(rreq, MPIDI_OFI_EVENT_RNDVREAD_RECV_MRS, hdr_sz, + p->av, p->vci_local, p->vci_remote, p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_rndvread_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + struct rdma_info *hdr = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r); + MPIR_Request *rreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r); + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq); + + MPIDI_OFI_RNDV_update_count(rreq, hdr->data_sz); + + int num_nics = MPIDI_OFI_global.num_nics; + p->remote_data_sz = MPL_MIN(hdr->data_sz, p->data_sz); + p->u.recv.remote_base = hdr->base; + p->u.recv.rkeys = MPL_malloc(num_nics * sizeof(uint64_t), MPL_MEM_OTHER); + for (int i = 0; i < num_nics; i++) { + p->u.recv.rkeys[i] = hdr->rkeys[i]; + } + + MPL_free(r); + + /* setup chunks */ + p->u.recv.chunks_per_nic = get_chunks_per_nic(p->remote_data_sz, num_nics); + + p->u.recv.cur_chunk_index = 0; + p->u.recv.num_infly = 0; + + /* issue fi_read */ + mpi_errno = MPIR_Async_things_add(rndvread_read_poll, rreq, NULL); + + return mpi_errno; +} + +static int rndvread_read_poll(MPIX_Async_thing thing) +{ + int ret = MPIX_ASYNC_NOPROGRESS; + int mpi_errno = MPI_SUCCESS; + MPIR_Request *rreq = MPIR_Async_thing_get_state(thing); + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq); + + /* CS required for genq pool and gpu imemcpy */ + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local)); + + int num_nics = MPIDI_OFI_global.num_nics; + while (p->u.recv.cur_chunk_index < p->u.recv.chunks_per_nic * num_nics) { + if (p->u.recv.num_infly >= MPIDI_OFI_RNDVREAD_INFLY_CHUNKS) { + goto fn_exit; + } + int nic; + MPI_Aint total_offset, nic_offset, chunk_sz; + get_chunk_offsets(p->u.recv.cur_chunk_index, num_nics, + p->u.recv.chunks_per_nic, p->remote_data_sz, + &total_offset, &nic, &nic_offset, &chunk_sz); + + if (chunk_sz > 0) { + void *read_buf; + if (p->need_pack) { + /* alloc a chunk */ + MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global. + per_vci[p->vci_local].pipeline_pool, &read_buf); + if (!read_buf) { + goto fn_exit; + } + } else { + read_buf = (char *) p->u.recv.u.data + total_offset; + } + uint64_t disp; + if (MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) { + disp = p->u.recv.remote_base + total_offset; + } else { + disp = nic_offset; + } + mpi_errno = recv_issue_read(rreq, MPIDI_OFI_EVENT_RNDVREAD_READ_CHUNK, + read_buf, chunk_sz, total_offset, + p->av, p->vci_local, p->vci_remote, nic, disp, + p->u.recv.rkeys[nic]); + MPIR_ERR_CHECK(mpi_errno); + p->u.recv.num_infly++; + } + p->u.recv.cur_chunk_index++; + } + + p->u.recv.all_issued = true; + ret = MPIX_ASYNC_DONE; + + fn_exit: + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local)); + return ret; + fn_fail: + ret = MPIX_ASYNC_NOPROGRESS; + goto fn_exit; +} + +struct read_req { + char pad[MPIDI_REQUEST_HDR_SIZE]; + struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; + int event_id; + MPIR_Request *rreq; + /* In case we need unpack after read */ + void *chunk_buf; + MPI_Aint chunk_sz; + MPI_Aint offset; +}; + +static int recv_issue_read(MPIR_Request * parent_request, int event_id, + void *buf, MPI_Aint data_sz, MPI_Aint offset, + MPIDI_av_entry_t * av, int vci_local, int vci_remote, int nic, + MPI_Aint remote_disp, uint64_t rkey) +{ + int mpi_errno = MPI_SUCCESS; + + struct read_req *r = MPL_malloc(sizeof(struct read_req), MPL_MEM_OTHER); + MPIR_Assertp(r); + + r->event_id = event_id; + r->rreq = parent_request; + r->chunk_buf = buf; + r->chunk_sz = data_sz; + r->offset = offset; + + /* control message always use nic 0 */ + int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic); + fi_addr_t addr = MPIDI_OFI_av_to_phys(av, vci_local, nic, vci_remote, nic); + + MPIDI_OFI_CALL_RETRY(fi_read(MPIDI_OFI_global.ctx[ctx_idx].tx, + buf, data_sz, NULL, + addr, remote_disp, rkey, (void *) &r->context), + vci_local, rdma_readfrom); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_rndvread_read_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + struct read_req *t = (void *) r; + MPIR_Request *rreq = t->rreq; + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq); + + p->u.recv.num_infly--; + if (!p->need_pack) { + check_recv_complete(rreq); + } else { + /* async copy */ + mpi_errno = async_recv_copy(rreq, t->chunk_buf, t->chunk_sz, + (void *) p->buf, p->count, p->datatype, t->offset, &p->attr); + p->u.recv.u.copy_infly++; + + } + MPL_free(r); + return mpi_errno; +} + +struct recv_copy { + MPIR_Request *rreq; + MPIR_gpu_req async_req; + void *chunk_buf; + MPI_Aint chunk_sz; +}; + +static int async_recv_copy(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz, + void *buf, MPI_Aint count, MPI_Datatype datatype, + MPI_Aint offset, MPL_pointer_attr_t * attr) +{ + int mpi_errno = MPI_SUCCESS; + + MPIR_gpu_req async_req; + int engine_type = MPIDI_OFI_gpu_get_recv_engine_type(); + int copy_dir = MPL_GPU_COPY_DIRECTION_NONE; + mpi_errno = MPIR_Ilocalcopy_gpu(chunk_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL, + buf, count, datatype, offset, attr, + copy_dir, engine_type, 1, &async_req); + MPIR_ERR_CHECK(mpi_errno); + + struct recv_copy *p = MPL_malloc(sizeof(struct recv_copy), MPL_MEM_OTHER); + p->rreq = rreq; + p->async_req = async_req; + p->chunk_buf = chunk_buf; + p->chunk_sz = chunk_sz; + + mpi_errno = MPIR_Async_things_add(recv_copy_poll, p, NULL); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +static int recv_copy_poll(MPIX_Async_thing thing) +{ + struct recv_copy *p = MPIR_Async_thing_get_state(thing); + + int is_done = 0; + MPIR_async_test(&(p->async_req), &is_done); + + if (!is_done) { + return MPIX_ASYNC_NOPROGRESS; + } else { + recv_copy_complete(p->rreq, p->chunk_buf, p->chunk_sz); + MPL_free(p); + return MPIX_ASYNC_DONE; + } +} + +static void recv_copy_complete(MPIR_Request * rreq, void *chunk_buf, MPI_Aint chunk_sz) +{ + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq); + + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local)); + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool, + chunk_buf); + + p->u.recv.u.copy_infly--; + check_recv_complete(rreq); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local)); +} + +static int check_recv_complete(MPIR_Request * rreq) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_rndvread_t *p = &MPIDI_OFI_AMREQ_READ(rreq); + if (p->u.recv.all_issued && p->u.recv.num_infly == 0 && + (!p->need_pack || p->u.recv.u.copy_infly == 0)) { + /* done. send ack */ + mpi_errno = MPIDI_OFI_RNDV_send_hdr(NULL, 0, p->av, p->vci_local, p->vci_remote, + p->match_bits); + /* complete request */ + MPL_free(p->u.recv.rkeys); + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(rreq); + } + return mpi_errno; +} diff --git a/src/mpid/ch4/netmod/ofi/ofi_rndv_write.c b/src/mpid/ch4/netmod/ofi/ofi_rndv_write.c new file mode 100644 index 00000000000..f7251daa840 --- /dev/null +++ b/src/mpid/ch4/netmod/ofi/ofi_rndv_write.c @@ -0,0 +1,369 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include +#include "ofi_impl.h" +#include "ofi_events.h" +#include "ofi_rndv.h" + +#include "ofi_rndv_rdma_common.inc" + +#define MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS 10 + +static int rndvwrite_write_poll(MPIX_Async_thing thing); +static int async_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, int nic, uint64_t disp, + void *chunk_buf, MPI_Aint chunk_sz, + const void *buf, MPI_Aint count, MPI_Datatype datatype, + MPI_Aint offset, MPL_pointer_attr_t * attr); +static int send_copy_poll(MPIX_Async_thing thing); +static int send_issue_write(MPIR_Request * sreq, void *buf, MPI_Aint data_sz, + int nic, MPI_Aint disp); + +/* export functions: + * int MPIDI_OFI_rndvwrite_send(MPIR_Request * sreq, int tag); + * int MPIDI_OFI_rndvwrite_write_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); + * int MPIDI_OFI_rndvwrite_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); + * + * int MPIDI_OFI_rndvwrite_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst); + * int MPIDI_OFI_rndvwrite_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); + */ +/* -- sender side -- */ + +int MPIDI_OFI_rndvwrite_send(MPIR_Request * sreq, int tag) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq); + MPIR_FUNC_ENTER; + + if (!p->need_pack) { + MPI_Aint true_extent, true_lb; + MPIR_Type_get_true_extent_impl(p->datatype, &true_lb, &true_extent); + p->u.send.u.data = MPIR_get_contig_ptr(p->buf, true_lb); + } else { + p->u.send.u.copy_infly = 0; + } + + /* recv the mrs */ + int num_nics = MPIDI_OFI_global.num_nics; + MPI_Aint hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t); + mpi_errno = MPIDI_OFI_RNDV_recv_hdr(sreq, MPIDI_OFI_EVENT_RNDVWRITE_RECV_MRS, hdr_sz, + p->av, p->vci_local, p->vci_remote, p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_rndvwrite_recv_mrs_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + struct rdma_info *hdr = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r); + MPIR_Request *sreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r); + MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq); + + int num_nics = MPIDI_OFI_global.num_nics; + p->remote_data_sz = MPL_MIN(hdr->data_sz, p->data_sz); + p->u.send.remote_base = hdr->base; + p->u.send.rkeys = MPL_malloc(num_nics * sizeof(uint64_t), MPL_MEM_OTHER); + for (int i = 0; i < num_nics; i++) { + p->u.send.rkeys[i] = hdr->rkeys[i]; + } + + MPL_free(r); + + /* setup chunks */ + p->u.send.chunks_per_nic = get_chunks_per_nic(p->remote_data_sz, num_nics); + p->u.send.chunks_remain = p->u.send.chunks_per_nic * num_nics; + + p->u.send.cur_chunk_index = 0; + p->u.send.write_infly = 0; + + /* issue fi_write */ + mpi_errno = MPIR_Async_things_add(rndvwrite_write_poll, sreq, NULL); + + return mpi_errno; +} + +static int rndvwrite_write_poll(MPIX_Async_thing thing) +{ + int ret = MPIX_ASYNC_NOPROGRESS; + int mpi_errno = MPI_SUCCESS; + MPIR_Request *sreq = MPIR_Async_thing_get_state(thing); + MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq); + + /* CS required for genq pool and gpu imemcpy */ + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(p->vci_local)); + + int num_nics = MPIDI_OFI_global.num_nics; + while (p->u.send.cur_chunk_index < p->u.send.chunks_per_nic * num_nics) { + int nic; + MPI_Aint total_offset, nic_offset, chunk_sz; + get_chunk_offsets(p->u.send.cur_chunk_index, num_nics, + p->u.send.chunks_per_nic, p->remote_data_sz, + &total_offset, &nic, &nic_offset, &chunk_sz); + + if (chunk_sz <= 0) { + p->u.send.cur_chunk_index++; + p->u.send.chunks_remain--; + continue; + } + + uint64_t disp; + if (MPIDI_OFI_ENABLE_MR_VIRT_ADDRESS) { + disp = p->u.send.remote_base + total_offset; + } else { + disp = nic_offset; + } + + if (p->need_pack) { + if (p->u.send.u.copy_infly >= MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS) { + goto fn_exit; + } + + /* alloc a chunk */ + void *chunk_buf; + MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool, + &chunk_buf); + if (!chunk_buf) { + goto fn_exit; + } + /* issue async copy */ + mpi_errno = async_send_copy(thing, sreq, nic, disp, chunk_buf, chunk_sz, + p->buf, p->count, p->datatype, total_offset, &p->attr); + MPIR_ERR_CHECK(mpi_errno); + } else { + if (p->u.send.write_infly >= MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS) { + goto fn_exit; + } + void *write_buf = (char *) p->u.send.u.data + total_offset; + /* issue rdma write */ + mpi_errno = send_issue_write(sreq, write_buf, chunk_sz, nic, disp); + MPIR_ERR_CHECK(mpi_errno); + } + p->u.send.cur_chunk_index++; + } + + ret = MPIX_ASYNC_DONE; + + fn_exit: + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(p->vci_local)); + return ret; + fn_fail: + goto fn_exit; +} + +struct send_copy { + MPIR_Request *sreq; + MPIR_gpu_req async_req; + void *chunk_buf; + MPI_Aint chunk_sz; + MPI_Aint offset; + int nic; + uint64_t disp; + bool is_done; +}; + +static int async_send_copy(MPIX_Async_thing thing, MPIR_Request * sreq, int nic, uint64_t disp, + void *chunk_buf, MPI_Aint chunk_sz, + const void *buf, MPI_Aint count, MPI_Datatype datatype, + MPI_Aint offset, MPL_pointer_attr_t * attr) +{ + int mpi_errno = MPI_SUCCESS; + + MPIR_gpu_req async_req; + int engine_type = MPIDI_OFI_gpu_get_send_engine_type(); + int copy_dir = MPL_GPU_COPY_DIRECTION_NONE; + mpi_errno = MPIR_Ilocalcopy_gpu((void *) buf, count, datatype, offset, attr, + chunk_buf, chunk_sz, MPIR_BYTE_INTERNAL, 0, NULL, + copy_dir, engine_type, 1, &async_req); + MPIR_ERR_CHECK(mpi_errno); + + struct send_copy *p = MPL_malloc(sizeof(struct send_copy), MPL_MEM_OTHER); + p->sreq = sreq; + p->async_req = async_req; + p->chunk_buf = chunk_buf; + p->chunk_sz = chunk_sz; + p->offset = offset; + p->nic = nic; + p->disp = disp; + + MPIR_Async_thing_spawn(thing, send_copy_poll, p, NULL); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +static int send_copy_poll(MPIX_Async_thing thing) +{ + struct send_copy *p = MPIR_Async_thing_get_state(thing); + MPIDI_OFI_rndvwrite_t *q = &MPIDI_OFI_AMREQ_WRITE(p->sreq); + + if (!p->is_done) { + int is_done = 0; + MPIR_async_test(&(p->async_req), &is_done); + if (is_done) { + p->is_done = true; + q->u.send.u.copy_infly--; + } + } + + if (!p->is_done) { + return MPIX_ASYNC_NOPROGRESS; + } else if (q->u.send.write_infly >= MPIDI_OFI_RNDVWRITE_INFLY_CHUNKS) { + return MPIX_ASYNC_NOPROGRESS; + } else { + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI_LOCK(q->vci_local)); + int mpi_errno = send_issue_write(p->sreq, p->chunk_buf, p->chunk_sz, p->nic, p->disp); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI_LOCK(q->vci_local)); + MPIR_Assertp(mpi_errno == MPI_SUCCESS); + + MPL_free(p); + return MPIX_ASYNC_DONE; + } +} + +struct write_req { + char pad[MPIDI_REQUEST_HDR_SIZE]; + struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; + int event_id; + MPIR_Request *sreq; + /* In case we need free the chunk after write */ + void *chunk_buf; +}; + +static int send_issue_write(MPIR_Request * sreq, void *buf, MPI_Aint data_sz, + int nic, MPI_Aint disp) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq); + + struct write_req *t = MPL_malloc(sizeof(struct write_req), MPL_MEM_OTHER); + MPIR_Assertp(t); + + t->event_id = MPIDI_OFI_EVENT_RNDVWRITE_WRITE_CHUNK; + t->sreq = sreq; + if (p->need_pack) { + t->chunk_buf = buf; + } + + /* control message always use nic 0 */ + int ctx_idx = MPIDI_OFI_get_ctx_index(p->vci_local, nic); + fi_addr_t addr = MPIDI_OFI_av_to_phys(p->av, p->vci_local, nic, p->vci_remote, nic); + uint64_t rkey = p->u.send.rkeys[nic]; + + MPIDI_OFI_CALL_RETRY(fi_write(MPIDI_OFI_global.ctx[ctx_idx].tx, + buf, data_sz, NULL, addr, disp, rkey, (void *) &t->context), + p->vci_local, rdma_write); + p->u.send.write_infly++; + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_rndvwrite_write_chunk_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + struct write_req *t = (void *) r; + MPIR_Request *sreq = t->sreq; + MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(sreq); + + if (p->need_pack) { + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.per_vci[p->vci_local].pipeline_pool, + t->chunk_buf); + } + + p->u.send.write_infly--; + p->u.send.chunks_remain--; + if (p->u.send.chunks_remain == 0) { + /* done. send ack. Also inform receiver our data_sz */ + mpi_errno = MPIDI_OFI_RNDV_send_hdr(&p->data_sz, sizeof(MPI_Aint), + p->av, p->vci_local, p->vci_remote, p->match_bits); + /* complete request */ + MPL_free(p->u.send.rkeys); + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(sreq); + } + + MPL_free(r); + return mpi_errno; +} + +/* -- receiver side -- */ + +int MPIDI_OFI_rndvwrite_recv(MPIR_Request * rreq, int tag, int vci_src, int vci_dst) +{ + int mpi_errno = MPI_SUCCESS; + MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(rreq); + MPIR_FUNC_ENTER; + + MPIR_Assert(!p->need_pack); + + int num_nics = MPIDI_OFI_global.num_nics; + p->u.recv.mrs = MPL_malloc((num_nics * sizeof(struct fid_mr *)), MPL_MEM_OTHER); + + int hdr_sz = sizeof(struct rdma_info) + num_nics * sizeof(uint64_t); + struct rdma_info *hdr = MPL_malloc(hdr_sz, MPL_MEM_OTHER); + MPIR_Assertp(hdr); + + mpi_errno = prepare_rdma_info(p->buf, p->datatype, p->data_sz, p->vci_local, + FI_REMOTE_WRITE, p->u.recv.mrs, hdr); + MPIR_ERR_CHECK(mpi_errno); + + /* send rdma_info */ + mpi_errno = MPIDI_OFI_RNDV_send_hdr(hdr, hdr_sz, + p->av, p->vci_local, p->vci_remote, p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + MPL_free(hdr); + + /* issue recv for ack */ + mpi_errno = MPIDI_OFI_RNDV_recv_hdr(rreq, MPIDI_OFI_EVENT_RNDVWRITE_ACK, + sizeof(MPI_Aint) /* remote data_sz */ , + p->av, p->vci_local, p->vci_remote, p->match_bits); + MPIR_ERR_CHECK(mpi_errno); + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + +int MPIDI_OFI_rndvwrite_ack_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + MPIR_Request *rreq = MPIDI_OFI_RNDV_GET_CONTROL_REQ(r); + MPIDI_OFI_rndvwrite_t *p = &MPIDI_OFI_AMREQ_WRITE(rreq); + + /* check sender data_sz */ + MPI_Aint *hdr_data_sz = MPIDI_OFI_RNDV_GET_CONTROL_HDR(r); + MPIDI_OFI_RNDV_update_count(rreq, *hdr_data_sz); + + int num_nics = MPIDI_OFI_global.num_nics; + for (int i = 0; i < num_nics; i++) { + uint64_t key = fi_mr_key(p->u.recv.mrs[i]); + MPIDI_OFI_CALL(fi_close(&p->u.recv.mrs[i]->fid), mr_unreg); + if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { + MPIDI_OFI_mr_key_free(MPIDI_OFI_LOCAL_MR_KEY, key); + } + } + MPL_free(p->u.recv.mrs); + MPL_free(r); + + /* complete rreq */ + MPIR_Datatype_release_if_not_builtin(p->datatype); + MPIDI_Request_complete_fast(rreq); + + fn_exit: + return mpi_errno; + fn_fail: + goto fn_exit; +} diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 56c3a8f5185..ae636e7a784 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -23,20 +23,6 @@ Set MPIR_CVAR_CH4_OFI_ENABLE_INJECT=0 to disable buffered send for small messages. This may help avoid hang due to lack of global progress. - - name : MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE - category : CH4_OFI - type : enum - default : copy_low_latency - class : none - verbosity : MPI_T_VERBOSITY_USER_BASIC - scope : MPI_T_SCOPE_LOCAL - description : |- - Specifies GPU engine type for GPU pt2pt on the sender side. - compute - use a compute engine - copy_high_bandwidth - use a high-bandwidth copy engine - copy_low_latency - use a low-latency copy engine - yaksa - use Yaksa - - name : MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD category : CH4_OFI type : int @@ -55,21 +41,6 @@ === END_MPI_T_CVAR_INFO_BLOCK === */ -MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_send_engine_type(void) -{ - if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_compute) { - return MPL_GPU_ENGINE_TYPE_COMPUTE; - } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == - MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_high_bandwidth) { - return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; - } else if (MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE == - MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_low_latency) { - return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; - } else { - return MPL_GPU_ENGINE_TYPE_LAST; - } -} - MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_issue_ack_recv(MPIR_Request * sreq, MPIR_Comm * comm, int context_offset, int dst_rank, int tag, MPIDI_av_entry_t * addr, @@ -255,173 +226,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *data, MPI_Aint da goto fn_exit; } -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_huge(const void *data, MPI_Aint data_sz, - uint64_t cq_data, int dst_rank, int tag, - MPIR_Comm * comm, uint64_t match_bits, - MPIDI_av_entry_t * addr, - int vci_local, int vci_remote, - int sender_nic, int receiver_nic, - MPIR_Request * sreq, - MPL_pointer_attr_t attr, bool do_striping) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, sender_nic); - - int num_nics; - uint64_t msg_size; - if (do_striping) { - num_nics = MPIDI_OFI_global.num_nics; - msg_size = MPIDI_OFI_STRIPE_CHUNK_SIZE; - } else { - num_nics = 1; - msg_size = MPIDI_OFI_global.max_msg_size; - } - - uint64_t rma_keys[MPIDI_OFI_MAX_NICS]; - struct fid_mr **huge_send_mrs; - huge_send_mrs = - (struct fid_mr **) MPL_malloc((num_nics * sizeof(struct fid_mr *)), MPL_MEM_BUFFER); - if (!MPIDI_OFI_ENABLE_MR_PROV_KEY) { - /* Set up a memory region for the lmt data transfer */ - for (int i = 0; i < num_nics; i++) { - rma_keys[i] = MPIDI_OFI_mr_key_alloc(MPIDI_OFI_LOCAL_MR_KEY, MPIDI_OFI_INVALID_MR_KEY); - MPIR_ERR_CHKANDJUMP(rma_keys[i] == MPIDI_OFI_INVALID_MR_KEY, mpi_errno, - MPI_ERR_OTHER, "**ofid_mr_key"); - } - } else { - /* zero them to avoid warnings */ - for (int i = 0; i < num_nics; i++) { - rma_keys[i] = 0; - } - } - - for (int i = 0; i < num_nics; i++) { - MPIDI_OFI_context_t *ctx = &MPIDI_OFI_global.ctx[MPIDI_OFI_get_ctx_index(vci_local, i)]; - MPIDI_OFI_CALL(fi_mr_reg(ctx->domain, data, data_sz, FI_REMOTE_READ, 0ULL, rma_keys[i], - 0ULL, &huge_send_mrs[i], NULL), mr_reg); - mpi_errno = MPIDI_OFI_mr_bind(MPIDI_OFI_global.prov_use[0], huge_send_mrs[i], ctx->ep, - NULL); - MPIR_ERR_CHECK(mpi_errno); - } - MPIDI_OFI_REQUEST(sreq, huge.send_mrs) = huge_send_mrs; - if (MPIDI_OFI_ENABLE_MR_PROV_KEY) { - /* MR_BASIC */ - for (int i = 0; i < num_nics; i++) { - rma_keys[i] = fi_mr_key(huge_send_mrs[i]); - } - } - - /* Send the maximum amount of data that we can here to get things - * started, then do the rest using the MR below. This can be confirmed - * in the MPIDI_OFI_get_huge code where we start the offset at - * MPIDI_OFI_global.max_msg_size */ - sreq->comm = comm; - MPIR_Comm_add_ref(comm); - - /* send ctrl message first */ - MPIDI_OFI_send_control_t ctrl; - ctrl.type = MPIDI_OFI_CTRL_HUGE; - for (int i = 0; i < num_nics; i++) { - ctrl.u.huge.info.rma_keys[i] = rma_keys[i]; - } - ctrl.u.huge.info.comm_id = comm->context_id; - ctrl.u.huge.info.tag = tag; - ctrl.u.huge.info.origin_rank = comm->rank; - ctrl.u.huge.info.vci_src = vci_local; - ctrl.u.huge.info.vci_dst = vci_remote; - ctrl.u.huge.info.send_buf = (void *) data; - ctrl.u.huge.info.msgsize = data_sz; - ctrl.u.huge.info.ackreq = sreq; - - mpi_errno = MPIDI_NM_am_send_hdr(dst_rank, comm, MPIDI_OFI_INTERNAL_HANDLER_CONTROL, - &ctrl, sizeof(ctrl), vci_local, vci_remote); - MPIR_ERR_CHECK(mpi_errno); - - /* send main native message next */ - MPIR_cc_inc(sreq->cc_ptr); - MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND_HUGE; - - fi_addr_t dest = MPIDI_OFI_av_to_phys(addr, vci_local, sender_nic, vci_remote, receiver_nic); - match_bits |= MPIDI_OFI_HUGE_SEND; /* Add the bit for a huge message */ - MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, - data, msg_size, NULL /* desc */ , - cq_data, dest, match_bits, - (void *) &(MPIDI_OFI_REQUEST(sreq, context))), - vci_local, tsenddata); - /* FIXME: sender_nic may not be the actual nic */ - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], msg_size); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, striped_nic_sent_bytes_count[sender_nic], msg_size); - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_pipeline(const void *buf, MPI_Aint count, - MPI_Datatype datatype, - uint64_t cq_data, int dst_rank, int tag, - MPIR_Comm * comm, uint64_t match_bits, - MPIDI_av_entry_t * addr, - int vci_local, int vci_remote, - int sender_nic, int receiver_nic, - MPIR_Request * sreq, - int dt_contig, size_t data_sz, - MPL_pointer_attr_t attr) -{ - int mpi_errno = MPI_SUCCESS; - MPIR_FUNC_ENTER; - - int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, sender_nic); - - uint32_t n_chunks = 0; - int chunk_size = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; - /* Update correct number of chunks in immediate data. */ - chunk_size = MPIDI_OFI_gpu_pipeline_chunk_size(data_sz); - n_chunks = data_sz / chunk_size; - if (data_sz % chunk_size) - n_chunks++; - MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks); - - /* Update sender packed bit if necessary. */ - uint64_t is_packed = datatype == MPI_PACKED ? 1 : 0; - MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed); - MPIR_ERR_CHKANDJUMP(is_packed, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); - - MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_SEND; - - /* Save pipeline information. */ - MPIDI_OFI_REQUEST(sreq, pipeline_info.chunk_sz) = chunk_size; - MPIDI_OFI_REQUEST(sreq, pipeline_info.cq_data) = cq_data; - MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr) = - MPIDI_OFI_av_to_phys(addr, vci_local, sender_nic, vci_remote, receiver_nic); - MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local) = vci_local; - MPIDI_OFI_REQUEST(sreq, pipeline_info.ctx_idx) = ctx_idx; - MPIDI_OFI_REQUEST(sreq, pipeline_info.match_bits) = match_bits; - MPIDI_OFI_REQUEST(sreq, pipeline_info.data_sz) = data_sz; - - /* send an empty message for tag matching */ - MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, - NULL, - 0, - cq_data, - MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr), - match_bits), vci_local, tinjectdata); - MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], data_sz); - - MPIDI_OFI_gpu_pending_send_t *send_task = - MPIDI_OFI_create_send_task(sreq, (void *) buf, count, datatype, attr, data_sz, dt_contig); - DL_APPEND(MPIDI_OFI_global.gpu_send_queue, send_task); - MPIDI_OFI_gpu_progress_send(); - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_fallback(const void *buf, MPI_Aint count, MPI_Datatype datatype, int dst_rank, int tag, @@ -434,6 +238,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_fallback(const void *buf, MPI_Aint c MPIR_FUNC_ENTER; MPIDI_OFI_REQUEST_CREATE(*request, MPIR_REQUEST_KIND__SEND, vci_src); + MPIDI_OFI_REQUEST(*request, am_req) = NULL; MPIR_Request *sreq = *request; @@ -491,8 +296,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_fallback(const void *buf, MPI_Aint c goto fn_exit; } -#define EAGER_THRESH (MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD == -1 ? MPIDI_OFI_global.max_msg_size : MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD) - MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI_Datatype datatype, int dst_rank, int tag, MPIR_Comm * comm, int context_offset, MPIDI_av_entry_t * addr, @@ -516,9 +319,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI bool need_mr = false; bool do_inject = false; bool do_iov = false; - bool do_gpu_pipelining = false; - bool do_striping = false; - bool do_huge = false; /* check gpu */ MPL_pointer_attr_t attr; @@ -549,12 +349,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI } } } - - if (need_pack && MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE && - data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) { - do_gpu_pipelining = true; - need_pack = false; - } } if (MPIR_CVAR_CH4_OFI_ENABLE_INJECT && !syncflag && !is_init && @@ -562,22 +356,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI do_inject = true; } - /* check striping path */ - if (MPIDI_OFI_COMM(comm).enable_striping && data_sz >= MPIDI_OFI_global.stripe_threshold) { - syncflag = false; - do_striping = true; - } - - /* check striping path */ - if (!do_striping && data_sz >= MPIDI_OFI_global.max_msg_size) { - syncflag = false; - do_huge = true; - } - /* noncontig? try iov or need pack */ if (!need_pack && !dt_contig) { - if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && !do_inject && !do_striping && !do_huge && - !do_gpu_pipelining) { + if (MPIDI_OFI_ENABLE_PT2PT_NOPACK && !do_inject) { MPI_Aint num_contig; MPIR_Typerep_get_iov_len(count, datatype, &num_contig); if (num_contig <= MPIDI_OFI_global.tx_iov_limit) { @@ -626,19 +407,27 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI *request = MPIR_Request_create_complete(MPIR_REQUEST_KIND__SEND); MPIR_ERR_CHECK(mpi_errno); - } else if (!is_am && data_sz > EAGER_THRESH) { - MPIR_Request *sreq = MPIDIG_request_create(MPIR_REQUEST_KIND__SEND, 2, - vci_src /* local */ , vci_dst /* remote */); - MPIR_ERR_CHKANDJUMP(!sreq, mpi_errno, MPI_ERR_OTHER, "**nomemreq"); + } else if (!is_am && data_sz > MPIDI_OFI_EAGER_THRESH) { + /* new pipeline send */ + MPIR_Request *sreq; + MPIDI_OFI_REQUEST_CREATE(sreq, MPIR_REQUEST_KIND__SEND, vci_src); *request = sreq; sreq->comm = comm; MPIR_Comm_add_ref(comm); - MPIDIG_REQUEST(sreq, req->local_vci) = vci_src; - MPIDIG_REQUEST(sreq, req->remote_vci) = vci_dst; - MPIDIG_REQUEST(sreq, buffer) = (void *) buf; - MPIDIG_REQUEST(sreq, count) = count; - MPIDIG_REQUEST(sreq, datatype) = datatype; - MPIDIG_REQUEST(sreq, u.send.dest) = dst_rank; + + MPIDI_OFI_rndv_common_t *p = &MPIDI_OFI_AMREQ_COMMON(sreq); + p->buf = buf; + p->count = count; + p->datatype = datatype; + p->need_pack = MPIDI_OFI_rndv_need_pack(dt_contig, &attr); + p->attr = attr; + p->data_sz = data_sz; + p->vci_local = vci_src; + p->vci_remote = vci_dst; + p->av = addr; + p->remote_rank = dst_rank; + /* match_bits will be set at receiving CTS */ + MPIR_Datatype_add_ref_if_not_builtin(datatype); mpi_errno = MPIDI_OFI_issue_ack_recv(sreq, comm, context_offset, dst_rank, tag, addr, @@ -646,7 +435,11 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI sizeof(MPIDI_OFI_ack_request_t)); MPIR_ERR_CHECK(mpi_errno); /* inject a zero-size message with MPIDI_OFI_RNDV_SEND in match_bits */ - match_bits |= MPIDI_OFI_RNDV_SEND; + if (p->need_pack) { + match_bits |= MPIDI_OFI_RNDV_PACK; + } else { + match_bits |= MPIDI_OFI_RNDV_SEND; + } MPIDI_OFI_idata_set_size(&cq_data, data_sz); /* optionally use cq_data to carry data_sz */ mpi_errno = MPIDI_OFI_send_lightweight(NULL, 0, cq_data, dst_rank, tag, comm, match_bits, addr, @@ -655,6 +448,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI } else { /* normal path */ MPIDI_OFI_REQUEST_CREATE(*request, MPIR_REQUEST_KIND__SEND, vci_src); + MPIDI_OFI_REQUEST(*request, am_req) = NULL; MPIR_Request *sreq = *request; if (syncflag) { @@ -687,31 +481,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI MPIDI_OFI_REQUEST(sreq, noncontig.pack.pack_buffer) = NULL; } - if (do_gpu_pipelining) { - mpi_errno = MPIDI_OFI_send_pipeline(buf, count, datatype, cq_data, dst_rank, tag, comm, - match_bits, addr, vci_src, vci_dst, - sender_nic, receiver_nic, - *request, dt_contig, data_sz, attr); - MPIR_ERR_CHECK(mpi_errno); - goto fn_exit; - } - - if (do_huge) { - mpi_errno = MPIDI_OFI_send_huge(data, data_sz, cq_data, dst_rank, tag, comm, - match_bits, addr, vci_src, vci_dst, - sender_nic, receiver_nic, *request, attr, false); - MPIR_ERR_CHECK(mpi_errno); - goto fn_exit; - } - - if (do_striping) { - mpi_errno = MPIDI_OFI_send_huge(data, data_sz, cq_data, dst_rank, tag, comm, - match_bits, addr, vci_src, vci_dst, - sender_nic, receiver_nic, *request, attr, true); - MPIR_ERR_CHECK(mpi_errno); - goto fn_exit; - } - /* NOTE: all previous send modes contains sync semantics already */ if (do_iov) { @@ -782,7 +551,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_isend(const void *buf, MPI_Aint count, mpi_errno = MPIDI_OFI_send_fallback(buf, count, datatype, rank, tag, comm, context_offset, addr, vci_src, vci_dst, request); MPIR_ERR_CHECK(mpi_errno); - MPIDI_OFI_REQUEST(*request, am_req) = NULL; } else { bool syncflag = (bool) MPIR_PT2PT_ATTR_GET_SYNCFLAG(attr); bool is_init = (bool) MPIR_PT2PT_ATTR_GET_INITFLAG(attr); @@ -790,7 +558,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_isend(const void *buf, MPI_Aint count, context_offset, addr, vci_src, vci_dst, request, false /* is_am */ , syncflag, is_init); MPIR_ERR_CHECK(mpi_errno); - MPIDI_OFI_REQUEST(*request, am_req) = NULL; } fn_exit: diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index e3c3d464ae5..2fe1bdf4cf4 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -98,18 +98,18 @@ static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata) return (idata >> MPIDI_OFI_IDATA_GPUCHUNK_OFFSET); } -#define MPIDI_OFI_PROTOCOL_BITS (6) +#define MPIDI_OFI_PROTOCOL_BITS (5) /* define protocol bits without MPIDI_OFI_PROTOCOL_SHIFT */ +/* The first 3 bits defines separate tag spaces */ #define MPIDI_OFI_ACK_SEND_0 1ULL #define MPIDI_OFI_DYNPROC_SEND_0 2ULL -#define MPIDI_OFI_GPU_PIPELINE_SEND_0 4ULL -#define MPIDI_OFI_AM_SEND_0 32ULL -/* the above defines separate tag spaces */ +#define MPIDI_OFI_AM_SEND_0 4ULL +/* the next 2 bits defines 3 meta values */ #define MPIDI_OFI_SYNC_SEND_0 8ULL -#define MPIDI_OFI_HUGE_SEND_0 16ULL -#define MPIDI_OFI_RNDV_SEND_0 24ULL +#define MPIDI_OFI_RNDV_SEND_0 16ULL +#define MPIDI_OFI_RNDV_PACK_0 24ULL /* sender require packing, thus only support pipeline or rndv write */ /* these two are really tag-carried meta data, thus require to be masked in receive */ -#define MPIDI_OFI_PROTOCOL_MASK_0 (MPIDI_OFI_SYNC_SEND_0 | MPIDI_OFI_HUGE_SEND_0) +#define MPIDI_OFI_PROTOCOL_MASK_0 (MPIDI_OFI_SYNC_SEND_0 | MPIDI_OFI_RNDV_SEND_0) /* Define constants for default bits allocation. The actual bits are defined in * ofi_capability_sets.h, which may use these defaults or define its own. @@ -129,11 +129,10 @@ static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata) #define MPIDI_OFI_PROTOCOL_SHIFT (MPIDI_OFI_CONTEXT_BITS + MPIDI_OFI_SOURCE_BITS + MPIDI_OFI_TAG_BITS) #define MPIDI_OFI_ACK_SEND (MPIDI_OFI_ACK_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) #define MPIDI_OFI_DYNPROC_SEND (MPIDI_OFI_DYNPROC_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) -#define MPIDI_OFI_GPU_PIPELINE_SEND (MPIDI_OFI_GPU_PIPELINE_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) -#define MPIDI_OFI_SYNC_SEND (MPIDI_OFI_SYNC_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) -#define MPIDI_OFI_HUGE_SEND (MPIDI_OFI_HUGE_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) #define MPIDI_OFI_AM_SEND (MPIDI_OFI_AM_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) +#define MPIDI_OFI_SYNC_SEND (MPIDI_OFI_SYNC_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) #define MPIDI_OFI_RNDV_SEND (MPIDI_OFI_RNDV_SEND_0 << MPIDI_OFI_PROTOCOL_SHIFT) +#define MPIDI_OFI_RNDV_PACK (MPIDI_OFI_RNDV_PACK_0 << MPIDI_OFI_PROTOCOL_SHIFT) #define MPIDI_OFI_PROTOCOL_MASK (MPIDI_OFI_PROTOCOL_MASK_0 << MPIDI_OFI_PROTOCOL_SHIFT) #define MPIDI_OFI_CONTEXT_MASK (((1ULL << MPIDI_OFI_CONTEXT_BITS) - 1) << (MPIDI_OFI_SOURCE_BITS + MPIDI_OFI_TAG_BITS)) @@ -168,7 +167,7 @@ static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata) #define MPIDI_OFI_AMREQUEST(req,field) ((req)->dev.ch4.am.netmod_am.ofi.field) #define MPIDI_OFI_AM_SREQ_HDR(req,field) ((req)->dev.ch4.am.netmod_am.ofi.sreq_hdr->field) #define MPIDI_OFI_AM_RREQ_HDR(req,field) ((req)->dev.ch4.am.netmod_am.ofi.rreq_hdr->field) -#define MPIDI_OFI_REQUEST(req,field) ((req)->dev.ch4.netmod.ofi.field) +#define MPIDI_OFI_REQUEST(req,field) ((req)->dev.ch4.netmod.ofi.direct.field) #define MPIDI_OFI_AV(av) ((av)->netmod.ofi) #define MPIDI_OFI_COMM(comm) ((comm)->dev.ch4.netmod.ofi) @@ -188,26 +187,28 @@ enum { MPIDI_OFI_EVENT_ABORT, MPIDI_OFI_EVENT_SEND, MPIDI_OFI_EVENT_RECV, - MPIDI_OFI_EVENT_SEND_GPU_PIPELINE, - MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT, - MPIDI_OFI_EVENT_RECV_GPU_PIPELINE, MPIDI_OFI_EVENT_AM_SEND, MPIDI_OFI_EVENT_AM_SEND_RDMA, MPIDI_OFI_EVENT_AM_SEND_PIPELINE, MPIDI_OFI_EVENT_AM_RECV, MPIDI_OFI_EVENT_AM_READ, MPIDI_OFI_EVENT_PEEK, - MPIDI_OFI_EVENT_RECV_HUGE, MPIDI_OFI_EVENT_RECV_PACK, MPIDI_OFI_EVENT_RECV_NOPACK, - MPIDI_OFI_EVENT_SEND_HUGE, MPIDI_OFI_EVENT_SEND_PACK, MPIDI_OFI_EVENT_SEND_NOPACK, MPIDI_OFI_EVENT_SSEND_ACK, MPIDI_OFI_EVENT_RNDV_CTS, - MPIDI_OFI_EVENT_GET_HUGE, + MPIDI_OFI_EVENT_PIPELINE_SEND_CHUNK, + MPIDI_OFI_EVENT_PIPELINE_RECV_CHUNK, + MPIDI_OFI_EVENT_PIPELINE_RECV_DATASIZE, + MPIDI_OFI_EVENT_RNDVREAD_RECV_MRS, + MPIDI_OFI_EVENT_RNDVREAD_READ_CHUNK, + MPIDI_OFI_EVENT_RNDVREAD_ACK, + MPIDI_OFI_EVENT_RNDVWRITE_RECV_MRS, + MPIDI_OFI_EVENT_RNDVWRITE_WRITE_CHUNK, + MPIDI_OFI_EVENT_RNDVWRITE_ACK, MPIDI_OFI_EVENT_CHUNK_DONE, - MPIDI_OFI_EVENT_HUGE_CHUNK_DONE, MPIDI_OFI_EVENT_INJECT_EMU, MPIDI_OFI_EVENT_DYNPROC_DONE, }; @@ -301,6 +302,7 @@ typedef struct { MPIDI_OFI_am_repost_request_t am_reqs[MPIDI_OFI_MAX_NUM_AM_BUFFERS]; MPIDU_genq_private_pool_t am_hdr_buf_pool; + MPIDU_genq_private_pool_t pipeline_pool; /* Queue to store defferend am send */ MPIDI_OFI_deferred_am_isend_req_t *deferred_am_isend_q; @@ -325,12 +327,6 @@ typedef struct { int cq_buffered_static_tail; MPIDI_OFI_cq_list_t *cq_buffered_dynamic_head, *cq_buffered_dynamic_tail; - /* queues to matching huge recv and control message */ - struct MPIDI_OFI_huge_recv_list *huge_ctrl_head; - struct MPIDI_OFI_huge_recv_list *huge_ctrl_tail; - struct MPIDI_OFI_huge_recv_list *huge_recv_head; - struct MPIDI_OFI_huge_recv_list *huge_recv_tail; - char pad MPL_ATTR_ALIGNED(MPL_CACHELINE_SIZE); } MPIDI_OFI_per_vci_t; @@ -511,14 +507,6 @@ typedef struct { /* stores the maximum of last recently used regular memory region key */ uint64_t global_max_regular_mr_key; - /* GPU pipeline */ - MPIDU_genq_private_pool_t gpu_pipeline_send_pool; - MPIDU_genq_private_pool_t gpu_pipeline_recv_pool; - MPIDI_OFI_gpu_task_t *gpu_send_task_queue[MPIDI_CH4_MAX_VCIS]; - MPIDI_OFI_gpu_task_t *gpu_recv_task_queue[MPIDI_CH4_MAX_VCIS]; - MPIDI_OFI_gpu_pending_recv_t *gpu_recv_queue; - MPIDI_OFI_gpu_pending_send_t *gpu_send_queue; - int addrnamelen; /* OFI uses the same name length within a provider. */ /* To support dynamic av tables, we need a way to tell which entries are empty. * ch4 av tables are initialize to 0s. Thus we need know which "0" is valid. */ @@ -530,30 +518,6 @@ typedef struct { #endif } MPIDI_OFI_global_t; -typedef struct { - int comm_id; - int origin_rank; - int tag; - MPIR_Request *ackreq; - void *send_buf; - size_t msgsize; - uint64_t rma_keys[MPIDI_OFI_MAX_NICS]; - int vci_src; - int vci_dst; -} MPIDI_OFI_huge_remote_info_t; - -typedef struct { - int16_t type; - union { - struct { - MPIDI_OFI_huge_remote_info_t info; - } huge; - struct { - MPIR_Request *ackreq; - } huge_ack; - } u; -} MPIDI_OFI_send_control_t; - typedef struct MPIDI_OFI_win_acc_hint { uint64_t dtypes_max_count[MPIDI_OFI_DT_MAX]; /* translate CH4 which_accumulate_ops hints to * atomicity support of all OFI datatypes. A datatype @@ -646,38 +610,17 @@ typedef struct MPIDI_OFI_target_mr { uint64_t mr_key; } MPIDI_OFI_target_mr_t; -typedef struct MPIDI_OFI_read_chunk { - char pad[MPIDI_REQUEST_HDR_SIZE]; - struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; /* fixed field, do not move */ - int event_id; /* fixed field, do not move */ - MPIR_Request *localreq; - MPIR_cc_t *chunks_outstanding; -} MPIDI_OFI_read_chunk_t; - -/* The list of posted huge receives that haven't been matched yet. These need - * to get matched up when handling the control message that starts transferring - * data from the remote memory region and we need a way of matching up the - * control messages with the "real" requests. */ -typedef struct MPIDI_OFI_huge_recv_list { - int comm_id; - int rank; - int tag; - union { - MPIDI_OFI_huge_remote_info_t *info; /* ctrl list */ - MPIR_Request *rreq; /* recv list */ - } u; - struct MPIDI_OFI_huge_recv_list *next; -} MPIDI_OFI_huge_recv_list_t; - /* Externs */ extern MPIDI_OFI_global_t MPIDI_OFI_global; extern MPIDI_OFI_capabilities_t MPIDI_OFI_caps_list[MPIDI_OFI_NUM_SETS]; +#define MPIDI_OFI_CAN_SEND_CQ_DATASIZE(data_sz) (MPIDI_OFI_global.cq_data_size == 8 && (data_sz) <= INT32_MAX) + static inline void MPIDI_OFI_idata_set_size(uint64_t * data_field, MPI_Aint data_sz) { *data_field &= 0xffffffff; - if (MPIDI_OFI_global.cq_data_size == 8 && data_sz <= INT32_MAX) { + if (MPIDI_OFI_CAN_SEND_CQ_DATASIZE(data_sz)) { *data_field |= ((uint64_t) data_sz << 32); } } diff --git a/src/mpid/ch4/netmod/ofi/ofi_vci.c b/src/mpid/ch4/netmod/ofi/ofi_vci.c index 8cc176fbb77..7000054e93f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_vci.c +++ b/src/mpid/ch4/netmod/ofi/ofi_vci.c @@ -65,8 +65,7 @@ int MPIDI_OFI_comm_set_vcis(MPIR_Comm * comm, int num_implicit, int num_reserved MPIR_ERR_CHECK(mpi_errno); for (int vci = 1; vci < MPIDI_OFI_global.num_vcis; vci++) { - MPIDI_OFI_am_init(vci); - MPIDI_OFI_am_post_recv(vci, 0); + MPIDI_OFI_init_per_vci(vci); } if (MPIR_CVAR_DEBUG_SUMMARY && comm->rank == 0) { diff --git a/src/mpid/ch4/netmod/ofi/util.c b/src/mpid/ch4/netmod/ofi/util.c index 115cb63a3b8..5253cc9b9b9 100644 --- a/src/mpid/ch4/netmod/ofi/util.c +++ b/src/mpid/ch4/netmod/ofi/util.c @@ -174,41 +174,6 @@ void MPIDI_OFI_mr_key_allocator_destroy(void) MPID_THREAD_CS_EXIT(VCI, mr_key_allocator_lock); } -int MPIDI_OFI_control_handler(void *am_hdr, void *data, MPI_Aint data_sz, - uint32_t attr, MPIR_Request ** req) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_send_control_t *ctrlsend = (MPIDI_OFI_send_control_t *) am_hdr; - - if (attr & MPIDIG_AM_ATTR__IS_ASYNC) { - *req = NULL; - } - - int local_vci = MPIDIG_AM_ATTR_DST_VCI(attr); - MPIR_AssertDeclValue(int remote_vci, MPIDIG_AM_ATTR_SRC_VCI(attr)); - switch (ctrlsend->type) { - case MPIDI_OFI_CTRL_HUGEACK: - mpi_errno = MPIDI_OFI_dispatch_function(local_vci, NULL, ctrlsend->u.huge_ack.ackreq); - break; - - case MPIDI_OFI_CTRL_HUGE: - MPIR_Assert(local_vci == ctrlsend->u.huge.info.vci_dst); - MPIR_Assert(remote_vci == ctrlsend->u.huge.info.vci_src); - mpi_errno = MPIDI_OFI_recv_huge_control(local_vci, - ctrlsend->u.huge.info.comm_id, - ctrlsend->u.huge.info.origin_rank, - ctrlsend->u.huge.info.tag, - &(ctrlsend->u.huge.info)); - break; - - default: - fprintf(stderr, "Bad control type: 0x%08x %d\n", ctrlsend->type, ctrlsend->type); - MPIR_Assert(0); - } - - return mpi_errno; -} - static bool check_mpi_acc_valid(MPI_Datatype dtype, MPI_Op op) { bool valid_flag = false; diff --git a/src/mpid/ch4/netmod/ucx/ucx_am.h b/src/mpid/ch4/netmod/ucx/ucx_am.h index 57d2404088e..ecfcb295401 100644 --- a/src/mpid/ch4/netmod/ucx/ucx_am.h +++ b/src/mpid/ch4/netmod/ucx/ucx_am.h @@ -272,7 +272,7 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_check_eager(MPI_Aint am_hdr_sz, MPI_Ai #endif } -MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(void) +MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(MPIR_Request * rreq) { return true; } diff --git a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c index 1f23ae06d32..813659a49c7 100644 --- a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c +++ b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c @@ -603,21 +603,7 @@ static int gpu_ipc_async_poll(MPIX_Async_thing thing) int is_done = 0; struct gpu_ipc_async *p = MPIR_Async_thing_get_state(thing); - switch (p->yreq.type) { - case MPIR_NULL_REQUEST: - /* a dummy, immediately complete */ - is_done = 1; - break; - case MPIR_TYPEREP_REQUEST: - MPIR_Typerep_test(p->yreq.u.y_req, &is_done); - break; - case MPIR_GPU_REQUEST: - err = MPL_gpu_test(&p->yreq.u.gpu_req, &is_done); - MPIR_Assertp(err == MPL_SUCCESS); - break; - default: - MPIR_Assert(0); - } + MPIR_async_test(&(p->yreq), &is_done); if (is_done) { int vci = MPIDIG_REQUEST(p->req, req->local_vci); diff --git a/src/mpid/ch4/shm/src/shm_am.h b/src/mpid/ch4/shm/src/shm_am.h index 6faf22d6a07..c313f3daafd 100644 --- a/src/mpid/ch4/shm/src/shm_am.h +++ b/src/mpid/ch4/shm/src/shm_am.h @@ -117,7 +117,7 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_SHM_am_check_eager(MPI_Aint am_hdr_sz, MPI_A return (am_hdr_sz + data_sz) <= MPIDI_POSIX_am_eager_limit(); } -MPL_STATIC_INLINE_PREFIX bool MPIDI_SHM_am_can_do_tag(void) +MPL_STATIC_INLINE_PREFIX bool MPIDI_SHM_am_can_do_tag(MPIR_Request * rreq) { return false; } diff --git a/src/mpid/ch4/src/mpidig.h b/src/mpid/ch4/src/mpidig.h index e889502e9ca..850c8c19b04 100644 --- a/src/mpid/ch4/src/mpidig.h +++ b/src/mpid/ch4/src/mpidig.h @@ -62,7 +62,6 @@ enum { MPIDI_IPC_ACK, MPIDI_IPC_WRITE, - MPIDI_OFI_INTERNAL_HANDLER_CONTROL, MPIDI_OFI_AM_RDMA_READ_ACK, MPIDI_OFI_RNDV_INFO, diff --git a/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c b/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c index 3b524639d65..5407c33ac0a 100644 --- a/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c +++ b/src/mpid/ch4/src/mpidig_pt2pt_callbacks.c @@ -13,9 +13,10 @@ static int recv_target_cmpl_cb(MPIR_Request * rreq); static int can_do_tag(MPIR_Request * rreq) { #ifdef MPIDI_CH4_DIRECT_NETMOD - return MPIDI_NM_am_can_do_tag(); + return MPIDI_NM_am_can_do_tag(rreq); #else - return MPIDI_REQUEST(rreq, is_local) ? MPIDI_SHM_am_can_do_tag() : MPIDI_NM_am_can_do_tag(); + return MPIDI_REQUEST(rreq, is_local) ? + MPIDI_SHM_am_can_do_tag(rreq) : MPIDI_NM_am_can_do_tag(rreq); #endif } diff --git a/test/mpi/bench/Makefile.am b/test/mpi/bench/Makefile.am index e2e8d81e567..471acd0ee32 100644 --- a/test/mpi/bench/Makefile.am +++ b/test/mpi/bench/Makefile.am @@ -15,6 +15,7 @@ noinst_PROGRAMS = \ get_bw \ put_bw \ p2p_one \ + p2p_self \ barrier \ bcast diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def index a0567d0f6db..78b2ebd20c5 100644 --- a/test/mpi/bench/macros/bench_frame.def +++ b/test/mpi/bench/macros/bench_frame.def @@ -23,10 +23,8 @@ subcode: bench_frame $global grank, gsize: int MPI_Comm_rank(MPI_COMM_WORLD, &grank); MPI_Comm_size(MPI_COMM_WORLD, &gsize); - $(if:MIN_PROCS) - $if gsize < $(MIN_PROCS) - printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n"); - return 1 + + $call check_launch MPI_Comm comm = MPI_COMM_WORLD; @@ -60,6 +58,22 @@ subcode: bench_frame MTest_Finalize(0); $(else) MPI_Finalize(); + # ----- + subcode: check_launch + $(if:MIN_PROCS) + $if gsize < $(MIN_PROCS) + printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n"); + return 1 + $(if:MEM_TYPES=sendrecv) + $if gsize % 2 == 1 + printf("! Test $(_pagename) requires even number of processes to form even/odd pairs !\n"); + $global gsrc, gdst: int + $if grank % 2 == 0 + gsrc = grank + gdst = grank + 1 + $else + gsrc = grank - 1 + gdst = grank macros: use_double: 1 diff --git a/test/mpi/bench/macros/bench_p2p.def b/test/mpi/bench/macros/bench_p2p.def index 08c302cf847..524c7a4eae5 100644 --- a/test/mpi/bench/macros/bench_p2p.def +++ b/test/mpi/bench/macros/bench_p2p.def @@ -15,7 +15,6 @@ */ macros: - MIN_PROCS: 2 MEM_TYPES: sendrecv subcode: _autoload diff --git a/test/mpi/bench/macros/mtest.def b/test/mpi/bench/macros/mtest.def index 6eebb85d09b..757ca7cb84c 100644 --- a/test/mpi/bench/macros/mtest.def +++ b/test/mpi/bench/macros/mtest.def @@ -4,10 +4,10 @@ macros: subcode: mtest_malloc(size) MTestArgList *head = MTestArgListCreate(argc, argv) $(if:MEM_TYPES=sendrecv) - int send_rank = 0, recv_rank = 1; - $(for:a in send,recv) - $if grank == $(a)_rank - $call alloc_mem_dev, $(a)mem, $(a)dev + $if grank == gsrc + $call alloc_mem_dev, sendmem, senddev + $elif grank == gdst + $call alloc_mem_dev, recvmem, recvdev $(else) # all procs allocating the same memory types $call alloc_mem_dev, memtype, device diff --git a/test/mpi/bench/p2p_one.def b/test/mpi/bench/p2p_one.def index af0fbf93cf4..6cf215da888 100644 --- a/test/mpi/bench/p2p_one.def +++ b/test/mpi/bench/p2p_one.def @@ -6,7 +6,6 @@ include: macros/mtest.def */ macros: - MIN_PROCS: 2 MEM_TYPES: sendrecv subcode: _autoload @@ -27,27 +26,28 @@ page: p2p_one, bench_frame MAX_BUFSIZE: 1000000000 data: buf, size, MPI_CHAR - int rank; - MPI_Comm_rank(comm, &rank) - - src = 0 - dst = 1 size = MAX_BUFSIZE /* Repeat a few times to see variations or the need to warm up */ $for iter=0:5 - $if rank == src + $if grank == gsrc tf_start = MPI_Wtime() $call @send_side - MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE) + $call recv_sync, gdst tf_latency = MPI_Wtime() - tf_start $call report_one - $elif rank == dst + $elif grank == gdst $call @recv_side - MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm) + $call send_sync, gsrc subcode: send_side - MPI_Send($(data), dst, TAG, comm) + MPI_Send($(data), gdst, TAG, comm) subcode: recv_side - MPI_Recv($(data), src, TAG, comm, MPI_STATUS_IGNORE) + MPI_Recv($(data), gsrc, TAG, comm, MPI_STATUS_IGNORE) + + subcode: send_sync(rank) + MPI_Send(NULL, 0, MPI_DATATYPE_NULL, $(rank), TAG, comm) + + subcode: recv_sync(rank) + MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, $(rank), TAG, comm, MPI_STATUS_IGNORE) diff --git a/test/mpi/bench/p2p_self.def b/test/mpi/bench/p2p_self.def new file mode 100644 index 00000000000..03824222ad1 --- /dev/null +++ b/test/mpi/bench/p2p_self.def @@ -0,0 +1,74 @@ +include: macros/bench_frame.def +include: macros/mtest.def + +/* Similar to p2p_one, but measure bandwidth of self messaging. + * The results should be comparable to a memcpy bandwidth. + */ + +subcode: report_header + printf("%12s %10s %12s\n", "msgsize", "latency(sec)", "bandwidth(GB/s)"); + +subcode: report_one + tf_bw = size / tf_latency / 1e9 + printf("%12d %10.3f %12.3f\n", size, tf_latency, tf_bw); + +page: p2p_self, self_frame + MAX_BUFSIZE: 1000000000 + + $call allocate_buffers + + comm = MPI_COMM_SELF + int tag = 0; + int size = $(MAX_BUFSIZE); + + MPI_Request reqs[2]; + $for iter=0:5 + tf_start = MPI_Wtime() + MPI_Isend(send_buf, size, MPI_CHAR, 0, tag, comm, &reqs[0]) + MPI_Irecv(recv_buf, size, MPI_CHAR, 0, tag, comm, &reqs[1]) + MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE) + tf_latency = MPI_Wtime() - tf_start + $call report_one + +subcode: self_frame + $include stdio + $include stdlib + $(if:HAS_MTEST) + $include mpitest.h + $(else) + $include mpi + + $function main + $(if:HAS_MTEST) + MTest_Init(NULL, NULL); + $(else) + MPI_Init(NULL, NULL); + + printf("TEST $(_pagename):\n") + $call @report_header + $call main + printf("\n") + + $(if:HAS_MTEST) + MTest_Finalize(0); + $(else) + MPI_Finalize(); + +subcode: allocate_buffers + $my send_buf, recv_buf: void * + $(set:size=MAX_BUFSIZE) + $(if:HAS_MTEST) + MTestArgList *head = MTestArgListCreate(argc, argv) + $my mtest_mem_type_e memtype, int device + $(for:send, recv) + memtype = MTestArgListGetMemType(head, "$1mem") + device = MTestArgListGetInt_with_default(head, "$1dev", 0) + MTestMalloc($(size), memtype, NULL, &$1_buf, device) + MTestArgListDestroy(head) + $(else) + send_buf = malloc($(size)) + recv_buf = malloc($(size)) + $if !send_buf || !recv_buf + printf("! Failed to allocate buffers (size=%d)\n", MAX_BUFSIZE) + return 1 + diff --git a/test/mpi/errors/pt2pt/testlist b/test/mpi/errors/pt2pt/testlist index eca0402fe21..89aa8841e16 100644 --- a/test/mpi/errors/pt2pt/testlist +++ b/test/mpi/errors/pt2pt/testlist @@ -1,6 +1,9 @@ proberank 1 truncmsg1 2 -truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline +truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read +truncmsg1 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write truncmsg2 2 truncmsg_mrecv 2 # multiple completion ests diff --git a/test/mpi/maint/jenkins/xfail.conf b/test/mpi/maint/jenkins/xfail.conf index da0bafb9ede..5184dbd2be0 100644 --- a/test/mpi/maint/jenkins/xfail.conf +++ b/test/mpi/maint/jenkins/xfail.conf @@ -116,8 +116,5 @@ mpich-.*-arm.* * * * * /^reduce 10/ xfail=ticket0 coll/testlist. * * * * * /^allgather_gpu/ xfail=ticket6657 coll/testlist.gpu * * * * * /^allgatherv_gpu/ xfail=ticket6657 coll/testlist.gpu -# GPU pipelining requires provider cq_data_sz > 8 which isn't met by psm3 -* * * ch4:ofi * /^.*MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1.*/ xfail=ticket0 pt2pt/testlist.gpu - # F08 profiling test only works with dynamic linking * * noshared * * /^profile1f90/ xfail=ticket0 f08/profile/testlist diff --git a/test/mpi/pt2pt/testlist.gpu b/test/mpi/pt2pt/testlist.gpu index 57640705ac3..103030fd7c7 100644 --- a/test/mpi/pt2pt/testlist.gpu +++ b/test/mpi/pt2pt/testlist.gpu @@ -18,5 +18,5 @@ sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100 sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8,100 arg=-sendmem=shared arg=-recvmem=host sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8,100 arg=-sendmem=shared arg=-recvmem=reg_host sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8,100 arg=-sendmem=shared arg=-recvmem=shared -sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8 arg=-sendmem=device arg=-recvmem=device env=MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 timeLimit=600 -pipeline 2 env=MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=4 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=4 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 env=MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +sendrecv1 2 arg=-typelist=MPI_INT,MPI_INT:4+MPI_DOUBLE:8 arg=-counts=1,17,50,100,512,65530 arg=-seed=200 arg=-testsizes=8 arg=-sendmem=device arg=-recvmem=device env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=100000 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline timeLimit=600 +pipeline 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=131072 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline env=MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS=4 env=MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ=1048576 diff --git a/test/mpi/pt2pt/testlist.in b/test/mpi/pt2pt/testlist.in index fa98c1ba95d..e4043924c06 100644 --- a/test/mpi/pt2pt/testlist.in +++ b/test/mpi/pt2pt/testlist.in @@ -35,10 +35,13 @@ rqstatus 2 rqfree 4 resultTest=TestAllowWarnings env=MPIR_CVAR_FINALIZE_WAIT=1 greq1 1 probe_unexp 4 -probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline +probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read +probe_unexp 4 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write probe_unexp 4 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true probenull 1 -probenull 1 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +probenull 1 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 probenull 1 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true recvnull 1 # For testing, scancel will run with 1 process as well @@ -57,13 +60,19 @@ waitany_null 1 # perhaps disable in the release tarball large_message 3 mem=6.5 mprobe 2 -mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline +mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read +mprobe 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write mprobe 2 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true big_count_status 1 many_isend 3 manylmt 2 huge_underflow 2 -huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline +huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=read +huge_underflow 2 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 env=MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=write huge_underflow 2 env=MPIR_CVAR_CH4_OFI_AM_LONG_FORCE_PIPELINE=true huge_anysrc 2 huge_dupcomm 2 diff --git a/test/mpi/threads/pt2pt/testlist b/test/mpi/threads/pt2pt/testlist index 8bd6ced6531..ad45659662c 100644 --- a/test/mpi/threads/pt2pt/testlist +++ b/test/mpi/threads/pt2pt/testlist @@ -41,15 +41,15 @@ mt_mprobe_sendirecv 2 arg=-iter=1024 arg=-count=1 mt_improbe_isendrecv 2 arg=-iter=1024 arg=-count=1 mt_improbe_sendrecv 2 arg=-iter=1024 arg=-count=1 -mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 -mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 -mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 -mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 - -mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 -mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 -mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 -mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE=16384 +mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=4096 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 + +mt_probe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +mt_iprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +mt_mprobe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 +mt_improbe_sendrecv_huge 2 arg=-iter=64 arg=-count=65530 env=MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD=16384 mt_sendrecv_pers_huge 2 arg=-iter=64 arg=-count=4096 mt_bsendrecv_pers_huge 2 arg=-iter=8 arg=-count=4096