From 84538b5c72ddc951d7474a8d13042f0f913470fa Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 27 Sep 2024 23:04:49 +0300 Subject: [PATCH 01/27] hoist code out of conditional --- src/vector.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/vector.c b/src/vector.c index 4667aec..d26156c 100644 --- a/src/vector.c +++ b/src/vector.c @@ -130,14 +130,12 @@ int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int coun if (op == ARMCII_OP_ACC) { ARMCII_Acc_type_translate(datatype, &type, &type_size); - type_count = size/type_size; - ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size"); } else { type = MPI_BYTE; MPI_Type_size(type, &type_size); - type_count = size/type_size; - ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size"); } + type_count = size/type_size; + ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size"); // CONSERVATIVE CASE: If remote pointers overlap or remote pointers correspond to // multiple allocations, use the safe implementation to avoid invalid MPI From dc8e3677c58bb16bcbef992246e0811fc735479a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 27 Sep 2024 23:11:05 +0300 Subject: [PATCH 02/27] start workoing on request-based RMA (again) --- src/armci.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/armci.h b/src/armci.h index 9c0290e..406360d 100644 --- a/src/armci.h +++ b/src/armci.h @@ -67,14 +67,27 @@ int ARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/], typedef struct armci_hdl_s { +#ifdef USE_RMA_REQUESTS + int batch_size; + union request { + MPI_Request single; // used when batch_size=0 (common case) + MPI_Request *array; // used when batch_size>0 + }; +#else int target; /* we do not actually support individual completion */ int aggregate; +#endif } armci_hdl_t; void ARMCI_INIT_HANDLE(armci_hdl_t *hdl); +#ifndef USE_RMA_REQUESTS +// GA does not use these. +// Removing them from the header ensures users will +// not be able to use them if they are not implemented. void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t* handle); void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t* handle); +#endif int ARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl); int ARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl); From ad68f253cd8a25d93f57e85f8492aa99b1e5551f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2024 16:44:14 +0300 Subject: [PATCH 03/27] 1) do not set aggregate initially (2) fix warning text --- src/onesided_nb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/onesided_nb.c b/src/onesided_nb.c index f5a58c5..4090287 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -13,7 +13,7 @@ */ void ARMCI_INIT_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { - handle->aggregate = 1; + handle->aggregate = 0; handle->target = -1; } else { ARMCII_Warning("ARMCI_INIT_HANDLE given NULL handle"); @@ -28,7 +28,7 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { handle->aggregate = 1; } else { - ARMCII_Warning("ARMCI_INIT_HANDLE given NULL handle"); + ARMCII_Warning("ARMCI_SET_AGGREGATE_HANDLE given NULL handle"); } return; } @@ -40,7 +40,7 @@ void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { handle->aggregate = 0; } else { - ARMCII_Warning("ARMCI_INIT_HANDLE given NULL handle"); + ARMCII_Warning("ARMCI_UNSET_AGGREGATE_HANDLE given NULL handle"); } return; } From f6bccc9ae002687c57f18345f500716fd8e7b1ad Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2024 19:22:11 +0300 Subject: [PATCH 04/27] small changes to design --- src/armci.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/armci.h b/src/armci.h index 406360d..5185b04 100644 --- a/src/armci.h +++ b/src/armci.h @@ -69,10 +69,8 @@ typedef struct armci_hdl_s { #ifdef USE_RMA_REQUESTS int batch_size; - union request { - MPI_Request single; // used when batch_size=0 (common case) - MPI_Request *array; // used when batch_size>0 - }; + MPI_Request single_request; // used when batch_size=0 (common case) + MPI_Request *request_array; // used when batch_size>0 #else int target; /* we do not actually support individual completion */ int aggregate; @@ -81,13 +79,8 @@ typedef struct armci_hdl_s armci_hdl_t; void ARMCI_INIT_HANDLE(armci_hdl_t *hdl); -#ifndef USE_RMA_REQUESTS -// GA does not use these. -// Removing them from the header ensures users will -// not be able to use them if they are not implemented. void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t* handle); void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t* handle); -#endif int ARMCI_NbPut(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl); int ARMCI_NbGet(void *src, void *dst, int bytes, int proc, armci_hdl_t *hdl); From 62f8c39f1e7039bf21f0f8768cc868f8282d7290 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2024 19:38:58 +0300 Subject: [PATCH 05/27] add feature to use request-based RMA in atomics Fetch_and_op or Compare_and_swap plus Flush(_local) might be more expensive so we add an option to use Rget_accumulate (yes, way more arguments) and wait on the resulting request, which might be better in some cases. --- src/armci_internals.h | 1 + src/gmr-extras.c | 17 ++++++++++++++++- src/init_finalize.c | 4 ++++ src/rmw.c | 4 ++-- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/armci_internals.h b/src/armci_internals.h index 405db46..aa4103d 100644 --- a/src/armci_internals.h +++ b/src/armci_internals.h @@ -113,6 +113,7 @@ typedef struct { int rma_nocheck; /* Use MPI_MODE_NOCHECK on synchronization calls that take assertion */ int disable_shm_accumulate; /* Set the disable_shm_accumulate window info key to true */ int use_same_op; /* Set accumulate_ops=same_op window info key */ + int use_request_atomics; /* Use request-based RMA for atomic operations */ char rma_ordering[20]; /* Set accumulate_ordering= window info key */ size_t memory_limit; /* upper bound on how much memory ARMCI can allocate */ diff --git a/src/gmr-extras.c b/src/gmr-extras.c index d180b0a..55ac6b0 100644 --- a/src/gmr-extras.c +++ b/src/gmr-extras.c @@ -103,7 +103,22 @@ int gmr_fetch_and_op(gmr_t *mreg, void *src, void *out, void *dst, ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp <= mreg->slices[proc].size, "Transfer is out of range"); - MPI_Fetch_and_op(src, out, type, grp_proc, (MPI_Aint) disp, op, mreg->window); + if (ARMCII_GLOBAL_STATE.use_request_atomics) { + + MPI_Request req; + MPI_Rget_accumulate(src, 1, type, out, 1, type, grp_proc, (MPI_Aint) disp, 1, type, op, mreg->window, &req); + MPI_Wait(&req, MPI_STATUS_IGNORE); + + } else { + + MPI_Fetch_and_op(src, out, type, grp_proc, (MPI_Aint) disp, op, mreg->window); + if (ARMCII_GLOBAL_STATE.end_to_end_flush) { + MPI_Win_flush(grp_proc, mreg->window); + } else { + MPI_Win_flush_local(grp_proc, mreg->window); + } + + } return 0; } diff --git a/src/init_finalize.c b/src/init_finalize.c index f67ebd8..5daae94 100644 --- a/src/init_finalize.c +++ b/src/init_finalize.c @@ -396,6 +396,9 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { /* Use MPI_MODE_NOCHECK assertion */ ARMCII_GLOBAL_STATE.rma_nocheck=ARMCII_Getenv_bool("ARMCI_RMA_NOCHECK", 1); + /* Use MPI_MODE_NOCHECK assertion */ + ARMCII_GLOBAL_STATE.use_request_atomics=ARMCII_Getenv_bool("ARMCI_USE_REQUEST_ATOMICS", 0); + /* Setup groups and communicators */ MPI_Comm_dup(comm, &ARMCI_GROUP_WORLD.comm); @@ -543,6 +546,7 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { printf(" NO_FLUSH_LOCAL = %s\n", ARMCII_GLOBAL_STATE.end_to_end_flush ? "TRUE" : "FALSE"); printf(" RMA_NOCHECK = %s\n", ARMCII_GLOBAL_STATE.rma_nocheck ? "TRUE" : "FALSE"); printf(" MSG_BARRIER_SYNCS = %s\n", ARMCII_GLOBAL_STATE.msg_barrier_syncs ? "TRUE" : "FALSE"); + printf(" USE_REQUEST_ATOMICS = %s\n", ARMCII_GLOBAL_STATE.use_request_atomics ? "TRUE" : "FALSE"); /* MPI info set on window */ printf(" USE_ALLOC_SHM = %s\n", ARMCII_GLOBAL_STATE.use_alloc_shm ? "TRUE" : "FALSE"); diff --git a/src/rmw.c b/src/rmw.c index 44ad856..2100bca 100644 --- a/src/rmw.c +++ b/src/rmw.c @@ -76,11 +76,11 @@ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { long out_val_l, src_val_l = *((long*)ploc); int out_val_i, src_val_i = *((int*)ploc); + // this is a blocking operation gmr_fetch_and_op(dst_mreg, is_long ? (void*) &src_val_l : (void*) &src_val_i /* src */, is_long ? (void*) &out_val_l : (void*) &out_val_i /* out */, prem /* dst */, type, rop, proc); - gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = out_val_l; else @@ -90,11 +90,11 @@ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { long fetch_val_l, add_val_l = value; int fetch_val_i, add_val_i = value; + // this is a blocking operation gmr_fetch_and_op(dst_mreg, is_long ? (void*) &add_val_l : (void*) &add_val_i /* src */, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i /* out */, prem /* dst */, type, rop, proc); - gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = fetch_val_l; From 4aa47b270ca61ce7573444ee1761ef4d7f9bbaf7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2024 19:43:47 +0300 Subject: [PATCH 06/27] merge gmr-extras.c into gmr.c Signed-off-by: Jeff Hammond --- Makefile.am | 1 - src/gmr-extras.c | 232 ----------------------------------------------- src/gmr.c | 223 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 220 insertions(+), 236 deletions(-) delete mode 100644 src/gmr-extras.c diff --git a/Makefile.am b/Makefile.am index 6760cb0..8dd1b9d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -18,7 +18,6 @@ libarmci_la_SOURCES = src/buffer.c \ src/internals.c \ src/malloc.c \ src/gmr.c \ - src/gmr-extras.c \ src/message.c \ src/message_gop.c \ src/mutex.c \ diff --git a/src/gmr-extras.c b/src/gmr-extras.c deleted file mode 100644 index 55ac6b0..0000000 --- a/src/gmr-extras.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (C) 2010. See COPYRIGHT in top-level directory. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/** One-sided get-accumulate operation. Source and output buffer must be private. - * - * @param[in] mreg Memory region - * @param[in] src Source address (local) - * @param[in] out Result address (local) - * @param[in] dst Destination address (remote) - * @param[in] type MPI type of the given buffers - * @param[in] count Number of elements of the given type to transfer - * @param[in] op MPI_Op to apply at the destination - * @param[in] proc Absolute process id of the target - * @return 0 on success, non-zero on failure - */ -int gmr_get_accumulate(gmr_t *mreg, void *src, void *out, void *dst, int count, MPI_Datatype type, MPI_Op op, int proc) { - ARMCII_Assert_msg(src != NULL && out != NULL, "Invalid local address(es)"); - return gmr_get_accumulate_typed(mreg, src, count, type, out, count, type, dst, count, type, op, proc); -} - -/** One-sided get-accumulate operation with typed arguments. Source and output buffer must be private. - * - * @param[in] mreg Memory region - * @param[in] src Address of source data - * @param[in] src_count Number of elements of the given type at the source - * @param[in] src_type MPI datatype of the source elements - * @param[in] out Address of output buffer (same process as the source) - * @param[in] out_count Number of elements of the given type at the ouput - * @param[in] out_type MPI datatype of the output elements - * @param[in] dst Address of destination buffer - * @param[in] dst_count Number of elements of the given type at the destination - * @param[in] dst_type MPI datatype of the destination elements - * @param[in] size Number of bytes to transfer - * @param[in] op MPI_Op to apply at the destination - * @param[in] proc Absolute process id of target process - * @return 0 on success, non-zero on failure - */ -int gmr_get_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *out, int out_count, MPI_Datatype out_type, - void *dst, int dst_count, MPI_Datatype dst_type, MPI_Op op, int proc) { - - int grp_proc; - gmr_size_t disp; - MPI_Aint lb, extent; - - grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); - ARMCII_Assert(grp_proc >= 0); - ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - - // Calculate displacement from beginning of the window - if (dst == MPI_BOTTOM) - disp = 0; - else - disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); - - // Perform checks - MPI_Type_get_true_extent(dst_type, &lb, &extent); - ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); - ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); - - MPI_Get_accumulate(src, src_count, src_type, out, out_count, out_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, op, mreg->window); - - return 0; -} - -/** One-sided fetch-and-op. Source and output buffer must be private. - * - * @param[in] mreg Memory region - * @param[in] src Address of source data - * @param[in] out Address of output buffer (same process as the source) - * @param[in] dst Address of destination buffer - * @param[in] type MPI datatype of the source, output and destination elements - * @param[in] op MPI_Op to apply at the destination - * @param[in] proc Absolute process id of target process - * @return 0 on success, non-zero on failure - */ -int gmr_fetch_and_op(gmr_t *mreg, void *src, void *out, void *dst, - MPI_Datatype type, MPI_Op op, int proc) { - - int grp_proc; - gmr_size_t disp; - - grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); - ARMCII_Assert(grp_proc >= 0); - ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - - /* built-in types only so no chance of seeing MPI_BOTTOM */ - disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); - - // Perform checks - ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); - ARMCII_Assert_msg(disp <= mreg->slices[proc].size, "Transfer is out of range"); - - if (ARMCII_GLOBAL_STATE.use_request_atomics) { - - MPI_Request req; - MPI_Rget_accumulate(src, 1, type, out, 1, type, grp_proc, (MPI_Aint) disp, 1, type, op, mreg->window, &req); - MPI_Wait(&req, MPI_STATUS_IGNORE); - - } else { - - MPI_Fetch_and_op(src, out, type, grp_proc, (MPI_Aint) disp, op, mreg->window); - if (ARMCII_GLOBAL_STATE.end_to_end_flush) { - MPI_Win_flush(grp_proc, mreg->window); - } else { - MPI_Win_flush_local(grp_proc, mreg->window); - } - - } - - return 0; -} - -/** Lock a memory region at all targets so that one-sided operations can be performed. - * - * @param[in] mreg Memory region - * @return 0 on success, non-zero on failure - */ -int gmr_lockall(gmr_t *mreg) { - int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); - - ARMCII_Assert(grp_me >= 0); - ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - - MPI_Win_lock_all((ARMCII_GLOBAL_STATE.rma_nocheck) ? MPI_MODE_NOCHECK : 0, - mreg->window); - - return 0; -} - -/** Unlock a memory region at all targets. - * - * @param[in] mreg Memory region - * @return 0 on success, non-zero on failure - */ -int gmr_unlockall(gmr_t *mreg) { - int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); - - ARMCII_Assert(grp_me >= 0); - ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - - MPI_Win_unlock_all(mreg->window); - - return 0; -} - -/** Flush a memory region for local or remote completion. - * - * @param[in] mreg Memory region - * @param[in] proc Absolute process id of the target - * @param[in] local_only Only flush the operation locally. - * @return 0 on success, non-zero on failure - */ -int gmr_flush(gmr_t *mreg, int proc, int local_only) { - int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); - int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); - - ARMCII_Assert(grp_proc >= 0 && grp_me >= 0); - ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - ARMCII_Assert_msg(grp_proc < mreg->group.size, "grp_proc exceeds group size!"); - - if (!local_only || ARMCII_GLOBAL_STATE.end_to_end_flush) { - MPI_Win_flush(grp_proc, mreg->window); - } else { - MPI_Win_flush_local(grp_proc, mreg->window); - } - - return 0; -} - -/** Flush a memory region for remote completion to all targets. - * - * @param[in] mreg Memory region - * @return 0 on success, non-zero on failure - */ -int gmr_flushall(gmr_t *mreg, int local_only) { - int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); - - ARMCII_Assert(grp_me >= 0); - ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - - if (!local_only || ARMCII_GLOBAL_STATE.end_to_end_flush) { - MPI_Win_flush_all(mreg->window); - } else { - MPI_Win_flush_local_all(mreg->window); - } - - return 0; -} - -/** Sync memory region so that public and private windows are the same. - * - * @param[in] mreg Memory region - * @return 0 on success, non-zero on failure - */ -int gmr_sync(gmr_t *mreg) -{ -#if 0 - // what is the point of this? - int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); - ARMCII_Assert(grp_me >= 0); -#endif - ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); - - if (!(mreg->unified)) { - MPI_Win_sync(mreg->window); - } - - return 0; -} - -void gmr_progress(void) -{ - if (ARMCII_GLOBAL_STATE.explicit_nb_progress) { - int flag; - MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ARMCI_GROUP_WORLD.comm, &flag, MPI_STATUS_IGNORE); - } - return; -} - diff --git a/src/gmr.c b/src/gmr.c index bdb2e8d..1befa51 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -484,7 +484,7 @@ int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); // Calculate displacement from beginning of the window - if (dst == MPI_BOTTOM) + if (dst == MPI_BOTTOM) disp = 0; else disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); @@ -546,7 +546,7 @@ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); // Calculate displacement from beginning of the window - if (src == MPI_BOTTOM) + if (src == MPI_BOTTOM) disp = 0; else disp = (gmr_size_t) ((uint8_t*)src - (uint8_t*)mreg->slices[proc].base); @@ -609,7 +609,7 @@ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); // Calculate displacement from beginning of the window - if (dst == MPI_BOTTOM) + if (dst == MPI_BOTTOM) disp = 0; else disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); @@ -624,3 +624,220 @@ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src return 0; } +/** One-sided get-accumulate operation. Source and output buffer must be private. + * + * @param[in] mreg Memory region + * @param[in] src Source address (local) + * @param[in] out Result address (local) + * @param[in] dst Destination address (remote) + * @param[in] type MPI type of the given buffers + * @param[in] count Number of elements of the given type to transfer + * @param[in] op MPI_Op to apply at the destination + * @param[in] proc Absolute process id of the target + * @return 0 on success, non-zero on failure + */ +int gmr_get_accumulate(gmr_t *mreg, void *src, void *out, void *dst, int count, MPI_Datatype type, MPI_Op op, int proc) { + ARMCII_Assert_msg(src != NULL && out != NULL, "Invalid local address(es)"); + return gmr_get_accumulate_typed(mreg, src, count, type, out, count, type, dst, count, type, op, proc); +} + +/** One-sided get-accumulate operation with typed arguments. Source and output buffer must be private. + * + * @param[in] mreg Memory region + * @param[in] src Address of source data + * @param[in] src_count Number of elements of the given type at the source + * @param[in] src_type MPI datatype of the source elements + * @param[in] out Address of output buffer (same process as the source) + * @param[in] out_count Number of elements of the given type at the ouput + * @param[in] out_type MPI datatype of the output elements + * @param[in] dst Address of destination buffer + * @param[in] dst_count Number of elements of the given type at the destination + * @param[in] dst_type MPI datatype of the destination elements + * @param[in] size Number of bytes to transfer + * @param[in] op MPI_Op to apply at the destination + * @param[in] proc Absolute process id of target process + * @return 0 on success, non-zero on failure + */ +int gmr_get_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, + void *out, int out_count, MPI_Datatype out_type, + void *dst, int dst_count, MPI_Datatype dst_type, MPI_Op op, int proc) { + + int grp_proc; + gmr_size_t disp; + MPI_Aint lb, extent; + + grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); + ARMCII_Assert(grp_proc >= 0); + ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); + + // Calculate displacement from beginning of the window + if (dst == MPI_BOTTOM) + disp = 0; + else + disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); + + // Perform checks + MPI_Type_get_true_extent(dst_type, &lb, &extent); + ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); + ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); + + MPI_Get_accumulate(src, src_count, src_type, out, out_count, out_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, op, mreg->window); + + return 0; +} + +/** One-sided fetch-and-op. Source and output buffer must be private. + * + * @param[in] mreg Memory region + * @param[in] src Address of source data + * @param[in] out Address of output buffer (same process as the source) + * @param[in] dst Address of destination buffer + * @param[in] type MPI datatype of the source, output and destination elements + * @param[in] op MPI_Op to apply at the destination + * @param[in] proc Absolute process id of target process + * @return 0 on success, non-zero on failure + */ +int gmr_fetch_and_op(gmr_t *mreg, void *src, void *out, void *dst, + MPI_Datatype type, MPI_Op op, int proc) { + + int grp_proc; + gmr_size_t disp; + + grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); + ARMCII_Assert(grp_proc >= 0); + ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); + + /* built-in types only so no chance of seeing MPI_BOTTOM */ + disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); + + // Perform checks + ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); + ARMCII_Assert_msg(disp <= mreg->slices[proc].size, "Transfer is out of range"); + + if (ARMCII_GLOBAL_STATE.use_request_atomics) { + + MPI_Request req; + MPI_Rget_accumulate(src, 1, type, out, 1, type, grp_proc, (MPI_Aint) disp, 1, type, op, mreg->window, &req); + MPI_Wait(&req, MPI_STATUS_IGNORE); + + } else { + + MPI_Fetch_and_op(src, out, type, grp_proc, (MPI_Aint) disp, op, mreg->window); + if (ARMCII_GLOBAL_STATE.end_to_end_flush) { + MPI_Win_flush(grp_proc, mreg->window); + } else { + MPI_Win_flush_local(grp_proc, mreg->window); + } + + } + + return 0; +} + +/** Lock a memory region at all targets so that one-sided operations can be performed. + * + * @param[in] mreg Memory region + * @return 0 on success, non-zero on failure + */ +int gmr_lockall(gmr_t *mreg) { + int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); + + ARMCII_Assert(grp_me >= 0); + ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); + + MPI_Win_lock_all((ARMCII_GLOBAL_STATE.rma_nocheck) ? MPI_MODE_NOCHECK : 0, + mreg->window); + + return 0; +} + +/** Unlock a memory region at all targets. + * + * @param[in] mreg Memory region + * @return 0 on success, non-zero on failure + */ +int gmr_unlockall(gmr_t *mreg) { + int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); + + ARMCII_Assert(grp_me >= 0); + ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); + + MPI_Win_unlock_all(mreg->window); + + return 0; +} + +/** Flush a memory region for local or remote completion. + * + * @param[in] mreg Memory region + * @param[in] proc Absolute process id of the target + * @param[in] local_only Only flush the operation locally. + * @return 0 on success, non-zero on failure + */ +int gmr_flush(gmr_t *mreg, int proc, int local_only) { + int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); + int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); + + ARMCII_Assert(grp_proc >= 0 && grp_me >= 0); + ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); + ARMCII_Assert_msg(grp_proc < mreg->group.size, "grp_proc exceeds group size!"); + + if (!local_only || ARMCII_GLOBAL_STATE.end_to_end_flush) { + MPI_Win_flush(grp_proc, mreg->window); + } else { + MPI_Win_flush_local(grp_proc, mreg->window); + } + + return 0; +} + +/** Flush a memory region for remote completion to all targets. + * + * @param[in] mreg Memory region + * @return 0 on success, non-zero on failure + */ +int gmr_flushall(gmr_t *mreg, int local_only) { + int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); + + ARMCII_Assert(grp_me >= 0); + ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); + + if (!local_only || ARMCII_GLOBAL_STATE.end_to_end_flush) { + MPI_Win_flush_all(mreg->window); + } else { + MPI_Win_flush_local_all(mreg->window); + } + + return 0; +} + +/** Sync memory region so that public and private windows are the same. + * + * @param[in] mreg Memory region + * @return 0 on success, non-zero on failure + */ +int gmr_sync(gmr_t *mreg) +{ +#if 0 + // what is the point of this? + int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); + ARMCII_Assert(grp_me >= 0); +#endif + ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); + + if (!(mreg->unified)) { + MPI_Win_sync(mreg->window); + } + + return 0; +} + +void gmr_progress(void) +{ + if (ARMCII_GLOBAL_STATE.explicit_nb_progress) { + int flag; + MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ARMCI_GROUP_WORLD.comm, &flag, MPI_STATUS_IGNORE); + } + return; +} + From c7b98991ad46a472a15063ce8fbb0292685a9da0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2024 19:48:22 +0300 Subject: [PATCH 07/27] add README for ARMCI_USE_REQUEST_ATOMICS --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 7b8cce2..58c0e42 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,11 @@ Boolean environment variables are enabled when set to a value beginning with Argument to `usleep()` to pause the progress polling loop. +`ARMCI_USE_REQUEST_ATOMICS` (boolean) + + Switch to request-based RMA (with Rget_accumulate) instead of + Fetch_and_op/Compare_and_swap plus a local flush. + ## Noncollective Groups `ARMCI_NONCOLLECTIVE_GROUPS` (boolean) From 97b6809650132811bae903274850a464f41e17cd Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2024 20:05:00 +0300 Subject: [PATCH 08/27] add a nonblocking request handle to all put/get/acc operations in gmr no implementation of request-based RMA yet... Signed-off-by: Jeff Hammond --- src/gmr.c | 43 ++++++++++++++++++++++++++----------------- src/gmr.h | 31 +++++++++++++++++++++---------- src/onesided.c | 10 +++++----- src/onesided_nb.c | 16 ++++++++++++---- src/strided.c | 6 +++--- src/strided_nb.c | 6 +++--- src/vector.c | 12 ++++++------ 7 files changed, 76 insertions(+), 48 deletions(-) diff --git a/src/gmr.c b/src/gmr.c index 1befa51..fa94160 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -453,9 +453,10 @@ gmr_t *gmr_lookup(void *ptr, int proc) { * @param[in] proc Absolute process id of target process * @return 0 on success, non-zero on failure */ -int gmr_put(gmr_t *mreg, void *src, void *dst, int size, int proc) { +int gmr_put(gmr_t *mreg, void *src, void *dst, int size, int proc, armci_hdl_t * handle) +{ ARMCII_Assert_msg(src != NULL, "Invalid local address"); - return gmr_put_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc); + return gmr_put_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc, handle); } @@ -473,8 +474,9 @@ int gmr_put(gmr_t *mreg, void *src, void *dst, int size, int proc) { * @return 0 on success, non-zero on failure */ int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *dst, int dst_count, MPI_Datatype dst_type, int proc) { - + void *dst, int dst_count, MPI_Datatype dst_type, + int proc, armci_hdl_t * handle) +{ int grp_proc; gmr_size_t disp; MPI_Aint lb, extent; @@ -515,9 +517,9 @@ int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, * @param[in] proc Absolute process id of target process * @return 0 on success, non-zero on failure */ -int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc) { +int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc, armci_hdl_t * handle) { ARMCII_Assert_msg(dst != NULL, "Invalid local address"); - return gmr_get_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc); + return gmr_get_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc, handle); } @@ -535,8 +537,9 @@ int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc) { * @return 0 on success, non-zero on failure */ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *dst, int dst_count, MPI_Datatype dst_type, int proc) { - + void *dst, int dst_count, MPI_Datatype dst_type, + int proc, armci_hdl_t * handle) +{ int grp_proc; gmr_size_t disp; MPI_Aint lb, extent; @@ -578,9 +581,11 @@ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, * @param[in] proc Absolute process id of the target * @return 0 on success, non-zero on failure */ -int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, int proc) { +int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, + int proc, armci_hdl_t * handle) +{ ARMCII_Assert_msg(src != NULL, "Invalid local address"); - return gmr_accumulate_typed(mreg, src, count, type, dst, count, type, proc); + return gmr_accumulate_typed(mreg, src, count, type, dst, count, type, proc, handle); } @@ -598,8 +603,9 @@ int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype ty * @return 0 on success, non-zero on failure */ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *dst, int dst_count, MPI_Datatype dst_type, int proc) { - + void *dst, int dst_count, MPI_Datatype dst_type, + int proc, armci_hdl_t * handle) +{ int grp_proc; gmr_size_t disp; MPI_Aint lb, extent; @@ -636,9 +642,11 @@ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src * @param[in] proc Absolute process id of the target * @return 0 on success, non-zero on failure */ -int gmr_get_accumulate(gmr_t *mreg, void *src, void *out, void *dst, int count, MPI_Datatype type, MPI_Op op, int proc) { +int gmr_get_accumulate(gmr_t *mreg, void *src, void *out, void *dst, int count, + MPI_Datatype type, MPI_Op op, int proc, armci_hdl_t * handle) +{ ARMCII_Assert_msg(src != NULL && out != NULL, "Invalid local address(es)"); - return gmr_get_accumulate_typed(mreg, src, count, type, out, count, type, dst, count, type, op, proc); + return gmr_get_accumulate_typed(mreg, src, count, type, out, count, type, dst, count, type, op, proc, handle); } /** One-sided get-accumulate operation with typed arguments. Source and output buffer must be private. @@ -659,9 +667,10 @@ int gmr_get_accumulate(gmr_t *mreg, void *src, void *out, void *dst, int count, * @return 0 on success, non-zero on failure */ int gmr_get_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *out, int out_count, MPI_Datatype out_type, - void *dst, int dst_count, MPI_Datatype dst_type, MPI_Op op, int proc) { - + void *out, int out_count, MPI_Datatype out_type, + void *dst, int dst_count, MPI_Datatype dst_type, + MPI_Op op, int proc, armci_hdl_t * handle) +{ int grp_proc; gmr_size_t disp; MPI_Aint lb, extent; diff --git a/src/gmr.h b/src/gmr.h index 5a45317..a86c597 100644 --- a/src/gmr.h +++ b/src/gmr.h @@ -44,28 +44,39 @@ void gmr_destroy(gmr_t *mreg, ARMCI_Group *group); int gmr_destroy_all(void); gmr_t *gmr_lookup(void *ptr, int proc); -int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int target); -int gmr_put(gmr_t *mreg, void *src, void *dst, int size, int target); -int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, int proc); -int gmr_get_accumulate(gmr_t *mreg, void *src, void *out, void *dst, int count, MPI_Datatype type, - MPI_Op op, int proc); +// blocking int gmr_fetch_and_op(gmr_t *mreg, void *src, void *out, void *dst, MPI_Datatype type, MPI_Op op, int proc); +// nonblocking +int gmr_get(gmr_t *mreg, void *src, void *dst, int size, + int target, armci_hdl_t * handle); +int gmr_put(gmr_t *mreg, void *src, void *dst, int size, + int target, armci_hdl_t * handle); +int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, + int proc, armci_hdl_t * handle); +int gmr_get_accumulate(gmr_t *mreg, void *src, void *out, void *dst, int count, MPI_Datatype type, + MPI_Op op, int proc, armci_hdl_t * handle); + int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *dst, int dst_count, MPI_Datatype dst_type, int proc); + void *dst, int dst_count, MPI_Datatype dst_type, + int proc, armci_hdl_t * handle); int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *dst, int dst_count, MPI_Datatype dst_type, int proc); + void *dst, int dst_count, MPI_Datatype dst_type, + int proc, armci_hdl_t * handle); int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *dst, int dst_count, MPI_Datatype dst_type, int proc); + void *dst, int dst_count, MPI_Datatype dst_type, + int proc, armci_hdl_t * handle); int gmr_get_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, - void *out, int out_count, MPI_Datatype out_type, void *dst, int dst_count, MPI_Datatype dst_type, - MPI_Op op, int proc); + void *out, int out_count, MPI_Datatype out_type, + void *dst, int dst_count, MPI_Datatype dst_type, + MPI_Op op, int proc, armci_hdl_t * handle); int gmr_lockall(gmr_t *mreg); int gmr_unlockall(gmr_t *mreg); int gmr_flush(gmr_t *mreg, int proc, int local_only); int gmr_flushall(gmr_t *mreg, int local_only); int gmr_sync(gmr_t *mreg); +int gmr_wait(armci_hdl_t * handle); void gmr_progress(void); diff --git a/src/onesided.c b/src/onesided.c index e934a8b..490626a 100644 --- a/src/onesided.c +++ b/src/onesided.c @@ -104,7 +104,7 @@ int PARMCI_Get(void *src, void *dst, int size, int target) { /* Origin buffer is private */ else if (dst_mreg == NULL) { - gmr_get(src_mreg, src, dst, size, target); + gmr_get(src_mreg, src, dst, size, target, NULL /* handle */); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ } @@ -117,7 +117,7 @@ int PARMCI_Get(void *src, void *dst, int size, int target) { MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); - gmr_get(src_mreg, src, dst_buf, size, target); + gmr_get(src_mreg, src, dst_buf, size, target, NULL /* handle */); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ ARMCI_Copy(dst_buf, dst, size); @@ -167,7 +167,7 @@ int PARMCI_Put(void *src, void *dst, int size, int target) { /* Origin buffer is private */ else if (src_mreg == NULL) { - gmr_put(dst_mreg, src, dst, size, target); + gmr_put(dst_mreg, src, dst, size, target, NULL /* handle */); gmr_flush(dst_mreg, target, 1); /* flush_local */ } @@ -182,7 +182,7 @@ int PARMCI_Put(void *src, void *dst, int size, int target) { ARMCI_Copy(src, src_buf, size); - gmr_put(dst_mreg, src_buf, dst, size, target); + gmr_put(dst_mreg, src_buf, dst, size, target, NULL /* handle */); gmr_flush(dst_mreg, target, 1); /* flush_local */ MPI_Free_mem(src_buf); @@ -259,7 +259,7 @@ int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int p /* TODO: Support a local accumulate operation more efficiently */ - gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); + gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc, NULL /* handle */); gmr_flush(dst_mreg, proc, 1); /* flush_local */ if (src_buf != src) diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 4090287..35fc26f 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -76,7 +76,7 @@ int PARMCI_NbPut(void *src, void *dst, int size, int target, armci_hdl_t *handle ARMCI_Copy(src, dst, size); } else { - gmr_put(dst_mreg, src, dst, size, target); + gmr_put(dst_mreg, src, dst, size, target, NULL /* handle */); } if (handle!=NULL) { @@ -120,7 +120,7 @@ int PARMCI_NbGet(void *src, void *dst, int size, int target, armci_hdl_t *handle ARMCI_Copy(src, dst, size); } else { - gmr_get(src_mreg, src, dst, size, target); + gmr_get(src_mreg, src, dst, size, target, NULL /* handle */); } if (handle!=NULL) { @@ -192,7 +192,7 @@ int PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int /* TODO: Support a local accumulate operation more efficiently */ - gmr_accumulate(dst_mreg, src_buf, dst, count, type, target); + gmr_accumulate(dst_mreg, src_buf, dst, count, type, target, NULL /* handle */); if (src_buf != src) { /* must wait for local completion to free source buffer */ @@ -223,7 +223,12 @@ int PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int /** Wait for a non-blocking operation to finish. */ -int PARMCI_Wait(armci_hdl_t* handle) { +int PARMCI_Wait(armci_hdl_t* handle) +{ +#ifdef USE_RMA_REQUESTS +#error TODO +#else + gmr_t *cur_mreg = gmr_list; if(handle->aggregate > 0) { @@ -239,6 +244,9 @@ int PARMCI_Wait(armci_hdl_t* handle) { cur_mreg = cur_mreg->next; } } + +#endif + return 0; } diff --git a/src/strided.c b/src/strided.c index 85370d7..2f25954 100644 --- a/src/strided.c +++ b/src/strided.c @@ -158,7 +158,7 @@ int PARMCI_PutS(void *src_ptr, int src_stride_ar[/*stride_levels*/], mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_put_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); + gmr_put_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, NULL /* handle */); gmr_flush(mreg, proc, 1); /* flush_local */ MPI_Type_free(&src_type); @@ -258,7 +258,7 @@ int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], mreg = gmr_lookup(src_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc); + gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc, NULL /* handle */); gmr_flush(mreg, proc, 0); /* COPY: Finish the transfer */ @@ -400,7 +400,7 @@ int PARMCI_AccS(int datatype, void *scale, mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); + gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, NULL /* handle */); gmr_flush(mreg, proc, 1); /* flush_local */ MPI_Type_free(&src_type); diff --git a/src/strided_nb.c b/src/strided_nb.c index 7f4f748..2c762bd 100644 --- a/src/strided_nb.c +++ b/src/strided_nb.c @@ -87,7 +87,7 @@ int PARMCI_NbPutS(void *src_ptr, int src_stride_ar[/*stride_levels*/], mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_put_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); + gmr_put_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, NULL /* handle */); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); @@ -194,7 +194,7 @@ int PARMCI_NbGetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], mreg = gmr_lookup(src_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc); + gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc, NULL /* handle */); /* COPY: Finish the transfer */ if (dst_buf != dst_ptr) { @@ -344,7 +344,7 @@ int PARMCI_NbAccS(int datatype, void *scale, mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); + gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, NULL /* handle */); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); diff --git a/src/vector.c b/src/vector.c index d26156c..034ff59 100644 --- a/src/vector.c +++ b/src/vector.c @@ -262,15 +262,15 @@ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count switch(op) { case ARMCII_OP_PUT: - gmr_put(mreg, src[i], dst[i], elem_count, proc); + gmr_put(mreg, src[i], dst[i], elem_count, proc, NULL /* handle */); flush_local = 1; break; case ARMCII_OP_GET: - gmr_get(mreg, src[i], dst[i], elem_count, proc); + gmr_get(mreg, src[i], dst[i], elem_count, proc, NULL /* handle */); flush_local = 0; break; case ARMCII_OP_ACC: - gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc); + gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc, NULL /* handle */); flush_local = 1; break; default: @@ -352,15 +352,15 @@ int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int coun switch(op) { case ARMCII_OP_PUT: - gmr_put_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); + gmr_put_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc, NULL /* handle */); flush_local = 1; break; case ARMCII_OP_GET: - gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, MPI_BOTTOM, 1, type_loc, proc); + gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, MPI_BOTTOM, 1, type_loc, proc, NULL /* handle */); flush_local = 0; break; case ARMCII_OP_ACC: - gmr_accumulate_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); + gmr_accumulate_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc, NULL /* handle */); flush_local = 1; break; default: From 97017ce13434a17051d61eed1a0f6b0753570d2c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2024 20:26:17 +0300 Subject: [PATCH 09/27] more request prep Signed-off-by: Jeff Hammond --- src/gmr.c | 50 +++++++++++++++++++++++++++++++++++------------ src/onesided_nb.c | 40 +++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/src/gmr.c b/src/gmr.c index fa94160..4f1a5af 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -486,10 +486,11 @@ int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); // Calculate displacement from beginning of the window - if (dst == MPI_BOTTOM) + if (dst == MPI_BOTTOM) { disp = 0; - else + } else { disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); + } // Perform checks MPI_Type_get_true_extent(dst_type, &lb, &extent); @@ -504,6 +505,11 @@ int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, (MPI_Aint) disp, dst_count, dst_type, mreg->window); } + if (handle!=NULL) { + /* Regular (not aggregate) handles merely store the target for future flushing. */ + handle->target = grp_proc; + } + return 0; } @@ -517,7 +523,8 @@ int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, * @param[in] proc Absolute process id of target process * @return 0 on success, non-zero on failure */ -int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc, armci_hdl_t * handle) { +int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc, armci_hdl_t * handle) +{ ARMCII_Assert_msg(dst != NULL, "Invalid local address"); return gmr_get_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc, handle); } @@ -549,10 +556,11 @@ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); // Calculate displacement from beginning of the window - if (src == MPI_BOTTOM) + if (src == MPI_BOTTOM) { disp = 0; - else + } else { disp = (gmr_size_t) ((uint8_t*)src - (uint8_t*)mreg->slices[proc].base); + } // Perform checks MPI_Type_get_true_extent(src_type, &lb, &extent); @@ -567,6 +575,11 @@ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, (MPI_Aint) disp, src_count, src_type, mreg->window); } + if (handle!=NULL) { + /* Regular (not aggregate) handles merely store the target for future flushing. */ + handle->target = grp_proc; + } + return 0; } @@ -615,10 +628,11 @@ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); // Calculate displacement from beginning of the window - if (dst == MPI_BOTTOM) + if (dst == MPI_BOTTOM) { disp = 0; - else + } else { disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); + } // Perform checks MPI_Type_get_true_extent(dst_type, &lb, &extent); @@ -627,6 +641,11 @@ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window); + if (handle!=NULL) { + /* Regular (not aggregate) handles merely store the target for future flushing. */ + handle->target = grp_proc; + } + return 0; } @@ -680,17 +699,24 @@ int gmr_get_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype ARMCII_Assert_msg(mreg->window != MPI_WIN_NULL, "A non-null mreg contains a null window."); // Calculate displacement from beginning of the window - if (dst == MPI_BOTTOM) + if (dst == MPI_BOTTOM) { disp = 0; - else + } else { disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); + } // Perform checks MPI_Type_get_true_extent(dst_type, &lb, &extent); ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); - MPI_Get_accumulate(src, src_count, src_type, out, out_count, out_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, op, mreg->window); + MPI_Get_accumulate(src, src_count, src_type, out, out_count, out_type, + grp_proc, (MPI_Aint) disp, dst_count, dst_type, op, mreg->window); + + if (handle!=NULL) { + /* Regular (not aggregate) handles merely store the target for future flushing. */ + handle->target = grp_proc; + } return 0; } @@ -707,8 +733,8 @@ int gmr_get_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype * @return 0 on success, non-zero on failure */ int gmr_fetch_and_op(gmr_t *mreg, void *src, void *out, void *dst, - MPI_Datatype type, MPI_Op op, int proc) { - + MPI_Datatype type, MPI_Op op, int proc) +{ int grp_proc; gmr_size_t disp; diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 35fc26f..089d0b9 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -11,22 +11,33 @@ /** Initialize Non-blocking handle. */ -void ARMCI_INIT_HANDLE(armci_hdl_t *handle) { +void ARMCI_INIT_HANDLE(armci_hdl_t *handle) +{ if (handle!=NULL) { +#ifdef USE_RMA_REQUESTS + handle->batch_size = 0; + handle->single_request = MPI_REQUEST_NULL; + handle->request_array = NULL; +#else handle->aggregate = 0; handle->target = -1; +#endif } else { ARMCII_Warning("ARMCI_INIT_HANDLE given NULL handle"); } return; } - /** Mark a handle as aggregate. */ -void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) { +void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) +{ if (handle!=NULL) { +#ifdef USE_RMA_REQUESTS + ARMCII_Assert_msg(0, "not supported"); +#else handle->aggregate = 1; +#endif } else { ARMCII_Warning("ARMCI_SET_AGGREGATE_HANDLE given NULL handle"); } @@ -38,7 +49,11 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) { */ void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { +#ifdef USE_RMA_REQUESTS + ARMCII_Assert_msg(0, "not supported"); +#else handle->aggregate = 0; +#endif } else { ARMCII_Warning("ARMCI_UNSET_AGGREGATE_HANDLE given NULL handle"); } @@ -79,10 +94,12 @@ int PARMCI_NbPut(void *src, void *dst, int size, int target, armci_hdl_t *handle gmr_put(dst_mreg, src, dst, size, target, NULL /* handle */); } +#if 0 if (handle!=NULL) { /* Regular (not aggregate) handles merely store the target for future flushing. */ handle->target = target; } +#endif gmr_progress(); @@ -123,10 +140,12 @@ int PARMCI_NbGet(void *src, void *dst, int size, int target, armci_hdl_t *handle gmr_get(src_mreg, src, dst, size, target, NULL /* handle */); } +#if 0 if (handle!=NULL) { /* Regular (not aggregate) handles merely store the target for future flushing. */ handle->target = target; } +#endif gmr_progress(); @@ -200,10 +219,12 @@ int PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int MPI_Free_mem(src_buf); } +#if 0 if (handle!=NULL) { /* Regular (not aggregate) handles merely store the target for future flushing. */ handle->target = target; } +#endif gmr_progress(); @@ -263,8 +284,13 @@ int PARMCI_Wait(armci_hdl_t* handle) /** Check if a non-blocking operation has finished. */ -int PARMCI_Test(armci_hdl_t* handle) { +int PARMCI_Test(armci_hdl_t* handle) +{ +#ifdef USE_RMA_REQUESTS +#error TODO +#else return PARMCI_Wait(handle); +#endif } @@ -280,7 +306,8 @@ int PARMCI_Test(armci_hdl_t* handle) { /** Wait for all outstanding non-blocking operations with implicit handles to a particular process to finish. */ -int PARMCI_WaitProc(int proc) { +int PARMCI_WaitProc(int proc) +{ gmr_t *cur_mreg = gmr_list; while (cur_mreg) { @@ -303,7 +330,8 @@ int PARMCI_WaitProc(int proc) { /** Wait for all non-blocking operations with implicit (NULL) handles to finish. */ -int PARMCI_WaitAll(void) { +int PARMCI_WaitAll(void) +{ gmr_t *cur_mreg = gmr_list; while (cur_mreg) { From a578579dde1ff34114d0246e1fb0a04aac89ad97 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 1 Oct 2024 14:42:19 +0200 Subject: [PATCH 10/27] implement request-based RMA this is not working for nonblocking vector ops, which fails in armci-test. all other tests pass, at least in shared-memory. Signed-off-by: Jeff Hammond --- src/armci.h | 4 +- src/armci_internals.h | 6 +- src/gmr.c | 154 +++++++++++++++++++++++++++++++++++++++++- src/gmr.h | 1 + src/onesided_nb.c | 71 ++++++++++++++++++- src/strided_nb.c | 21 +----- src/vector.c | 30 ++++---- src/vector_nb.c | 21 +----- 8 files changed, 248 insertions(+), 60 deletions(-) diff --git a/src/armci.h b/src/armci.h index 5185b04..7597cca 100644 --- a/src/armci.h +++ b/src/armci.h @@ -5,6 +5,9 @@ #ifndef _ARMCI_H_ #define _ARMCI_H_ +// TODO add to build system +#define USE_RMA_REQUESTS 1 + #include #define ARMCI_MPI 3 @@ -64,7 +67,6 @@ int ARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int *flag, int value, int proc); - typedef struct armci_hdl_s { #ifdef USE_RMA_REQUESTS diff --git a/src/armci_internals.h b/src/armci_internals.h index aa4103d..b88e4c6 100644 --- a/src/armci_internals.h +++ b/src/armci_internals.h @@ -203,12 +203,12 @@ void ARMCII_Strided_to_dtype(int stride_array[/*stride_levels*/], int count[/*st int stride_levels, MPI_Datatype old_type, MPI_Datatype *new_type); int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size, - int datatype, int overlapping, int same_alloc, int proc, int blocking); + int datatype, int overlapping, int same_alloc, int proc, int blocking, armci_hdl_t * handle); int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, - MPI_Datatype type, int proc, int consrv /* if 1, batched = safe */, int blocking); + MPI_Datatype type, int proc, int consrv /* if 1, batched = safe */, int blocking, armci_hdl_t * handle); int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, - MPI_Datatype type, int proc, int blocking); + MPI_Datatype type, int proc, int blocking, armci_hdl_t * handle); armcii_iov_iter_t *ARMCII_Strided_to_iov_iter( void *src_ptr, int src_stride_ar[/*stride_levels*/], diff --git a/src/gmr.c b/src/gmr.c index 4f1a5af..4fb12be 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -497,19 +497,48 @@ int gmr_put_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); +#ifdef USE_RMA_REQUESTS + + if (handle!=NULL) { + + MPI_Request req = MPI_REQUEST_NULL; + + if (ARMCII_GLOBAL_STATE.rma_atomicity) { + MPI_Raccumulate(src, src_count, src_type, grp_proc, + (MPI_Aint) disp, dst_count, dst_type, + MPI_REPLACE, mreg->window, &req); + } else { + MPI_Rput(src, src_count, src_type, grp_proc, + (MPI_Aint) disp, dst_count, dst_type, + mreg->window, &req); + } + + gmr_handle_add_request(handle, req); + + return 0; + + } + +#endif + if (ARMCII_GLOBAL_STATE.rma_atomicity) { MPI_Accumulate(src, src_count, src_type, grp_proc, - (MPI_Aint) disp, dst_count, dst_type, MPI_REPLACE, mreg->window); + (MPI_Aint) disp, dst_count, dst_type, + MPI_REPLACE, mreg->window); } else { MPI_Put(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, mreg->window); } +#ifndef USE_RMA_REQUESTS + if (handle!=NULL) { /* Regular (not aggregate) handles merely store the target for future flushing. */ handle->target = grp_proc; } +#endif + return 0; } @@ -567,6 +596,30 @@ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + src_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); +#ifdef USE_RMA_REQUESTS + + if (handle!=NULL) { + + MPI_Request req = MPI_REQUEST_NULL; + + if (ARMCII_GLOBAL_STATE.rma_atomicity) { + MPI_Rget_accumulate(NULL, 0, MPI_BYTE, + dst, dst_count, dst_type, grp_proc, + (MPI_Aint) disp, src_count, src_type, + MPI_NO_OP, mreg->window, &req); + } else { + MPI_Rget(dst, dst_count, dst_type, grp_proc, + (MPI_Aint) disp, src_count, src_type, + mreg->window, &req); + } + + gmr_handle_add_request(handle, req); + + return 0; + } + +#endif + if (ARMCII_GLOBAL_STATE.rma_atomicity) { MPI_Get_accumulate(NULL, 0, MPI_BYTE, dst, dst_count, dst_type, grp_proc, (MPI_Aint) disp, src_count, src_type, MPI_NO_OP, mreg->window); @@ -575,11 +628,15 @@ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, (MPI_Aint) disp, src_count, src_type, mreg->window); } +#ifndef USE_RMA_REQUESTS + if (handle!=NULL) { /* Regular (not aggregate) handles merely store the target for future flushing. */ handle->target = grp_proc; } +#endif + return 0; } @@ -639,13 +696,35 @@ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); +#ifdef USE_RMA_REQUESTS + + if (handle!=NULL) { + + MPI_Request req = MPI_REQUEST_NULL; + + MPI_Raccumulate(src, src_count, src_type, grp_proc, + (MPI_Aint) disp, dst_count, dst_type, + MPI_SUM, mreg->window, &req); + + gmr_handle_add_request(handle, req); + + return 0; + + } + +#endif + MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window); +#ifndef USE_RMA_REQUESTS + if (handle!=NULL) { /* Regular (not aggregate) handles merely store the target for future flushing. */ handle->target = grp_proc; } +#endif + return 0; } @@ -710,14 +789,39 @@ int gmr_get_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); - MPI_Get_accumulate(src, src_count, src_type, out, out_count, out_type, - grp_proc, (MPI_Aint) disp, dst_count, dst_type, op, mreg->window); +#ifdef USE_RMA_REQUESTS + + if (handle!=NULL) { + + MPI_Request req = MPI_REQUEST_NULL; + + MPI_Rget_accumulate(src, src_count, src_type, + out, out_count, out_type, + grp_proc, (MPI_Aint) disp, dst_count, dst_type, + op, mreg->window, &req); + + gmr_handle_add_request(handle, req); + + return 0; + + } + +#endif + + MPI_Get_accumulate(src, src_count, src_type, + out, out_count, out_type, + grp_proc, (MPI_Aint) disp, dst_count, dst_type, + op, mreg->window); + +#ifndef USE_RMA_REQUESTS if (handle!=NULL) { /* Regular (not aggregate) handles merely store the target for future flushing. */ handle->target = grp_proc; } +#endif + return 0; } @@ -876,3 +980,47 @@ void gmr_progress(void) return; } +void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) +{ + ARMCII_Assert_msg(handle->batch_size >= 0, + "handle is corrupt (batch_size < 0)"); + + if (handle->batch_size == 0) { + + ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, + "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); + ARMCII_Assert_msg(handle->request_array == NULL, + "handle is corrupt (request_array is not NULL)"); + + handle->batch_size = 1; + handle->single_request = req; + + } else if (handle->batch_size == 1) { + + ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, + "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); + ARMCII_Assert_msg(handle->request_array == NULL, + "handle is corrupt (request_array is not NULL)"); + + // there is a single request in the handle, so we allocate space for two, + // then copy from the single request to the array and append the new one. + // we nullify the single request to make sure it is not usable. + handle->request_array = malloc( handle->batch_size++ * sizeof(MPI_Request) ); + handle->request_array[0] = handle->single_request; + handle->request_array[1] = req; + handle->single_request = MPI_REQUEST_NULL; + + } else if (handle->batch_size > 1) { + + ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, + "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); + ARMCII_Assert_msg(handle->request_array != NULL, + "handle is corrupt (request_array is NULL)"); + + // grow the allocation and append the new one. + handle->request_array = realloc( handle->request_array , handle->batch_size++ * sizeof(MPI_Request) ); + handle->request_array[handle->batch_size-1] = req; + + } + +} diff --git a/src/gmr.h b/src/gmr.h index a86c597..602593b 100644 --- a/src/gmr.h +++ b/src/gmr.h @@ -79,5 +79,6 @@ int gmr_sync(gmr_t *mreg); int gmr_wait(armci_hdl_t * handle); void gmr_progress(void); +void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req); #endif /* HAVE_GMR_H */ diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 089d0b9..2563b8c 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -247,7 +247,41 @@ int PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int int PARMCI_Wait(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS -#error TODO + + ARMCII_Assert_msg(handle->batch_size >= 0, + "handle is corrupt (batch_size < 0)"); + //ARMCII_Assert_msg(handle->batch_size == 0, + // "handle waited on without prior use"); + + if (handle->batch_size == 1) { + + ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, + "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); + ARMCII_Assert_msg(handle->request_array == NULL, + "handle is corrupt (request_array is not NULL)"); + + MPI_Wait( &(handle->single_request), MPI_STATUS_IGNORE ); + + } else if (handle->batch_size > 1) { + + ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, + "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); + ARMCII_Assert_msg(handle->request_array != NULL, + "handle is corrupt (request_array is NULL)"); + + for (int i = 0 ; i < handle->batch_size ; i++ ) { + ARMCII_Assert_msg(handle->request_array[i] != MPI_REQUEST_NULL, + "handle contains MPI_REQUEST_NULL"); + printf("%s %s %s i=%d\n",__FILE__, __LINE__, __func__); + MPI_Wait( &(handle->request_array[i]), MPI_STATUS_IGNORE ); + } + //MPI_Waitall( handle->batch_size, handle->request_array, MPI_STATUSES_IGNORE ); + free(handle->request_array); + + handle->batch_size = 0; + handle->request_array = NULL; + } + #else gmr_t *cur_mreg = gmr_list; @@ -287,7 +321,40 @@ int PARMCI_Wait(armci_hdl_t* handle) int PARMCI_Test(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS -#error TODO + + int flag = 0; + + ARMCII_Assert_msg(handle->batch_size >= 0, + "handle is corrupt (batch_size < 0)"); + ARMCII_Assert_msg(handle->batch_size == 0, + "handle waited on without prior use"); + + if (handle->batch_size == 1) { + + ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, + "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); + ARMCII_Assert_msg(handle->request_array == NULL, + "handle is corrupt (request_array is not NULL)"); + + MPI_Test( &(handle->single_request), &flag, MPI_STATUS_IGNORE ); + + } else if (handle->batch_size > 1) { + + ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, + "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); + ARMCII_Assert_msg(handle->request_array != NULL, + "handle is corrupt (request_array is NULL)"); + + MPI_Testall( handle->batch_size, handle->request_array, &flag, MPI_STATUSES_IGNORE ); + free(handle->request_array); + + handle->batch_size = 0; + handle->request_array = NULL; + } + + // no error codes are supported so we can do this + return (!flag); + #else return PARMCI_Wait(handle); #endif diff --git a/src/strided_nb.c b/src/strided_nb.c index 2c762bd..58f304a 100644 --- a/src/strided_nb.c +++ b/src/strided_nb.c @@ -37,8 +37,8 @@ * @return Zero on success, error code otherwise. */ int PARMCI_NbPutS(void *src_ptr, int src_stride_ar[/*stride_levels*/], - void *dst_ptr, int dst_stride_ar[/*stride_levels*/], - int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t *handle) { + void *dst_ptr, int dst_stride_ar[/*stride_levels*/], + int count[/*stride_levels+1*/], int stride_levels, int proc, armci_hdl_t * handle) { int err; @@ -87,7 +87,7 @@ int PARMCI_NbPutS(void *src_ptr, int src_stride_ar[/*stride_levels*/], mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_put_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, NULL /* handle */); + gmr_put_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, handle); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); @@ -98,11 +98,6 @@ int PARMCI_NbPutS(void *src_ptr, int src_stride_ar[/*stride_levels*/], MPI_Free_mem(src_buf); } - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = proc; - } - err = 0; } else { @@ -206,11 +201,6 @@ int PARMCI_NbGetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], MPI_Type_free(&src_type); MPI_Type_free(&dst_type); - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = proc; - } - err = 0; } else { @@ -355,11 +345,6 @@ int PARMCI_NbAccS(int datatype, void *scale, MPI_Free_mem(src_buf); } - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = proc; - } - err = 0; } else { diff --git a/src/vector.c b/src/vector.c index 034ff59..ebca279 100644 --- a/src/vector.c +++ b/src/vector.c @@ -123,7 +123,7 @@ int ARMCII_Iov_check_same_allocation(void **ptrs, int count, int proc) { * @return Zero on success, error code otherwise */ int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size, - int datatype, int overlapping, int same_alloc, int proc, int blocking) { + int datatype, int overlapping, int same_alloc, int proc, int blocking, armci_hdl_t * handle) { MPI_Datatype type; int type_count, type_size; @@ -148,7 +148,7 @@ int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int coun return ARMCII_Iov_op_safe(op, src, dst, count, type_count, type, proc); #else /* Jeff: We are going to always block when there is buffer overlap. */ - return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc, 1 /* consrv */, 1 /* blocking */); + return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc, 1 /* consrv */, 1 /* blocking */, handle); #endif } @@ -157,10 +157,10 @@ int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int coun else if ( ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_DIRECT || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_AUTO ) { - return ARMCII_Iov_op_datatype(op, src, dst, count, type_count, type, proc, blocking); + return ARMCII_Iov_op_datatype(op, src, dst, count, type_count, type, proc, blocking, handle); } else if (ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_BATCHED) { - return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc, 0 /* not consrv */, blocking); + return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc, 0 /* not consrv */, blocking, handle); } else { ARMCII_Error("unknown iov method (%d)\n", ARMCII_GLOBAL_STATE.iov_method); @@ -226,7 +226,7 @@ int ARMCII_Iov_op_safe(enum ARMCII_Op_e op, void **src, void **dst, int count, i * lock/unlock pair. */ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, - MPI_Datatype type, int proc, int consrv, int blocking) { + MPI_Datatype type, int proc, int consrv, int blocking, armci_hdl_t * handle) { int i; int flush_local = 1; /* used only for MPI-3 */ @@ -262,15 +262,15 @@ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count switch(op) { case ARMCII_OP_PUT: - gmr_put(mreg, src[i], dst[i], elem_count, proc, NULL /* handle */); + gmr_put(mreg, src[i], dst[i], elem_count, proc, handle); flush_local = 1; break; case ARMCII_OP_GET: - gmr_get(mreg, src[i], dst[i], elem_count, proc, NULL /* handle */); + gmr_get(mreg, src[i], dst[i], elem_count, proc, handle); flush_local = 0; break; case ARMCII_OP_ACC: - gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc, NULL /* handle */); + gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc, handle); flush_local = 1; break; default: @@ -291,7 +291,7 @@ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count * datatype to achieve a one-sided gather/scatter. */ int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, - MPI_Datatype type, int proc, int blocking) { + MPI_Datatype type, int proc, int blocking, armci_hdl_t * handle) { gmr_t *mreg; MPI_Datatype type_loc, type_rem; @@ -352,15 +352,15 @@ int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int coun switch(op) { case ARMCII_OP_PUT: - gmr_put_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc, NULL /* handle */); + gmr_put_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc, handle); flush_local = 1; break; case ARMCII_OP_GET: - gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, MPI_BOTTOM, 1, type_loc, proc, NULL /* handle */); + gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, MPI_BOTTOM, 1, type_loc, proc, handle); flush_local = 0; break; case ARMCII_OP_ACC: - gmr_accumulate_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc, NULL /* handle */); + gmr_accumulate_typed(mreg, MPI_BOTTOM, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc, handle); flush_local = 1; break; default: @@ -411,7 +411,7 @@ int PARMCI_PutV(armci_giov_t *iov, int iov_len, int proc) { ARMCII_Buf_prepare_read_vec(iov[v].src_ptr_array, &src_buf, iov[v].ptr_array_len, iov[v].bytes); ARMCII_Iov_op_dispatch(ARMCII_OP_PUT, src_buf, iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes, 0, - overlapping, same_alloc, proc, 1 /* blocking */); + overlapping, same_alloc, proc, 1 /* blocking */, NULL); ARMCII_Buf_finish_read_vec(iov[v].src_ptr_array, src_buf, iov[v].ptr_array_len, iov[v].bytes); } @@ -452,7 +452,7 @@ int PARMCI_GetV(armci_giov_t *iov, int iov_len, int proc) { ARMCII_Buf_prepare_write_vec(iov[v].dst_ptr_array, &dst_buf, iov[v].ptr_array_len, iov[v].bytes); ARMCII_Iov_op_dispatch(ARMCII_OP_GET, iov[v].src_ptr_array, dst_buf, iov[v].ptr_array_len, iov[v].bytes, 0, - overlapping, same_alloc, proc, 1 /* blocking */); + overlapping, same_alloc, proc, 1 /* blocking */, NULL); ARMCII_Buf_finish_write_vec(iov[v].dst_ptr_array, dst_buf, iov[v].ptr_array_len, iov[v].bytes); } @@ -492,7 +492,7 @@ int PARMCI_AccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int p ARMCII_Buf_prepare_acc_vec(iov[v].src_ptr_array, &src_buf, iov[v].ptr_array_len, iov[v].bytes, datatype, scale); ARMCII_Iov_op_dispatch(ARMCII_OP_ACC, src_buf, iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes, datatype, - overlapping, same_alloc, proc, 1 /* blocking */); + overlapping, same_alloc, proc, 1 /* blocking */, NULL); ARMCII_Buf_finish_acc_vec(iov[v].src_ptr_array, src_buf, iov[v].ptr_array_len, iov[v].bytes); } diff --git a/src/vector_nb.c b/src/vector_nb.c index 4f5a98e..0c6a484 100644 --- a/src/vector_nb.c +++ b/src/vector_nb.c @@ -47,15 +47,10 @@ int PARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) ARMCII_Buf_prepare_read_vec(iov[v].src_ptr_array, &src_buf, iov[v].ptr_array_len, iov[v].bytes); ARMCII_Iov_op_dispatch(ARMCII_OP_PUT, src_buf, iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes, 0, - overlapping, same_alloc, proc, blocking); + overlapping, same_alloc, proc, blocking, handle); ARMCII_Buf_finish_read_vec(iov[v].src_ptr_array, src_buf, iov[v].ptr_array_len, iov[v].bytes); } - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = proc; - } - gmr_progress(); return 0; @@ -100,15 +95,10 @@ int PARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) ARMCII_Buf_prepare_write_vec(iov[v].dst_ptr_array, &dst_buf, iov[v].ptr_array_len, iov[v].bytes); ARMCII_Iov_op_dispatch(ARMCII_OP_GET, iov[v].src_ptr_array, dst_buf, iov[v].ptr_array_len, iov[v].bytes, 0, - overlapping, same_alloc, proc, blocking); + overlapping, same_alloc, proc, blocking, handle); ARMCII_Buf_finish_write_vec(iov[v].dst_ptr_array, dst_buf, iov[v].ptr_array_len, iov[v].bytes); } - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = proc; - } - gmr_progress(); return 0; @@ -152,15 +142,10 @@ int PARMCI_NbAccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int ARMCII_Buf_prepare_acc_vec(iov[v].src_ptr_array, &src_buf, iov[v].ptr_array_len, iov[v].bytes, datatype, scale); ARMCII_Iov_op_dispatch(ARMCII_OP_ACC, src_buf, iov[v].dst_ptr_array, iov[v].ptr_array_len, iov[v].bytes, datatype, - overlapping, same_alloc, proc, blocking); + overlapping, same_alloc, proc, blocking, handle); ARMCII_Buf_finish_acc_vec(iov[v].src_ptr_array, src_buf, iov[v].ptr_array_len, iov[v].bytes); } - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = proc; - } - gmr_progress(); return 0; From b2555a1054b9eda6966c868447efe844abda6d71 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 1 Oct 2024 17:55:33 +0200 Subject: [PATCH 11/27] fix strict aliasing rule violation that offends address sanitizer Signed-off-by: Jeff Hammond --- src/rmw.c | 57 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/src/rmw.c b/src/rmw.c index 2100bca..a45da16 100644 --- a/src/rmw.c +++ b/src/rmw.c @@ -37,8 +37,8 @@ * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ -int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { - +int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) +{ int is_swap = 0, is_long = 0; MPI_Datatype type; MPI_Op rop; @@ -73,33 +73,48 @@ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { /* We hold the DLA lock if (src_mreg != NULL). */ if (is_swap) { - long out_val_l, src_val_l = *((long*)ploc); - int out_val_i, src_val_i = *((int*)ploc); - - // this is a blocking operation - gmr_fetch_and_op(dst_mreg, - is_long ? (void*) &src_val_l : (void*) &src_val_i /* src */, - is_long ? (void*) &out_val_l : (void*) &out_val_i /* out */, - prem /* dst */, type, rop, proc); - if (is_long) + + if (is_long) { + + long out_val_l, src_val_l = *((long*)ploc); + + // this is a blocking operation + gmr_fetch_and_op(dst_mreg, (void*) &src_val_l, (void*) &out_val_l, prem /* dst */, type, rop, proc); + *(long*) ploc = out_val_l; - else + + } else { + + int out_val_i, src_val_i = *((int*)ploc); + + // this is a blocking operation + gmr_fetch_and_op(dst_mreg, (void*) &src_val_i, (void*) &out_val_i, prem /* dst */, type, rop, proc); + *(int*) ploc = out_val_i; + + } } else /* fetch-and-add */ { - long fetch_val_l, add_val_l = value; - int fetch_val_i, add_val_i = value; - // this is a blocking operation - gmr_fetch_and_op(dst_mreg, - is_long ? (void*) &add_val_l : (void*) &add_val_i /* src */, - is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i /* out */, - prem /* dst */, type, rop, proc); + if (is_long) { + + long fetch_val_l, add_val_l = value; + + // this is a blocking operation + gmr_fetch_and_op(dst_mreg, (void*) &add_val_l, (void*) &fetch_val_l, prem /* dst */, type, rop, proc); - if (is_long) *(long*) ploc = fetch_val_l; - else + + } else { + + int fetch_val_i, add_val_i = value; + + // this is a blocking operation + gmr_fetch_and_op(dst_mreg, (void*) &add_val_i, (void*) &fetch_val_i, prem /* dst */, type, rop, proc); + *(int*) ploc = fetch_val_i; + + } } return 0; From d7d2f82485420d5cb87bebf3d48b03d4ede47f0e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 12:26:32 +0200 Subject: [PATCH 12/27] fix request array append Signed-off-by: Jeff Hammond --- src/gmr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gmr.c b/src/gmr.c index 4fb12be..a015f3d 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -1005,7 +1005,8 @@ void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) // there is a single request in the handle, so we allocate space for two, // then copy from the single request to the array and append the new one. // we nullify the single request to make sure it is not usable. - handle->request_array = malloc( handle->batch_size++ * sizeof(MPI_Request) ); + handle->batch_size++; + handle->request_array = malloc( handle->batch_size * sizeof(MPI_Request) ); handle->request_array[0] = handle->single_request; handle->request_array[1] = req; handle->single_request = MPI_REQUEST_NULL; @@ -1018,7 +1019,8 @@ void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) "handle is corrupt (request_array is NULL)"); // grow the allocation and append the new one. - handle->request_array = realloc( handle->request_array , handle->batch_size++ * sizeof(MPI_Request) ); + handle->batch_size++; + handle->request_array = realloc( handle->request_array , handle->batch_size * sizeof(MPI_Request) ); handle->request_array[handle->batch_size-1] = req; } From 804771ce57bc1afae46ab90b14d8b41155e52748 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 12:27:52 +0200 Subject: [PATCH 13/27] formatting and C99 loops Signed-off-by: Jeff Hammond --- src/vector_nb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/vector_nb.c b/src/vector_nb.c index 0c6a484..ed41c6c 100644 --- a/src/vector_nb.c +++ b/src/vector_nb.c @@ -27,15 +27,15 @@ * @param[in] proc Target process. * @return Success 0, otherwise non-zero. */ -int PARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) { - int v; +int PARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) +{ int blocking = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { blocking = 1; } - for (v = 0; v < iov_len; v++) { + for (int v = 0; v < iov_len; v++) { void **src_buf; int overlapping, same_alloc; @@ -74,15 +74,15 @@ int PARMCI_NbPutV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) * @param[in] proc Target process. * @return Success 0, otherwise non-zero. */ -int PARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) { - int v; +int PARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) +{ int blocking = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { blocking = 1; } - for (v = 0; v < iov_len; v++) { + for (int v = 0; v < iov_len; v++) { void **dst_buf; int overlapping, same_alloc; @@ -122,15 +122,15 @@ int PARMCI_NbGetV(armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) * @param[in] proc Target process. * @return Success 0, otherwise non-zero. */ -int PARMCI_NbAccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) { - int v; +int PARMCI_NbAccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc, armci_hdl_t* handle) +{ int blocking = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { blocking = 1; } - for (v = 0; v < iov_len; v++) { + for (int v = 0; v < iov_len; v++) { void **src_buf; int overlapping, same_alloc; From cef539131f895b241c7f44c84f2cbe7c5ad33627 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 12:29:03 +0200 Subject: [PATCH 14/27] formatting and C99 loops Signed-off-by: Jeff Hammond --- src/vector.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/vector.c b/src/vector.c index ebca279..71f9964 100644 --- a/src/vector.c +++ b/src/vector.c @@ -123,7 +123,9 @@ int ARMCII_Iov_check_same_allocation(void **ptrs, int count, int proc) { * @return Zero on success, error code otherwise */ int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size, - int datatype, int overlapping, int same_alloc, int proc, int blocking, armci_hdl_t * handle) { + int datatype, int overlapping, int same_alloc, int proc, + int blocking, armci_hdl_t * handle) +{ MPI_Datatype type; int type_count, type_size; @@ -291,7 +293,8 @@ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count * datatype to achieve a one-sided gather/scatter. */ int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, - MPI_Datatype type, int proc, int blocking, armci_hdl_t * handle) { + MPI_Datatype type, int proc, int blocking, armci_hdl_t * handle) +{ gmr_t *mreg; MPI_Datatype type_loc, type_rem; @@ -396,10 +399,9 @@ int ARMCII_Iov_op_datatype(enum ARMCII_Op_e op, void **src, void **dst, int coun * @param[in] proc Target process. * @return Success 0, otherwise non-zero. */ -int PARMCI_PutV(armci_giov_t *iov, int iov_len, int proc) { - int v; - - for (v = 0; v < iov_len; v++) { +int PARMCI_PutV(armci_giov_t *iov, int iov_len, int proc) +{ + for (int v = 0; v < iov_len; v++) { void **src_buf; int overlapping, same_alloc; @@ -436,10 +438,9 @@ int PARMCI_PutV(armci_giov_t *iov, int iov_len, int proc) { * @param[in] proc Target process. * @return Success 0, otherwise non-zero. */ -int PARMCI_GetV(armci_giov_t *iov, int iov_len, int proc) { - int v; - - for (v = 0; v < iov_len; v++) { +int PARMCI_GetV(armci_giov_t *iov, int iov_len, int proc) +{ + for (int v = 0; v < iov_len; v++) { void **dst_buf; int overlapping, same_alloc; @@ -477,10 +478,9 @@ int PARMCI_GetV(armci_giov_t *iov, int iov_len, int proc) { * @param[in] proc Target process. * @return Success 0, otherwise non-zero. */ -int PARMCI_AccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc) { - int v; - - for (v = 0; v < iov_len; v++) { +int PARMCI_AccV(int datatype, void *scale, armci_giov_t *iov, int iov_len, int proc) +{ + for (int v = 0; v < iov_len; v++) { void **src_buf; int overlapping, same_alloc; From b0375fa86314793cb365a0732b71e1bdc2443d38 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 12:30:34 +0200 Subject: [PATCH 15/27] add just-flushall code path for when things are weird Signed-off-by: Jeff Hammond --- src/armci.h | 1 + src/onesided_nb.c | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/armci.h b/src/armci.h index 7597cca..9f4a1b9 100644 --- a/src/armci.h +++ b/src/armci.h @@ -70,6 +70,7 @@ int ARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/], typedef struct armci_hdl_s { #ifdef USE_RMA_REQUESTS + int just_flushall; int batch_size; MPI_Request single_request; // used when batch_size=0 (common case) MPI_Request *request_array; // used when batch_size>0 diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 2563b8c..680068a 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -15,6 +15,7 @@ void ARMCI_INIT_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { #ifdef USE_RMA_REQUESTS + handle->just_flushall = 1; handle->batch_size = 0; handle->single_request = MPI_REQUEST_NULL; handle->request_array = NULL; @@ -34,7 +35,10 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { #ifdef USE_RMA_REQUESTS - ARMCII_Assert_msg(0, "not supported"); + handle->just_flushall = 1; + handle->batch_size = 0; + handle->single_request = MPI_REQUEST_NULL; + handle->request_array = NULL; #else handle->aggregate = 1; #endif @@ -50,7 +54,10 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { #ifdef USE_RMA_REQUESTS - ARMCII_Assert_msg(0, "not supported"); + handle->just_flushall = 1; + handle->batch_size = 0; + handle->single_request = MPI_REQUEST_NULL; + handle->request_array = NULL; #else handle->aggregate = 0; #endif @@ -73,7 +80,8 @@ void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) { /** Non-blocking put operation. */ -int PARMCI_NbPut(void *src, void *dst, int size, int target, armci_hdl_t *handle) { +int PARMCI_NbPut(void *src, void *dst, int size, int target, armci_hdl_t *handle) +{ gmr_t *src_mreg, *dst_mreg; dst_mreg = gmr_lookup(dst, target); @@ -248,6 +256,18 @@ int PARMCI_Wait(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS + if (handle == NULL || handle->just_flushall || handle->batch_size == 0) { + + gmr_t *cur_mreg = gmr_list; + + while (cur_mreg) { + gmr_flushall(cur_mreg, 1); /* local only */ + cur_mreg = cur_mreg->next; + } + + return 0; + } + ARMCII_Assert_msg(handle->batch_size >= 0, "handle is corrupt (batch_size < 0)"); //ARMCII_Assert_msg(handle->batch_size == 0, @@ -272,7 +292,7 @@ int PARMCI_Wait(armci_hdl_t* handle) for (int i = 0 ; i < handle->batch_size ; i++ ) { ARMCII_Assert_msg(handle->request_array[i] != MPI_REQUEST_NULL, "handle contains MPI_REQUEST_NULL"); - printf("%s %s %s i=%d\n",__FILE__, __LINE__, __func__); + printf("%s %d %s i=%d\n",__FILE__, __LINE__, __func__, i); MPI_Wait( &(handle->request_array[i]), MPI_STATUS_IGNORE ); } //MPI_Waitall( handle->batch_size, handle->request_array, MPI_STATUSES_IGNORE ); @@ -322,6 +342,18 @@ int PARMCI_Test(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS + if (handle == NULL || handle->just_flushall || handle->batch_size == 0) { + + gmr_t *cur_mreg = gmr_list; + + while (cur_mreg) { + gmr_flushall(cur_mreg, 1); /* local only */ + cur_mreg = cur_mreg->next; + } + + return 0; + } + int flag = 0; ARMCII_Assert_msg(handle->batch_size >= 0, From b485a3b32925c6375fd1e19ec5bc95535bed1cea Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 12:41:35 +0200 Subject: [PATCH 16/27] default to request-based atomics; print whether handles use RMA requests Signed-off-by: Jeff Hammond --- src/gmr.c | 10 +++++----- src/init_finalize.c | 9 ++++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/gmr.c b/src/gmr.c index a015f3d..129b97c 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -244,17 +244,17 @@ gmr_t *gmr_create(gmr_size_t local_size, void **base_ptrs, ARMCI_Group *group) { unified = false; } #else - int unified = ARMCII_Is_win_unified(mreg->window); - int print = ARMCII_GLOBAL_STATE.verbose; + const int unified = ARMCII_Is_win_unified(mreg->window); + const int print = ARMCII_GLOBAL_STATE.verbose; if (unified == 1) { mreg->unified = true; - if (print) printf("MPI_WIN_MODEL = MPI_WIN_UNIFIED\n"); + if (print > 1) printf("MPI_WIN_MODEL = MPI_WIN_UNIFIED\n"); } else if (unified == 0) { mreg->unified = false; - if (print) printf("MPI_WIN_MODEL = MPI_WIN_SEPARATE\n"); + if (print > 1) printf("MPI_WIN_MODEL = MPI_WIN_SEPARATE\n"); } else { mreg->unified = false; - if (print) printf("MPI_WIN_MODEL not available\n"); + if (print > 1) printf("MPI_WIN_MODEL not available\n"); } #endif if (!(mreg->unified) && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) ) { diff --git a/src/init_finalize.c b/src/init_finalize.c index 5daae94..2db3ac0 100644 --- a/src/init_finalize.c +++ b/src/init_finalize.c @@ -397,7 +397,7 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { ARMCII_GLOBAL_STATE.rma_nocheck=ARMCII_Getenv_bool("ARMCI_RMA_NOCHECK", 1); /* Use MPI_MODE_NOCHECK assertion */ - ARMCII_GLOBAL_STATE.use_request_atomics=ARMCII_Getenv_bool("ARMCI_USE_REQUEST_ATOMICS", 0); + ARMCII_GLOBAL_STATE.use_request_atomics=ARMCII_Getenv_bool("ARMCI_USE_REQUEST_ATOMICS", 1); /* Setup groups and communicators */ @@ -548,6 +548,13 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { printf(" MSG_BARRIER_SYNCS = %s\n", ARMCII_GLOBAL_STATE.msg_barrier_syncs ? "TRUE" : "FALSE"); printf(" USE_REQUEST_ATOMICS = %s\n", ARMCII_GLOBAL_STATE.use_request_atomics ? "TRUE" : "FALSE"); +#ifdef USE_RMA_REQUESTS + const int use_rma_requests = 1; +#else + const int use_rma_requests = 0; +#endif + printf(" USE_RMA_REQUESTS = %s\n", use_rma_requests ? "TRUE" : "FALSE"); + /* MPI info set on window */ printf(" USE_ALLOC_SHM = %s\n", ARMCII_GLOBAL_STATE.use_alloc_shm ? "TRUE" : "FALSE"); printf(" DISABLE_SHM_ACC = %s\n", ARMCII_GLOBAL_STATE.disable_shm_accumulate ? "TRUE" : "FALSE"); From 633bcd5e313792874686c0a14698a4cd978b10c9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 13:11:28 +0200 Subject: [PATCH 17/27] turn off just_flushall; use Waitall again Signed-off-by: Jeff Hammond --- src/onesided_nb.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 680068a..6d2ed36 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -15,7 +15,7 @@ void ARMCI_INIT_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { #ifdef USE_RMA_REQUESTS - handle->just_flushall = 1; + handle->just_flushall = 0; handle->batch_size = 0; handle->single_request = MPI_REQUEST_NULL; handle->request_array = NULL; @@ -35,7 +35,7 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { #ifdef USE_RMA_REQUESTS - handle->just_flushall = 1; + handle->just_flushall = 0; handle->batch_size = 0; handle->single_request = MPI_REQUEST_NULL; handle->request_array = NULL; @@ -54,7 +54,7 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) { if (handle!=NULL) { #ifdef USE_RMA_REQUESTS - handle->just_flushall = 1; + handle->just_flushall = 0; handle->batch_size = 0; handle->single_request = MPI_REQUEST_NULL; handle->request_array = NULL; @@ -256,7 +256,7 @@ int PARMCI_Wait(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS - if (handle == NULL || handle->just_flushall || handle->batch_size == 0) { + if (handle == NULL || handle->batch_size == 0 || handle->just_flushall) { gmr_t *cur_mreg = gmr_list; @@ -289,13 +289,15 @@ int PARMCI_Wait(armci_hdl_t* handle) ARMCII_Assert_msg(handle->request_array != NULL, "handle is corrupt (request_array is NULL)"); +#if 0 for (int i = 0 ; i < handle->batch_size ; i++ ) { ARMCII_Assert_msg(handle->request_array[i] != MPI_REQUEST_NULL, "handle contains MPI_REQUEST_NULL"); - printf("%s %d %s i=%d\n",__FILE__, __LINE__, __func__, i); MPI_Wait( &(handle->request_array[i]), MPI_STATUS_IGNORE ); } - //MPI_Waitall( handle->batch_size, handle->request_array, MPI_STATUSES_IGNORE ); +#else + MPI_Waitall( handle->batch_size, handle->request_array, MPI_STATUSES_IGNORE ); +#endif free(handle->request_array); handle->batch_size = 0; @@ -342,7 +344,7 @@ int PARMCI_Test(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS - if (handle == NULL || handle->just_flushall || handle->batch_size == 0) { + if (handle == NULL || handle->batch_size == 0 || handle->just_flushall) { gmr_t *cur_mreg = gmr_list; From ac26a2cfeee7ec87bc71ed0e2cb87f381dd4a444 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 13:24:13 +0200 Subject: [PATCH 18/27] remove just_flushall code path; allow inactive handles but warn Signed-off-by: Jeff Hammond --- src/onesided_nb.c | 55 ++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 6d2ed36..e95db43 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -13,9 +13,9 @@ */ void ARMCI_INIT_HANDLE(armci_hdl_t *handle) { - if (handle!=NULL) { + ARMCII_Assert_msg(handle, "handle is NULL"); + if (1 || handle != NULL) { #ifdef USE_RMA_REQUESTS - handle->just_flushall = 0; handle->batch_size = 0; handle->single_request = MPI_REQUEST_NULL; handle->request_array = NULL; @@ -24,7 +24,7 @@ void ARMCI_INIT_HANDLE(armci_hdl_t *handle) handle->target = -1; #endif } else { - ARMCII_Warning("ARMCI_INIT_HANDLE given NULL handle"); + ARMCII_Warning("ARMCI_INIT_HANDLE given NULL handle.\n"); } return; } @@ -33,9 +33,9 @@ void ARMCI_INIT_HANDLE(armci_hdl_t *handle) */ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) { - if (handle!=NULL) { + ARMCII_Assert_msg(handle, "handle is NULL"); + if (1 || handle != NULL) { #ifdef USE_RMA_REQUESTS - handle->just_flushall = 0; handle->batch_size = 0; handle->single_request = MPI_REQUEST_NULL; handle->request_array = NULL; @@ -43,7 +43,7 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) handle->aggregate = 1; #endif } else { - ARMCII_Warning("ARMCI_SET_AGGREGATE_HANDLE given NULL handle"); + ARMCII_Warning("ARMCI_SET_AGGREGATE_HANDLE given NULL handle.\n"); } return; } @@ -51,10 +51,11 @@ void ARMCI_SET_AGGREGATE_HANDLE(armci_hdl_t *handle) /** Clear an aggregate handle. */ -void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) { - if (handle!=NULL) { +void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) +{ + ARMCII_Assert_msg(handle, "handle is NULL"); + if (1 || handle != NULL) { #ifdef USE_RMA_REQUESTS - handle->just_flushall = 0; handle->batch_size = 0; handle->single_request = MPI_REQUEST_NULL; handle->request_array = NULL; @@ -62,7 +63,7 @@ void ARMCI_UNSET_AGGREGATE_HANDLE(armci_hdl_t *handle) { handle->aggregate = 0; #endif } else { - ARMCII_Warning("ARMCI_UNSET_AGGREGATE_HANDLE given NULL handle"); + ARMCII_Warning("ARMCI_UNSET_AGGREGATE_HANDLE given NULL handle.\n"); } return; } @@ -256,7 +257,8 @@ int PARMCI_Wait(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS - if (handle == NULL || handle->batch_size == 0 || handle->just_flushall) { +#if 0 + if (handle == NULL || handle->batch_size == 0) { gmr_t *cur_mreg = gmr_list; @@ -267,13 +269,17 @@ int PARMCI_Wait(armci_hdl_t* handle) return 0; } +#endif - ARMCII_Assert_msg(handle->batch_size >= 0, - "handle is corrupt (batch_size < 0)"); - //ARMCII_Assert_msg(handle->batch_size == 0, - // "handle waited on without prior use"); + ARMCII_Assert_msg(handle, "handle is NULL"); + ARMCII_Assert_msg(handle->batch_size >= 0, "handle is corrupt (batch_size < 0)"); + //ARMCII_Assert_msg(handle->batch_size == 0, "handle waited on without prior use"); - if (handle->batch_size == 1) { + if (handle->batch_size == 0) { + + ARMCII_Warning("ARMCI_Wait passed an inactive handle.\n"); + + } else if (handle->batch_size == 1) { ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); @@ -344,7 +350,8 @@ int PARMCI_Test(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS - if (handle == NULL || handle->batch_size == 0 || handle->just_flushall) { +#if 0 + if (handle == NULL || handle->batch_size == 0) { gmr_t *cur_mreg = gmr_list; @@ -355,15 +362,19 @@ int PARMCI_Test(armci_hdl_t* handle) return 0; } +#endif int flag = 0; - ARMCII_Assert_msg(handle->batch_size >= 0, - "handle is corrupt (batch_size < 0)"); - ARMCII_Assert_msg(handle->batch_size == 0, - "handle waited on without prior use"); + ARMCII_Assert_msg(handle, "handle is NULL"); + ARMCII_Assert_msg(handle->batch_size >= 0, "handle is corrupt (batch_size < 0)"); + //ARMCII_Assert_msg(handle->batch_size == 0, "handle waited on without prior use"); + + if (handle->batch_size == 0) { + + ARMCII_Warning("ARMCI_Wait passed an inactive handle.\n"); - if (handle->batch_size == 1) { + } else if (handle->batch_size == 1) { ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); From c4612ffc558d290a857a2db8a242f99199838c2c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 13:36:52 +0200 Subject: [PATCH 19/27] whitespace and code motion Signed-off-by: Jeff Hammond --- src/init_finalize.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/init_finalize.c b/src/init_finalize.c index 2db3ac0..1ec61e4 100644 --- a/src/init_finalize.c +++ b/src/init_finalize.c @@ -238,8 +238,8 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { #ifdef HAVE_PTHREADS /* Check progress thread settings */ - ARMCII_GLOBAL_STATE.progress_thread = ARMCII_Getenv_bool("ARMCI_PROGRESS_THREAD", 0); - ARMCII_GLOBAL_STATE.progress_usleep = ARMCII_Getenv_int("ARMCI_PROGRESS_USLEEP", 0); + ARMCII_GLOBAL_STATE.progress_thread = ARMCII_Getenv_bool("ARMCI_PROGRESS_THREAD", 0); + ARMCII_GLOBAL_STATE.progress_usleep = ARMCII_Getenv_int("ARMCI_PROGRESS_USLEEP", 0); if (ARMCII_GLOBAL_STATE.progress_thread && (mpi_thread_level!=MPI_THREAD_MULTIPLE)) { ARMCII_Warning("ARMCI progress thread requires MPI_THREAD_MULTIPLE (%d); progress thread disabled.\n", @@ -258,7 +258,7 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { /* Check for debugging flags */ - ARMCII_GLOBAL_STATE.debug_alloc = ARMCII_Getenv_bool("ARMCI_DEBUG_ALLOC", 0); + ARMCII_GLOBAL_STATE.debug_alloc = ARMCII_Getenv_bool("ARMCI_DEBUG_ALLOC", 0); { int junk; junk = ARMCII_Getenv_bool("ARMCI_FLUSH_BARRIERS", -1); @@ -275,15 +275,15 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { if (ARMCII_Getenv("ARMCI_NONCOLLECTIVE_GROUPS")) { ARMCII_GLOBAL_STATE.noncollective_groups = ARMCII_Getenv_bool("ARMCI_NONCOLLECTIVE_GROUPS", 0); } - ARMCII_GLOBAL_STATE.cache_rank_translation=ARMCII_Getenv_bool("ARMCI_CACHE_RANK_TRANSLATION", 1); + ARMCII_GLOBAL_STATE.cache_rank_translation = ARMCII_Getenv_bool("ARMCI_CACHE_RANK_TRANSLATION", 1); /* Check for IOV flags */ #ifdef NO_SEATBELTS - ARMCII_GLOBAL_STATE.iov_checks = 0; + ARMCII_GLOBAL_STATE.iov_checks = 0; #endif - ARMCII_GLOBAL_STATE.iov_checks = ARMCII_Getenv_bool("ARMCI_IOV_CHECKS", 0); - ARMCII_GLOBAL_STATE.iov_batched_limit = ARMCII_Getenv_int("ARMCI_IOV_BATCHED_LIMIT", 0); + ARMCII_GLOBAL_STATE.iov_checks = ARMCII_Getenv_bool("ARMCI_IOV_CHECKS", 0); + ARMCII_GLOBAL_STATE.iov_batched_limit = ARMCII_Getenv_int("ARMCI_IOV_BATCHED_LIMIT", 0); if (ARMCII_GLOBAL_STATE.iov_batched_limit < 0) { ARMCII_Warning("Ignoring invalid value for ARMCI_IOV_BATCHED_LIMIT (%d)\n", ARMCII_GLOBAL_STATE.iov_batched_limit); @@ -396,9 +396,16 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { /* Use MPI_MODE_NOCHECK assertion */ ARMCII_GLOBAL_STATE.rma_nocheck=ARMCII_Getenv_bool("ARMCI_RMA_NOCHECK", 1); - /* Use MPI_MODE_NOCHECK assertion */ + /* Use request-based RMA for atomic operations */ ARMCII_GLOBAL_STATE.use_request_atomics=ARMCII_Getenv_bool("ARMCI_USE_REQUEST_ATOMICS", 1); + /* Use request-based RMA for ARMCI nonblocking with explicit handles */ +#ifdef USE_RMA_REQUESTS + const int use_rma_requests = 1; +#else + const int use_rma_requests = 0; +#endif + /* Setup groups and communicators */ MPI_Comm_dup(comm, &ARMCI_GROUP_WORLD.comm); @@ -547,13 +554,7 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { printf(" RMA_NOCHECK = %s\n", ARMCII_GLOBAL_STATE.rma_nocheck ? "TRUE" : "FALSE"); printf(" MSG_BARRIER_SYNCS = %s\n", ARMCII_GLOBAL_STATE.msg_barrier_syncs ? "TRUE" : "FALSE"); printf(" USE_REQUEST_ATOMICS = %s\n", ARMCII_GLOBAL_STATE.use_request_atomics ? "TRUE" : "FALSE"); - -#ifdef USE_RMA_REQUESTS - const int use_rma_requests = 1; -#else - const int use_rma_requests = 0; -#endif - printf(" USE_RMA_REQUESTS = %s\n", use_rma_requests ? "TRUE" : "FALSE"); + printf(" USE_RMA_REQUESTS = %s\n", use_rma_requests ? "TRUE" : "FALSE"); // compile-time option /* MPI info set on window */ printf(" USE_ALLOC_SHM = %s\n", ARMCII_GLOBAL_STATE.use_alloc_shm ? "TRUE" : "FALSE"); @@ -579,7 +580,7 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { /* Update (Aug. 2022): it has reappeared in MPICH 4.x, per * https://github.com/pmodels/mpich/issues/6110 */ printf(" Warning: MPI_Win_allocate can lead to correctness issues.\n"); - if ((mpi_implementation == ARMCII_MPICH) && (mpi_impl_major == 4)) { + if ((mpi_implementation == ARMCII_MPICH) && (mpi_impl_major == 4)) { printf(" There is a good chance your implementation is affected!\n"); printf(" See https://github.com/pmodels/mpich/issues/6110 for details.\n"); } From 641ef175386e017d6b2039dde141e1bded1e71b3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 16:13:33 +0300 Subject: [PATCH 20/27] cleanup/finish nonblocking handle stuff Signed-off-by: Jeff Hammond --- src/onesided_nb.c | 27 +++------------------------ src/strided_nb.c | 4 ++-- 2 files changed, 5 insertions(+), 26 deletions(-) diff --git a/src/onesided_nb.c b/src/onesided_nb.c index e95db43..d9fae59 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -100,16 +100,9 @@ int PARMCI_NbPut(void *src, void *dst, int size, int target, armci_hdl_t *handle ARMCI_Copy(src, dst, size); } else { - gmr_put(dst_mreg, src, dst, size, target, NULL /* handle */); + gmr_put(dst_mreg, src, dst, size, target, handle); } -#if 0 - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = target; - } -#endif - gmr_progress(); return 0; @@ -146,15 +139,8 @@ int PARMCI_NbGet(void *src, void *dst, int size, int target, armci_hdl_t *handle ARMCI_Copy(src, dst, size); } else { - gmr_get(src_mreg, src, dst, size, target, NULL /* handle */); - } - -#if 0 - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = target; + gmr_get(src_mreg, src, dst, size, target, handle); } -#endif gmr_progress(); @@ -220,7 +206,7 @@ int PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int /* TODO: Support a local accumulate operation more efficiently */ - gmr_accumulate(dst_mreg, src_buf, dst, count, type, target, NULL /* handle */); + gmr_accumulate(dst_mreg, src_buf, dst, count, type, target, handle); if (src_buf != src) { /* must wait for local completion to free source buffer */ @@ -228,13 +214,6 @@ int PARMCI_NbAcc(int datatype, void *scale, void *src, void *dst, int bytes, int MPI_Free_mem(src_buf); } -#if 0 - if (handle!=NULL) { - /* Regular (not aggregate) handles merely store the target for future flushing. */ - handle->target = target; - } -#endif - gmr_progress(); return 0; diff --git a/src/strided_nb.c b/src/strided_nb.c index 58f304a..19e9e59 100644 --- a/src/strided_nb.c +++ b/src/strided_nb.c @@ -189,7 +189,7 @@ int PARMCI_NbGetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], mreg = gmr_lookup(src_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc, NULL /* handle */); + gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc, handle); /* COPY: Finish the transfer */ if (dst_buf != dst_ptr) { @@ -334,7 +334,7 @@ int PARMCI_NbAccS(int datatype, void *scale, mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); - gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, NULL /* handle */); + gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc, handle); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); From 87f2b55571ff5267a244e9ec558b602210973344 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 17:57:32 +0300 Subject: [PATCH 21/27] workaround bug Signed-off-by: Jeff Hammond --- src/gmr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gmr.c b/src/gmr.c index 129b97c..6e04dac 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -603,7 +603,9 @@ int gmr_get_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, MPI_Request req = MPI_REQUEST_NULL; if (ARMCII_GLOBAL_STATE.rma_atomicity) { - MPI_Rget_accumulate(NULL, 0, MPI_BYTE, + // Using the source type instead of MPI_BYTE works around an MPICH bug that appears with + // Intel MPI 2021.10 and Cray MPI 8.1.29 + MPI_Rget_accumulate(NULL, 0, src_type /* MPI_BYTE */, dst, dst_count, dst_type, grp_proc, (MPI_Aint) disp, src_count, src_type, MPI_NO_OP, mreg->window, &req); From 475055f4288a41405167a622a4ddafa21663f867 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 22:11:45 +0300 Subject: [PATCH 22/27] remove just_flushall --- src/armci.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/armci.h b/src/armci.h index 9f4a1b9..7597cca 100644 --- a/src/armci.h +++ b/src/armci.h @@ -70,7 +70,6 @@ int ARMCI_PutS_flag(void *src_ptr, int src_stride_ar[/*stride_levels*/], typedef struct armci_hdl_s { #ifdef USE_RMA_REQUESTS - int just_flushall; int batch_size; MPI_Request single_request; // used when batch_size=0 (common case) MPI_Request *request_array; // used when batch_size>0 From 1cbb89405a400630ca502588fb54a3218b9370c4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 Oct 2024 22:11:56 +0300 Subject: [PATCH 23/27] relax assertions in PARMCI_Wait related to uninitialized handles running NWChem generates a huge number of assertions/warnings about bogus handles. it would seem that GA does a bad job of initializing these. Signed-off-by: Jeff Hammond --- src/gmr.c | 6 ++++++ src/onesided_nb.c | 28 +++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/gmr.c b/src/gmr.c index 6e04dac..8062e97 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -984,25 +984,31 @@ void gmr_progress(void) void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) { +#if 0 ARMCII_Assert_msg(handle->batch_size >= 0, "handle is corrupt (batch_size < 0)"); +#endif if (handle->batch_size == 0) { +#if 0 ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); ARMCII_Assert_msg(handle->request_array == NULL, "handle is corrupt (request_array is not NULL)"); +#endif handle->batch_size = 1; handle->single_request = req; } else if (handle->batch_size == 1) { +#if 0 ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); ARMCII_Assert_msg(handle->request_array == NULL, "handle is corrupt (request_array is not NULL)"); +#endif // there is a single request in the handle, so we allocate space for two, // then copy from the single request to the array and append the new one. diff --git a/src/onesided_nb.c b/src/onesided_nb.c index d9fae59..fb74736 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -251,10 +251,12 @@ int PARMCI_Wait(armci_hdl_t* handle) #endif ARMCII_Assert_msg(handle, "handle is NULL"); - ARMCII_Assert_msg(handle->batch_size >= 0, "handle is corrupt (batch_size < 0)"); - //ARMCII_Assert_msg(handle->batch_size == 0, "handle waited on without prior use"); - if (handle->batch_size == 0) { + if (handle->batch_size < 0) { + + ARMCII_Warning("ARMCI_Wait passed a bogus (uninitialized) handle.\n"); + + } else if (handle->batch_size == 0) { ARMCII_Warning("ARMCI_Wait passed an inactive handle.\n"); @@ -267,12 +269,27 @@ int PARMCI_Wait(armci_hdl_t* handle) MPI_Wait( &(handle->single_request), MPI_STATUS_IGNORE ); + handle->batch_size = 0; + handle->single_request = MPI_REQUEST_NULL; + handle->request_array = NULL; + } else if (handle->batch_size > 1) { +#if 0 ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); ARMCII_Assert_msg(handle->request_array != NULL, "handle is corrupt (request_array is NULL)"); +#else + if (handle->single_request == MPI_REQUEST_NULL) { + printf("handle->batch_size = %d\n"); + ARMCII_Warning("handle is corrupt (single_request_array is not MPI_REQUEST_NULL)\n"); + } + if (handle->request_array != NULL) { + printf("handle->batch_size = %d\n"); + ARMCII_Warning("handle is corrupt (request_array is NULL)\n"); + } +#endif #if 0 for (int i = 0 ; i < handle->batch_size ; i++ ) { @@ -285,8 +302,9 @@ int PARMCI_Wait(armci_hdl_t* handle) #endif free(handle->request_array); - handle->batch_size = 0; - handle->request_array = NULL; + handle->batch_size = 0; + handle->single_request = MPI_REQUEST_NULL; + handle->request_array = NULL; } #else From 34280ee7e4d58eb91bebab92894653bcd7b67691 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 4 Oct 2024 15:02:57 +0300 Subject: [PATCH 24/27] fixed many incorrect/spurious warnings --- src/gmr.c | 44 +++++++++--------- src/onesided_nb.c | 112 ++++++++++++++++++---------------------------- 2 files changed, 66 insertions(+), 90 deletions(-) diff --git a/src/gmr.c b/src/gmr.c index 8062e97..34d2427 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -984,31 +984,30 @@ void gmr_progress(void) void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) { -#if 0 - ARMCII_Assert_msg(handle->batch_size >= 0, - "handle is corrupt (batch_size < 0)"); -#endif + if (handle->batch_size < 0) { - if (handle->batch_size == 0) { + ARMCII_Warning("gmr_handle_add_request passed a bogus (uninitialized) handle.\n"); -#if 0 - ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, - "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); - ARMCII_Assert_msg(handle->request_array == NULL, - "handle is corrupt (request_array is not NULL)"); -#endif + } else if (handle->batch_size == 0) { + + if (handle->single_request != MPI_REQUEST_NULL) { + ARMCII_Warning("gmr_handle_add_request: handle is corrupt (single_request_array is not MPI_REQUEST_NULL).\n"); + } + if (handle->request_array != NULL) { + ARMCII_Warning("gmr_handle_add_request: handle is corrupt (request_array is not NULL).\n"); + } handle->batch_size = 1; handle->single_request = req; } else if (handle->batch_size == 1) { -#if 0 - ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, - "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); - ARMCII_Assert_msg(handle->request_array == NULL, - "handle is corrupt (request_array is not NULL)"); -#endif + if (handle->single_request == MPI_REQUEST_NULL) { + ARMCII_Warning("gmr_handle_add_request: handle is corrupt (single_request_array is MPI_REQUEST_NULL).\n"); + } + if (handle->request_array != NULL) { + ARMCII_Warning("gmr_handle_add_request: handle is corrupt (request_array is not NULL).\n"); + } // there is a single request in the handle, so we allocate space for two, // then copy from the single request to the array and append the new one. @@ -1021,10 +1020,12 @@ void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) } else if (handle->batch_size > 1) { - ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, - "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); - ARMCII_Assert_msg(handle->request_array != NULL, - "handle is corrupt (request_array is NULL)"); + if (handle->single_request != MPI_REQUEST_NULL) { + ARMCII_Warning("gmr_handle_add_request: handle is corrupt (single_request_array is not MPI_REQUEST_NULL).\n"); + } + if (handle->request_array == NULL) { + ARMCII_Warning("gmr_handle_add_request: handle is corrupt (request_array is NULL).\n"); + } // grow the allocation and append the new one. handle->batch_size++; @@ -1032,5 +1033,4 @@ void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) handle->request_array[handle->batch_size-1] = req; } - } diff --git a/src/onesided_nb.c b/src/onesided_nb.c index fb74736..35c2bc7 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -236,20 +236,6 @@ int PARMCI_Wait(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS -#if 0 - if (handle == NULL || handle->batch_size == 0) { - - gmr_t *cur_mreg = gmr_list; - - while (cur_mreg) { - gmr_flushall(cur_mreg, 1); /* local only */ - cur_mreg = cur_mreg->next; - } - - return 0; - } -#endif - ARMCII_Assert_msg(handle, "handle is NULL"); if (handle->batch_size < 0) { @@ -262,10 +248,12 @@ int PARMCI_Wait(armci_hdl_t* handle) } else if (handle->batch_size == 1) { - ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, - "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); - ARMCII_Assert_msg(handle->request_array == NULL, - "handle is corrupt (request_array is not NULL)"); + if (handle->single_request == MPI_REQUEST_NULL) { + ARMCII_Warning("ARMCI_Wait: handle is corrupt (single_request_array is MPI_REQUEST_NULL)\n"); + } + if (handle->request_array != NULL) { + ARMCII_Warning("ARMCI_Wait: handle is corrupt (request_array is not NULL)\n"); + } MPI_Wait( &(handle->single_request), MPI_STATUS_IGNORE ); @@ -275,31 +263,16 @@ int PARMCI_Wait(armci_hdl_t* handle) } else if (handle->batch_size > 1) { -#if 0 - ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, - "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); - ARMCII_Assert_msg(handle->request_array != NULL, - "handle is corrupt (request_array is NULL)"); -#else - if (handle->single_request == MPI_REQUEST_NULL) { - printf("handle->batch_size = %d\n"); - ARMCII_Warning("handle is corrupt (single_request_array is not MPI_REQUEST_NULL)\n"); + if (handle->single_request != MPI_REQUEST_NULL) { + printf("handle->batch_size = %d\n",handle->batch_size); + ARMCII_Warning("ARMCI_Wait: handle is corrupt (single_request_array is not MPI_REQUEST_NULL)\n"); } - if (handle->request_array != NULL) { - printf("handle->batch_size = %d\n"); - ARMCII_Warning("handle is corrupt (request_array is NULL)\n"); + if (handle->request_array == NULL) { + printf("handle->batch_size = %d\n",handle->batch_size); + ARMCII_Warning("ARMCI_Wait: handle is corrupt (request_array is NULL)\n"); } -#endif -#if 0 - for (int i = 0 ; i < handle->batch_size ; i++ ) { - ARMCII_Assert_msg(handle->request_array[i] != MPI_REQUEST_NULL, - "handle contains MPI_REQUEST_NULL"); - MPI_Wait( &(handle->request_array[i]), MPI_STATUS_IGNORE ); - } -#else MPI_Waitall( handle->batch_size, handle->request_array, MPI_STATUSES_IGNORE ); -#endif free(handle->request_array); handle->batch_size = 0; @@ -347,51 +320,54 @@ int PARMCI_Test(armci_hdl_t* handle) { #ifdef USE_RMA_REQUESTS -#if 0 - if (handle == NULL || handle->batch_size == 0) { - - gmr_t *cur_mreg = gmr_list; - - while (cur_mreg) { - gmr_flushall(cur_mreg, 1); /* local only */ - cur_mreg = cur_mreg->next; - } - - return 0; - } -#endif - int flag = 0; ARMCII_Assert_msg(handle, "handle is NULL"); - ARMCII_Assert_msg(handle->batch_size >= 0, "handle is corrupt (batch_size < 0)"); - //ARMCII_Assert_msg(handle->batch_size == 0, "handle waited on without prior use"); - if (handle->batch_size == 0) { + if (handle->batch_size < 0) { - ARMCII_Warning("ARMCI_Wait passed an inactive handle.\n"); + ARMCII_Warning("ARMCI_Test passed a bogus (uninitialized) handle.\n"); + + } else if (handle->batch_size == 0) { + + ARMCII_Warning("ARMCI_Test passed an inactive handle.\n"); } else if (handle->batch_size == 1) { - ARMCII_Assert_msg(handle->single_request != MPI_REQUEST_NULL, - "handle is corrupt (single_request_array is MPI_REQUEST_NULL)"); - ARMCII_Assert_msg(handle->request_array == NULL, - "handle is corrupt (request_array is not NULL)"); + if (handle->single_request == MPI_REQUEST_NULL) { + ARMCII_Warning("ARMCI_Test: handle is corrupt (single_request_array is MPI_REQUEST_NULL)\n"); + } + if (handle->request_array != NULL) { + ARMCII_Warning("ARMCI_Test: handle is corrupt (request_array is not NULL)\n"); + } MPI_Test( &(handle->single_request), &flag, MPI_STATUS_IGNORE ); + if (flag) { + handle->batch_size = 0; + handle->single_request = MPI_REQUEST_NULL; + handle->request_array = NULL; + } + } else if (handle->batch_size > 1) { - ARMCII_Assert_msg(handle->single_request == MPI_REQUEST_NULL, - "handle is corrupt (single_request_array is not MPI_REQUEST_NULL)"); - ARMCII_Assert_msg(handle->request_array != NULL, - "handle is corrupt (request_array is NULL)"); + if (handle->single_request != MPI_REQUEST_NULL) { + printf("handle->batch_size = %d\n",handle->batch_size); + ARMCII_Warning("ARMCI_Test: handle is corrupt (single_request_array is not MPI_REQUEST_NULL)\n"); + } + if (handle->request_array == NULL) { + printf("handle->batch_size = %d\n",handle->batch_size); + ARMCII_Warning("ARMCI_Test: handle is corrupt (request_array is NULL)\n"); + } MPI_Testall( handle->batch_size, handle->request_array, &flag, MPI_STATUSES_IGNORE ); - free(handle->request_array); - handle->batch_size = 0; - handle->request_array = NULL; + if (flag) { + free(handle->request_array); + handle->batch_size = 0; + handle->single_request = MPI_REQUEST_NULL; + handle->request_array = NULL; + } } // no error codes are supported so we can do this From d85f38732697c9e093f89a0cacdf827d1fd44ac4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 4 Oct 2024 18:28:15 +0300 Subject: [PATCH 25/27] remove printf --- src/onesided_nb.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 35c2bc7..449d80f 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -264,11 +264,9 @@ int PARMCI_Wait(armci_hdl_t* handle) } else if (handle->batch_size > 1) { if (handle->single_request != MPI_REQUEST_NULL) { - printf("handle->batch_size = %d\n",handle->batch_size); ARMCII_Warning("ARMCI_Wait: handle is corrupt (single_request_array is not MPI_REQUEST_NULL)\n"); } if (handle->request_array == NULL) { - printf("handle->batch_size = %d\n",handle->batch_size); ARMCII_Warning("ARMCI_Wait: handle is corrupt (request_array is NULL)\n"); } @@ -352,11 +350,9 @@ int PARMCI_Test(armci_hdl_t* handle) } else if (handle->batch_size > 1) { if (handle->single_request != MPI_REQUEST_NULL) { - printf("handle->batch_size = %d\n",handle->batch_size); ARMCII_Warning("ARMCI_Test: handle is corrupt (single_request_array is not MPI_REQUEST_NULL)\n"); } if (handle->request_array == NULL) { - printf("handle->batch_size = %d\n",handle->batch_size); ARMCII_Warning("ARMCI_Test: handle is corrupt (request_array is NULL)\n"); } From fd8b6c6955b9aa9ce921b6960b824a29c668ff47 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 4 Oct 2024 19:40:21 +0300 Subject: [PATCH 26/27] these warnings appear to generate false positives --- src/gmr.c | 4 ++-- src/onesided_nb.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gmr.c b/src/gmr.c index 34d2427..f18c71a 100644 --- a/src/gmr.c +++ b/src/gmr.c @@ -994,7 +994,7 @@ void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) ARMCII_Warning("gmr_handle_add_request: handle is corrupt (single_request_array is not MPI_REQUEST_NULL).\n"); } if (handle->request_array != NULL) { - ARMCII_Warning("gmr_handle_add_request: handle is corrupt (request_array is not NULL).\n"); + //ARMCII_Warning("gmr_handle_add_request: handle is corrupt (request_array is not NULL).\n"); } handle->batch_size = 1; @@ -1006,7 +1006,7 @@ void gmr_handle_add_request(armci_hdl_t * handle, MPI_Request req) ARMCII_Warning("gmr_handle_add_request: handle is corrupt (single_request_array is MPI_REQUEST_NULL).\n"); } if (handle->request_array != NULL) { - ARMCII_Warning("gmr_handle_add_request: handle is corrupt (request_array is not NULL).\n"); + //ARMCII_Warning("gmr_handle_add_request: handle is corrupt (request_array is not NULL).\n"); } // there is a single request in the handle, so we allocate space for two, diff --git a/src/onesided_nb.c b/src/onesided_nb.c index 449d80f..fb3da2b 100644 --- a/src/onesided_nb.c +++ b/src/onesided_nb.c @@ -252,7 +252,7 @@ int PARMCI_Wait(armci_hdl_t* handle) ARMCII_Warning("ARMCI_Wait: handle is corrupt (single_request_array is MPI_REQUEST_NULL)\n"); } if (handle->request_array != NULL) { - ARMCII_Warning("ARMCI_Wait: handle is corrupt (request_array is not NULL)\n"); + //ARMCII_Warning("ARMCI_Wait: handle is corrupt (request_array is not NULL)\n"); } MPI_Wait( &(handle->single_request), MPI_STATUS_IGNORE ); From 87f724d0ce39ccd3698f994036bc3de4e369f5af Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 14 Oct 2024 12:30:31 +0300 Subject: [PATCH 27/27] reorganize init code so Warning works better ARMCII_Warning was called before ARMCI_GROUP_WORLD was initialized, so warnings in init were printed by every rank. Signed-off-by: Jeff Hammond --- src/groups.c | 5 +--- src/init_finalize.c | 68 ++++++++++++++++++++++----------------------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/src/groups.c b/src/groups.c index 642dae9..fb0b7f5 100644 --- a/src/groups.c +++ b/src/groups.c @@ -11,13 +11,10 @@ #include -/** The ARMCI world group. This is accessed from outside via - * ARMCI_Group_get_world. - */ +/** The ARMCI world group. This is accessed from outside via ARMCI_Group_get_world. */ ARMCI_Group ARMCI_GROUP_WORLD = {0}; ARMCI_Group ARMCI_GROUP_DEFAULT = {0}; - /** Initialize an ARMCI group's remaining fields using the communicator field. */ void ARMCII_Group_init_from_comm(ARMCI_Group *group) { diff --git a/src/init_finalize.c b/src/init_finalize.c index 1ec61e4..7ac5b81 100644 --- a/src/init_finalize.c +++ b/src/init_finalize.c @@ -199,6 +199,22 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { } } + /* Setup groups and communicators - do this before ARMCII_Warning is called! */ + + MPI_Comm_dup(comm, &ARMCI_GROUP_WORLD.comm); + ARMCII_Group_init_from_comm(&ARMCI_GROUP_WORLD); + ARMCI_GROUP_DEFAULT = ARMCI_GROUP_WORLD; + + /* Create GOP operators */ + + MPI_Op_create(ARMCII_Absmin_op, 1 /* commute */, &ARMCI_MPI_ABSMIN_OP); + MPI_Op_create(ARMCII_Absmax_op, 1 /* commute */, &ARMCI_MPI_ABSMAX_OP); + + MPI_Op_create(ARMCII_Msg_sel_min_op, 1 /* commute */, &ARMCI_MPI_SELMIN_OP); + MPI_Op_create(ARMCII_Msg_sel_max_op, 1 /* commute */, &ARMCI_MPI_SELMAX_OP); + + /* Determine environmental settings */ + ARMCII_GLOBAL_STATE.verbose = ARMCII_Getenv_int("ARMCI_VERBOSE", 0); /* Figure out what MPI library we are using, in an attempt to work around bugs. */ @@ -277,7 +293,18 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { } ARMCII_GLOBAL_STATE.cache_rank_translation = ARMCII_Getenv_bool("ARMCI_CACHE_RANK_TRANSLATION", 1); - /* Check for IOV flags */ + /* Check for IOV and Strided flags */ + +#if defined(OPEN_MPI) && defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION < 5) + /* Open-MPI 5 RMA works a lot better than older releases... */ + ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_BATCHED; + ARMCII_GLOBAL_STATE.strided_method = ARMCII_STRIDED_IOV; +#else + /* IOV_DIRECT leads to addr=NULL errors when ARMCI_{GetV,PutV} are used + * Jeff: Is this still true? */ + ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_DIRECT; + ARMCII_GLOBAL_STATE.strided_method = ARMCII_STRIDED_DIRECT; +#endif #ifdef NO_SEATBELTS ARMCII_GLOBAL_STATE.iov_checks = 0; @@ -286,19 +313,11 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { ARMCII_GLOBAL_STATE.iov_batched_limit = ARMCII_Getenv_int("ARMCI_IOV_BATCHED_LIMIT", 0); if (ARMCII_GLOBAL_STATE.iov_batched_limit < 0) { - ARMCII_Warning("Ignoring invalid value for ARMCI_IOV_BATCHED_LIMIT (%d)\n", ARMCII_GLOBAL_STATE.iov_batched_limit); + ARMCII_Warning("Ignoring invalid value for ARMCI_IOV_BATCHED_LIMIT (%d)\n", + ARMCII_GLOBAL_STATE.iov_batched_limit); ARMCII_GLOBAL_STATE.iov_batched_limit = 0; } -#if defined(OPEN_MPI) && defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION < 5) - /* Open-MPI 5 RMA works a lot better than older releases... */ - ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_BATCHED; -#else - /* DIRECT leads to addr=NULL errors when ARMCI_{GetV,PutV} are used - * Jeff: Is this still true? */ - ARMCII_GLOBAL_STATE.iov_method = ARMCII_IOV_DIRECT; -#endif - char *var = ARMCII_Getenv("ARMCI_IOV_METHOD"); if (var != NULL) { if (strcmp(var, "AUTO") == 0) @@ -313,14 +332,6 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { ARMCII_Warning("Ignoring unknown value for ARMCI_IOV_METHOD (%s)\n", var); } - /* Check for Strided flags */ - -#if defined(OPEN_MPI) - ARMCII_GLOBAL_STATE.strided_method = ARMCII_STRIDED_IOV; -#else - ARMCII_GLOBAL_STATE.strided_method = ARMCII_STRIDED_DIRECT; -#endif - var = ARMCII_Getenv("ARMCI_STRIDED_METHOD"); if (var != NULL) { if (strcmp(var, "IOV") == 0) { @@ -334,10 +345,13 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { #ifdef OPEN_MPI if (ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_DIRECT || ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { + if (ARMCI_GROUP_WORLD.rank == 0) { ARMCII_Warning("MPI Datatypes are broken in RMA in many versions of Open-MPI!\n"); #if defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION == 4) - ARMCII_Warning("Open-MPI 4.0.0 RMA with datatypes is definitely broken. See https://github.com/open-mpi/ompi/issues/6275 for details.\n"); + ARMCII_Warning("Open-MPI 4.0.0 RMA with datatypes is definitely broken." + "See https://github.com/open-mpi/ompi/issues/6275 for details.\n"); #endif + } } #endif @@ -406,20 +420,6 @@ int PARMCI_Init_thread_comm(int armci_requested, MPI_Comm comm) { const int use_rma_requests = 0; #endif - /* Setup groups and communicators */ - - MPI_Comm_dup(comm, &ARMCI_GROUP_WORLD.comm); - ARMCII_Group_init_from_comm(&ARMCI_GROUP_WORLD); - ARMCI_GROUP_DEFAULT = ARMCI_GROUP_WORLD; - - /* Create GOP operators */ - - MPI_Op_create(ARMCII_Absmin_op, 1 /* commute */, &ARMCI_MPI_ABSMIN_OP); - MPI_Op_create(ARMCII_Absmax_op, 1 /* commute */, &ARMCI_MPI_ABSMAX_OP); - - MPI_Op_create(ARMCII_Msg_sel_min_op, 1 /* commute */, &ARMCI_MPI_SELMIN_OP); - MPI_Op_create(ARMCII_Msg_sel_max_op, 1 /* commute */, &ARMCI_MPI_SELMAX_OP); - #ifdef HAVE_MEMKIND_H char * pool_path; if (ARMCII_GLOBAL_STATE.use_win_allocate == ARMCII_MEMKIND_WINDOW_TYPE) {