Skip to content

Commit 957b73a

Browse files
committed
ch4/{ofi,ucx}: Skip collective finalize in sessions model
If there is no MPI_COMM_WORLD, we should skip calling a global PMI barrier during finalization. It is not guaranteed that world processes have initialized MPI (thus PMI), so a barrier could hang.
1 parent 2f0c8f4 commit 957b73a

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

src/mpid/ch4/netmod/ofi/ofi_init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,7 @@ int MPIDI_OFI_mpi_finalize_hook(void)
933933
/* sockets provider need flush any last lightweight send. */
934934
mpi_errno = flush_send_queue();
935935
MPIR_ERR_CHECK(mpi_errno);
936-
} else if (MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
936+
} else if (!MPIR_Process.comm_world || MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
937937
/* skip collective work arounds */
938938
} else if (strcmp("verbs;ofi_rxm", MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name) == 0
939939
|| strcmp("psm2", MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name) == 0

src/mpid/ch4/netmod/ucx/ucx_init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ int MPIDI_UCX_mpi_finalize_hook(void)
246246
int n = MPIDI_UCX_global.num_vcis;
247247
pending = MPL_malloc(sizeof(ucs_status_ptr_t) * MPIR_Process.size * n * n, MPL_MEM_OTHER);
248248

249-
if (!MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
249+
if (MPIR_Process.comm_world && !MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
250250
/* if some process are not present, the disconnect may timeout and give errors */
251251
mpi_errno = MPIR_pmi_barrier();
252252
MPIR_ERR_CHECK(mpi_errno);

0 commit comments

Comments
 (0)