Skip to content

Commit 4b8a027

Browse files
committed
ch4/{ofi,ucx}: Skip collective finalize in sessions model
If there is no MPI_COMM_WORLD, we should skip calling a global PMI barrier during finalization. It is not guaranteed that world processes have initialized MPI (thus PMI), so a barrier could hang.
1 parent ec6b39f commit 4b8a027

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

src/mpid/ch4/netmod/ofi/ofi_init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -852,7 +852,7 @@ int MPIDI_OFI_mpi_finalize_hook(void)
852852
/* sockets provider need flush any last lightweight send. */
853853
mpi_errno = flush_send_queue();
854854
MPIR_ERR_CHECK(mpi_errno);
855-
} else if (MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
855+
} else if (!MPIR_Process.comm_world || MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
856856
/* skip collective work arounds */
857857
} else if (strcmp("verbs;ofi_rxm", MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name) == 0
858858
|| strcmp("psm2", MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name) == 0

src/mpid/ch4/netmod/ucx/ucx_init.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ int MPIDI_UCX_mpi_finalize_hook(void)
243243
int n = MPIDI_UCX_global.num_vcis;
244244
pending = MPL_malloc(sizeof(ucs_status_ptr_t) * MPIR_Process.size * n * n, MPL_MEM_OTHER);
245245

246-
if (!MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
246+
if (MPIR_Process.comm_world && !MPIR_CVAR_NO_COLLECTIVE_FINALIZE) {
247247
/* if some process are not present, the disconnect may timeout and give errors */
248248
mpi_errno = MPIR_pmi_barrier();
249249
MPIR_ERR_CHECK(mpi_errno);

0 commit comments

Comments
 (0)