diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 1b2fca7d6f1..c72cdd24d44 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -634,7 +634,6 @@ int mca_spml_ucx_clear_put_op_mask(mca_spml_ucx_ctx_t *ctx) int mca_spml_ucx_add_procs(oshmem_group_t* group, size_t nprocs) { int rc = OSHMEM_ERROR; - int my_rank = oshmem_my_proc_id(); size_t ucp_workers = mca_spml_ucx.ucp_workers; unsigned int *wk_roffs = NULL; unsigned int *wk_rsizes = NULL; @@ -644,6 +643,8 @@ int mca_spml_ucx_add_procs(oshmem_group_t* group, size_t nprocs) ucp_address_t **wk_local_addr; unsigned int *wk_addr_len; ucp_ep_params_t ep_params; + int *indices; + int proc_index, swap_index, temp_index; wk_local_addr = calloc(mca_spml_ucx.ucp_workers, sizeof(ucp_address_t *)); wk_addr_len = calloc(mca_spml_ucx.ucp_workers, sizeof(size_t)); @@ -691,23 +692,40 @@ int mca_spml_ucx_add_procs(oshmem_group_t* group, size_t nprocs) } } + indices = malloc(nprocs * sizeof(*indices)); + if (!indices) { + goto error; + } + + for (i = 0; i < nprocs; i++) { + indices[i] = i; + } + + srand((unsigned int)time(NULL)); + /* Get the EP connection requests for all the processes from modex */ - for (n = 0; n < nprocs; ++n) { - i = (my_rank + n) % nprocs; + for (proc_index = nprocs - 1; proc_index >= 0; --proc_index) { + /* Fisher-Yates shuffle algorithm */ + if (proc_index > 0) { + swap_index = rand() % (proc_index + 1); + temp_index = indices[proc_index]; + indices[proc_index] = indices[swap_index]; + indices[swap_index] = temp_index; + } ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; - ep_params.address = (ucp_address_t *)mca_spml_ucx.remote_addrs_tbl[0][i]; + ep_params.address = (ucp_address_t *) mca_spml_ucx.remote_addrs_tbl[0][indices[proc_index]]; err = ucp_ep_create(mca_spml_ucx_ctx_default.ucp_worker[0], &ep_params, - &mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn); + &mca_spml_ucx_ctx_default.ucp_peers[indices[proc_index]].ucp_conn); if (UCS_OK != err) { - SPML_UCX_ERROR("ucp_ep_create(proc=%zu/%zu) failed: %s", n, nprocs, + SPML_UCX_ERROR("ucp_ep_create(proc=%d/%zu) failed: %s", proc_index, nprocs, ucs_status_string(err)); goto error2; } /* Initialize mkeys as NULL for all processes */ - mca_spml_ucx_peer_mkey_cache_init(&mca_spml_ucx_ctx_default, i); + mca_spml_ucx_peer_mkey_cache_init(&mca_spml_ucx_ctx_default, indices[proc_index]); } for (i = 0; i < mca_spml_ucx.ucp_workers; i++) { @@ -719,6 +737,7 @@ int mca_spml_ucx_add_procs(oshmem_group_t* group, size_t nprocs) free(wk_roffs); free(wk_addr_len); free(wk_local_addr); + free(indices); SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***"); @@ -753,6 +772,7 @@ int mca_spml_ucx_add_procs(oshmem_group_t* group, size_t nprocs) free(wk_raddrs); free(wk_rsizes); free(wk_roffs); + free(indices); error: free(wk_addr_len); free(wk_local_addr);