Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/mpi/comm/commutil.c
Original file line number Diff line number Diff line change
Expand Up @@ -770,10 +770,10 @@ static int init_comm_seq(MPIR_Comm * comm)
* used, for example, to hash vci.
* Builtin-comm, e.g. MPI_COMM_WORLD, always have seq at 0 */
if (!HANDLE_IS_BUILTIN(comm->handle)) {
static int vci_seq = 0;
vci_seq++;
static MPL_atomic_int_t vci_seq = MPL_ATOMIC_INT_T_INITIALIZER(0);
MPL_atomic_fetch_add_int(&vci_seq, 1);

int tmp = vci_seq;
int tmp = MPL_atomic_load_int(&vci_seq);
/* Bcast seq over vci 0 */
MPIR_Assert(comm->seq == 0);

Expand Down
16 changes: 13 additions & 3 deletions src/mpi/comm/contextid.c
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,8 @@ static int find_and_allocate_context_id(uint32_t local_mask[])
* They are used to avoid deadlock in multi-threaded case. In single-threaded
* case, they are not used.
*/
static volatile int eager_nelem = -1;
static volatile int eager_in_use = 0;
static int eager_nelem = -1;
static int eager_in_use = 0;

/* In multi-threaded case, mask_in_use is used to maintain thread safety. In
* single-threaded case, it is always 0. */
Expand Down Expand Up @@ -681,6 +681,9 @@ static int sched_cb_gcn_allocate_cid(MPIR_Comm * comm, int tag, void *state)
int mpi_errno = MPI_SUCCESS;
struct gcn_state *st = state, *tmp;
int newctxid;

MPID_THREAD_CS_ENTER(VCI, MPIR_THREAD_VCI_CTX_MUTEX);

if (st->own_eager_mask) {
newctxid = find_and_allocate_context_id(st->local_mask);
if (st->ctx0)
Expand Down Expand Up @@ -710,6 +713,8 @@ static int sched_cb_gcn_allocate_cid(MPIR_Comm * comm, int tag, void *state)
}
}

MPID_THREAD_CS_EXIT(VCI, MPIR_THREAD_VCI_CTX_MUTEX);

if (*st->ctx0 == 0) {
if (st->local_mask[ALL_OWN_MASK_FLAG] == 1) {
/* --BEGIN ERROR HANDLING-- */
Expand Down Expand Up @@ -796,6 +801,8 @@ static int sched_cb_gcn_copy_mask(MPIR_Comm * comm, int tag, void *state)
int mpi_errno = MPI_SUCCESS;
struct gcn_state *st = state;

MPID_THREAD_CS_ENTER(VCI, MPIR_THREAD_VCI_CTX_MUTEX);

if (st->first_iter) {
memset(st->local_mask, 0, (MPIR_MAX_CONTEXT_MASK + 1) * sizeof(int));
st->own_eager_mask = 0;
Expand Down Expand Up @@ -828,6 +835,8 @@ static int sched_cb_gcn_copy_mask(MPIR_Comm * comm, int tag, void *state)
}
}

MPID_THREAD_CS_EXIT(VCI, MPIR_THREAD_VCI_CTX_MUTEX);

mpi_errno = MPIR_Iallreduce_intra_sched_auto(MPI_IN_PLACE, st->local_mask,
MPIR_MAX_CONTEXT_MASK + 1,
MPIR_UINT32_T_INTERNAL, MPI_BAND,
Expand Down Expand Up @@ -1110,6 +1119,8 @@ void MPIR_Free_contextid(int context_id)
}
}

MPID_THREAD_CS_ENTER(VCI, MPIR_THREAD_VCI_CTX_MUTEX);

/* --BEGIN ERROR HANDLING-- */
/* Check that this context id has been allocated */
if ((context_mask[idx] & (0x1U << bitpos)) != 0) {
Expand All @@ -1123,7 +1134,6 @@ void MPIR_Free_contextid(int context_id)
}
/* --END ERROR HANDLING-- */

MPID_THREAD_CS_ENTER(VCI, MPIR_THREAD_VCI_CTX_MUTEX);
/* MT: Note that this update must be done atomically in the multithreaedd
* case. In the "one, single lock" implementation, that lock is indeed
* held when this operation is called. */
Expand Down
26 changes: 0 additions & 26 deletions src/mpid/ch4/netmod/ofi/ofi_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ static int update_multi_nic_hints(MPIR_Comm * comm)
if (comm) {
/* If the user set a multi-nic hint, but num_nics = 1, disable multi-nic optimizations. */
if (MPIDI_OFI_global.num_nics > 1) {
int was_enabled_striping = MPIDI_OFI_COMM(comm).enable_striping;

/* Check if we should use striping */
if (HAS_PREF_NIC(comm)) {
/* If the user specified a particular NIC, don't use striping. */
Expand All @@ -29,19 +27,10 @@ static int update_multi_nic_hints(MPIR_Comm * comm)
else
MPIDI_OFI_COMM(comm).enable_striping = MPIR_CVAR_CH4_OFI_ENABLE_MULTI_NIC_STRIPING;

/* If striping was on and we disabled it here, decrement the global counter. */
if (was_enabled_striping > 0 && MPIDI_OFI_COMM(comm).enable_striping == 0)
MPIDI_OFI_global.num_comms_enabled_striping--;
/* If striping was off and we enabled it here, increment the global counter. */
else if (was_enabled_striping <= 0 && MPIDI_OFI_COMM(comm).enable_striping != 0)
MPIDI_OFI_global.num_comms_enabled_striping++;

if (MPIDI_OFI_COMM(comm).enable_striping) {
MPIDI_OFI_global.stripe_threshold = MPIR_CVAR_CH4_OFI_MULTI_NIC_STRIPING_THRESHOLD;
}

int was_enabled_hashing = MPIDI_OFI_COMM(comm).enable_hashing;

/* Check if we should use hashing */
if (HAS_PREF_NIC(comm)) {
/* If the user specified a particular NIC, don't use hashing. */
Expand All @@ -60,13 +49,6 @@ static int update_multi_nic_hints(MPIR_Comm * comm)
!comm->hints[MPIR_COMM_HINT_NO_ANY_SOURCE])) {
MPIDI_OFI_COMM(comm).enable_hashing = 0;
}

/* If hashing was on and we disabled it here, decrement the global counter. */
if (was_enabled_hashing > 0 && MPIDI_OFI_COMM(comm).enable_hashing == 0)
MPIDI_OFI_global.num_comms_enabled_hashing--;
/* If hashing was off and we enabled it here, increment the global counter. */
else if (was_enabled_hashing <= 0 && MPIDI_OFI_COMM(comm).enable_hashing != 0)
MPIDI_OFI_global.num_comms_enabled_hashing++;
}
}

Expand Down Expand Up @@ -145,8 +127,6 @@ int MPIDI_OFI_mpi_comm_commit_pre_hook(MPIR_Comm * comm)
MPIDI_OFI_COMM(comm).conn_id = -1;

/* Initialize the multi-nic optimization values */
MPIDI_OFI_global.num_comms_enabled_striping = 0;
MPIDI_OFI_global.num_comms_enabled_hashing = 0;
MPIDI_OFI_COMM(comm).enable_striping = 0;
MPIDI_OFI_COMM(comm).enable_hashing = 0;
MPIDI_OFI_COMM(comm).pref_nic = NULL;
Expand Down Expand Up @@ -192,12 +172,6 @@ int MPIDI_OFI_mpi_comm_free_hook(MPIR_Comm * comm)
int mpi_errno = MPI_SUCCESS;
MPIR_FUNC_ENTER;

/* If we enabled striping or hashing, decrement the counter. */
MPIDI_OFI_global.num_comms_enabled_striping -=
(MPIDI_OFI_COMM(comm).enable_striping != 0 ? 1 : 0);
MPIDI_OFI_global.num_comms_enabled_hashing -=
(MPIDI_OFI_COMM(comm).enable_hashing != 0 ? 1 : 0);

MPL_free(MPIDI_OFI_COMM(comm).pref_nic);

MPIR_FUNC_EXIT;
Expand Down
3 changes: 0 additions & 3 deletions src/mpid/ch4/netmod/ofi/ofi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -644,9 +644,6 @@ int MPIDI_OFI_init_local(int *tag_bits)
/* Initialize RMA keys allocator */
MPIDI_OFI_mr_key_allocator_init();

MPIDI_OFI_global.num_comms_enabled_striping = 0;
MPIDI_OFI_global.num_comms_enabled_hashing = 0;

mpi_errno = ofi_pvar_init();
MPIR_ERR_CHECK(mpi_errno);

Expand Down
2 changes: 0 additions & 2 deletions src/mpid/ch4/netmod/ofi/ofi_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -489,8 +489,6 @@ typedef struct {
MPIDI_OFI_per_vci_t per_vci[MPIDI_CH4_MAX_VCIS];
int num_vcis;
int num_nics;
int num_comms_enabled_striping; /* Number of active communicators with striping enabled */
int num_comms_enabled_hashing; /* Number of active communicators with hashingenabled */
bool am_bufs_registered; /* whether active message buffers are GPU registered */

/* Window/RMA Globals */
Expand Down
4 changes: 4 additions & 0 deletions src/mpid/ch4/src/ch4_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ int MPIDIU_new_avt(int size, int *avtid)
MPIR_FUNC_ENTER;
MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE, (MPL_DBG_FDEST, " new_avt: size=%d", size));

MPID_THREAD_CS_ENTER(VCI, MPIDIU_THREAD_DYNPROC_MUTEX);

*avtid = get_next_avtid();

/* note: zeroed so is_local default to 0, which is true for *avtid > 0 */
Expand All @@ -102,6 +104,8 @@ int MPIDIU_new_avt(int size, int *avtid)
}
MPIDI_global.avt_mgr.av_tables[*avtid] = new_av_table;

MPID_THREAD_CS_EXIT(VCI, MPIDIU_THREAD_DYNPROC_MUTEX);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Grammar error in commit message.

MPIR_FUNC_EXIT;
return mpi_errno;
}
Expand Down
2 changes: 1 addition & 1 deletion src/mpid/common/sched/mpidu_sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -1185,7 +1185,7 @@ int MPIDU_Sched_progress(int vci, int *made_progress)
* For example, with MPI_Comm_idup, sched_cb_gcn_allocate_cid will call MPIR_Allreduce.
* This inner progress should skip Sched progress to avoid recursive situation.
*/
static int in_sched_progress = 0;
static MPL_TLS int in_sched_progress = 0;

if (in_sched_progress) {
return MPI_SUCCESS;
Expand Down
24 changes: 7 additions & 17 deletions src/mpid/common/thread/mpidu_thread_fallback.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,23 +137,13 @@ M*/
#define MPIDUI_THREAD_CS_ENTER(mutex) \
do { \
if (MPIR_ThreadInfo.isThreaded) { \
int equal_ = 0; \
MPL_thread_id_t self_, owner_; \
MPL_thread_self(&self_); \
owner_ = mutex.owner; \
MPL_thread_same(&self_, &owner_, &equal_); \
if (!equal_) { \
int err_ = 0; \
MPL_DBG_MSG_P(MPIR_DBG_THREAD,VERBOSE,"enter MPIDU_Thread_mutex_lock %p", &mutex); \
MPIDU_Thread_mutex_lock(&mutex, &err_, MPL_THREAD_PRIO_HIGH);\
MPL_DBG_MSG_P(MPIR_DBG_THREAD,VERBOSE,"exit MPIDU_Thread_mutex_lock %p", &mutex); \
MPIR_Assert(err_ == 0); \
MPIR_Assert(mutex.count == 0); \
MPL_thread_self(&mutex.owner); \
} else { \
/* assert all recursive usage */ \
MPIR_Assert(0); \
} \
int err_ = 0; \
MPL_DBG_MSG_P(MPIR_DBG_THREAD,VERBOSE,"enter MPIDU_Thread_mutex_lock %p", &mutex); \
MPIDU_Thread_mutex_lock(&mutex, &err_, MPL_THREAD_PRIO_HIGH); \
MPL_DBG_MSG_P(MPIR_DBG_THREAD,VERBOSE,"exit MPIDU_Thread_mutex_lock %p", &mutex); \
MPIR_Assert(err_ == 0); \
MPIR_Assert(mutex.count == 0); \
MPL_thread_self(&mutex.owner); \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The commit message is inaccurate. It is removing the ownership check. The recursive lock check is still on. And the reason for that change is because we no longer support recursive locking and the owner checking is no longer needed. The race condition is a trigger for the removal, but not the reason. If it is the reason, we should fix it rather than removing it.

mutex.count++; \
} \
} while (0)
Expand Down