Skip to content

Commit 4f08552

Browse files
authored
Merge pull request #7258 from raffenet/4.3.x-update
[4.3.x] misc: Fixes for ch4:ofi and prep for 4.3.0rc3 Approved-by: Hui Zhou <[email protected]>
2 parents e12effd + 999d6ce commit 4f08552

File tree

12 files changed

+75
-66
lines changed

12 files changed

+75
-66
lines changed

CHANGES

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@
6565

6666
# Fix compilation issue with g++ in -std=gnu++20 mode
6767

68+
# Fix bug in MPI_ANY_SOURCE handling observed using the libfabric CXI
69+
provider
70+
71+
# Add NIC information to error messages in ch4:ofi netmod
72+
6873
===============================================================================
6974
Changes in 4.2
7075
===============================================================================

maint/extracterrmsgs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -680,12 +680,12 @@ sub ProcessFile
680680
# add longnames since we omit errnames.txt for these
681681
$longnames{"**ofid_$name"} = "OFI call $name failed";
682682
$longnamesDefined{"**ofid_$name"} = "$filename:$linecount";
683-
$longnames{"**ofid_$name %s %d %s %s"} = "OFI call $name failed (%s:%d:%s:%s)";
684-
$longnamesDefined{"**ofid_$name %s %d %s %s"} = "$filename:$linecount";
683+
$longnames{"**ofid_$name %s %s"} = "OFI call $name failed (default nic=%s: %s)";
684+
$longnamesDefined{"**ofid_$name %s %s"} = "$filename:$linecount";
685685
}
686686

687687
$generic_msgs{"**ofid_$name"}++;
688-
$specific_msgs{"**ofid_$name %s %d %s %s"}++;
688+
$specific_msgs{"**ofid_$name %s %s"}++;
689689

690690
next;
691691
}

maint/version.m4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# changing this by playing with diversions, but then we would probably be
1515
# playing with autotools-fire.
1616

17-
m4_define([MPICH_VERSION_m4],[4.3.0rc2])dnl
17+
m4_define([MPICH_VERSION_m4],[4.3.0rc3])dnl
1818
m4_define([MPICH_RELEASE_DATE_m4],[unreleased development copy])dnl
1919

2020
# For libtool ABI versioning rules see:

src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i
7070
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
7171
do {
7272
ret = fi_cntr_wait(rcv_cntr, 1, 1);
73-
MPIR_ERR_CHKANDJUMP4(ret < 0 && ret != -FI_ETIMEDOUT,
73+
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
7474
mpi_errno, MPI_ERR_RMA_RANGE,
75-
"**ofid_cntr_wait", "**ofid_cntr_wait %s %d %s %s",
76-
__SHORT_FILE__, __LINE__, __func__, fi_strerror(-ret));
75+
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
76+
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
7777
MPID_Progress_test(NULL);
7878
} while (ret == -FI_ETIMEDOUT);
7979
} else {
@@ -90,10 +90,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i
9090
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
9191
do {
9292
ret = fi_cntr_wait(snd_cntr, num_children, 1);
93-
MPIR_ERR_CHKANDJUMP4(ret < 0 && ret != -FI_ETIMEDOUT,
93+
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
9494
mpi_errno, MPI_ERR_RMA_RANGE,
95-
"**ofid_cntr_wait", "**ofid_cntr_wait %s %d %s %s",
96-
__SHORT_FILE__, __LINE__, __func__, fi_strerror(-ret));
95+
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
96+
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
9797
MPID_Progress_test(NULL);
9898
} while (ret == -FI_ETIMEDOUT);
9999
} else {

src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer
6969
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
7070
do {
7171
ret = fi_cntr_wait(rcv_cntr, 1, 1);
72-
MPIR_ERR_CHKANDJUMP4(ret < 0 && ret != -FI_ETIMEDOUT,
72+
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
7373
mpi_errno, MPI_ERR_RMA_RANGE,
74-
"**ofid_cntr_wait", "**ofid_cntr_wait %s %d %s %s",
75-
__SHORT_FILE__, __LINE__, __func__, fi_strerror(-ret));
74+
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
75+
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
7676
MPID_Progress_test(NULL);
7777
} while (ret == -FI_ETIMEDOUT);
7878
} else {
@@ -89,10 +89,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer
8989
if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) {
9090
do {
9191
ret = fi_cntr_wait(snd_cntr, num_children, 1);
92-
MPIR_ERR_CHKANDJUMP4(ret < 0 && ret != -FI_ETIMEDOUT,
92+
MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT,
9393
mpi_errno, MPI_ERR_RMA_RANGE,
94-
"**ofid_cntr_wait", "**ofid_cntr_wait %s %d %s %s",
95-
__SHORT_FILE__, __LINE__, __func__, fi_strerror(-ret));
94+
"**ofid_cntr_wait", "**ofid_cntr_wait %s %s",
95+
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret));
9696
MPID_Progress_test(NULL);
9797
} while (ret == -FI_ETIMEDOUT);
9898
} else {

src/mpid/ch4/netmod/ofi/errnames.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010
# Most of the libfabric call error names are generated from MPIDI_OFI_CALL macros,
1111
# some of them are explicitly used via MPIR_ERR_CHKANDJUMP4, they need be listed here.
1212
**ofid_cancel:OFI cancel failed
13-
**ofid_cancel %s %d %s %s:OFI cancel failed (%s:%d:%s:%s)
13+
**ofid_cancel %s %s:OFI cancel failed (default nic=%s: %s)
1414
**ofid_cntr_open:OFI Counter open failed
15-
**ofid_cntr_open %s %d %s %s:OFI OFI Counter open failed (%s:%d:%s:%s)
15+
**ofid_cntr_open %s %s:OFI OFI Counter open failed (default nic=%s: %s)
1616
**ofid_cntr_wait:OFI Counter wait failed
17-
**ofid_cntr_wait %s %d %s %s:OFI OFI Counter wait failed (%s:%d:%s:%s)
17+
**ofid_cntr_wait %s %s:OFI OFI Counter wait failed (default nic=%s: %s)
1818
**ofid_enable_trigger:OFI triggered ops enable failed
19-
**ofid_enable_trigger %s %d %s %s:OFI triggered ops enable failed (%s:%d:%s:%s)
19+
**ofid_enable_trigger %s %s:OFI triggered ops enable failed (default nic=%s: %s)
2020
**ofid_issue_trigger:OFI triggered ops issue failed
21-
**ofid_issue_trigger %s %d %s %s:OFI triggered ops issue failed (%s:%d:%s:%s)
21+
**ofid_issue_trigger %s %s:OFI triggered ops issue failed (default nic=%s: %s)
2222
**ofid_poll:OFI poll failed
23-
**ofid_poll %s %d %s %s:OFI poll failed (%s:%d:%s:%s)
23+
**ofid_poll %s %s:OFI poll failed (default nic=%s: %s)

src/mpid/ch4/netmod/ofi/ofi_events.c

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -737,9 +737,10 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret)
737737
break;
738738

739739
default:
740-
MPIR_ERR_SETFATALANDJUMP4(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
741-
"**ofid_poll %s %d %s %s", __SHORT_FILE__,
742-
__LINE__, __func__, fi_strerror(e.err));
740+
MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
741+
"**ofid_poll %s %s",
742+
MPIDI_OFI_DEFAULT_NIC_NAME,
743+
fi_strerror(e.err));
743744
}
744745

745746
break;
@@ -782,17 +783,17 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret)
782783
break;
783784

784785
default:
785-
MPIR_ERR_SETFATALANDJUMP4(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
786-
"**ofid_poll %s %d %s %s", __SHORT_FILE__,
787-
__LINE__, __func__, fi_strerror(e.err));
786+
MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
787+
"**ofid_poll %s %s",
788+
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(e.err));
788789
}
789790

790791
break;
791792

792793
default:
793-
MPIR_ERR_SETFATALANDJUMP4(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
794-
"**ofid_poll %s %d %s %s", __SHORT_FILE__, __LINE__,
795-
__func__, fi_strerror(errno));
794+
MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll",
795+
"**ofid_poll %s %s",
796+
MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(errno));
796797
}
797798

798799
fn_exit:

src/mpid/ch4/netmod/ofi/ofi_impl.h

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ ATTRIBUTE((unused));
3838

3939
#define MPIDI_OFI_WIN(win) ((win)->dev.netmod.ofi)
4040

41+
#define MPIDI_OFI_NIC_NAME(nic) (MPIDI_OFI_global.prov_use[nic] ? \
42+
MPIDI_OFI_global.prov_use[nic]->domain_attr->name : "(n/a)")
43+
#define MPIDI_OFI_DEFAULT_NIC_NAME (MPIDI_OFI_NIC_NAME(0))
44+
4145
int MPIDI_OFI_progress_uninlined(int vci);
4246
int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
4347

@@ -55,18 +59,16 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
5559
#define MPIDI_OFI_PROGRESS_WHILE(cond, vci) \
5660
while (cond) MPIDI_OFI_PROGRESS(vci)
5761

58-
#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP4
59-
#define MPIDI_OFI_CALL(FUNC,STR) \
62+
#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP2
63+
#define MPIDI_OFI_CALL(FUNC,STR) \
6064
do { \
6165
ssize_t _ret = FUNC; \
6266
MPIDI_OFI_ERR(_ret<0, \
6367
mpi_errno, \
6468
MPI_ERR_OTHER, \
6569
"**ofid_"#STR, \
66-
"**ofid_"#STR" %s %d %s %s", \
67-
__SHORT_FILE__, \
68-
__LINE__, \
69-
__func__, \
70+
"**ofid_"#STR" %s %s", \
71+
MPIDI_OFI_DEFAULT_NIC_NAME, \
7072
fi_strerror(-_ret)); \
7173
} while (0)
7274

@@ -81,10 +83,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
8183
mpi_errno, \
8284
MPI_ERR_OTHER, \
8385
"**ofid_"#STR, \
84-
"**ofid_"#STR" %s %d %s %s", \
85-
__SHORT_FILE__, \
86-
__LINE__, \
87-
__func__, \
86+
"**ofid_"#STR" %s %s", \
87+
MPIDI_OFI_DEFAULT_NIC_NAME, \
8888
fi_strerror(-_ret)); \
8989
if (_retry > 0) { \
9090
_retry--; \
@@ -94,9 +94,7 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
9494
* for recursive locking in more than one lock (currently limited
9595
* to one due to scalar TLS counter), this lock yielding
9696
* operation can be avoided since we are inside a finite loop. */ \
97-
MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vci_); \
98-
mpi_errno = MPIDI_OFI_retry_progress(); \
99-
MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vci_); \
97+
mpi_errno = MPIDI_OFI_retry_progress(vci_, _retry); \
10098
MPIR_ERR_CHECK(mpi_errno); \
10199
} while (1); \
102100
} while (0)
@@ -113,9 +111,7 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
113111
_retry--; \
114112
MPIR_ERR_CHKANDJUMP(_retry == 0, mpi_errno, MPIX_ERR_EAGAIN, "**eagain"); \
115113
} \
116-
MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vci_); \
117-
mpi_errno = MPIDI_OFI_retry_progress(); \
118-
MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vci_); \
114+
mpi_errno = MPIDI_OFI_retry_progress(vci_, _retry); \
119115
} \
120116
} while (0)
121117

@@ -129,10 +125,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
129125
mpi_errno, \
130126
MPI_ERR_OTHER, \
131127
"**ofid_"#STR, \
132-
"**ofid_"#STR" %s %d %s %s", \
133-
__SHORT_FILE__, \
134-
__LINE__, \
135-
__func__, \
128+
"**ofid_"#STR" %s %s", \
129+
MPIDI_OFI_DEFAULT_NIC_NAME, \
136130
fi_strerror(-_ret)); \
137131
mpi_errno = MPIDI_OFI_progress_do_queue(vci_); \
138132
if (mpi_errno != MPI_SUCCESS) \
@@ -176,10 +170,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
176170
mpi_errno, \
177171
MPI_ERR_OTHER, \
178172
"**ofid_"#STR, \
179-
"**ofid_"#STR" %s %d %s %s", \
180-
__SHORT_FILE__, \
181-
__LINE__, \
182-
__func__, \
173+
"**ofid_"#STR" %s %s", \
174+
MPIDI_OFI_DEFAULT_NIC_NAME, \
183175
fi_strerror(-_ret)); \
184176
} while (0)
185177

@@ -299,7 +291,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_mr_bind(struct fi_info *prov, struct fid_
299291
#define MPIDI_OFI_LOCAL_MR_KEY 0
300292
#define MPIDI_OFI_COLL_MR_KEY 1
301293
#define MPIDI_OFI_INVALID_MR_KEY 0xFFFFFFFFFFFFFFFFULL
302-
int MPIDI_OFI_retry_progress(void);
294+
int MPIDI_OFI_retry_progress(int vci, int retry);
303295
int MPIDI_OFI_recv_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
304296
int MPIDI_OFI_recv_huge_control(int vci, MPIR_Context_id_t comm_id, int rank, int tag,
305297
MPIDI_OFI_huge_remote_info_t * info);

src/mpid/ch4/netmod/ofi/ofi_send.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
- name : MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD
4141
category : CH4_OFI
4242
type : int
43-
default : 16384
43+
default : -1
4444
class : none
4545
verbosity : MPI_T_VERBOSITY_USER_BASIC
4646
scope : MPI_T_SCOPE_LOCAL

src/mpid/ch4/netmod/ofi/ofi_spawn.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s
5555
int rc;
5656
rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) &req.context);
5757
if (rc && rc != -FI_ENOENT) {
58-
MPIR_ERR_CHKANDJUMP4(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
59-
"**ofid_cancel %s %d %s %s", __SHORT_FILE__, __LINE__, __func__,
58+
MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
59+
"**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME,
6060
fi_strerror(-rc));
6161

6262
}
@@ -112,8 +112,8 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout)
112112
int rc;
113113
rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].rx, (void *) &req.context);
114114
if (rc && rc != -FI_ENOENT) {
115-
MPIR_ERR_CHKANDJUMP4(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
116-
"**ofid_cancel %s %d %s %s", __SHORT_FILE__, __LINE__, __func__,
115+
MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel",
116+
"**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME,
117117
fi_strerror(-rc));
118118

119119
}

0 commit comments

Comments
 (0)