Skip to content

Commit 13b32c7

Browse files
jiaxiyanshijin-aws
authored andcommitted
prov/efa: differentiate unresponsive receiver errors following rdma-core
Add a new vendor error code EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE from rdma core to indicate the remote is unreachable. Add a new EFA provider error code UNESTABLISHED_RECV_UNRESP to distinguish unresponsive receiver error when the peer is reachable by the EFA device but libfabric failed to complete a handshake. Add unit test for EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE. Signed-off-by: Jessie Yang <[email protected]> (cherry picked from commit 5573b3f)
1 parent 6bf5d90 commit 13b32c7

File tree

6 files changed

+44
-12
lines changed

6 files changed

+44
-12
lines changed

prov/efa/src/efa_errno.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@
6969
_(10, REMOTE_ERROR_RNR, Destination resource not ready (no work queue entries posted on receive queue)) \
7070
_(11, REMOTE_ERROR_BAD_LENGTH, Remote scatter-gather list too short) \
7171
_(12, REMOTE_ERROR_BAD_STATUS, Unexpected status returned by responder) \
72-
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (detected locally)) \
73-
_(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations))
72+
_(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive remote (was previously responsive)) \
73+
_(14, REMOTE_ERROR_UNKNOWN_PEER, No valid address handle at remote side (required for RDMA operations)) \
74+
_(15, LOCAL_ERROR_UNREACH_REMOTE, Unreachable remote (never received a response))
7475

7576
/**
7677
* @brief EFA provider proprietary error codes
@@ -105,7 +106,8 @@
105106
_(4122, SHM_INTERNAL_ERROR, SHM internal error) \
106107
_(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \
107108
_(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \
108-
_(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON)
109+
_(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON) \
110+
_(4126, UNESTABLISHED_RECV_UNRESP, Unresponsive receiver (reachable by EFA device but handshake failed))
109111

110112
/** @} */
111113

@@ -156,13 +158,15 @@ static inline int to_fi_errno(enum efa_errno err) {
156158
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNSUPPORTED_OP:
157159
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS:
158160
return FI_EINVAL;
159-
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
161+
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE:
160162
return FI_EHOSTUNREACH;
161163
case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH:
162164
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH:
163165
return FI_EMSGSIZE;
164166
case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT:
167+
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
165168
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
169+
case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP:
166170
return FI_ECONNABORTED;
167171
case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN:
168172
case EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER:

prov/efa/src/efa_strerror.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@ void efa_show_help(enum efa_errno err) {
6767
help = "This error is detected remotely; "
6868
"typically encountered when the peer process is no longer present";
6969
break;
70-
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE:
70+
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE:
7171
help = "This error is detected locally. "
72-
"The connection status is unknown or was never established via "
73-
"handshake. This typically indicates one or more misconfigured "
72+
"The peer is not reachable by the EFA device. "
73+
"This typically indicates one or more misconfigured "
7474
"EC2 instances; most often due to incorrect inbound/outbound "
7575
"security group rules and/or instances placed in different "
7676
"subnets. Refer to the public AWS documentation for EFA for "
@@ -80,8 +80,14 @@ void efa_show_help(enum efa_errno err) {
8080
case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP:
8181
help = "This error is detected locally. "
8282
"The connection was previously established via handshake, "
83-
"which indicates the error is likely due to the peer process no "
84-
"longer being present.";
83+
"which indicates the error is likely due to a hardware failure "
84+
"on the remote peer, or the peer process no longer being present.";
85+
break;
86+
case FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP:
87+
help = "This error is detected locally. "
88+
"The peer is reachable by the EFA device but libfabric failed "
89+
"to complete a handshake, which indicates the error is likely "
90+
"due to the peer process no longer being present.";
8591
break;
8692
case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX:
8793
help = "This error is detected locally. "

prov/efa/src/rdm/efa_rdm_cq.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,9 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct
399399
*
400400
* @todo Currently, this only checks for unresponsive receiver
401401
* (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to
402-
* #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other
402+
* #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP if a handshake was made, or
403+
* #FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP if the handshake failed.
404+
* This should be expanded to handle other
403405
* RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate
404406
* error reporting
405407
*/
@@ -418,8 +420,9 @@ static int efa_rdm_cq_get_prov_errno(struct ibv_cq_ex *ibv_cq_ex) {
418420

419421
switch (vendor_err) {
420422
case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: {
421-
if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED)
422-
vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP;
423+
vendor_err = (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) ?
424+
FI_EFA_ERR_ESTABLISHED_RECV_UNRESP :
425+
FI_EFA_ERR_UNESTABLISHED_RECV_UNRESP;
423426
break;
424427
}
425428
default:

prov/efa/test/efa_unit_test_cq.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,23 @@ void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id
227227
EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
228228
}
229229

230+
/**
231+
* @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns
232+
* unreachable remote error for send.
233+
*
234+
* When send operation failed, fi_cq_read() should return -FI_EAVAIL, which means error available.
235+
* then user should call fi_cq_readerr() to get an error CQ entry that contain error code.
236+
*
237+
* @param[in] state struct efa_resource that is managed by the framework
238+
*/
239+
void test_rdm_cq_read_bad_send_status_unreachable_receiver(struct efa_resource **state)
240+
{
241+
struct efa_resource *resource = *state;
242+
test_rdm_cq_read_bad_send_status(resource,
243+
0x1234567812345678, 0x8765432187654321,
244+
EFA_IO_COMP_STATUS_LOCAL_ERROR_UNREACH_REMOTE);
245+
}
246+
230247
/**
231248
* @brief test that RDM CQ's fi_cq_read()/fi_cq_readerr() works properly when rdma-core returns
232249
* invalid qpn error for send.

prov/efa/test/efa_unit_tests.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ int main(void)
120120
cmocka_unit_test_setup_teardown(test_rdm_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
121121
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
122122
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
123+
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unreachable_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
123124
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
124125
cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_message_too_long, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
125126
cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_bad_recv_status, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),

prov/efa/test/efa_unit_tests.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ void test_ibv_cq_ex_read_failed_poll();
134134
void test_rdm_cq_create_error_handling();
135135
void test_rdm_cq_read_bad_send_status_unresponsive_receiver();
136136
void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id();
137+
void test_rdm_cq_read_bad_send_status_unreachable_receiver();
137138
void test_rdm_cq_read_bad_send_status_invalid_qpn();
138139
void test_rdm_cq_read_bad_send_status_message_too_long();
139140
void test_ibv_cq_ex_read_bad_recv_status();

0 commit comments

Comments
 (0)