-
Notifications
You must be signed in to change notification settings - Fork 456
prov/efa: Improve emulated 1 sided protocol error case behavior #11452
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
b27cfad
af2cdbe
b2ff094
ccaa511
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -124,7 +124,7 @@ void efa_rdm_txe_release(struct efa_rdm_ope *txe) | |
| err = fi_close((struct fid *)txe->mr[i]); | ||
| if (OFI_UNLIKELY(err)) { | ||
| EFA_WARN(FI_LOG_CQ, "mr dereg failed. err=%d\n", err); | ||
| efa_base_ep_write_eq_error(&txe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG); | ||
| efa_base_ep_write_eq_error(&txe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG, true); | ||
| } | ||
|
|
||
| txe->mr[i] = NULL; | ||
|
|
@@ -189,7 +189,7 @@ void efa_rdm_rxe_release_internal(struct efa_rdm_ope *rxe) | |
| err = fi_close((struct fid *)rxe->mr[i]); | ||
| if (OFI_UNLIKELY(err)) { | ||
| EFA_WARN(FI_LOG_CQ, "mr dereg failed. err=%d\n", err); | ||
| efa_base_ep_write_eq_error(&rxe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG); | ||
| efa_base_ep_write_eq_error(&rxe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG, true); | ||
| } | ||
|
|
||
| rxe->mr[i] = NULL; | ||
|
|
@@ -649,9 +649,25 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) | |
| //efa_rdm_rxe_release(rxe); | ||
|
|
||
| if (rxe->internal_flags & EFA_RDM_OPE_INTERNAL) { | ||
| EFA_WARN(FI_LOG_CQ, | ||
| "Writing eq error for rxe from internal operations\n"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| /* | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like the last commit said, we shouldn't even call efa_base_ep_write_eq_error |
||
| * Catch all errors from emulated one-sided operations, and ignore the | ||
| * error on the RX side, which shouldn't be involved in a one sided | ||
| * operation anyways. | ||
| * | ||
| * Emulated eager write | ||
| * Emulated long-CTS write | ||
| * Emulated long-read write | ||
| * Emulated DC eager write | ||
| * Emulated DC long-CTS write | ||
| * Emulated short read | ||
| * Emulated long-CTS read | ||
| * Emulated atomic (for DC) | ||
| * Emulated fetch atomic | ||
| * Emulated compare atomic | ||
| */ | ||
| EFA_WARN(FI_LOG_CQ, "Emulated one-sided operation error on RX side\n"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, false); | ||
| /* TODO Add NACK here */ | ||
| return; | ||
| } | ||
|
|
||
|
|
@@ -660,7 +676,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) | |
| if (write_cq_err) { | ||
| EFA_WARN(FI_LOG_CQ, | ||
| "Error writing error cq entry when handling RX error\n"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -755,7 +771,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) | |
| if (txe->internal_flags & EFA_RDM_OPE_INTERNAL) { | ||
| EFA_WARN(FI_LOG_CQ, | ||
| "Writing eq error for txe from internal operations\n"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); | ||
| return; | ||
| } | ||
|
|
||
|
|
@@ -764,7 +780,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) | |
| if (write_cq_err) { | ||
| EFA_WARN(FI_LOG_CQ, | ||
| "Error writing error cq entry when handling TX error\n"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -466,7 +466,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) | |
| "While sending a handshake packet, an error occurred." | ||
| " Our address: %s, peer address: %s\n", | ||
| ep_addr_str, peer_addr_str); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); | ||
| break; | ||
| } | ||
| } | ||
|
|
@@ -524,7 +524,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) | |
| default: | ||
| EFA_WARN(FI_LOG_CQ, "Unknown x_entry type: %d\n", pkt_entry->ope->type); | ||
| assert(0 && "unknown x_entry state"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); | ||
| efa_rdm_pke_release_tx(pkt_entry); | ||
| break; | ||
| } | ||
|
|
@@ -666,7 +666,7 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry) | |
| "invalid control pkt type %d\n", | ||
| efa_rdm_pke_get_base_hdr(pkt_entry)->type); | ||
| assert(0 && "invalid control pkt type"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE, true); | ||
| return; | ||
| } | ||
|
|
||
|
|
@@ -713,7 +713,13 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) | |
| "Packet receive error from non TX/RX packet. Our address: %s\n", | ||
| ep_addr_str); | ||
|
|
||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| if (efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_WRITE_RTA_PKT || | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any rx error are unexpected, and I doubt whether you can really derive that pkt type here since the rx is failed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need the RX error to be handled for 1 sided emulated atomics. |
||
| efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_FETCH_RTA_PKT || | ||
| efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_COMPARE_RTA_PKT) | ||
| /* TODO Send NACK */ | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, false); | ||
| else | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); | ||
| efa_rdm_pke_release_rx(pkt_entry); | ||
| return; | ||
| } | ||
|
|
@@ -726,7 +732,7 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) | |
| EFA_WARN(FI_LOG_CQ, "unknown RDM operation entry type encountered: %d\n", | ||
| pkt_entry->ope->type); | ||
| assert(0 && "unknown x_entry state"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); | ||
| } | ||
|
|
||
| efa_rdm_pke_release_rx(pkt_entry); | ||
|
|
@@ -781,14 +787,14 @@ void efa_rdm_pke_proc_received(struct efa_rdm_pke *pkt_entry) | |
| EFA_WARN(FI_LOG_CQ, | ||
| "Received a RTS packet, which has been retired since protocol version 4\n"); | ||
| assert(0 && "deprecated RTS pakcet received"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE, true); | ||
| efa_rdm_pke_release_rx(pkt_entry); | ||
| return; | ||
| case EFA_RDM_RETIRED_CONNACK_PKT: | ||
| EFA_WARN(FI_LOG_CQ, | ||
| "Received a CONNACK packet, which has been retired since protocol version 4\n"); | ||
| assert(0 && "deprecated CONNACK pakcet received"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE, true); | ||
| efa_rdm_pke_release_rx(pkt_entry); | ||
| return; | ||
| case EFA_RDM_EOR_PKT: | ||
|
|
@@ -859,7 +865,7 @@ void efa_rdm_pke_proc_received(struct efa_rdm_pke *pkt_entry) | |
| "invalid control pkt type %d\n", | ||
| efa_rdm_pke_get_base_hdr(pkt_entry)->type); | ||
| assert(0 && "invalid control pkt type"); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE); | ||
| efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE, true); | ||
| efa_rdm_pke_release_rx(pkt_entry); | ||
| return; | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not think we need this commit at all, we shouldn't even call this function when the tx error can be ignored