Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions prov/efa/src/rdm/efa_rdm_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -649,9 +649,25 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno)
//efa_rdm_rxe_release(rxe);

if (rxe->internal_flags & EFA_RDM_OPE_INTERNAL) {
EFA_WARN(FI_LOG_CQ,
"Writing eq error for rxe from internal operations\n");
efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true);
/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like the last commit said, we shouldn't even call efa_base_ep_write_eq_error

* Catch all errors from emulated one-sided operations, and ignore the
* error on the RX side, which shouldn't be involved in a one sided
* operation anyways.
*
* Emulated eager write
* Emulated long-CTS write
* Emulated long-read write
* Emulated DC eager write
* Emulated DC long-CTS write
* Emulated short read
* Emulated long-CTS read
* Emulated atomic (for DC)
* Emulated fetch atomic
* Emulated compare atomic
*/
EFA_WARN(FI_LOG_CQ, "Emulated one-sided operation error on RX side\n");
efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, false);
/* TODO Add NACK here */
return;
}

Expand Down
8 changes: 7 additions & 1 deletion prov/efa/src/rdm/efa_rdm_pke_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,13 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno)
"Packet receive error from non TX/RX packet. Our address: %s\n",
ep_addr_str);

efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true);
if (efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_WRITE_RTA_PKT ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any rx error are unexpected, and I doubt whether you can really derive that pkt type here since the rx is failed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need the RX error to be handled for 1 sided emulated atomics.

efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_FETCH_RTA_PKT ||
efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_COMPARE_RTA_PKT)
/* TODO Send NACK */
efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, false);
else
efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true);
efa_rdm_pke_release_rx(pkt_entry);
return;
}
Expand Down
20 changes: 18 additions & 2 deletions prov/efa/src/rdm/efa_rdm_pke_rta.c
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ int efa_rdm_pke_proc_dc_write_rta(struct efa_rdm_pke *pkt_entry)

rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic);
if (OFI_UNLIKELY(!rxe)) {
/*
* The RXE pool grows until we run OOM, and if we are OOM, we
* cannot send a NACK, so force a hard failure here.
*/
efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true);
efa_rdm_pke_release_rx(pkt_entry);
return -FI_ENOBUFS;
Expand Down Expand Up @@ -380,6 +384,10 @@ int efa_rdm_pke_proc_fetch_rta(struct efa_rdm_pke *pkt_entry)

rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic_fetch);
if(OFI_UNLIKELY(!rxe)) {
/*
* The RXE pool grows until we run OOM, and if we are OOM, we
* cannot send a NACK, so force a hard failure here.
*/
efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS,
FI_EFA_ERR_RXE_POOL_EXHAUSTED, true);
return -FI_ENOBUFS;
Expand Down Expand Up @@ -511,6 +519,10 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry)

rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic_compare);
if(OFI_UNLIKELY(!rxe)) {
/*
* The RXE pool grows until we run OOM, and if we are OOM, we
* cannot send a NACK, so force a hard failure here.
*/
efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true);
efa_rdm_pke_release_rx(pkt_entry);
return -FI_ENOBUFS;
Expand All @@ -521,7 +533,7 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry)
dt = rxe->atomic_hdr.datatype;
dtsize = ofi_datatype_size(rxe->atomic_hdr.datatype);
if (OFI_UNLIKELY(!dtsize)) {
efa_base_ep_write_eq_error(&ep->base_ep, errno, FI_EFA_ERR_INVALID_DATATYPE, true);
efa_base_ep_write_eq_error(&ep->base_ep, errno, FI_EFA_ERR_INVALID_DATATYPE, false);
efa_rdm_rxe_release(rxe);
efa_rdm_pke_release_rx(pkt_entry);
return -errno;
Expand Down Expand Up @@ -552,7 +564,11 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry)

err = efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_ATOMRSP_PKT);
if (OFI_UNLIKELY(err)) {
efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST, true);
/*
* The atomic response failed to post, attempt to write EQ error
* and do not abort
*/
efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST, false);
ofi_buf_free(rxe->atomrsp_data);
efa_rdm_rxe_release(rxe);
efa_rdm_pke_release_rx(pkt_entry);
Expand Down
14 changes: 12 additions & 2 deletions prov/efa/src/rdm/efa_rdm_pke_rtw.c
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,10 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry)
if (!rxe) {
EFA_WARN(FI_LOG_CQ,
"RX entries exhausted.\n");
/*
* The RXE pool grows until we run OOM, and if we are OOM, we
* cannot send a NACK, so force a hard failure here.
*/
efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep,
FI_ENOBUFS,
FI_EFA_ERR_RXE_POOL_EXHAUSTED, true);
Expand All @@ -548,8 +552,9 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry)
rtw_hdr->rma_iov_count,
FI_REMOTE_WRITE, rxe->iov, rxe->desc);
if (OFI_UNLIKELY(err)) {
/* TODO Add NACK */
EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n");
efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, true);
efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, false);
efa_rdm_rxe_release(rxe);
efa_rdm_pke_release_rx(pkt_entry);
return;
Expand All @@ -574,7 +579,12 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry)
if (OFI_UNLIKELY(err)) {
EFA_WARN(FI_LOG_CQ,
"RDMA post read or queue failed.\n");
efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST, true);

/*
* The READ/QUEUE/NACK failed, in this case log EQ message
* and do not abort.
*/
efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST, false);
efa_rdm_rxe_release(rxe);
}
}