From b27cfad4bb80e1251ad6449925dbb13484244f8d Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Fri, 26 Sep 2025 19:07:29 +0000 Subject: [PATCH 1/4] prov/efa: Fix typo in comment EFA_RDM_LONGREAD_RTA_PKT was supposed to be EFA_RDM_LONGREAD_RTW_PKT in comment. Signed-off-by: Seth Zegelstein --- prov/efa/src/rdm/efa_rdm_pke_rtw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.c b/prov/efa/src/rdm/efa_rdm_pke_rtw.c index 824ee484c6b..d31b06fb5ca 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.c @@ -516,9 +516,9 @@ ssize_t efa_rdm_pke_init_longread_rtw(struct efa_rdm_pke *pkt_entry, } /** - * @brief handle the event that a EFA_RDM_LONGREAD_RTA_PKE has been received + * @brief handle the event that a EFA_RDM_LONGREAD_RTW_PKT has been received * - * @param[in] pkt_entry received EFA_RDM_LONGREAD_RTA_PKT packet entry + * @param[in] pkt_entry received EFA_RDM_LONGREAD_RTW_PKT packet entry */ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) { From af2cdbe74b57f233d6836e7bdda45c65ccd56dca Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Tue, 30 Sep 2025 18:38:35 +0000 Subject: [PATCH 2/4] prov/efa: Slide recv-win on RTM/RTA error In order to continue processing future packets correctly, we need to progress the recv window when a RTM or RTA packet has been successfully delivered. If we fail processing a packet, we need to write the error to the CQ (when applicable), and continue to do work. Returning early here makes our recv window index be off by 1. Signed-off-by: Seth Zegelstein --- prov/efa/src/rdm/efa_rdm_pke_rtm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.c b/prov/efa/src/rdm/efa_rdm_pke_rtm.c index 46b63541e8f..bc7a14e64d1 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.c @@ -507,11 +507,12 @@ void efa_rdm_pke_handle_rtm_rta_recv(struct efa_rdm_pke *pkt_entry) /* * efa_rdm_pke_proc_rtm_rta() will write error cq entry if needed, - * thus we do not write error cq entry + * thus we do not write error cq entry. + * + * Even if we hit an error processing the packets contents, we still + * need to slide the recv window so we can continue to do work. */ - ret = efa_rdm_pke_proc_rtm_rta(pkt_entry, peer); - if (OFI_UNLIKELY(ret)) - return; + (void) efa_rdm_pke_proc_rtm_rta(pkt_entry, peer); if (slide_recvwin) { ofi_recvwin_slide((&peer->robuf)); From b2ff0942799bd73b6bdaeb89cc78ace62278c5b6 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Fri, 26 Sep 2025 01:29:08 +0000 Subject: [PATCH 3/4] prov/efa: Abort based on flag when writing to non-bound EQ This commit adds bool should_abort to efa_base_ep_write_eq_error() which allows the caller to dictate whether the function should abort in the case that the user has not bound an EQ. This commit sets should_abort=true everywhere, so it should not be a functional change. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_base_ep.c | 19 +++++++++++-------- prov/efa/src/efa_base_ep.h | 3 ++- prov/efa/src/efa_cq.c | 4 ++-- prov/efa/src/efa_domain.c | 2 +- prov/efa/src/rdm/efa_rdm_cq.c | 12 ++++++------ prov/efa/src/rdm/efa_rdm_ep_utils.c | 4 ++-- prov/efa/src/rdm/efa_rdm_msg.c | 8 ++++---- prov/efa/src/rdm/efa_rdm_ope.c | 12 ++++++------ prov/efa/src/rdm/efa_rdm_peer.c | 2 +- prov/efa/src/rdm/efa_rdm_pke_cmd.c | 16 ++++++++-------- prov/efa/src/rdm/efa_rdm_pke_nonreq.c | 2 +- prov/efa/src/rdm/efa_rdm_pke_rta.c | 10 +++++----- prov/efa/src/rdm/efa_rdm_pke_rtm.c | 12 ++++++------ prov/efa/src/rdm/efa_rdm_pke_rtr.c | 6 +++--- prov/efa/src/rdm/efa_rdm_pke_rtw.c | 24 ++++++++++++------------ prov/efa/src/rdm/efa_rdm_rxe_map.c | 2 +- 16 files changed, 71 insertions(+), 67 deletions(-) diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index e3c95399c09..bb018d56556 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -134,7 +134,7 @@ int efa_base_ep_destruct(struct efa_base_ep *base_ep) if (base_ep->efa_recv_wr_vec) free(base_ep->efa_recv_wr_vec); - + if (base_ep->user_recv_wr_vec) free(base_ep->user_recv_wr_vec); @@ -386,7 +386,7 @@ int efa_base_ep_enable_qp(struct efa_base_ep *base_ep, struct efa_qp *qp) qp->qp_num = qp->ibv_qp->qp_num; base_ep->domain->qp_table[qp->qp_num & base_ep->domain->qp_table_sz_m1] = qp; - + EFA_INFO(FI_LOG_EP_CTRL, "QP enabled! qp_n: %d qkey: %d\n", qp->qp_num, qp->qkey); return err; @@ -586,11 +586,12 @@ bool efa_qp_support_op_in_order_aligned_128_bytes(struct efa_qp *qp, enum ibv_wr * If the base_ep is not binded to an EQ, or write to EQ failed, * this function will print the error message to console, then abort() * - * @param[in,out] ep base endpoint - * @param[in] err OFI error code + * @param[in,out] ep base endpoint + * @param[in] err OFI error code * @param[in] prov_errno provider error code + * @param[in] should_abort If true, aborts when no EQ is available */ -void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, ssize_t prov_errno) +void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, ssize_t prov_errno, bool should_abort) { struct fi_eq_err_entry err_entry; int ret = -FI_ENOEQ; @@ -614,11 +615,13 @@ void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, ssize_t pro fprintf(stderr, "Libfabric EFA provider has encountered an internal error:\n\n" "Libfabric error: (%zd) %s\n" - "EFA internal error: (%zd) %s\n\n" - "Your application will now abort().\n", + "EFA internal error: (%zd) %s\n\n", err, fi_strerror(err), prov_errno, efa_strerror(prov_errno)); - abort(); + if (should_abort) { + fprintf(stderr, "Your application will now abort().\n"); + abort(); + } } const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, size_t *buflen) diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index 382bebf4dfb..7f57339cb21 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -126,7 +126,8 @@ bool efa_qp_support_op_in_order_aligned_128_bytes(struct efa_qp *qp, void efa_base_ep_write_eq_error(struct efa_base_ep *ep, ssize_t err, - ssize_t prov_errno); + ssize_t prov_errno, + bool should_abort); const char *efa_base_ep_raw_addr_str(struct efa_base_ep *base_ep, char *buf, size_t *buflen); diff --git a/prov/efa/src/efa_cq.c b/prov/efa/src/efa_cq.c index 1b8790757de..b8a4c7c64e7 100644 --- a/prov/efa/src/efa_cq.c +++ b/prov/efa/src/efa_cq.c @@ -123,7 +123,7 @@ static void efa_cq_handle_error(struct efa_base_ep *base_ep, FI_LOG_CQ, "Error writing error cq entry when handling %s error\n", is_tx ? "TX" : "RX"); - efa_base_ep_write_eq_error(base_ep, err, prov_errno); + efa_base_ep_write_eq_error(base_ep, err, prov_errno, true); } } @@ -247,7 +247,7 @@ efa_cq_proc_ibv_recv_rdma_with_imm_completion(struct efa_base_ep *base_ep, "operation: %s\n", fi_strerror(-ret)); efa_base_ep_write_eq_error(base_ep, -ret, - FI_EFA_ERR_WRITE_RECV_COMP); + FI_EFA_ERR_WRITE_RECV_COMP, true); } } diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index 196562a125b..6ae55b9ebcb 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -802,7 +802,7 @@ void efa_domain_progress_rdm_peers_and_queues(struct efa_domain *domain) peer->conn->fi_addr, peer->conn->implicit_fi_addr, fi_strerror(-ret)); - efa_base_ep_write_eq_error(&peer->ep->base_ep, -ret, FI_EFA_ERR_PEER_HANDSHAKE); + efa_base_ep_write_eq_error(&peer->ep->base_ep, -ret, FI_EFA_ERR_PEER_HANDSHAKE, true); continue; } diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index e1e5ac8371c..5d755eccf2d 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -120,7 +120,7 @@ void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( EFA_WARN(FI_LOG_CQ, "Unable to write a cq entry for remote for RECV_RDMA operation: %s\n", fi_strerror(-ret)); - efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_WRITE_RECV_COMP); + efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_WRITE_RECV_COMP, true); } efa_cntr_report_rx_completion(&ep->base_ep.util_ep, flags); @@ -189,7 +189,7 @@ static inline int efa_rdm_cq_populate_src_efa_ep_addr( self_raw_addr_str, base_hdr->version, EFA_RDM_PROTOCOL_VERSION); efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, - FI_EFA_ERR_INVALID_PKT_TYPE); + FI_EFA_ERR_INVALID_PKT_TYPE, true); fprintf(stderr, "Host %s received a packet with invalid protocol " "version %d.\n" @@ -399,7 +399,7 @@ efa_rdm_cq_get_peer_for_pkt_entry(struct efa_rdm_ep *ep, 0, NULL, false, true); if (OFI_UNLIKELY(ret != 0)) { efa_base_ep_write_eq_error(&ep->base_ep, ret, - FI_EFA_ERR_AV_INSERT); + FI_EFA_ERR_AV_INSERT, true); return NULL; } assert(implicit_fi_addr != FI_ADDR_NOTAVAIL); @@ -500,7 +500,7 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct base_hdr->type); assert(0 && "invalid REQ packet type"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -518,7 +518,7 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct /* local & peer host-id & ep address will be logged by efa_rdm_write_error_msg */ if (!efa_rdm_write_error_msg(ep, pkt_entry->peer, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, errbuf, &errbuf_len)) EFA_WARN(FI_LOG_CQ, "Error: %s\n", (const char *) errbuf); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -692,7 +692,7 @@ enum ibv_wc_status efa_rdm_cq_process_wc(struct efa_ibv_cq *cq, struct efa_rdm_e if (efa_cq_wc_is_unsolicited(cq)) { EFA_WARN(FI_LOG_CQ, "Receive error %s (%d) for unsolicited write recv", efa_strerror(prov_errno), prov_errno); - efa_base_ep_write_eq_error(&ep->base_ep, to_fi_errno(prov_errno), prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, to_fi_errno(prov_errno), prov_errno, true); break; } assert(pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 8cd3b5129a5..e5de86dfaec 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -754,7 +754,7 @@ void efa_rdm_ep_post_handshake_or_queue(struct efa_rdm_ep *ep, struct efa_rdm_pe EFA_WARN(FI_LOG_EP_CTRL, "Failed to post HANDSHAKE to peer fi_addr: %ld implicit fi_addr %ld. %s\n", peer->conn->fi_addr, peer->conn->implicit_fi_addr, fi_strerror(-err)); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PEER_HANDSHAKE); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PEER_HANDSHAKE, true); return; } @@ -1009,7 +1009,7 @@ void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep) err_exit: - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_INTERNAL_RX_BUF_POST); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_INTERNAL_RX_BUF_POST, true); } /** diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index 5825d123f98..bed88ed5325 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -767,7 +767,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep, if (ret == FI_SUCCESS) { /* A matched rxe is found */ rxe = efa_rdm_msg_alloc_matched_rxe_for_rtm(ep, *pkt_entry_ptr, peer_rxe, ofi_op_msg); if (OFI_UNLIKELY(!rxe)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); return NULL; } efa_rdm_tracepoint(msg_match_expected_nontagged, rxe->msg_id, @@ -779,7 +779,7 @@ efa_rdm_msg_alloc_rxe_for_msgrtm(struct efa_rdm_ep *ep, */ rxe = efa_rdm_msg_alloc_unexp_rxe_for_rtm(ep, pkt_entry_ptr, ofi_op_msg); if (OFI_UNLIKELY(!rxe)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); return NULL; } (*pkt_entry_ptr)->ope = rxe; @@ -840,7 +840,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep, if (ret == FI_SUCCESS) { /* A matched rxe is found */ rxe = efa_rdm_msg_alloc_matched_rxe_for_rtm(ep, *pkt_entry_ptr, peer_rxe, ofi_op_tagged); if (OFI_UNLIKELY(!rxe)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); return NULL; } efa_rdm_tracepoint(msg_match_expected_tagged, rxe->msg_id, @@ -852,7 +852,7 @@ efa_rdm_msg_alloc_rxe_for_tagrtm(struct efa_rdm_ep *ep, */ rxe = efa_rdm_msg_alloc_unexp_rxe_for_rtm(ep, pkt_entry_ptr, ofi_op_tagged); if (OFI_UNLIKELY(!rxe)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); return NULL; } (*pkt_entry_ptr)->ope = rxe; diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 7b2ff8cf0fb..578749d0533 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -124,7 +124,7 @@ void efa_rdm_txe_release(struct efa_rdm_ope *txe) err = fi_close((struct fid *)txe->mr[i]); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "mr dereg failed. err=%d\n", err); - efa_base_ep_write_eq_error(&txe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG); + efa_base_ep_write_eq_error(&txe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG, true); } txe->mr[i] = NULL; @@ -189,7 +189,7 @@ void efa_rdm_rxe_release_internal(struct efa_rdm_ope *rxe) err = fi_close((struct fid *)rxe->mr[i]); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "mr dereg failed. err=%d\n", err); - efa_base_ep_write_eq_error(&rxe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG); + efa_base_ep_write_eq_error(&rxe->ep->base_ep, err, FI_EFA_ERR_MR_DEREG, true); } rxe->mr[i] = NULL; @@ -651,7 +651,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) if (rxe->internal_flags & EFA_RDM_OPE_INTERNAL) { EFA_WARN(FI_LOG_CQ, "Writing eq error for rxe from internal operations\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); return; } @@ -660,7 +660,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) if (write_cq_err) { EFA_WARN(FI_LOG_CQ, "Error writing error cq entry when handling RX error\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); } } @@ -755,7 +755,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) if (txe->internal_flags & EFA_RDM_OPE_INTERNAL) { EFA_WARN(FI_LOG_CQ, "Writing eq error for txe from internal operations\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); return; } @@ -764,7 +764,7 @@ void efa_rdm_txe_handle_error(struct efa_rdm_ope *txe, int err, int prov_errno) if (write_cq_err) { EFA_WARN(FI_LOG_CQ, "Error writing error cq entry when handling TX error\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); } } diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 177b679f7c0..bee83bffb5d 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -275,7 +275,7 @@ void efa_rdm_peer_move_overflow_pke_to_recvwin(struct efa_rdm_peer *peer) /* running out of memory while copy packet */ efa_base_ep_write_eq_error( &(overflow_pkt_entry->ep->base_ep), - FI_ENOBUFS, FI_EFA_ERR_OOM); + FI_ENOBUFS, FI_EFA_ERR_OOM, true); return; } dlist_remove(&overflow_pke_list_entry->entry); diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.c b/prov/efa/src/rdm/efa_rdm_pke_cmd.c index 270747e4214..2317fac7264 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.c +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.c @@ -466,7 +466,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) "While sending a handshake packet, an error occurred." " Our address: %s, peer address: %s\n", ep_addr_str, peer_addr_str); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); break; } } @@ -524,7 +524,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) default: EFA_WARN(FI_LOG_CQ, "Unknown x_entry type: %d\n", pkt_entry->ope->type); assert(0 && "unknown x_entry state"); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); efa_rdm_pke_release_tx(pkt_entry); break; } @@ -666,7 +666,7 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry) "invalid control pkt type %d\n", efa_rdm_pke_get_base_hdr(pkt_entry)->type); assert(0 && "invalid control pkt type"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE, true); return; } @@ -713,7 +713,7 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) "Packet receive error from non TX/RX packet. Our address: %s\n", ep_addr_str); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -726,7 +726,7 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) EFA_WARN(FI_LOG_CQ, "unknown RDM operation entry type encountered: %d\n", pkt_entry->ope->type); assert(0 && "unknown x_entry state"); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); } efa_rdm_pke_release_rx(pkt_entry); @@ -781,14 +781,14 @@ void efa_rdm_pke_proc_received(struct efa_rdm_pke *pkt_entry) EFA_WARN(FI_LOG_CQ, "Received a RTS packet, which has been retired since protocol version 4\n"); assert(0 && "deprecated RTS pakcet received"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE, true); efa_rdm_pke_release_rx(pkt_entry); return; case EFA_RDM_RETIRED_CONNACK_PKT: EFA_WARN(FI_LOG_CQ, "Received a CONNACK packet, which has been retired since protocol version 4\n"); assert(0 && "deprecated CONNACK pakcet received"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DEPRECATED_PKT_TYPE, true); efa_rdm_pke_release_rx(pkt_entry); return; case EFA_RDM_EOR_PKT: @@ -859,7 +859,7 @@ void efa_rdm_pke_proc_received(struct efa_rdm_pke *pkt_entry) "invalid control pkt type %d\n", efa_rdm_pke_get_base_hdr(pkt_entry)->type); assert(0 && "invalid control pkt type"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE, true); efa_rdm_pke_release_rx(pkt_entry); return; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c index 88769b713c9..c5a4f0ecb78 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c +++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c @@ -832,7 +832,7 @@ void efa_rdm_pke_handle_atomrsp_recv(struct efa_rdm_pke *pkt_entry) txe->atomic_ex.resp_iov_count, atomrsp_pkt->data, atomrsp_hdr->seg_length); if (OFI_UNLIKELY(ret < 0)) { - efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, -ret, EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH); + efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, -ret, EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH, true); return; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_rta.c b/prov/efa/src/rdm/efa_rdm_pke_rta.c index b0247113414..583014fccf4 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rta.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rta.c @@ -282,7 +282,7 @@ int efa_rdm_pke_proc_dc_write_rta(struct efa_rdm_pke *pkt_entry) rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic); if (OFI_UNLIKELY(!rxe)) { - efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return -FI_ENOBUFS; } @@ -381,7 +381,7 @@ int efa_rdm_pke_proc_fetch_rta(struct efa_rdm_pke *pkt_entry) rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic_fetch); if(OFI_UNLIKELY(!rxe)) { efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, - FI_EFA_ERR_RXE_POOL_EXHAUSTED); + FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); return -FI_ENOBUFS; } @@ -511,7 +511,7 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic_compare); if(OFI_UNLIKELY(!rxe)) { - efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return -FI_ENOBUFS; } @@ -521,7 +521,7 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) dt = rxe->atomic_hdr.datatype; dtsize = ofi_datatype_size(rxe->atomic_hdr.datatype); if (OFI_UNLIKELY(!dtsize)) { - efa_base_ep_write_eq_error(&ep->base_ep, errno, FI_EFA_ERR_INVALID_DATATYPE); + efa_base_ep_write_eq_error(&ep->base_ep, errno, FI_EFA_ERR_INVALID_DATATYPE, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return -errno; @@ -552,7 +552,7 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) err = efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_ATOMRSP_PKT); if (OFI_UNLIKELY(err)) { - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST, true); ofi_buf_free(rxe->atomrsp_data); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.c b/prov/efa/src/rdm/efa_rdm_pke_rtm.c index bc7a14e64d1..6e35bca7b74 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.c @@ -289,7 +289,7 @@ static ssize_t efa_rdm_pke_proc_msgrtm(struct efa_rdm_pke *pkt_entry) if (OFI_UNLIKELY(!rxe)) { efa_base_ep_write_eq_error( &ep->base_ep, FI_ENOBUFS, - FI_EFA_ERR_RXE_POOL_EXHAUSTED); + FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return -FI_ENOBUFS; } @@ -337,7 +337,7 @@ static ssize_t efa_rdm_pke_proc_tagrtm(struct efa_rdm_pke *pkt_entry) if (OFI_UNLIKELY(!rxe)) { efa_base_ep_write_eq_error( &ep->base_ep, FI_ENOBUFS, - FI_EFA_ERR_RXE_POOL_EXHAUSTED); + FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return -FI_ENOBUFS; } @@ -413,7 +413,7 @@ ssize_t efa_rdm_pke_proc_rtm_rta(struct efa_rdm_pke *pkt_entry, struct efa_rdm_p EFA_WARN(FI_LOG_EP_CTRL, "Unknown packet type ID: %d\n", base_hdr->type); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_UNKNOWN_PKT_TYPE); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_UNKNOWN_PKT_TYPE, true); efa_rdm_pke_release_rx(pkt_entry); } @@ -479,21 +479,21 @@ void efa_rdm_pke_handle_rtm_rta_recv(struct efa_rdm_pke *pkt_entry) "Invalid msg_id: %" PRIu32 " robuf->exp_msg_id: %" PRIu32 "\n", msg_id, peer->robuf.exp_msg_id); - efa_base_ep_write_eq_error(&ep->base_ep, ret, FI_EFA_ERR_PKT_ALREADY_PROCESSED); + efa_base_ep_write_eq_error(&ep->base_ep, ret, FI_EFA_ERR_PKT_ALREADY_PROCESSED, true); efa_rdm_pke_release_rx(pkt_entry); return; } if (OFI_UNLIKELY(ret == -FI_ENOMEM)) { /* running out of memory while copy packet */ - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_OOM); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_OOM, true); return; } EFA_WARN(FI_LOG_EP_CTRL, "Unknown error %d processing REQ packet msg_id: %" PRIu32 "\n", ret, msg_id); - efa_base_ep_write_eq_error(&ep->base_ep, ret, FI_EFA_ERR_OTHER); + efa_base_ep_write_eq_error(&ep->base_ep, ret, FI_EFA_ERR_OTHER, true); return; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtr.c b/prov/efa/src/rdm/efa_rdm_pke_rtr.c index e61dade89f7..d6072df9d9a 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtr.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtr.c @@ -116,7 +116,7 @@ void efa_rdm_pke_handle_rtr_recv(struct efa_rdm_pke *pkt_entry) if (OFI_UNLIKELY(!rxe)) { EFA_WARN(FI_LOG_CQ, "RX entries exhausted.\n"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -126,7 +126,7 @@ void efa_rdm_pke_handle_rtr_recv(struct efa_rdm_pke *pkt_entry) FI_REMOTE_READ, rxe->iov, rxe->desc); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verification failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -140,7 +140,7 @@ void efa_rdm_pke_handle_rtr_recv(struct efa_rdm_pke *pkt_entry) err = efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_READRSP_PKT); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "Posting of readrsp packet failed! err=%ld\n", err); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.c b/prov/efa/src/rdm/efa_rdm_pke_rtw.c index d31b06fb5ca..fa945979193 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.c @@ -145,7 +145,7 @@ void efa_rdm_pke_proc_eager_rtw(struct efa_rdm_pke *pkt_entry, if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -161,13 +161,13 @@ void efa_rdm_pke_proc_eager_rtw(struct efa_rdm_pke *pkt_entry, pkt_entry->payload_size, rxe->total_len); EFA_WARN(FI_LOG_CQ, "target buffer: %p length: %ld\n", rxe->iov[0].iov_base, rxe->iov[0].iov_len); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RTM_MISMATCH); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RTM_MISMATCH, true); efa_rdm_pke_release_rx(pkt_entry); efa_rdm_rxe_release(rxe); } else { err = efa_rdm_pke_copy_payload_to_ope(pkt_entry, rxe); if (OFI_UNLIKELY(err)) { - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RXE_COPY); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RXE_COPY, true); efa_rdm_pke_release_rx(pkt_entry); efa_rdm_rxe_release(rxe); } @@ -194,7 +194,7 @@ void efa_rdm_pke_handle_eager_rtw_recv(struct efa_rdm_pke *pkt_entry) if (!rxe) { EFA_WARN(FI_LOG_CQ, "RX entries exhausted.\n"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -255,7 +255,7 @@ void efa_rdm_pke_handle_dc_eager_rtw_recv(struct efa_rdm_pke *pkt_entry) EFA_WARN(FI_LOG_CQ, "RX entries exhausted.\n"); efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, - FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -386,7 +386,7 @@ void efa_rdm_pke_handle_longcts_rtw_recv(struct efa_rdm_pke *pkt_entry) "RX entries exhausted.\n"); efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, - FI_EFA_ERR_RXE_POOL_EXHAUSTED); + FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -401,7 +401,7 @@ void efa_rdm_pke_handle_longcts_rtw_recv(struct efa_rdm_pke *pkt_entry) FI_REMOTE_WRITE, rxe->iov, rxe->desc); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -417,14 +417,14 @@ void efa_rdm_pke_handle_longcts_rtw_recv(struct efa_rdm_pke *pkt_entry) pkt_entry->payload_size, rxe->total_len); EFA_WARN(FI_LOG_CQ, "target buffer: %p length: %ld\n", rxe->iov[0].iov_base, rxe->iov[0].iov_len); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RTM_MISMATCH); + efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RTM_MISMATCH, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; } else { err = efa_rdm_pke_copy_payload_to_ope(pkt_entry, rxe); if (OFI_UNLIKELY(err)) { - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RXE_COPY); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RXE_COPY, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -536,7 +536,7 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) "RX entries exhausted.\n"); efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, - FI_EFA_ERR_RXE_POOL_EXHAUSTED); + FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -549,7 +549,7 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) FI_REMOTE_WRITE, rxe->iov, rxe->desc); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, true); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -574,7 +574,7 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RDMA post read or queue failed.\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST, true); efa_rdm_rxe_release(rxe); } } diff --git a/prov/efa/src/rdm/efa_rdm_rxe_map.c b/prov/efa/src/rdm/efa_rdm_rxe_map.c index a9dca6d8df3..728066c90e8 100644 --- a/prov/efa/src/rdm/efa_rdm_rxe_map.c +++ b/prov/efa/src/rdm/efa_rdm_rxe_map.c @@ -43,7 +43,7 @@ void efa_rdm_rxe_map_insert(struct efa_rdm_rxe_map *rxe_map, if (OFI_UNLIKELY(!entry)) { EFA_WARN(FI_LOG_CQ, "Map entries for medium size message exhausted.\n"); - efa_base_ep_write_eq_error(&rxe->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED); + efa_base_ep_write_eq_error(&rxe->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); return; } From ccaa5110ffda5be031ea30dc14275d0cb864d1a9 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Tue, 30 Sep 2025 02:15:12 +0000 Subject: [PATCH 4/4] prov/efa: Stop 1-sided emulated protocols from abort on RX error Emulated write, read and atomic are single sided operations. They should never cause an abort on an RX side error because the RX side is "should not" be involved in the protocol that we are emulating. We emulate due to the NIC not supporting the protocols, and we need to emulate the functionality as if the NIC did support the protocols. This change modifies the emulated write protocols, emulated read protocols, and emulated atomics to not abort when an error happens on the RX side. Error CQ's should only be written to the target CQ if write w/imm is used, otherwise, the operation should fail silently on the RX side and a NACK should be delivered to the TX side to indicate a failure of the operation depending on the TX's requested completion level. Signed-off-by: Seth Zegelstein --- prov/efa/src/rdm/efa_rdm_ope.c | 22 +++++++++++++++++++--- prov/efa/src/rdm/efa_rdm_pke_cmd.c | 8 +++++++- prov/efa/src/rdm/efa_rdm_pke_rta.c | 20 ++++++++++++++++++-- prov/efa/src/rdm/efa_rdm_pke_rtw.c | 14 ++++++++++++-- 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 578749d0533..36387a54bd6 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -649,9 +649,25 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) //efa_rdm_rxe_release(rxe); if (rxe->internal_flags & EFA_RDM_OPE_INTERNAL) { - EFA_WARN(FI_LOG_CQ, - "Writing eq error for rxe from internal operations\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); + /* + * Catch all errors from emulated one-sided operations, and ignore the + * error on the RX side, which shouldn't be involved in a one sided + * operation anyways. + * + * Emulated eager write + * Emulated long-CTS write + * Emulated long-read write + * Emulated DC eager write + * Emulated DC long-CTS write + * Emulated short read + * Emulated long-CTS read + * Emulated atomic (for DC) + * Emulated fetch atomic + * Emulated compare atomic + */ + EFA_WARN(FI_LOG_CQ, "Emulated one-sided operation error on RX side\n"); + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, false); + /* TODO Add NACK here */ return; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.c b/prov/efa/src/rdm/efa_rdm_pke_cmd.c index 2317fac7264..170e028248e 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.c +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.c @@ -713,7 +713,13 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) "Packet receive error from non TX/RX packet. Our address: %s\n", ep_addr_str); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); + if (efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_WRITE_RTA_PKT || + efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_FETCH_RTA_PKT || + efa_rdm_pkt_type_of(pkt_entry) == EFA_RDM_COMPARE_RTA_PKT) + /* TODO Send NACK */ + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, false); + else + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno, true); efa_rdm_pke_release_rx(pkt_entry); return; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_rta.c b/prov/efa/src/rdm/efa_rdm_pke_rta.c index 583014fccf4..a6221d9f588 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rta.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rta.c @@ -282,6 +282,10 @@ int efa_rdm_pke_proc_dc_write_rta(struct efa_rdm_pke *pkt_entry) rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic); if (OFI_UNLIKELY(!rxe)) { + /* + * The RXE pool grows until we run OOM, and if we are OOM, we + * cannot send a NACK, so force a hard failure here. + */ efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return -FI_ENOBUFS; @@ -380,6 +384,10 @@ int efa_rdm_pke_proc_fetch_rta(struct efa_rdm_pke *pkt_entry) rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic_fetch); if(OFI_UNLIKELY(!rxe)) { + /* + * The RXE pool grows until we run OOM, and if we are OOM, we + * cannot send a NACK, so force a hard failure here. + */ efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); return -FI_ENOBUFS; @@ -511,6 +519,10 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) rxe = efa_rdm_pke_alloc_rta_rxe(pkt_entry, ofi_op_atomic_compare); if(OFI_UNLIKELY(!rxe)) { + /* + * The RXE pool grows until we run OOM, and if we are OOM, we + * cannot send a NACK, so force a hard failure here. + */ efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); efa_rdm_pke_release_rx(pkt_entry); return -FI_ENOBUFS; @@ -521,7 +533,7 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) dt = rxe->atomic_hdr.datatype; dtsize = ofi_datatype_size(rxe->atomic_hdr.datatype); if (OFI_UNLIKELY(!dtsize)) { - efa_base_ep_write_eq_error(&ep->base_ep, errno, FI_EFA_ERR_INVALID_DATATYPE, true); + efa_base_ep_write_eq_error(&ep->base_ep, errno, FI_EFA_ERR_INVALID_DATATYPE, false); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return -errno; @@ -552,7 +564,11 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) err = efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_ATOMRSP_PKT); if (OFI_UNLIKELY(err)) { - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST, true); + /* + * The atomic response failed to post, attempt to write EQ error + * and do not abort + */ + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST, false); ofi_buf_free(rxe->atomrsp_data); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.c b/prov/efa/src/rdm/efa_rdm_pke_rtw.c index fa945979193..03e449f0628 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.c @@ -534,6 +534,10 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) if (!rxe) { EFA_WARN(FI_LOG_CQ, "RX entries exhausted.\n"); + /* + * The RXE pool grows until we run OOM, and if we are OOM, we + * cannot send a NACK, so force a hard failure here. + */ efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_ENOBUFS, FI_EFA_ERR_RXE_POOL_EXHAUSTED, true); @@ -548,8 +552,9 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) rtw_hdr->rma_iov_count, FI_REMOTE_WRITE, rxe->iov, rxe->desc); if (OFI_UNLIKELY(err)) { + /* TODO Add NACK */ EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, true); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR, false); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -574,7 +579,12 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RDMA post read or queue failed.\n"); - efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST, true); + + /* + * The READ/QUEUE/NACK failed, in this case log EQ message + * and do not abort. + */ + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RDMA_READ_POST, false); efa_rdm_rxe_release(rxe); } }