Skip to content

Commit

Permalink
Merge pull request ofiwg#1003 from hpe/libfabric-update
Browse files Browse the repository at this point in the history
Libfabric Sync: March 04, 2024
  • Loading branch information
ianryanhpe authored and GitHub Enterprise committed Mar 6, 2024
2 parents 5c533ce + 396f62b commit 94310d6
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 34 deletions.
1 change: 1 addition & 0 deletions prov/efa/src/efa.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@


#define EFA_DEFAULT_RUNT_SIZE (307200)
#define EFA_NEURON_RUNT_SIZE (131072)
#define EFA_DEFAULT_INTER_MAX_MEDIUM_MESSAGE_SIZE (65536)
#define EFA_DEFAULT_INTER_MIN_READ_MESSAGE_SIZE (1048576)
#define EFA_DEFAULT_INTER_MIN_READ_WRITE_SIZE (65536)
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/efa_hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_
fi_param_get_size_t(&efa_prov, "inter_min_read_write_size", &info->min_read_write_size);
break;
case FI_HMEM_NEURON:
info->runt_size = 0;
info->runt_size = EFA_NEURON_RUNT_SIZE;
info->max_intra_eager_size = 0;
info->max_medium_msg_size = 0;
info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1;
Expand Down
20 changes: 20 additions & 0 deletions prov/efa/src/rdm/efa_rdm_pke_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,26 @@ void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int err, int pro
EFA_DBG(FI_LOG_CQ, "Packet receive error: %s (%d)\n",
efa_strerror(prov_errno), prov_errno);

/*
* pkes posted by efa_rdm_ep_bulk_post_internal_rx_pkts
* are not associated with ope before being progressed
*/
if (!pkt_entry->ope) {
char ep_addr_str[OFI_ADDRSTRLEN];
size_t buflen=0;

memset(&ep_addr_str, 0, sizeof(ep_addr_str));
buflen = sizeof(ep_addr_str);
efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &buflen);
EFA_WARN(FI_LOG_CQ,
"Packet receive error from non TX/RX packet. Our address: %s\n",
ep_addr_str);

efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno);
efa_rdm_pke_release_rx(pkt_entry);
return;
}

if (pkt_entry->ope->type == EFA_RDM_TXE) {
efa_rdm_txe_handle_error(pkt_entry->ope, err, prov_errno);
} else if (pkt_entry->ope->type == EFA_RDM_RXE) {
Expand Down
10 changes: 0 additions & 10 deletions prov/efa/src/rdm/efa_rdm_rma.c
Original file line number Diff line number Diff line change
Expand Up @@ -330,16 +330,6 @@ bool efa_rdm_rma_should_write_using_rdma(struct efa_rdm_ep *ep, struct efa_rdm_o
(txe->iov_count > 1 || txe->rma_iov_count > 1))
return false;

/*
* For local write, handshake is not required and
* we just need to check the local ep caps
*/
if (peer->is_self)
return efa_rdm_ep_support_rdma_write(ep);

/* Check for hardware support of RDMA write.
A handshake should have been made before the check. */
assert(peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED);
return efa_both_support_rdma_write(ep, peer);
}

Expand Down
33 changes: 10 additions & 23 deletions prov/efa/test/efa_unit_test_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,28 +274,15 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state)
struct efa_resource *resource = *state;
struct efa_rdm_pke *pkt_entry;
struct fi_cq_data_entry cq_entry;
struct fi_cq_err_entry cq_err_entry = {0};
struct efa_ep_addr raw_addr = {0};
size_t raw_addr_len = sizeof(struct efa_ep_addr);
fi_addr_t peer_addr;
int ret, err, numaddr, err_data_size = 1024;
struct fi_eq_err_entry eq_err_entry;
int ret;


efa_unit_test_resource_construct(resource, FI_EP_RDM);
efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);

pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL);

/* create a fake peer */
err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
assert_int_equal(err, 0);
raw_addr.qpn = 1;
raw_addr.qkey = 0x1234;
numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL);
assert_int_equal(numaddr, 1);

pkt_entry->ope = efa_rdm_ep_alloc_rxe(efa_rdm_ep, peer_addr, ofi_op_msg);

/* A receive completion requires efa rx pkts are posted */
efa_rdm_ep->efa_rx_pkts_posted++;
assert_non_null(pkt_entry);
Expand All @@ -315,16 +302,16 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state)
will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
efa_rdm_ep->ibv_cq_ex->wr_id = (uintptr_t)pkt_entry;
efa_rdm_ep->ibv_cq_ex->status = IBV_WC_GENERAL_ERR;
/* the recv error will not populate to application cq because it's an EFA internal error and
* and not related to any application recv. Currently we can only read the error from eq.
*/
ret = fi_cq_read(resource->cq, &cq_entry, 1);
assert_int_equal(ret, -FI_EAVAIL);
assert_int_equal(ret, -FI_EAGAIN);

cq_err_entry.err_data = malloc(err_data_size);
cq_err_entry.err_data_size = err_data_size;
ret = fi_cq_readerr(resource->cq, &cq_err_entry, 0);
assert_int_equal(ret, 1);
assert_int_equal(cq_err_entry.err, FI_EIO);
assert_int_equal(cq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
free(cq_err_entry.err_data);
ret = fi_eq_readerr(resource->eq, &eq_err_entry, 0);
assert_int_equal(ret, sizeof(eq_err_entry));
assert_int_equal(eq_err_entry.err, FI_EIO);
assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
}

/**
Expand Down

0 comments on commit 94310d6

Please sign in to comment.