Skip to content

Commit 741ca49

Browse files
committed
prov/efa: Bypass rdma-core in data path.
This patch allows libfabric to bypass rdma-core API in data path to reduce layering overhead and cache misses. It starts from making libfabric call internal ops that is the equivalent of the rdma-core API. Further optimization is needed to get rid of the rdma-core API workflow restriction and implement the logic directly for the transmission and cq poll calls. Co-authored-by: Luke Robison <[email protected]> Signed-off-by: Shi Jin <[email protected]>
1 parent 769b9ad commit 741ca49

35 files changed

+3834
-463
lines changed

include/windows/config.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@
4343
/* Indicates if ibv_reg_dmabuf_mr verbs is available */
4444
#define HAVE_EFA_DMABUF_MR 0
4545

46+
/* Indicates if efa data path direct is available */
47+
#define HAVE_EFA_DATA_PATH_DIRECT 0
48+
4649
/* Define to 1 if host_clock_get_service is available. */
4750
/* #undef HAVE_HOST_GET_CLOCK_SERVICE */
4851

man/fi_efa.7.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,11 @@ Setting this environment variable to 0 can disable this feature.
522522
When the number of internal rx pkts to post is lower than this threshold,
523523
the refill will be skipped.
524524

525+
*FI_EFA_USE_DATA_PATH_DIRECT*
526+
527+
: Use the direct data path implementation that bypasses rdma-core on data path, including the CQ polling and TX/RX submissions, when it's available.
528+
Setting this variable as 1 will enable this feature (Default: false).
529+
525530
# SEE ALSO
526531

527532
[`fabric`(7)](fabric.7.html),

prov/efa/Makefile.include

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ _efa_files = \
5151
prov/efa/src/efa_rma.c \
5252
prov/efa/src/efa_cq.c \
5353
prov/efa/src/efa_ep.c \
54+
prov/efa/src/efa_data_path_direct.c \
5455
prov/efa/src/rdm/efa_rdm_peer.c \
5556
prov/efa/src/rdm/efa_rdm_cq.c \
5657
prov/efa/src/rdm/efa_rdm_ep_utils.c \
@@ -75,6 +76,10 @@ _efa_files = \
7576
prov/efa/src/rdm/efa_rdm_srx.c \
7677
prov/efa/src/rdm/efa_rdm_util.c
7778

79+
if ENABLE_EFA_UNIT_TEST
80+
_efa_files += prov/efa/test/efa_unit_test_data_path_ops.c
81+
endif ENABLE_EFA_UNIT_TEST
82+
7883
_efa_headers = \
7984
prov/efa/src/efa.h \
8085
prov/efa/src/efa_av.h \
@@ -95,6 +100,13 @@ _efa_headers = \
95100
prov/efa/src/efa_prov.h \
96101
prov/efa/src/efa_env.h \
97102
prov/efa/src/fi_ext_efa.h \
103+
prov/efa/src/efa_data_path_ops.h \
104+
prov/efa/src/efa_io_defs.h \
105+
prov/efa/src/efa_data_path_direct_structs.h \
106+
prov/efa/src/efa_data_path_direct_entry.h \
107+
prov/efa/src/efa_data_path_direct.h \
108+
prov/efa/src/efa_data_path_direct_internal.h \
109+
prov/efa/src/efa_mmio.h \
98110
prov/efa/src/rdm/efa_rdm_peer.h \
99111
prov/efa/src/rdm/efa_rdm_cq.h \
100112
prov/efa/src/rdm/efa_rdm_ep.h \
@@ -166,7 +178,31 @@ prov_efa_test_efa_unit_test_LDFLAGS = $(cmocka_rpath) $(efa_LDFLAGS) $(cmocka_LD
166178
-Wl,--wrap=efa_rdm_pke_read \
167179
-Wl,--wrap=efa_rdm_pke_proc_matched_rtm \
168180
-Wl,--wrap=efa_rdm_ope_post_send \
169-
-Wl,--wrap=efa_device_support_unsolicited_write_recv
181+
-Wl,--wrap=efa_device_support_unsolicited_write_recv \
182+
-Wl,--wrap=efa_qp_post_recv \
183+
-Wl,--wrap=efa_qp_wr_complete \
184+
-Wl,--wrap=efa_qp_wr_send \
185+
-Wl,--wrap=efa_qp_wr_start \
186+
-Wl,--wrap=efa_qp_wr_rdma_read \
187+
-Wl,--wrap=efa_qp_wr_rdma_write \
188+
-Wl,--wrap=efa_qp_wr_rdma_write_imm \
189+
-Wl,--wrap=efa_qp_wr_send_imm \
190+
-Wl,--wrap=efa_qp_wr_set_inline_data_list \
191+
-Wl,--wrap=efa_qp_wr_set_sge_list \
192+
-Wl,--wrap=efa_qp_wr_set_ud_addr \
193+
-Wl,--wrap=efa_ibv_cq_start_poll \
194+
-Wl,--wrap=efa_ibv_cq_next_poll \
195+
-Wl,--wrap=efa_ibv_cq_end_poll \
196+
-Wl,--wrap=efa_ibv_cq_wc_read_opcode \
197+
-Wl,--wrap=efa_ibv_cq_wc_read_qp_num \
198+
-Wl,--wrap=efa_ibv_cq_wc_read_vendor_err \
199+
-Wl,--wrap=efa_ibv_cq_wc_read_src_qp \
200+
-Wl,--wrap=efa_ibv_cq_wc_read_slid \
201+
-Wl,--wrap=efa_ibv_cq_wc_read_byte_len \
202+
-Wl,--wrap=efa_ibv_cq_wc_read_wc_flags \
203+
-Wl,--wrap=efa_ibv_cq_wc_read_imm_data \
204+
-Wl,--wrap=efa_ibv_cq_wc_is_unsolicited \
205+
-Wl,--wrap=efa_ibv_cq_wc_read_sgid
170206

171207
if HAVE_EFADV_CQ_EX
172208
prov_efa_test_efa_unit_test_LDFLAGS += -Wl,--wrap=efadv_create_cq

prov/efa/configure.m4

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,12 @@ AC_DEFUN([FI_EFA_CONFIGURE],[
226226
AC_DEFINE_UNQUOTED([HAVE_EFADV_QUERY_CQ],
227227
[$have_efadv_query_cq],
228228
[Indicates if efadv_query_cq is available])
229+
AS_IF([test "$have_efadv_query_qp_wqs" = "1" -a "$have_efadv_query_cq" = "1"],
230+
[have_efa_data_path_direct=1],
231+
[have_efa_data_path_direct=0])
232+
AC_DEFINE_UNQUOTED([HAVE_EFA_DATA_PATH_DIRECT],
233+
[$have_efa_data_path_direct],
234+
[Indicates if data path direct is available (requires both QUERY_QP_WQS and QUERY_CQ)])
229235
230236
231237
CPPFLAGS=$save_CPPFLAGS

prov/efa/src/efa_base_ep.c

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "efa_cq.h"
88
#include "efa_cntr.h"
99
#include "rdm/efa_rdm_protocol.h"
10+
#include "efa_data_path_direct.h"
1011

1112
int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av)
1213
{
@@ -251,9 +252,12 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex,
251252
}
252253

253254
(*qp)->ibv_qp_ex = ibv_qp_to_qp_ex((*qp)->ibv_qp);
255+
/* Initialize it explicitly for safety */
256+
(*qp)->data_path_direct_enabled = false;
254257
return FI_SUCCESS;
255258
}
256259

260+
static
257261
int efa_base_ep_create_qp(struct efa_base_ep *base_ep,
258262
struct ibv_qp_init_attr_ex *init_attr_ex)
259263
{
@@ -312,6 +316,10 @@ void efa_qp_destruct(struct efa_qp *qp)
312316
err = -ibv_destroy_qp(qp->ibv_qp);
313317
if (err)
314318
EFA_INFO(FI_LOG_CORE, "destroy qp[%u] failed, err: %s\n", qp->qp_num, fi_strerror(-err));
319+
#if HAVE_EFA_DATA_PATH_DIRECT
320+
if (qp->data_path_direct_enabled)
321+
efa_data_path_direct_qp_finalize(qp);
322+
#endif
315323
free(qp);
316324
}
317325

@@ -725,40 +733,51 @@ void efa_base_ep_remove_cntr_ibv_cq_poll_list(struct efa_base_ep *ep)
725733
int efa_base_ep_create_and_enable_qp(struct efa_base_ep *ep, bool create_user_recv_qp)
726734
{
727735
struct ibv_qp_init_attr_ex attr_ex = { 0 };
728-
struct efa_cq *scq, *rcq;
729-
struct ibv_cq_ex *tx_ibv_cq, *rx_ibv_cq;
736+
struct efa_cq *scq, *rcq, *txcq, *rxcq;
730737
int err;
731738

732-
scq = efa_base_ep_get_tx_cq(ep);
733-
rcq = efa_base_ep_get_rx_cq(ep);
739+
txcq = efa_base_ep_get_tx_cq(ep);
740+
rxcq = efa_base_ep_get_rx_cq(ep);
734741

735-
if (!scq && !rcq) {
742+
if (!txcq && !rxcq) {
736743
EFA_WARN(FI_LOG_EP_CTRL,
737744
"Endpoint is not bound to a send or receive completion queue\n");
738745
return -FI_ENOCQ;
739746
}
740747

741-
if (!scq && ofi_needs_tx(ep->info->caps)) {
748+
if (!txcq && ofi_needs_tx(ep->info->caps)) {
742749
EFA_WARN(FI_LOG_EP_CTRL,
743750
"Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n");
744751
return -FI_ENOCQ;
745752
}
746753

747-
if (!rcq && ofi_needs_rx(ep->info->caps)) {
754+
if (!rxcq && ofi_needs_rx(ep->info->caps)) {
748755
EFA_WARN(FI_LOG_EP_CTRL,
749756
"Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n");
750757
return -FI_ENOCQ;
751758
}
752759

753-
tx_ibv_cq = scq ? scq->ibv_cq.ibv_cq_ex : rcq->ibv_cq.ibv_cq_ex;
754-
rx_ibv_cq = rcq ? rcq->ibv_cq.ibv_cq_ex : scq->ibv_cq.ibv_cq_ex;
760+
scq = txcq ? txcq : rxcq;
761+
rcq = rxcq ? rxcq : txcq;
755762

756-
efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, tx_ibv_cq, rx_ibv_cq);
763+
efa_base_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, scq->ibv_cq.ibv_cq_ex, rcq->ibv_cq.ibv_cq_ex);
757764

758765
err = efa_base_ep_create_qp(ep, &attr_ex);
759766
if (err)
760767
return err;
761768

769+
#if HAVE_EFA_DATA_PATH_DIRECT
770+
/* Only enable direct QP when direct CQ is enabled */
771+
assert(scq->ibv_cq.data_path_direct_enabled == rcq->ibv_cq.data_path_direct_enabled);
772+
if (scq->ibv_cq.data_path_direct_enabled) {
773+
err = efa_data_path_direct_qp_initialize(ep->qp);
774+
if (err) {
775+
efa_base_ep_destruct_qp(ep);
776+
return err;
777+
}
778+
}
779+
#endif
780+
762781
if (create_user_recv_qp) {
763782
err = efa_qp_create(&ep->user_recv_qp, &attr_ex, ep->info->tx_attr->tclass);
764783
if (err) {

prov/efa/src/efa_base_ep.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "ofi_util.h"
1313
#include "efa_av.h"
1414
#include "rdm/efa_rdm_protocol.h"
15+
#include "efa_data_path_direct_structs.h"
1516

1617
#define EFA_QP_DEFAULT_SERVICE_LEVEL 0
1718
#define EFA_QP_LOW_LATENCY_SERVICE_LEVEL 8
@@ -46,9 +47,12 @@ struct efa_qp {
4647
struct efa_base_ep *base_ep;
4748
uint32_t qp_num;
4849
uint32_t qkey;
50+
bool data_path_direct_enabled;
51+
#if HAVE_EFA_DATA_PATH_DIRECT
52+
struct efa_data_path_direct_qp data_path_direct_qp;
53+
#endif
4954
};
5055

51-
5256
struct efa_av;
5357

5458
struct efa_recv_wr {
@@ -111,9 +115,6 @@ int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex,
111115

112116
void efa_qp_destruct(struct efa_qp *qp);
113117

114-
int efa_base_ep_create_qp(struct efa_base_ep *base_ep,
115-
struct ibv_qp_init_attr_ex *init_attr_ex);
116-
117118
void efa_base_ep_close_util_ep(struct efa_base_ep *base_ep);
118119

119120
int efa_base_ep_destruct_qp(struct efa_base_ep *base_ep);

0 commit comments

Comments
 (0)