Skip to content

Commit 7d60749

Browse files
committed
prov/efa: acquire the same lock for qp lifecycle
In efa ep close, qp destroy + qp table update + the cq flush must happen in the same locking block, and the same lock must be acquired by the qp creation + qp table update in the ep enable. This ensure that the cq flush will exactly flush out the stale cqe belongs to the old qp before the new qp that likely has the same qpn is ingested by rdma-core's qp table, which will make rdma-core regard the stale cqe as a valid cqe for the new qp and cause unexpected behaviors. Signed-off-by: Shi Jin <[email protected]>
1 parent ef67305 commit 7d60749

File tree

4 files changed

+29
-35
lines changed

4 files changed

+29
-35
lines changed

prov/efa/src/efa_base_ep.c

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ int efa_base_ep_destruct_qp(struct efa_base_ep *base_ep)
5858
struct efa_qp *qp = base_ep->qp;
5959
struct efa_qp *user_recv_qp = base_ep->user_recv_qp;
6060
uint32_t qp_num;
61+
struct efa_cq *tx_cq, *rx_cq;
62+
int err;
6163

6264
/*
6365
* Acquire the lock to prevent race conditions when CQ polling accesses the qp_table
@@ -79,6 +81,25 @@ int efa_base_ep_destruct_qp(struct efa_base_ep *base_ep)
7981
domain->qp_table[qp_num & domain->qp_table_sz_m1] = NULL;
8082
base_ep->user_recv_qp = NULL;
8183
}
84+
85+
/* Flush the cq to poll out all stale cqes for the destroyed qp */
86+
tx_cq = efa_base_ep_get_tx_cq(base_ep);
87+
rx_cq = efa_base_ep_get_rx_cq(base_ep);
88+
89+
if (tx_cq) {
90+
err = 0;
91+
while (err != ENOENT) {
92+
err = tx_cq->poll_ibv_cq(-1, &tx_cq->ibv_cq);
93+
}
94+
}
95+
96+
if (rx_cq && rx_cq != tx_cq) {
97+
err = 0;
98+
while (err != ENOENT) {
99+
err = rx_cq->poll_ibv_cq(-1, &rx_cq->ibv_cq);
100+
}
101+
}
102+
82103
efa_base_ep_unlock_cq(base_ep);
83104

84105
return 0;
@@ -364,10 +385,7 @@ int efa_base_ep_enable_qp(struct efa_base_ep *base_ep, struct efa_qp *qp)
364385

365386
qp->qp_num = qp->ibv_qp->qp_num;
366387

367-
/* Acquire the lock to prevent race conditions when CQ polling accesses the qp_table */
368-
efa_base_ep_lock_cq(base_ep);
369388
base_ep->domain->qp_table[qp->qp_num & base_ep->domain->qp_table_sz_m1] = qp;
370-
efa_base_ep_unlock_cq(base_ep);
371389

372390
EFA_INFO(FI_LOG_EP_CTRL, "QP enabled! qp_n: %d qkey: %d\n", qp->qp_num, qp->qkey);
373391

@@ -822,37 +840,19 @@ int efa_base_ep_create_and_enable_qp(struct efa_base_ep *ep, bool create_user_re
822840
scq = txcq ? txcq : rxcq;
823841
rcq = rxcq ? rxcq : txcq;
824842

843+
/* Acquire the lock to prevent race conditions when CQ polling accesses the qp_table */
844+
efa_base_ep_lock_cq(ep);
825845
err = efa_base_ep_create_qp(ep, &scq->ibv_cq, &rcq->ibv_cq, create_user_recv_qp);
826846
if (err)
827-
return err;
828-
829-
return efa_base_ep_enable(ep);
830-
}
831-
832-
void efa_base_ep_flush_cq(struct efa_base_ep *base_ep)
833-
{
834-
struct efa_cq *tx_cq, *rx_cq;
835-
int err;
836-
837-
efa_base_ep_lock_cq(base_ep);
838-
tx_cq = efa_base_ep_get_tx_cq(base_ep);
839-
rx_cq = efa_base_ep_get_rx_cq(base_ep);
847+
goto out;
840848

841-
if (tx_cq) {
842-
err = 0;
843-
while (err != ENOENT) {
844-
err = tx_cq->poll_ibv_cq(-1, &tx_cq->ibv_cq);
845-
}
846-
}
849+
err = efa_base_ep_enable(ep);
847850

848-
if (rx_cq && rx_cq != tx_cq) {
849-
err = 0;
850-
while (err != ENOENT) {
851-
err = rx_cq->poll_ibv_cq(-1, &rx_cq->ibv_cq);
852-
}
853-
}
854-
efa_base_ep_unlock_cq(base_ep);
851+
out:
852+
efa_base_ep_unlock_cq(ep);
853+
return err;
855854
}
855+
856856
#if ENABLE_DEBUG
857857
void efa_ep_addr_print(char *prefix, struct efa_ep_addr *addr) {
858858
char raw_gid_str[INET6_ADDRSTRLEN];

prov/efa/src/efa_base_ep.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,6 @@ void efa_base_ep_remove_cntr_ibv_cq_poll_list(struct efa_base_ep *ep);
151151

152152
int efa_base_ep_create_and_enable_qp(struct efa_base_ep *ep, bool create_user_recv_qp);
153153

154-
void efa_base_ep_flush_cq(struct efa_base_ep *base_ep);
155-
156154
#if ENABLE_DEBUG
157155
void efa_ep_addr_print(char *prefix, struct efa_ep_addr *addr);
158156
#endif

prov/efa/src/efa_ep.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,6 @@ static int efa_ep_close(fid_t fid)
208208
EFA_WARN(FI_LOG_EP_CTRL, "Unable to close base endpoint\n");
209209
}
210210

211-
efa_base_ep_flush_cq(ep);
212-
213211
free(ep);
214212

215213
return 0;

prov/efa/src/rdm/efa_rdm_ep_fiops.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,8 +1033,6 @@ static int efa_rdm_ep_close(struct fid *fid)
10331033
retv = ret;
10341034
}
10351035

1036-
efa_base_ep_flush_cq(&efa_rdm_ep->base_ep);
1037-
10381036
ret = efa_rdm_ep_close_shm_ep_resources(efa_rdm_ep);
10391037
if (ret)
10401038
retv = ret;

0 commit comments

Comments
 (0)