Skip to content

Commit 0b45e29

Browse files
authored
Merge pull request #2321 from yosefe/topic/uct-dc-fix-hang-fc-disabled-v1.3.x
UCT/DC: Don't consider fc_wnd if flow control is disabled - v1.3.x
2 parents c9ef2c6 + 647c01a commit 0b45e29

File tree

5 files changed

+125
-21
lines changed

5 files changed

+125
-21
lines changed

src/uct/ib/dc/base/dc_ep.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ static inline int uct_dc_iface_dci_ep_can_send(uct_dc_ep_t *ep)
167167
{
168168
uct_dc_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_iface_t);
169169
return (!(ep->flags & UCT_DC_EP_FLAG_TX_WAIT)) &&
170-
(ep->fc.fc_wnd > 0) &&
170+
uct_rc_fc_has_resources(&iface->super, &ep->fc) &&
171171
uct_dc_iface_dci_has_tx_resources(iface, ep->dci);
172172
}
173173

@@ -176,7 +176,7 @@ void uct_dc_iface_schedule_dci_alloc(uct_dc_iface_t *iface, uct_dc_ep_t *ep)
176176
{
177177
/* If FC window is empty the group will be scheduled when
178178
* grant is received */
179-
if (ep->fc.fc_wnd > 0) {
179+
if (uct_rc_fc_has_resources(&iface->super, &ep->fc)) {
180180
ucs_arbiter_group_schedule(uct_dc_iface_dci_waitq(iface), &ep->arb_group);
181181
}
182182
}

src/uct/ib/rc/base/rc_ep.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ enum {
120120

121121
#define UCT_RC_UPDATE_FC_WND(_iface, _fc) \
122122
{ \
123+
/* For performance reasons, prefer to update fc_wnd unconditionally */ \
123124
(_fc)->fc_wnd--; \
124125
\
125126
if ((_iface)->config.fc_enabled) { \
@@ -304,9 +305,19 @@ static UCS_F_ALWAYS_INLINE void uct_rc_txqp_check(uct_rc_txqp_t *txqp)
304305
txqp->qp->qp_num, txqp->qp->state);
305306
}
306307

308+
static UCS_F_ALWAYS_INLINE
309+
int uct_rc_fc_has_resources(uct_rc_iface_t *iface, uct_rc_fc_t *fc)
310+
{
311+
/* When FC is disabled, fc_wnd may still become 0 because it's decremented
312+
* unconditionally (for performance reasons) */
313+
return (fc->fc_wnd > 0) || !iface->config.fc_enabled;
314+
}
315+
307316
static UCS_F_ALWAYS_INLINE int uct_rc_ep_has_tx_resources(uct_rc_ep_t *ep)
308317
{
309-
return ((ep->txqp.available > 0) && (ep->fc.fc_wnd > 0));
318+
uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_rc_iface_t);
319+
320+
return (ep->txqp.available > 0) && uct_rc_fc_has_resources(iface, &ep->fc);
310321
}
311322

312323
static UCS_F_ALWAYS_INLINE void

test/gtest/uct/ib/test_dc.cc

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,8 @@ class test_dc_flow_control : public test_rc_flow_control {
371371
public:
372372

373373
/* virtual */
374-
uct_rc_fc_t* get_fc_ptr(entity *e) {
375-
return &ucs_derived_of(e->ep(0), uct_dc_ep_t)->fc;
374+
uct_rc_fc_t* get_fc_ptr(entity *e, int ep_idx = 0) {
375+
return &ucs_derived_of(e->ep(ep_idx), uct_dc_ep_t)->fc;
376376
}
377377
};
378378

@@ -395,6 +395,41 @@ UCS_TEST_P(test_dc_flow_control, pending_grant)
395395
flush();
396396
}
397397

398+
UCS_TEST_P(test_dc_flow_control, fc_disabled_flush)
399+
{
400+
test_flush_fc_disabled();
401+
}
402+
403+
UCS_TEST_P(test_dc_flow_control, fc_disabled_pending_no_dci) {
404+
405+
pending_send_request_t pending_req;
406+
pending_req.uct.func = pending_cb;
407+
pending_req.cb_count = 0;
408+
409+
set_fc_disabled(m_e1);
410+
411+
/* Send on new endpoints until out of DCIs */
412+
for (int ep_index = 0; ep_index < 20; ++ep_index) {
413+
m_e1->connect(ep_index, *m_e2, ep_index);
414+
415+
ucs_status_t status = uct_ep_am_short(m_e1->ep(ep_index), 0, 0, NULL, 0);
416+
if (status == UCS_ERR_NO_RESOURCE) {
417+
/* if FC is disabled, it should be OK to set fc_wnd to 0 */
418+
get_fc_ptr(m_e1, ep_index)->fc_wnd = 0;
419+
420+
/* Add to pending */
421+
status = uct_ep_pending_add(m_e1->ep(ep_index), &pending_req.uct);
422+
ASSERT_UCS_OK(status);
423+
424+
wait_for_flag(&pending_req.cb_count);
425+
EXPECT_EQ(1, pending_req.cb_count);
426+
break;
427+
}
428+
429+
ASSERT_UCS_OK(status);
430+
}
431+
}
432+
398433
/* Check that soft request is not handled by DC */
399434
UCS_TEST_P(test_dc_flow_control, soft_request)
400435
{
@@ -487,8 +522,8 @@ class test_dc_flow_control_stats : public test_rc_flow_control_stats {
487522
test_rc_flow_control_stats::init();
488523
}
489524

490-
uct_rc_fc_t* get_fc_ptr(entity *e) {
491-
return &ucs_derived_of(e->ep(0), uct_dc_ep_t)->fc;
525+
uct_rc_fc_t* get_fc_ptr(entity *e, int ep_idx = 0) {
526+
return &ucs_derived_of(e->ep(ep_idx), uct_dc_ep_t)->fc;
492527
}
493528

494529
uct_rc_fc_t* fake_ep_fc_ptr(entity *e) {

test/gtest/uct/ib/test_rc.cc

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ UCS_TEST_P(test_rc_max_wr, send_limit)
6767
UCT_RC_INSTANTIATE_TEST_CASE(test_rc_max_wr)
6868

6969

70-
int test_rc_flow_control::m_req_count = 0;
7170
uint32_t test_rc_flow_control::m_am_rx_count = 0;
7271

7372
void test_rc_flow_control::init()
@@ -78,13 +77,24 @@ void test_rc_flow_control::init()
7877
}
7978
test_rc::init();
8079

80+
ucs_assert(rc_iface(m_e1)->config.fc_enabled);
81+
ucs_assert(rc_iface(m_e2)->config.fc_enabled);
82+
8183
uct_iface_set_am_handler(m_e1->iface(), FLUSH_AM_ID, am_handler,
8284
NULL, UCT_CB_FLAG_SYNC);
8385
uct_iface_set_am_handler(m_e2->iface(), FLUSH_AM_ID, am_handler,
8486
NULL, UCT_CB_FLAG_SYNC);
8587

8688
}
8789

90+
void test_rc_flow_control::cleanup()
91+
{
92+
/* Restore FC state to enabled, so iface cleanup will destroy the grant mpool */
93+
rc_iface(m_e1)->config.fc_enabled = 1;
94+
rc_iface(m_e2)->config.fc_enabled = 1;
95+
test_rc::cleanup();
96+
}
97+
8898
void test_rc_flow_control::send_am_and_flush(entity *e, int num_msg)
8999
{
90100
m_am_rx_count = 0;
@@ -147,24 +157,48 @@ void test_rc_flow_control::test_pending_grant(int wnd)
147157
send_am_messages(m_e1, 1, UCS_OK);
148158
}
149159

160+
void test_rc_flow_control::test_flush_fc_disabled()
161+
{
162+
set_fc_disabled(m_e1);
163+
ucs_status_t status;
164+
165+
/* If FC is disabled, wnd=0 should not prevent the flush */
166+
get_fc_ptr(m_e1)->fc_wnd = 0;
167+
status = uct_ep_flush(m_e1->ep(0), 0, NULL);
168+
EXPECT_EQ(UCS_OK, status);
169+
170+
/* send active message should be OK */
171+
get_fc_ptr(m_e1)->fc_wnd = 1;
172+
status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0);
173+
EXPECT_EQ(UCS_OK, status);
174+
EXPECT_EQ(0, get_fc_ptr(m_e1)->fc_wnd);
175+
176+
/* flush must have resources */
177+
status = uct_ep_flush(m_e1->ep(0), 0, NULL);
178+
EXPECT_FALSE(UCS_STATUS_IS_ERR(status)) << ucs_status_string(status);
179+
}
180+
150181
void test_rc_flow_control::test_pending_purge(int wnd, int num_pend_sends)
151182
{
152-
uct_pending_req_t reqs[num_pend_sends];
183+
pending_send_request_t reqs[num_pend_sends];
153184

154185
disable_entity(m_e2);
155186
set_fc_attributes(m_e1, true, wnd, wnd, 1);
156187

157-
m_req_count = 0;
158188
send_am_and_flush(m_e1, wnd);
159189

160190
/* Now m2 ep should have FC grant message in the pending queue.
161191
* Add some user pending requests as well */
162-
for (int i = 0; i < num_pend_sends; i ++) {
163-
reqs[i].func = NULL; /* make valgrind happy */
164-
EXPECT_EQ(uct_ep_pending_add(m_e2->ep(0), &reqs[i]), UCS_OK);
192+
for (int i = 0; i < num_pend_sends; i++) {
193+
reqs[i].uct.func = NULL; /* make valgrind happy */
194+
reqs[i].purge_count = 0;
195+
EXPECT_EQ(uct_ep_pending_add(m_e2->ep(0), &reqs[i].uct), UCS_OK);
165196
}
166197
uct_ep_pending_purge(m_e2->ep(0), purge_cb, NULL);
167-
EXPECT_EQ(num_pend_sends, m_req_count);
198+
199+
for (int i = 0; i < num_pend_sends; i++) {
200+
EXPECT_EQ(1, reqs[i].purge_count);
201+
}
168202
}
169203

170204

@@ -206,6 +240,11 @@ UCS_TEST_P(test_rc_flow_control, pending_grant)
206240
test_pending_grant(5);
207241
}
208242

243+
UCS_TEST_P(test_rc_flow_control, fc_disabled_flush)
244+
{
245+
test_flush_fc_disabled();
246+
}
247+
209248
UCT_RC_INSTANTIATE_TEST_CASE(test_rc_flow_control)
210249

211250

test/gtest/uct/ib/test_rc.h

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ class test_rc : public uct_test {
2626
return ucs_derived_of(e->iface(), uct_rc_iface_t);
2727
}
2828

29-
uct_rc_ep_t* rc_ep(entity *e) {
30-
return ucs_derived_of(e->ep(0), uct_rc_ep_t);
29+
uct_rc_ep_t* rc_ep(entity *e, int ep_idx = 0) {
30+
return ucs_derived_of(e->ep(ep_idx), uct_rc_ep_t);
3131
}
3232

3333
void send_am_messages(entity *e, int wnd, ucs_status_t expected,
@@ -54,14 +54,16 @@ class test_rc : public uct_test {
5454
class test_rc_flow_control : public test_rc {
5555
public:
5656
typedef struct pending_send_request {
57-
uct_ep_h ep;
5857
uct_pending_req_t uct;
58+
int cb_count;
59+
int purge_count;
5960
} pending_send_request_t;
6061

6162
void init();
63+
void cleanup();
6264

63-
virtual uct_rc_fc_t* get_fc_ptr(entity *e) {
64-
return &rc_ep(e)->fc;
65+
virtual uct_rc_fc_t* get_fc_ptr(entity *e, int ep_idx = 0) {
66+
return &rc_ep(e, ep_idx)->fc;
6567
}
6668

6769
virtual void disable_entity(entity *e) {
@@ -84,6 +86,11 @@ class test_rc_flow_control : public test_rc {
8486

8587
}
8688

89+
void set_fc_disabled(entity *e) {
90+
/* same as default settings in rc_iface_init */
91+
set_fc_attributes(e, false, std::numeric_limits<int16_t>::max(), 0, 0);
92+
}
93+
8794
void send_am_and_flush(entity *e, int num_msg);
8895

8996
void progress_loop(double delta_ms=10.0) {
@@ -102,7 +109,18 @@ class test_rc_flow_control : public test_rc {
102109
}
103110

104111
static void purge_cb(uct_pending_req_t *self, void *arg) {
105-
m_req_count++;
112+
pending_send_request_t *req = ucs_container_of(self,
113+
pending_send_request_t,
114+
uct);
115+
++req->purge_count;
116+
}
117+
118+
static ucs_status_t pending_cb(uct_pending_req_t *self) {
119+
pending_send_request_t *req = ucs_container_of(self,
120+
pending_send_request_t,
121+
uct);
122+
++req->cb_count;
123+
return UCS_OK;
106124
}
107125

108126
void validate_grant(entity *e);
@@ -113,8 +131,9 @@ class test_rc_flow_control : public test_rc {
113131

114132
void test_pending_purge(int wnd, int num_pend_sends);
115133

134+
void test_flush_fc_disabled();
135+
116136
protected:
117-
static int m_req_count;
118137
static const uint8_t FLUSH_AM_ID = 1;
119138
static uint32_t m_am_rx_count;
120139
};

0 commit comments

Comments
 (0)