Skip to content

Commit 110cdbe

Browse files
lsaversj-xiong
authored andcommitted
prov/opx: 0 byte write_data Error
0 byte write with CQ data was not posting because there is no data to send after the RTS/CTS exchange. Updated to post the CQ data when receiving the RTS and the sender completes when receiving the CTS. Other minor changes: Tail was not set properly in fi_opx_ep_rx_process_header() Added the endpoint pointer to the flight recorder dump Added opx_debug_ep_list_dump to dump the endpoint list Signed-off-by: Lindsay Reiser <[email protected]>
1 parent d60532d commit 110cdbe

File tree

6 files changed

+43
-19
lines changed

6 files changed

+43
-19
lines changed

prov/opx/include/rdma/opx/fi_opx_endpoint.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2007,7 +2007,11 @@ void fi_opx_ep_rx_process_header_rma_rts(struct fi_opx_ep *opx_ep, const union o
20072007
fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__);
20082008
abort();
20092009
}
2010-
slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr);
2010+
if (context->len) {
2011+
slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr);
2012+
} else { /* length is 0, there will be no RZV data sent, so post now */
2013+
slist_insert_tail((struct slist_entry *) context, rx->cq_completed_ptr);
2014+
}
20112015

20122016
OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RMA-RTS");
20132017
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
@@ -2831,11 +2835,13 @@ static inline void fi_opx_ep_rx_process_header(struct fid_ep *ep, const union op
28312835
if (prev) {
28322836
prev->next = context->next;
28332837
} else {
2838+
assert(opx_ep->rx->queue[kind].mq.head == (struct slist_entry *) context);
28342839
opx_ep->rx->queue[kind].mq.head = (struct slist_entry *) context->next;
28352840
}
28362841

28372842
if (context->next == NULL) {
2838-
opx_ep->rx->queue[kind].mq.tail = NULL;
2843+
assert(opx_ep->rx->queue[kind].mq.tail == (struct slist_entry *) context);
2844+
opx_ep->rx->queue[kind].mq.tail = (struct slist_entry *) prev;
28392845
}
28402846

28412847
context->next = NULL;

prov/opx/include/rdma/opx/fi_opx_flight_recorder.h

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2021,2024 Cornelis Networks.
2+
* Copyright (C) 2021,2025 Cornelis Networks.
33
*
44
* This software is available to you under a choice of one of two
55
* licenses. You may choose to be licensed under the terms of the GNU
@@ -139,31 +139,34 @@ static inline void flight_recorder_dump_packet_payload(struct flight_recorder_en
139139
}
140140
}
141141

142-
static inline void flight_recorder_dump(struct flight_recorder *fr)
142+
static inline void flight_recorder_dump(struct fi_opx_ep *opx_ep, struct flight_recorder *fr)
143143
{
144144
const unsigned count = fr->count;
145145
struct timespec current_time;
146146
fi_opx_timer_now(fr->now, fr->timer);
147147
if (!fr->timer->cycle_timer.use_cycle_timer) {
148148
current_time = fr->now->tp;
149149
fprintf(stderr,
150+
"#FLIGHT_RECORDER t%d |EP: %p\n"
150151
"#FLIGHT_RECORDER t%d |Last Dump Time: %ld.%ld\n"
151152
"#FLIGHT_RECORDER t%d |Current Time: %ld.%ld\n"
152153
"#FLIGHT_RECORDER t%d |Entry Count : %u\n",
153-
fr->tid, fr->last_dump.tv_sec, fr->last_dump.tv_nsec, fr->tid, current_time.tv_sec,
154-
current_time.tv_nsec, fr->tid, count);
154+
fr->tid, opx_ep, fr->tid, fr->last_dump.tv_sec, fr->last_dump.tv_nsec, fr->tid,
155+
current_time.tv_sec, current_time.tv_nsec, fr->tid, count);
155156
fr->last_dump = current_time;
156157
if (count == 0) {
157158
fr->last_dump = current_time;
158159
return;
159160
}
160161
} else {
161162
fprintf(stderr,
163+
"#FLIGHT_RECORDER t%d |EP: %p\n"
162164
"#FLIGHT_RECORDER t%d |Last Dump Time: %0.9lf\n"
163165
"#FLIGHT_RECORDER t%d |Current Time: %0.9lf\n"
164166
"#FLIGHT_RECORDER t%d |Entry Count : %u\n",
165-
fr->tid, fr->last_dump_cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12, fr->tid,
166-
fr->now->cycle_timer.cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12, fr->tid, count);
167+
fr->tid, opx_ep, fr->tid, fr->last_dump_cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12,
168+
fr->tid, fr->now->cycle_timer.cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12, fr->tid,
169+
count);
167170
fr->last_dump_cycles = fr->now->cycle_timer.cycles;
168171
if (count == 0) {
169172
fr->last_dump_cycles = fr->now->cycle_timer.cycles;
@@ -199,33 +202,33 @@ static inline void flight_recorder_dump(struct flight_recorder *fr)
199202
fr->count = 0;
200203
}
201204

202-
#define FLIGHT_RECORDER_STRING(fr, event_id, format, ...) \
205+
#define FLIGHT_RECORDER_STRING(ep, fr, event_id, format, ...) \
203206
{ \
204207
struct flight_recorder_entry *next = \
205208
flight_recorder_init_next_entry((fr), (event_id), FR_ENTRY_TYPE_STRING); \
206209
int actual_len = snprintf((char *) next->data, FLIGHT_RECORDER_ENTRY_DATA_LEN, format, ##__VA_ARGS__); \
207210
int end_of_string = MIN(actual_len, FLIGHT_RECORDER_ENTRY_DATA_LEN - 1); \
208211
next->data[end_of_string] = 0; \
209212
if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \
210-
flight_recorder_dump((fr)); \
213+
flight_recorder_dump(ep, fr); \
211214
}
212215

213-
#define FLIGHT_RECORDER_PACKET_HDR(fr, event_id, packet_hdr) \
216+
#define FLIGHT_RECORDER_PACKET_HDR(ep, fr, event_id, packet_hdr) \
214217
{ \
215218
struct flight_recorder_entry *next = \
216219
flight_recorder_init_next_entry((fr), (event_id), FR_ENTRY_TYPE_PACKET_HDR); \
217220
memcpy((void *) next->data, (void *) &(packet_hdr), sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); \
218221
if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \
219-
flight_recorder_dump((fr)); \
222+
flight_recorder_dump(ep, fr); \
220223
}
221224

222-
#define FLIGHT_RECORDER_PACKET(fr, event_id, packet) \
225+
#define FLIGHT_RECORDER_PACKET(ep, fr, event_id, packet) \
223226
{ \
224227
struct flight_recorder_entry *next = \
225228
flight_recorder_init_next_entry((fr), (event_id), FR_ENTRY_TYPE_PACKET); \
226229
memcpy((void *) next->data, (void *) &(packet), sizeof(union fi_opx_hfi1_packet_payload)); \
227230
if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \
228-
flight_recorder_dump((fr)); \
231+
flight_recorder_dump(ep, fr); \
229232
}
230233

231234
#define FLIGHT_RECORDER_INIT(fr) \
@@ -235,9 +238,9 @@ static inline void flight_recorder_dump(struct flight_recorder *fr)
235238

236239
#else /* !FLIGHT_RECORDER_ENABLE */
237240

238-
#define FLIGHT_RECORDER_STRING(fr, event_id, format, ...)
239-
#define FLIGHT_RECORDER_PACKET_HDR(fr, event_id, packet_hdr)
240-
#define FLIGHT_RECORDER_PACKET(fr, event_id, packet)
241+
#define FLIGHT_RECORDER_STRING(ep, fr, event_id, format, ...)
242+
#define FLIGHT_RECORDER_PACKET_HDR(ep, fr, event_id, packet_hdr)
243+
#define FLIGHT_RECORDER_PACKET(ep, fr, event_id, packet)
241244

242245
#endif /* #ifdef FLIGHT_RECORDER_ENABLE */
243246

prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, uint64_t *p_rhf_seq, ui
376376
*p_last_egrbfr_index = egrbfr_index;
377377
}
378378

379-
FLIGHT_RECORDER_PACKET_HDR(opx_ep->fr, FR_EVENT_HFI1_POLL_ONCE, hdr);
379+
FLIGHT_RECORDER_PACKET_HDR(opx_ep, opx_ep->fr, FR_EVENT_HFI1_POLL_ONCE, hdr);
380380
}
381381

382382
*p_rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq, hfi1_type);

prov/opx/include/rdma/opx/opx_debug.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ struct opx_debug_ep_entry {
4545

4646
void opx_debug_ep_list_append(void *opx_ep);
4747
void opx_debug_ep_list_free(void *opx_ep);
48+
void opx_debug_ep_list_dump();
4849
void opx_debug_install_handler();
4950

5051
#endif

prov/opx/src/fi_opx_hfi1.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4397,6 +4397,15 @@ fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr
43974397
const enum ofi_reliability_kind reliability, const uint32_t u32_extended_rx,
43984398
const enum opx_hfi1_type hfi1_type)
43994399
{
4400+
if (dput_opcode == FI_OPX_HFI_DPUT_OPCODE_PUT_CQ) {
4401+
struct fi_opx_completion_counter *cc = ((struct fi_opx_rma_request *) rma_request_vaddr)->cc;
4402+
if (cc->byte_counter == 0) {
4403+
OPX_BUF_FREE((struct fi_opx_rma_request *) rma_request_vaddr);
4404+
cc->hit_zero(cc);
4405+
return NULL;
4406+
}
4407+
}
4408+
44004409
union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool);
44014410
struct fi_opx_hfi1_dput_params *params = &work->dput;
44024411

prov/opx/src/opx_debug.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,13 +424,18 @@ void opx_debug_ep_list_free(void *opx_ep)
424424
}
425425
}
426426

427-
static void opx_debug_signal_handler(int signum, siginfo_t *info, void *ucontext)
427+
void opx_debug_ep_list_dump()
428428
{
429429
struct opx_debug_ep_entry *entry = (struct opx_debug_ep_entry *) ep_list.head;
430430
while (entry) {
431431
opx_debug_dump_endpoint((struct fi_opx_ep *) entry->ep);
432432
entry = entry->next;
433433
}
434+
}
435+
436+
static void opx_debug_signal_handler(int signum, siginfo_t *info, void *ucontext)
437+
{
438+
opx_debug_ep_list_dump();
434439

435440
if (prev_sig_handler && prev_sig_handler != SIG_DFL && prev_sig_handler != SIG_IGN) {
436441
prev_sig_handler(signum);

0 commit comments

Comments
 (0)