Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/pr-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,38 @@ jobs:
with:
name: ${{ matrix.os }}-${{ matrix.cc }}-config.log
path: config.log
gcc15:
permissions:
contents: read
pull-requests: read
runs-on: ubuntu-24.04
steps:
- name: Install dependencies (Linux)
run: |
sudo apt-get update
sudo apt-get install -y ${{ env.APT_PACKAGES }}
sudo add-apt-repository -y ppa:puni070/gcc-noble
sudo apt-get update
sudo apt-get install -y gcc-15 g++-15
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-15 60 --slave /usr/bin/g++ g++ /usr/bin/g++-15
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Build Check
run: |
set -x
git clone --depth 1 -b ${{ env.RDMA_CORE_VERSION }} https://github.com/linux-rdma/rdma-core.git
pushd rdma-core; bash build.sh; popd
export LD_LIBRARY_PATH="${{ env.RDMA_CORE_PATH }}/lib:$LD_LIBRARY_PATH"
./autogen.sh
./configure --prefix=$PWD/install ${{ env.OFI_PROVIDER_FLAGS }} --with-lttng
make -j 2 AM_CFLAGS="-Wall -Werror"; make install
DISTCHECK_CONFIGURE_FLAGS="${{ env.OFI_PROVIDER_FLAGS }}" make -j 2 distcheck
$PWD/install/bin/fi_info -l
- name: Upload build logs
if: failure()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: gcc15-config.log
path: config.log
hmem:
permissions:
contents: read
Expand Down
6 changes: 6 additions & 0 deletions prov/opx/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,12 @@ AC_DEFUN([FI_OPX_CONFIGURE],[
AS_IF([test $opx_happy -eq 1],[
AC_MSG_NOTICE([Appending OPX_HMEM to opx_CPPFLAGS])
opx_CPPFLAGS="-DOPX_HMEM -I/usr/include/uapi"

AS_IF([test $have_rocr -eq 1], [
AC_MSG_NOTICE([Appending -L/opt/rocm/lib -lamdhip64 to opx_LDFLAGS])
opx_LDFLAGS="$opx_LDFLAGS -L/opt/rocm/lib -lamdhip64"
AC_SUBST(opx_LDFLAGS)
])
])
])

Expand Down
10 changes: 8 additions & 2 deletions prov/opx/include/rdma/opx/fi_opx_endpoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -2007,7 +2007,11 @@ void fi_opx_ep_rx_process_header_rma_rts(struct fi_opx_ep *opx_ep, const union o
fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__);
abort();
}
slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr);
if (context->len) {
slist_insert_tail((struct slist_entry *) context, rx->cq_pending_ptr);
} else { /* length is 0, there will be no RZV data sent, so post now */
slist_insert_tail((struct slist_entry *) context, rx->cq_completed_ptr);
}

OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "RECV-RMA-RTS");
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
Expand Down Expand Up @@ -2831,11 +2835,13 @@ static inline void fi_opx_ep_rx_process_header(struct fid_ep *ep, const union op
if (prev) {
prev->next = context->next;
} else {
assert(opx_ep->rx->queue[kind].mq.head == (struct slist_entry *) context);
opx_ep->rx->queue[kind].mq.head = (struct slist_entry *) context->next;
}

if (context->next == NULL) {
opx_ep->rx->queue[kind].mq.tail = NULL;
assert(opx_ep->rx->queue[kind].mq.tail == (struct slist_entry *) context);
opx_ep->rx->queue[kind].mq.tail = (struct slist_entry *) prev;
}

context->next = NULL;
Expand Down
33 changes: 18 additions & 15 deletions prov/opx/include/rdma/opx/fi_opx_flight_recorder.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021,2024 Cornelis Networks.
* Copyright (C) 2021,2025 Cornelis Networks.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
Expand Down Expand Up @@ -139,31 +139,34 @@ static inline void flight_recorder_dump_packet_payload(struct flight_recorder_en
}
}

static inline void flight_recorder_dump(struct flight_recorder *fr)
static inline void flight_recorder_dump(struct fi_opx_ep *opx_ep, struct flight_recorder *fr)
{
const unsigned count = fr->count;
struct timespec current_time;
fi_opx_timer_now(fr->now, fr->timer);
if (!fr->timer->cycle_timer.use_cycle_timer) {
current_time = fr->now->tp;
fprintf(stderr,
"#FLIGHT_RECORDER t%d |EP: %p\n"
"#FLIGHT_RECORDER t%d |Last Dump Time: %ld.%ld\n"
"#FLIGHT_RECORDER t%d |Current Time: %ld.%ld\n"
"#FLIGHT_RECORDER t%d |Entry Count : %u\n",
fr->tid, fr->last_dump.tv_sec, fr->last_dump.tv_nsec, fr->tid, current_time.tv_sec,
current_time.tv_nsec, fr->tid, count);
fr->tid, opx_ep, fr->tid, fr->last_dump.tv_sec, fr->last_dump.tv_nsec, fr->tid,
current_time.tv_sec, current_time.tv_nsec, fr->tid, count);
fr->last_dump = current_time;
if (count == 0) {
fr->last_dump = current_time;
return;
}
} else {
fprintf(stderr,
"#FLIGHT_RECORDER t%d |EP: %p\n"
"#FLIGHT_RECORDER t%d |Last Dump Time: %0.9lf\n"
"#FLIGHT_RECORDER t%d |Current Time: %0.9lf\n"
"#FLIGHT_RECORDER t%d |Entry Count : %u\n",
fr->tid, fr->last_dump_cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12, fr->tid,
fr->now->cycle_timer.cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12, fr->tid, count);
fr->tid, opx_ep, fr->tid, fr->last_dump_cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12,
fr->tid, fr->now->cycle_timer.cycles * fr->timer->cycle_timer.picos_per_cycle / 1e12, fr->tid,
count);
fr->last_dump_cycles = fr->now->cycle_timer.cycles;
if (count == 0) {
fr->last_dump_cycles = fr->now->cycle_timer.cycles;
Expand Down Expand Up @@ -199,33 +202,33 @@ static inline void flight_recorder_dump(struct flight_recorder *fr)
fr->count = 0;
}

#define FLIGHT_RECORDER_STRING(fr, event_id, format, ...) \
#define FLIGHT_RECORDER_STRING(ep, fr, event_id, format, ...) \
{ \
struct flight_recorder_entry *next = \
flight_recorder_init_next_entry((fr), (event_id), FR_ENTRY_TYPE_STRING); \
int actual_len = snprintf((char *) next->data, FLIGHT_RECORDER_ENTRY_DATA_LEN, format, ##__VA_ARGS__); \
int end_of_string = MIN(actual_len, FLIGHT_RECORDER_ENTRY_DATA_LEN - 1); \
next->data[end_of_string] = 0; \
if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \
flight_recorder_dump((fr)); \
flight_recorder_dump(ep, fr); \
}

#define FLIGHT_RECORDER_PACKET_HDR(fr, event_id, packet_hdr) \
#define FLIGHT_RECORDER_PACKET_HDR(ep, fr, event_id, packet_hdr) \
{ \
struct flight_recorder_entry *next = \
flight_recorder_init_next_entry((fr), (event_id), FR_ENTRY_TYPE_PACKET_HDR); \
memcpy((void *) next->data, (void *) &(packet_hdr), sizeof(struct fi_opx_hfi1_stl_packet_hdr_9B)); \
if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \
flight_recorder_dump((fr)); \
flight_recorder_dump(ep, fr); \
}

#define FLIGHT_RECORDER_PACKET(fr, event_id, packet) \
#define FLIGHT_RECORDER_PACKET(ep, fr, event_id, packet) \
{ \
struct flight_recorder_entry *next = \
flight_recorder_init_next_entry((fr), (event_id), FR_ENTRY_TYPE_PACKET); \
memcpy((void *) next->data, (void *) &(packet), sizeof(union fi_opx_hfi1_packet_payload)); \
if ((fr)->count + 1 == FLIGHT_RECORDER_ENTRY_COUNT) \
flight_recorder_dump((fr)); \
flight_recorder_dump(ep, fr); \
}

#define FLIGHT_RECORDER_INIT(fr) \
Expand All @@ -235,9 +238,9 @@ static inline void flight_recorder_dump(struct flight_recorder *fr)

#else /* !FLIGHT_RECORDER_ENABLE */

#define FLIGHT_RECORDER_STRING(fr, event_id, format, ...)
#define FLIGHT_RECORDER_PACKET_HDR(fr, event_id, packet_hdr)
#define FLIGHT_RECORDER_PACKET(fr, event_id, packet)
#define FLIGHT_RECORDER_STRING(ep, fr, event_id, format, ...)
#define FLIGHT_RECORDER_PACKET_HDR(ep, fr, event_id, packet_hdr)
#define FLIGHT_RECORDER_PACKET(ep, fr, event_id, packet)

#endif /* #ifdef FLIGHT_RECORDER_ENABLE */

Expand Down
2 changes: 1 addition & 1 deletion prov/opx/include/rdma/opx/fi_opx_hfi1_progress.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ void fi_opx_hfi1_handle_packet(struct fi_opx_ep *opx_ep, uint64_t *p_rhf_seq, ui
*p_last_egrbfr_index = egrbfr_index;
}

FLIGHT_RECORDER_PACKET_HDR(opx_ep->fr, FR_EVENT_HFI1_POLL_ONCE, hdr);
FLIGHT_RECORDER_PACKET_HDR(opx_ep, opx_ep->fr, FR_EVENT_HFI1_POLL_ONCE, hdr);
}

*p_rhf_seq = OPX_RHF_SEQ_INCREMENT(rhf_seq, hfi1_type);
Expand Down
2 changes: 1 addition & 1 deletion prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ struct fi_opx_hfi1_sdma_work_entry {
uint64_t first_ack_time_ns;

/* ==== CACHELINE 1 ==== */
struct fi_opx_hfi1_sdma_packet packets[OPX_SDMA_MAX_PKTS_BOUNCE_BUF];
struct fi_opx_hfi1_sdma_packet packets[OPX_HFI1_SDMA_MAX_PKTS_TID];

struct {
struct fi_opx_completion_counter cc;
Expand Down
2 changes: 1 addition & 1 deletion prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h
Original file line number Diff line number Diff line change
Expand Up @@ -2210,7 +2210,7 @@ ssize_t opx_hfi1_tx_send_egr_16B(struct fid_ep *ep, const void *buf, size_t len,

/* write one block of PIO non-SOP, either one full block (8 qws) or the partial qws/block */
const size_t first_block_qws =
full_block_credits_needed ? OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS : tail_partial_block_qws;
full_block_credits_needed ? OPX_JKR_16B_PAYLOAD_AFTER_HDR_QWS : payload_qws_total;

#ifndef NDEBUG
credits_consumed +=
Expand Down
7 changes: 6 additions & 1 deletion prov/opx/include/rdma/opx/fi_opx_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
#include "rdma/opx/opx_tracer.h"
#include "ofi_hmem.h"

#if HAVE_ROCR
#define __HIP_PLATFORM_AMD__ 0
#include <hip/hip_runtime.h>
#endif

#define OPX_HMEM_NO_HANDLE (0)
#define OPX_HMEM_DEV_REG_THRESHOLD_NOT_SET (-1L)

Expand Down Expand Up @@ -639,7 +644,7 @@ enum opx_hmem_return_code opx_hmem_event_query(enum fi_hmem_iface iface, union o
#endif
#if HAVE_ROCR
if (iface == FI_HMEM_ROCR) {
hipError_t r = hipEventQuery(event->hip_event);
hipError_t result = hipEventQuery(event->hip_event);
if (result == hipSuccess) {
return OPX_HMEM_SUCCESS;
} else if (result == hipErrorNotReady) {
Expand Down
1 change: 1 addition & 0 deletions prov/opx/include/rdma/opx/opx_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct opx_debug_ep_entry {

void opx_debug_ep_list_append(void *opx_ep);
void opx_debug_ep_list_free(void *opx_ep);
void opx_debug_ep_list_dump();
void opx_debug_install_handler();

#endif
18 changes: 9 additions & 9 deletions prov/opx/src/fi_opx_atomic.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,10 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, const uint32_t opcode,
params->cc = NULL;
params->user_cc = NULL;
params->src_base_addr = NULL;
params->origin_byte_counter = NULL;
params->payload_bytes_for_iovec = sizeof(struct fi_opx_hfi1_dput_fetch);
params->fetch_vaddr = (void *) fetch_iov->buf;
params->target_byte_counter_vaddr = (const uintptr_t) cc;
params->target_hfi_unit = opx_dst_addr.hfi1_unit;
params->origin_byte_counter = NULL;
params->payload_bytes_for_iovec = sizeof(struct fi_opx_hfi1_dput_fetch);
params->fetch_vaddr = (void *) fetch_iov->buf;
params->target_hfi_unit = opx_dst_addr.hfi1_unit;
params->u32_extended_rx = fi_opx_ep_get_u32_extended_rx(opx_ep, params->is_shm, opx_dst_addr.hfi1_subctxt_rx);

if (compare_iov) {
Expand All @@ -184,10 +183,11 @@ void fi_opx_atomic_op_internal(struct fi_opx_ep *opx_ep, const uint32_t opcode,

struct fi_opx_rma_request *rma_request = ofi_buf_alloc(opx_ep->tx->rma_request_pool);
assert(rma_request != NULL);
rma_request->cc = cc;
rma_request->hmem_iface = fetch_iov->iface;
rma_request->hmem_device = fetch_iov->device;
params->rma_request_vaddr = (uintptr_t) rma_request;
rma_request->cc = cc;
rma_request->hmem_iface = fetch_iov->iface;
rma_request->hmem_device = fetch_iov->device;
params->rma_request_vaddr = (uintptr_t) rma_request;
params->target_byte_counter_vaddr = params->rma_request_vaddr;

fi_opx_ep_rx_poll(&opx_ep->ep_fid, 0, OPX_RELIABILITY, FI_OPX_HDRQ_MASK_RUNTIME, hfi1_type, ctx_sharing);

Expand Down
19 changes: 15 additions & 4 deletions prov/opx/src/fi_opx_hfi1.c
Original file line number Diff line number Diff line change
Expand Up @@ -3789,9 +3789,10 @@ int fi_opx_hfi1_do_dput_sdma(union fi_opx_hfi1_deferred_work *work, const enum o
(!sdma_no_bounce_buf || opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH ||
opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH || need_padding);

const uint64_t max_pkts_per_req =
params->sdma_we->use_bounce_buf ? OPX_HFI1_SDMA_MAX_PKTS : opx_ep->tx->sdma_max_pkts;
packet_count = MIN(packet_count, max_pkts_per_req);
const uint64_t max_pkts_per_req = params->sdma_we->use_bounce_buf ?
OPX_SDMA_MAX_PKTS_BOUNCE_BUF :
opx_ep->tx->sdma_max_pkts;
packet_count = MIN(packet_count, max_pkts_per_req);

int32_t psns_avail = fi_opx_reliability_tx_available_psns(
&opx_ep->ep_fid, opx_ep->reli_service, params->slid, params->origin_rx,
Expand Down Expand Up @@ -4007,7 +4008,8 @@ int fi_opx_hfi1_do_dput_sdma_tid(union fi_opx_hfi1_deferred_work *work, const en
unsigned i;
const void *sbuf_start = params->src_base_addr;
const bool sdma_no_bounce_buf = params->sdma_no_bounce_buf;
const uint64_t max_pkts_per_req = sdma_no_bounce_buf ? opx_ep->tx->sdma_max_pkts_tid : OPX_SDMA_BOUNCE_BUF_MAX;
const uint64_t max_pkts_per_req =
sdma_no_bounce_buf ? opx_ep->tx->sdma_max_pkts_tid : OPX_SDMA_MAX_PKTS_BOUNCE_BUF;

assert(params->ntidpairs != 0);
assert(niov == 1);
Expand Down Expand Up @@ -4395,6 +4397,15 @@ fi_opx_hfi1_rx_rzv_cts(struct fi_opx_ep *opx_ep, const union opx_hfi1_packet_hdr
const enum ofi_reliability_kind reliability, const uint32_t u32_extended_rx,
const enum opx_hfi1_type hfi1_type)
{
if (dput_opcode == FI_OPX_HFI_DPUT_OPCODE_PUT_CQ) {
struct fi_opx_completion_counter *cc = ((struct fi_opx_rma_request *) rma_request_vaddr)->cc;
if (cc->byte_counter == 0) {
OPX_BUF_FREE((struct fi_opx_rma_request *) rma_request_vaddr);
cc->hit_zero(cc);
return NULL;
}
}

union fi_opx_hfi1_deferred_work *work = ofi_buf_alloc(opx_ep->tx->work_pending_pool);
struct fi_opx_hfi1_dput_params *params = &work->dput;

Expand Down
9 changes: 7 additions & 2 deletions prov/opx/src/opx_debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ static void opx_debug_dump_backtrace(FILE *output)
static void opx_debug_dump_endpoint(struct fi_opx_ep *opx_ep)
{
char hostname[HOST_NAME_MAX + 1];
pid_t my_pid = gettid();
pid_t my_pid = getpid();

int rc = gethostname(hostname, HOST_NAME_MAX);
if (rc != 0) {
Expand Down Expand Up @@ -424,13 +424,18 @@ void opx_debug_ep_list_free(void *opx_ep)
}
}

static void opx_debug_signal_handler(int signum, siginfo_t *info, void *ucontext)
void opx_debug_ep_list_dump()
{
struct opx_debug_ep_entry *entry = (struct opx_debug_ep_entry *) ep_list.head;
while (entry) {
opx_debug_dump_endpoint((struct fi_opx_ep *) entry->ep);
entry = entry->next;
}
}

static void opx_debug_signal_handler(int signum, siginfo_t *info, void *ucontext)
{
opx_debug_ep_list_dump();

if (prev_sig_handler && prev_sig_handler != SIG_DFL && prev_sig_handler != SIG_IGN) {
prev_sig_handler(signum);
Expand Down
14 changes: 9 additions & 5 deletions prov/opx/src/opx_hmem_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,15 @@ int opx_hmem_open_domain(struct opx_hmem_fabric *hmem_fabric, struct fi_info *in
return ret;
}

ret = ofi_ipc_cache_open(&new_hmem_domain->ipc_cache, &new_hmem_domain->util_domain);
if (ret) {
opx_hmem_close_domain(new_hmem_domain, 0);
FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, "Error opening IPC Cache ret=%d (%s)\n", ret, strerror(ret));
return ret;
if ((hmem_ops[FI_HMEM_CUDA].initialized && ofi_hmem_is_ipc_enabled(FI_HMEM_CUDA)) ||
(hmem_ops[FI_HMEM_ROCR].initialized && ofi_hmem_is_ipc_enabled(FI_HMEM_ROCR))) {
ret = ofi_ipc_cache_open(&new_hmem_domain->ipc_cache, &new_hmem_domain->util_domain);
if (ret) {
opx_hmem_close_domain(new_hmem_domain, 0);
FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, "Error opening IPC Cache ret=%d (%s)\n", ret,
strerror(ret));
return ret;
}
}

ret = ofi_bufpool_create(&new_hmem_domain->hmem_stream.event_pool, sizeof(union opx_hmem_event), 0, UINT_MAX,
Expand Down
9 changes: 3 additions & 6 deletions prov/verbs/src/verbs_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,12 +374,9 @@ static int vrb_param_define(const char *param_name, const char *param_str,
switch (type) {
case FI_PARAM_STRING:
if (*(char **)param_default != NULL) {
param_default_sz =
MIN(strlen(*(char **)param_default),
254);
strncpy(param_default_str, *(char **)param_default,
param_default_sz);
param_default_str[param_default_sz + 1] = '\0';
strncpy(param_default_str, *(char **)param_default, 255);
param_default_str[255] = '\0';
param_default_sz = strlen(param_default_str);
}
break;
case FI_PARAM_INT:
Expand Down