diff --git a/AUTHORS b/AUTHORS index 9804584832a..1abca8334ff 100644 --- a/AUTHORS +++ b/AUTHORS @@ -15,9 +15,12 @@ Artem Ryabov Artemy Kovalyov Arun Chandran Aurelien Bouteiller +Benson Muite Bin Lei Boris Karasev Brad Benton +Brian Sheng +Bruno Faccini Changcheng Liu Colin Hirsch Corey J. Nolet @@ -29,10 +32,12 @@ Dmitrii Gabor Dmitry Gladkov Doug Jacobsen Edgar Gabriel +Elad Guttel Elad Persiko Eugene Voronov Evgeny Leksikov Fabian Ruhland +Felix Abecassis Gilbert Lee Gilles Gouaillardet Gonzalo Brito Gadeschi @@ -40,6 +45,7 @@ Graham Lopez Guy Shattah Hessam Mirsadeghi Hiroyuki Sato +Honggang Li Howard Pritchard Huaxiang Fan Hui Zhou @@ -48,13 +54,16 @@ Ilia Yastrebov Ilya Nelkenbaum Itay Alroy Ivan Kochin +JKLiang9714 <1023587725@qq.com> Jakir Kham +Jan Ciesko Jason Gunthorpe Jeff Daily Jianxin Xiong John Snyder Jonas Zhou Joseph Schuchart +Kaidrikov Evgeny Keisuke Fukuda Ken Raffenetti Khaled Hamidouche @@ -66,17 +75,21 @@ Manjunath Gorentla Venkata Marek Schimara Mark Allen Matthew Baker +Matthias Diener Michael Braverman Michal Shalev Mike Dubman Mikhail Brinskii +Mikhail Brinskiy Min Fang Nathan Hjelm Netanel Yosephian +Nysal Jan K.A Ofir Farjon Olly Perks Ovidiu Mara Pak Lui +Pasha (Pavel) Shamis Pavan Balaji Pavel Shamis (Pasha) Peter Andreas Entschev @@ -84,15 +97,18 @@ Peter Rudenko Peter-Jan Gootzen Qiang Yu Raul Akhmetshin +Rob Bradford Robert Dietrich Rohit Zambre Roie Danino Romain Pereira Sam James +Saravanan Vajravel Sasha Kotchubievsky Scott Saulters Sergey Lebedev Sergey Oblomov +Sergey Oblomov sergeyo@mellanox.com () Sergey Shalnov Serguei Sagalovitch Shachar Hasson @@ -104,17 +120,32 @@ Stephen Richmond Swen Boehm Thomas Vegas Tony Curtis +Tooraj Taraz Tzafrir Cohen Valentin Petrov +Vasily Philipov Wenbin Lu +Xiang Gao Xin Zhao Xu Yifeng Yiltan Hassan Temucin Yossi Itigin Yuriy Shestakov +Zhongkai Zhang Zhu Yanjun Zihao Zhao +akolliasAMD +akshay-venkatesh +chenyidu +dmitrygx +esoha-nvidia <69258779+esoha-nvidia@users.noreply.github.com> +lyu lzhang2 +nileshnegi +panda1100 +razor1991 +root +shasson5 <103439971+shasson5@users.noreply.github.com> In addition we would like to acknowledge the following members of UCX community for their participation in annual face-to-face meeting, design discussions, and diff --git a/config/m4/gaudi.m4 b/config/m4/gaudi.m4 new file mode 100644 index 00000000000..140d20ddf7c --- /dev/null +++ b/config/m4/gaudi.m4 @@ -0,0 +1,71 @@ +# +# Copyright (c) Intel Corporation, 2025. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +AC_DEFUN([UCX_CHECK_GAUDI],[ + +AS_IF([test "x$gaudi_checked" != "xyes"], + [ + AC_ARG_WITH([gaudi], + [AS_HELP_STRING([--with-gaudi=(DIR)], [Enable the use of GAUDI (default is guess).])], + [], [with_gaudi=guess]) + + AS_IF([test "x$with_gaudi" = "xno"], + [ + gaudi_happy="no" + ], + [ + save_CPPFLAGS="$CPPFLAGS" + save_LDFLAGS="$LDFLAGS" + save_LIBS="$LIBS" + + GAUDI_CPPFLAGS="" + GAUDI_LDFLAGS="" + GAUDI_LIBS="" + + AS_IF([test ! -z "$with_gaudi" -a "x$with_gaudi" != "xyes" -a "x$with_gaudi" != "xguess"], + [ucx_check_gaudi_dir="$with_gaudi" + ucx_check_gaudi_libdir="$with_gaudi/lib/habanalabs" + GAUDI_CPPFLAGS="-I$with_gaudi/include/habanalabs -I/usr/include/drm -I/usr/include/libdrm" + GAUDI_LDFLAGS="-L$ucx_check_gaudi_libdir"], + [GAUDI_CPPFLAGS="-I/usr/include/habanalabs -I/usr/include/drm -I/usr/include/libdrm" + GAUDI_LDFLAGS="-L/usr/lib/habanalabs"]) + + AS_IF([test ! -z "$with_gaudi_libdir" -a "x$with_gaudi_libdir" != "xyes"], + [ucx_check_gaudi_libdir="$with_gaudi_libdir" + GAUDI_LDFLAGS="-L$ucx_check_gaudi_libdir"]) + + CPPFLAGS="$CPPFLAGS $GAUDI_CPPFLAGS" + LDFLAGS="$LDFLAGS $GAUDI_LDFLAGS" + + # Check gaudi header files + AC_CHECK_HEADERS([hlthunk.h], + [gaudi_happy="yes"], [gaudi_happy="no"]) + + # Check gaudi libraries + AS_IF([test "x$gaudi_happy" = "xyes"], + [AC_CHECK_LIB([hl-thunk], [hlthunk_open], + [GAUDI_LIBS="$GAUDI_LIBS -lhl-thunk -lscal -lSynapse -lSynapseMme"], [gaudi_happy="no"])]) + + CPPFLAGS="$save_CPPFLAGS" + LDFLAGS="$save_LDFLAGS" + LIBS="$save_LIBS" + + AS_IF([test "x$gaudi_happy" = "xyes"], + [AC_SUBST([GAUDI_CPPFLAGS], ["$GAUDI_CPPFLAGS"]) + AC_SUBST([GAUDI_LDFLAGS], ["$GAUDI_LDFLAGS"]) + AC_SUBST([GAUDI_LIBS], ["$GAUDI_LIBS"]) + AC_DEFINE([HAVE_GAUDI], 1, [Enable GAUDI support])], + [AS_IF([test "x$with_gaudi" != "xguess"], + [AC_MSG_ERROR([GAUDI support is requested but gaudi packages cannot be found])], + [AC_MSG_WARN([GAUDI not found])])]) + + ]) # "x$with_gaudi" = "xno" + + gaudi_checked=yes + AM_CONDITIONAL([HAVE_GAUDI], [test "x$gaudi_happy" != xno]) + + ]) # "x$gaudi_checked" != "xyes" + +]) # UCX_CHECK_GAUDI diff --git a/configure.ac b/configure.ac index 30926b15c23..e41eb6826d1 100644 --- a/configure.ac +++ b/configure.ac @@ -231,6 +231,7 @@ AS_IF([test "x$with_docs_only" = xyes], AM_CONDITIONAL([HAVE_GCOV], [false]) AM_CONDITIONAL([HAVE_LCOV], [false]) AM_CONDITIONAL([HAVE_ZE], [false]) + AM_CONDITIONAL([HAVE_GAUDI], [false]) ], [ AM_CONDITIONAL([DOCS_ONLY], [false]) @@ -247,6 +248,7 @@ AS_IF([test "x$with_docs_only" = xyes], m4_include([config/m4/gdrcopy.m4]) m4_include([config/m4/mad.m4]) m4_include([config/m4/lcov.m4]) + m4_include([config/m4/gaudi.m4]) m4_include([src/ucm/configure.m4]) m4_include([src/ucs/configure.m4]) m4_include([src/uct/configure.m4]) diff --git a/debian/ucx-gaudi.install b/debian/ucx-gaudi.install new file mode 100644 index 00000000000..9adfc58094c --- /dev/null +++ b/debian/ucx-gaudi.install @@ -0,0 +1,2 @@ +usr/lib/ucx/libucx_perftest_gaudi.* +usr/lib/ucx/libuct_gaudi.* diff --git a/src/ucs/Makefile.am b/src/ucs/Makefile.am index 28b6fa1e805..657d06622df 100644 --- a/src/ucs/Makefile.am +++ b/src/ucs/Makefile.am @@ -90,6 +90,11 @@ nobase_dist_libucs_la_HEADERS = \ arch/rv64/global_opts.h \ arch/global_opts.h +if HAVE_GAUDI + nobase_dist_libucs_la_HEADERS += \ + sys/topo/gaudi/topo.h +endif + noinst_HEADERS = \ arch/aarch64/cpu.h \ arch/generic/cpu.h \ @@ -220,9 +225,14 @@ libucs_la_SOURCES = \ vfs/base/vfs_obj.c \ vfs/base/vfs_cb.c +if HAVE_GAUDI + libucs_la_SOURCES += \ + sys/topo/gaudi/topo.c +endif + if HAVE_AARCH64_THUNDERX2 libucs_la_SOURCES += \ - arch/aarch64/memcpy_thunderx2.S + arch/aarch64/memcpy_thunderx2.S endif if HAVE_STATS diff --git a/src/ucs/memory/memory_type.c b/src/ucs/memory/memory_type.c index 66b0994976d..039089b8f6d 100644 --- a/src/ucs/memory/memory_type.c +++ b/src/ucs/memory/memory_type.c @@ -24,6 +24,7 @@ const char *ucs_memory_type_names[] = { [UCS_MEMORY_TYPE_ZE_HOST] = "ze-host", [UCS_MEMORY_TYPE_ZE_DEVICE] = "ze-device", [UCS_MEMORY_TYPE_ZE_MANAGED] = "ze-managed", + [UCS_MEMORY_TYPE_GAUDI] = "gaudi", [UCS_MEMORY_TYPE_LAST] = "unknown", [UCS_MEMORY_TYPE_LAST + 1] = NULL }; @@ -38,5 +39,6 @@ const char *ucs_memory_type_descs[] = { [UCS_MEMORY_TYPE_ZE_HOST] = "Intel/Ze USM host memory", [UCS_MEMORY_TYPE_ZE_DEVICE] = "Intel/Ze GPU memory", [UCS_MEMORY_TYPE_ZE_MANAGED] = "Intel/Ze GPU managed memory", + [UCS_MEMORY_TYPE_GAUDI] = "HabanaLabs Gaudi memory", [UCS_MEMORY_TYPE_LAST] = "unknown" }; diff --git a/src/ucs/memory/memory_type.h b/src/ucs/memory/memory_type.h index c2ff2f70b9f..305ba706434 100644 --- a/src/ucs/memory/memory_type.h +++ b/src/ucs/memory/memory_type.h @@ -45,6 +45,7 @@ typedef enum ucs_memory_type { UCS_MEMORY_TYPE_ZE_HOST, /**< Intel ZE memory (USM host) */ UCS_MEMORY_TYPE_ZE_DEVICE, /**< Intel ZE memory (USM device) */ UCS_MEMORY_TYPE_ZE_MANAGED, /**< Intel ZE managed memory (USM shared) */ + UCS_MEMORY_TYPE_GAUDI, /**< HabanaLabs Gaudi memory */ UCS_MEMORY_TYPE_LAST, UCS_MEMORY_TYPE_UNKNOWN = UCS_MEMORY_TYPE_LAST } ucs_memory_type_t; diff --git a/src/ucs/sys/topo/gaudi/topo.c b/src/ucs/sys/topo/gaudi/topo.c new file mode 100644 index 00000000000..b34ca9b4a5f --- /dev/null +++ b/src/ucs/sys/topo/gaudi/topo.c @@ -0,0 +1,1911 @@ +/** +* Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "topo.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define COMPARE(a, b) ((a) < (b) ? -1 : (a) > (b) ? 1 : 0) +#define UCS_GAUDI_TOPO_ACCEL_PATH "/sys/class/accel/" +#define UCS_GAUDI_TOPO_INFINIBAND_PORT_FMT "/sys/class/infiniband/%s/ports/1/" +#define UCS_GAUDI_TOPO_VENDOR_ID 0x1da3 /* Habana Labs Vendor ID */ +#define UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID 0x15b3 +#define UCS_GAUDI_TOPO_BROADCOM_VENDOR_ID 0x14e4 +#define GAUDI_DEVICE_NAME_LEN 10 + +static pthread_mutex_t gaudi_init_mutex = PTHREAD_MUTEX_INITIALIZER; +/* File-scope one-time control and status */ +static pthread_once_t gaudi_spinlock_once_flag = PTHREAD_ONCE_INIT; +static ucs_status_t gaudi_spinlock_init_status = UCS_OK; + +static const ucs_sys_dev_distance_t gaudi_fallback_node_distance = + {.latency = 100e-9, .bandwidth = 17e9}; /* 100ns, 17 GB/s */ +static const ucs_sys_dev_distance_t gaudi_fallback_sys_distance = + {.latency = 300e-9, .bandwidth = 220e6}; /* 300ns, 220 MB/s */ + +/* Structure to hold Gaudi and HNIC mappings */ +typedef struct { + ucs_sys_device_t gaudi_device; + ucs_sys_device_t hnic_device; + uint16_t hnic_vendor_id; + ucs_sys_dev_distance_t distance; + ucs_numa_node_t common_numa_node; + char gaudi_dev_name[GAUDI_DEVICE_NAME_LEN]; +} ucs_gaudi_connection_t; + +/* Static context for Gaudi topology */ +typedef struct { + ucs_spinlock_t lock; + unsigned initialized; + unsigned provider_added; + ucs_sys_device_t *gaudi_devices; + char (*gaudi_devices_names)[GAUDI_DEVICE_NAME_LEN]; + unsigned num_gaudi_devices; + ucs_sys_device_t *hnic_devices; + uint16_t *hnic_vendor_ids; + unsigned num_hnic_devices; + ucs_gaudi_connection_t *connections; + unsigned num_connections; + ucs_sys_device_t *assigned_hnic_for_gaudi; + int *assigned_port_for_gaudi; + unsigned have_assignment; +} ucs_gaudi_topo_ctx_t; + +static ucs_gaudi_topo_ctx_t ucs_gaudi_topo_ctx = {0}; + +/* Compatible definition of ucs_sys_topo_ops_t (from topo.c layout) */ +typedef struct { + ucs_status_t (*get_distance)(ucs_sys_device_t device1, + ucs_sys_device_t device2, + ucs_sys_dev_distance_t *distance); + void (*get_memory_distance)(ucs_sys_device_t device, + ucs_sys_dev_distance_t *distance); +} compatible_topo_ops_t; + +/* Compatible definition of ucs_sys_topo_provider_t (from topo.c layout) */ +typedef struct { + const char *name; + compatible_topo_ops_t ops; + ucs_list_link_t list; +} compatible_topo_provider_t; + +/* Forward declarations */ +static ucs_status_t ucs_gaudi_get_distance(ucs_sys_device_t device1, + ucs_sys_device_t device2, + ucs_sys_dev_distance_t *distance); + +static void ucs_gaudi_get_memory_distance(ucs_sys_device_t device, + ucs_sys_dev_distance_t *distance); + +static ucs_status_t ucs_gaudi_lazy_init(); + +static int +ucs_gaudi_is_hnic_active(ucs_sys_device_t hnic_device, uint16_t hnic_vendor_id); + +/* Gaudi topology provider (compatible structure) */ +static compatible_topo_provider_t ucs_gaudi_topo_provider = { + .name = "gaudi", + .ops = + { + .get_distance = ucs_gaudi_get_distance, + .get_memory_distance = ucs_gaudi_get_memory_distance, + }, + .list = {NULL, NULL}}; + + +/* Helper function to construct sysfs path from ucs_sys_device_t */ +static ucs_status_t ucs_gaudi_sys_dev_to_sysfs_path(ucs_sys_device_t sys_dev, + char *path, size_t max) +{ + ucs_sys_bus_id_t bus_id; + ucs_status_t status; + char *link_path; + const char *prefix = "/sys/bus/pci/devices/"; + size_t prefix_len; + + status = ucs_topo_get_device_bus_id(sys_dev, &bus_id); + if (status != UCS_OK) { + ucs_error("Failed to get bus ID for device %d", sys_dev); + return status; + } + + if (max < PATH_MAX) { + ucs_error("Output buffer too small (%zu < %d)", max, PATH_MAX); + return UCS_ERR_BUFFER_TOO_SMALL; + } + + /* Build path directly in output buffer */ + prefix_len = strlen(prefix); + ucs_strncpy_safe(path, prefix, max); + ucs_snprintf_safe(path + prefix_len, max - prefix_len, "%04x:%02x:%02x.%x", + bus_id.domain, bus_id.bus, bus_id.slot, bus_id.function); + + link_path = realpath(path, NULL); + if (link_path == NULL) { + ucs_error("Failed to resolve realpath for %s: %s", path, + strerror(errno)); + return UCS_ERR_IO_ERROR; + } + + ucs_strncpy_safe(path, link_path, max); + ucs_free(link_path); + + return UCS_OK; +} + +/* Helper function to read PCI vendor ID from sysfs */ +static ucs_status_t +ucs_gaudi_read_vendor_id(ucs_sys_device_t sys_dev, uint16_t *vendor_id) +{ + char *path; + char vendor_str[16]; + ucs_status_t status; + FILE *file; + char *endptr; + unsigned long val; + size_t path_len; + + status = ucs_string_alloc_path_buffer(&path, "gaudi_vendor_path"); + if (status != UCS_OK) { + return status; + } + + status = ucs_gaudi_sys_dev_to_sysfs_path(sys_dev, path, PATH_MAX); + if (status != UCS_OK) { + ucs_debug("Failed to get sysfs path for device %d: %s", sys_dev, + ucs_status_string(status)); + ucs_free(path); + return status; + } + + path_len = strlen(path); + if ((PATH_MAX - path_len) < sizeof("/vendor")) { + ucs_free(path); + return UCS_ERR_BUFFER_TOO_SMALL; + } + ucs_snprintf_safe(path + path_len, PATH_MAX - path_len, "/vendor"); + + file = fopen(path, "r"); + if (!file) { + ucs_debug("Failed to open %s", path); + ucs_free(path); + return UCS_ERR_IO_ERROR; + } + + if (!fgets(vendor_str, sizeof(vendor_str), file)) { + ucs_debug("Failed to read vendor ID from %s", path); + fclose(file); + ucs_free(path); + return UCS_ERR_IO_ERROR; + } + fclose(file); + + errno = 0; + val = strtoul(vendor_str, &endptr, 0); + + /* Skip trailing whitespace */ + while (isspace((unsigned char)*endptr)) { + endptr++; + } + + if (errno != 0 || endptr == vendor_str || *endptr != '\0') { + ucs_debug("Invalid vendor ID '%s' in %s", vendor_str, path); + ucs_free(path); + return UCS_ERR_INVALID_PARAM; + } + + if (val > UINT16_MAX) { + ucs_debug("Vendor ID 0x%lx exceeds maximum value in %s", val, path); + ucs_free(path); + return UCS_ERR_INVALID_PARAM; + } + ucs_free(path); + + *vendor_id = (uint16_t)val; + return UCS_OK; +} + +/* Helper function to read PCI address from sysfs */ +static ucs_status_t +ucs_gaudi_read_pci_addr(const char *accel_name, char *pci_addr, size_t max) +{ + char path[PATH_MAX]; + FILE *file; + ucs_status_t status; + size_t len; + + ucs_snprintf_safe(path, PATH_MAX, "%s%s/device/pci_addr", + UCS_GAUDI_TOPO_ACCEL_PATH, accel_name); + + file = fopen(path, "r"); + if (!file) { + ucs_debug("Failed to open %s", path); + return UCS_ERR_IO_ERROR; + } + + if (fgets(pci_addr, max, file) == NULL) { + ucs_debug("Failed to read PCI address from %s", path); + status = UCS_ERR_IO_ERROR; + } else { + /* Remove trailing newline */ + len = strlen(pci_addr); + if (len > 0 && pci_addr[len - 1] == '\n') { + pci_addr[len - 1] = '\0'; + } + status = UCS_OK; + } + + fclose(file); + return status; +} + +/* Helper function to read module ID from sysfs */ +static ucs_status_t +ucs_gaudi_read_module_id(const char *accel_name, uint32_t *module_id) +{ + char path[PATH_MAX]; + FILE *file; + char buffer[16]; + char *endptr; + unsigned long val; + + ucs_snprintf_safe(path, PATH_MAX, "%s%s/device/module_id", + UCS_GAUDI_TOPO_ACCEL_PATH, accel_name); + + file = fopen(path, "r"); + if (!file) { + ucs_debug("Failed to open %s", path); + return UCS_ERR_IO_ERROR; + } + + if (fgets(buffer, sizeof(buffer), file) == NULL) { + ucs_debug("Failed to read module ID from %s", path); + fclose(file); + return UCS_ERR_IO_ERROR; + } + fclose(file); + + errno = 0; + val = strtoul(buffer, &endptr, 10); + while (isspace((unsigned char)*endptr)) { + endptr++; + } + if (errno != 0 || endptr == buffer || *endptr != '\0') { + ucs_debug("Invalid module ID in %s: '%s'", path, buffer); + return UCS_ERR_INVALID_PARAM; + } + + *module_id = (uint32_t)val; + return UCS_OK; +} + +/* Get Gaudi device index from a given module ID. */ +int ucs_gaudi_get_index_from_module_id(uint32_t module_id) +{ + DIR *dir; + struct dirent *entry; + ucs_status_t status; + uint32_t read_module_id; + int device_id; + + dir = opendir(UCS_GAUDI_TOPO_ACCEL_PATH); + if (!dir) { + ucs_error("Failed to open directory %s: %s", UCS_GAUDI_TOPO_ACCEL_PATH, + strerror(errno)); + return -1; + } + + /* Default: not found */ + device_id = -1; + while ((entry = readdir(dir)) != NULL) { + if (strncmp(entry->d_name, "accel", 5) != 0 || + strncmp(entry->d_name, "accel_", 6) == 0) { + continue; + } + + status = ucs_gaudi_read_module_id(entry->d_name, &read_module_id); + if (status != UCS_OK) { + continue; + } + + if (read_module_id == module_id) { + device_id = (int)strtol(entry->d_name + 5, NULL, 10); + break; + } + } + + closedir(dir); + + if (device_id < 0) { + ucs_error("no Gaudi accelerator with module_id %u found", module_id); + } + + return device_id; +} + +/* Enumerate Gaudi devices and HNICs */ +static ucs_status_t ucs_gaudi_enumerate_devices() +{ + ucs_sys_device_t sys_dev; + ucs_sys_bus_id_t bus_id; + ucs_status_t status; + struct stat statbuf; + DIR *dir; + struct dirent *entry; + char *accel_path; + char pci_addr[32]; + uint16_t vendor_id; + unsigned i; + unsigned gaudi_idx; + unsigned hnic_idx; + + dir = opendir(UCS_GAUDI_TOPO_ACCEL_PATH); + if (!dir) { + ucs_error("Failed to open directory %s", UCS_GAUDI_TOPO_ACCEL_PATH); + return UCS_ERR_IO_ERROR; + } + + status = ucs_string_alloc_path_buffer(&accel_path, "accel_path"); + if (status != UCS_OK) { + closedir(dir); + return status; + } + + /* Count Gaudi devices and HNICs */ + ucs_gaudi_topo_ctx.num_gaudi_devices = 0; + while ((entry = readdir(dir)) != NULL) { + if (strncmp(entry->d_name, "accel", 5) != 0 || + strncmp(entry->d_name, "accel_", 6) == 0) { + continue; + } + + ucs_snprintf_safe(accel_path, PATH_MAX, "%s%s", + UCS_GAUDI_TOPO_ACCEL_PATH, entry->d_name); + + if (stat(accel_path, &statbuf) == 0 && S_ISDIR(statbuf.st_mode)) { + ucs_debug("Found Gaudi device: %s", entry->d_name); + ucs_gaudi_topo_ctx.num_gaudi_devices++; + } + } + + if (ucs_gaudi_topo_ctx.num_gaudi_devices == 0) { + ucs_error("No Gaudi devices found under %s — aborting enumeration", + UCS_GAUDI_TOPO_ACCEL_PATH); + status = UCS_ERR_NO_DEVICE; + goto out; + } + + /* Enumerate HNICs using UCX topo */ + ucs_gaudi_topo_ctx.num_hnic_devices = 0; + for (i = 0; i < ucs_topo_num_devices(); i++) { + if (ucs_topo_get_device_bus_id(i, &bus_id) == UCS_OK) { + if (ucs_gaudi_read_vendor_id(i, &vendor_id) == UCS_OK) { + /* Assume Mellanox and Broadcom devices are HNICs */ + if (vendor_id == UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID || + vendor_id == UCS_GAUDI_TOPO_BROADCOM_VENDOR_ID) { + ucs_debug("Found HNIC device: %d (%s, active: %d)", i, + ucs_topo_sys_device_get_name(i), + ucs_gaudi_is_hnic_active(i, vendor_id)); + ucs_gaudi_topo_ctx.num_hnic_devices++; + } + } + } + } + + if (ucs_gaudi_topo_ctx.num_hnic_devices == 0) { + ucs_error("No HNIC devices found (no Mellanox/Broadcom NICs) — " + "aborting enumeration"); + status = UCS_ERR_NO_DEVICE; + goto out; + } + + /* Allocate arrays */ + ucs_gaudi_topo_ctx.gaudi_devices = + ucs_calloc(ucs_gaudi_topo_ctx.num_gaudi_devices, + sizeof(ucs_sys_device_t), "gaudi_devices"); + ucs_gaudi_topo_ctx.gaudi_devices_names = + ucs_calloc(ucs_gaudi_topo_ctx.num_gaudi_devices, + GAUDI_DEVICE_NAME_LEN, "gaudi_devices_names"); + ucs_gaudi_topo_ctx.hnic_devices = + ucs_calloc(ucs_gaudi_topo_ctx.num_hnic_devices, + sizeof(ucs_sys_device_t), "hnic_devices"); + ucs_gaudi_topo_ctx.hnic_vendor_ids = + ucs_calloc(ucs_gaudi_topo_ctx.num_hnic_devices, sizeof(uint16_t), + "hnic_vendor_ids"); + if (!ucs_gaudi_topo_ctx.gaudi_devices || + !ucs_gaudi_topo_ctx.gaudi_devices_names || + !ucs_gaudi_topo_ctx.hnic_devices || + !ucs_gaudi_topo_ctx.hnic_vendor_ids) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + + /* Populate Gaudi devices */ + gaudi_idx = 0; + rewinddir(dir); + while ((entry = readdir(dir)) != NULL && + gaudi_idx < ucs_gaudi_topo_ctx.num_gaudi_devices) { + if (strncmp(entry->d_name, "accel", 5) != 0 || + strncmp(entry->d_name, "accel_", 6) == 0) { + continue; + } + + ucs_snprintf_safe(accel_path, PATH_MAX, "%s%s", + UCS_GAUDI_TOPO_ACCEL_PATH, entry->d_name); + + if (stat(accel_path, &statbuf) == 0 && S_ISDIR(statbuf.st_mode)) { + status = ucs_gaudi_read_pci_addr(entry->d_name, pci_addr, + sizeof(pci_addr)); + if (status != UCS_OK) { + ucs_debug("Skipping device %s due to PCI address read " + "failure", + entry->d_name); + continue; + } + + status = ucs_topo_find_device_by_bdf_name(pci_addr, &sys_dev); + if (status == UCS_OK) { + ucs_gaudi_topo_ctx.gaudi_devices[gaudi_idx] = sys_dev; + ucs_topo_sys_device_set_name(sys_dev, entry->d_name, 1); + ucs_strncpy_safe( + ucs_gaudi_topo_ctx.gaudi_devices_names[gaudi_idx], + entry->d_name, GAUDI_DEVICE_NAME_LEN); + gaudi_idx++; + } else { + ucs_debug("Failed to find device by BDF %s for %s", pci_addr, + entry->d_name); + } + } + } + + /* Populate HNIC devices */ + hnic_idx = 0; + for (i = 0; i < ucs_topo_num_devices() && + hnic_idx < ucs_gaudi_topo_ctx.num_hnic_devices; + i++) { + if (ucs_topo_get_device_bus_id(i, &bus_id) == UCS_OK) { + if (ucs_gaudi_read_vendor_id(i, &vendor_id) == UCS_OK) { + if (vendor_id == UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID || + vendor_id == UCS_GAUDI_TOPO_BROADCOM_VENDOR_ID) { + ucs_gaudi_topo_ctx.hnic_devices[hnic_idx] = i; + ucs_gaudi_topo_ctx.hnic_vendor_ids[hnic_idx++] = vendor_id; + } + } + } + } + + status = UCS_OK; + +out: + /* + * Memory is freed ONLY on allocation failure (UCS_ERR_NO_MEMORY). + * For other errors, no allocations were made, so nothing to free. + */ + if (status == UCS_ERR_NO_MEMORY) { + ucs_free(ucs_gaudi_topo_ctx.gaudi_devices); + ucs_free(ucs_gaudi_topo_ctx.gaudi_devices_names); + ucs_free(ucs_gaudi_topo_ctx.hnic_devices); + ucs_free(ucs_gaudi_topo_ctx.hnic_vendor_ids); + + ucs_gaudi_topo_ctx.gaudi_devices = NULL; + ucs_gaudi_topo_ctx.gaudi_devices_names = NULL; + ucs_gaudi_topo_ctx.hnic_devices = NULL; + ucs_gaudi_topo_ctx.hnic_vendor_ids = NULL; + + ucs_gaudi_topo_ctx.num_gaudi_devices = 0; + ucs_gaudi_topo_ctx.num_hnic_devices = 0; + } + + ucs_free(accel_path); + + closedir(dir); + return status; +} + +/* + * Check if HNIC is active (RoCE) + * + * Both Mellanox and Broadcom vendors are expected to expose + * InfiniBand sysfs state files at: + * /sys/class/infiniband//ports/1/{state,phys_state} + * + * No fallback to generic Port1State is provided because: + * 1. Mellanox: Always uses IB sysfs in production + * 2. Broadcom: bnxt_re driver creates IB sysfs for RoCE + */ +static int +ucs_gaudi_is_hnic_active(ucs_sys_device_t hnic_device, uint16_t hnic_vendor_id) +{ + ucs_status_t status; + char *path; + char state[64]; + const char *dev_name; + size_t len; + + status = ucs_string_alloc_path_buffer(&path, "gaudi_hnic_path"); + if (status != UCS_OK) { + /* Out of memory, assume inactive */ + return 0; + } + + /* RDMA device name for MLX/BNXT, e.g. "mlx5_3" or "bnxt_re0". Never NULL. */ + dev_name = ucs_topo_sys_device_get_name(hnic_device); + + /* Mellanox/Broadcom devices: prefer InfiniBand state file (port 1) */ + ucs_snprintf_safe(path, PATH_MAX, UCS_GAUDI_TOPO_INFINIBAND_PORT_FMT, + dev_name); + + /* ports/1/state first (typically "4: ACTIVE\n" when up) */ + status = ucs_sys_read_sysfs_file(dev_name, path, "state", state, + sizeof(state), UCS_LOG_LEVEL_DEBUG); + if (status == UCS_OK) { + ucs_free(path); + state[sizeof(state) - 1] = '\0'; + len = strlen(state); + while (len > 0 && (state[len - 1] == '\n' || state[len - 1] == '\r')) { + state[--len] = '\0'; + } + return ((len > 0 && state[0] == '4') || + strstr(state, "ACTIVE") != NULL) && + !strstr(state, "INACTIVE"); + } + + /* ports/1/phys_state */ + status = ucs_sys_read_sysfs_file(dev_name, path, "phys_state", state, + sizeof(state), UCS_LOG_LEVEL_DEBUG); + if (status == UCS_OK) { + ucs_free(path); + state[sizeof(state) - 1] = '\0'; + len = strlen(state); + while (len > 0 && (state[len - 1] == '\n' || state[len - 1] == '\r')) { + state[--len] = '\0'; + } + return (strstr(state, "LinkUp") != NULL); + } + + /* Fallback: assume inactive */ + if (hnic_vendor_id == UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID) { + ucs_debug("Mellanox HNIC %s: IB state files absent, assuming inactive", + dev_name); + } else { + ucs_debug("Broadcom HNIC %s: IB state files absent, assuming inactive", + dev_name); + } + + ucs_free(path); + return 0; +} + +/* + * Return PCIe hop count between two sysfs paths and the common ancestor path; + * 255 if they live in separate root complexes. + */ +static inline unsigned ucs_gaudi_count_pcie_hops(const char *path1, + const char *path2, + char *common_path) +{ + /* compute common parent */ + ucs_path_get_common_parent(path1, path2, common_path); + if (common_path[0] == '\0') { + return 255; /* fallback */ + } + + return ucs_path_calc_distance(path1, path2); +} + +/* + * Estimate device-to-device distance based on NUMA and path depth + * - If either device is unknown or identical -> node distance. + * - If both NUMA nodes are valid and differ -> system distance. + * - Otherwise (same NUMA or at least one unknown) -> refine with + * PCIe hop count via sysfs paths. + */ +static ucs_status_t +ucs_gaudi_estimate_distance(ucs_sys_device_t device1, ucs_sys_device_t device2, + ucs_sys_dev_distance_t *distance) +{ + ucs_numa_node_t numa1, numa2; + ucs_status_t status; + const double hop_latency_ns = 10e-9; /* ~10 ns per hop */ + const double hop_bw_penalty = 0.95; /* 5 % loss per hop */ + unsigned hops; + char *path1, *path2, *common_path; + + /* If either device is unknown or they are identical, assume node distance */ + if ((device1 == UCS_SYS_DEVICE_ID_UNKNOWN) || + (device2 == UCS_SYS_DEVICE_ID_UNKNOWN) || (device1 == device2)) { + *distance = (ucs_global_opts.dist.node.bandwidth > 0) ? + ucs_global_opts.dist.node : + gaudi_fallback_node_distance; + return UCS_OK; + } + + /* Default distance */ + *distance = ucs_topo_default_distance; + + /* Step 1: Check NUMA nodes */ + numa1 = ucs_topo_sys_device_get_numa_node(device1); + numa2 = ucs_topo_sys_device_get_numa_node(device2); + + if (numa1 != UCS_NUMA_NODE_UNDEFINED && numa2 != UCS_NUMA_NODE_UNDEFINED) { + /* Different NUMA nodes */ + if (numa1 != numa2) { + *distance = (ucs_global_opts.dist.sys.bandwidth > 0) ? + ucs_global_opts.dist.sys : + gaudi_fallback_sys_distance; + return UCS_OK; + } + /* Same NUMA; continue to PCIe refinement */ + } + + /* Step 2: Same NUMA or NUMA unknown or undefined */ + + /* allocate three scratch buffers */ + status = ucs_string_alloc_path_buffer(&path1, "path1"); + if (status != UCS_OK) { + goto out; + } + + status = ucs_string_alloc_path_buffer(&path2, "path2"); + if (status != UCS_OK) { + goto free_path1; + } + + status = ucs_string_alloc_path_buffer(&common_path, "common_path"); + if (status != UCS_OK) { + goto free_path2; + } + + /* convert devices to sysfs paths */ + status = ucs_gaudi_sys_dev_to_sysfs_path(device1, path1, PATH_MAX); + if (status != UCS_OK) { + goto free_all; + } + + status = ucs_gaudi_sys_dev_to_sysfs_path(device2, path2, PATH_MAX); + if (status != UCS_OK) { + goto free_all; + } + + hops = ucs_gaudi_count_pcie_hops(path1, path2, common_path); + + /* Note: DO NOT treat hops==255 as system distance. In this topology, + * Gaudi/NIC can share NUMA but different root complexes. The 255 hop + * penalty naturally degrades the score without overriding NUMA affinity, + * which is the correct behavior. + */ + distance->latency = (ucs_global_opts.dist.node.bandwidth > 0 ? + ucs_global_opts.dist.node.latency : + gaudi_fallback_node_distance.latency) + + hop_latency_ns * hops; + + distance->bandwidth = (ucs_global_opts.dist.node.bandwidth > 0 ? + ucs_global_opts.dist.node.bandwidth : + gaudi_fallback_node_distance.bandwidth) * + pow(hop_bw_penalty, hops); + + status = UCS_OK; + +free_all: + ucs_free(common_path); +free_path2: + ucs_free(path2); +free_path1: + ucs_free(path1); +out: + return status; +} + +/* Create connection matrix */ +static ucs_status_t ucs_gaudi_create_connection_matrix() +{ + ucs_sys_device_t gaudi, hnic; + ucs_numa_node_t numa1, numa2; + ucs_status_t status; + ucs_gaudi_connection_t *conn; + const char *gaudi_name; + unsigned conn_idx; + unsigned i, j; + unsigned max_num_connections; + uint16_t hnic_vendor_id; + + max_num_connections = ucs_gaudi_topo_ctx.num_gaudi_devices * + ucs_gaudi_topo_ctx.num_hnic_devices; + ucs_gaudi_topo_ctx.connections = ucs_calloc(max_num_connections, + sizeof(ucs_gaudi_connection_t), + "gaudi_connections"); + if (!ucs_gaudi_topo_ctx.connections) { + return UCS_ERR_NO_MEMORY; + } + + conn_idx = 0; + for (i = 0; i < ucs_gaudi_topo_ctx.num_gaudi_devices; i++) { + gaudi = ucs_gaudi_topo_ctx.gaudi_devices[i]; + gaudi_name = ucs_gaudi_topo_ctx.gaudi_devices_names[i]; + numa1 = ucs_topo_sys_device_get_numa_node(gaudi); + + for (j = 0; j < ucs_gaudi_topo_ctx.num_hnic_devices; j++) { + hnic = ucs_gaudi_topo_ctx.hnic_devices[j]; + hnic_vendor_id = ucs_gaudi_topo_ctx.hnic_vendor_ids[j]; + if (!ucs_gaudi_is_hnic_active(hnic, hnic_vendor_id)) { + continue; + } + numa2 = ucs_topo_sys_device_get_numa_node(hnic); + + conn = &ucs_gaudi_topo_ctx.connections[conn_idx]; + conn->gaudi_device = gaudi; + ucs_strncpy_safe(conn->gaudi_dev_name, gaudi_name, + GAUDI_DEVICE_NAME_LEN); + conn->hnic_device = hnic; + conn->hnic_vendor_id = hnic_vendor_id; + conn->common_numa_node = ((numa1 == numa2) && + (numa1 != UCS_NUMA_NODE_UNDEFINED)) ? + numa1 : + UCS_NUMA_NODE_UNDEFINED; + + status = ucs_gaudi_estimate_distance(gaudi, hnic, &conn->distance); + if (status != UCS_OK) { + ucs_debug("Failed to estimate distance between Gaudi %u and " + "HNIC %u", + gaudi, hnic); + conn->distance = ucs_topo_default_distance; + } + conn_idx++; + } + } + + ucs_gaudi_topo_ctx.num_connections = conn_idx; + return UCS_OK; +} + +/* Fetch precomputed distance for (gaudi, hnic) from the connections list */ +static int ucs_gaudi_lookup_distance(ucs_sys_device_t gaudi_device, + ucs_sys_device_t hnic_device, + ucs_sys_dev_distance_t *distance) +{ + const ucs_gaudi_connection_t *conn; + unsigned i; + + /* Lookup connection matrix for distance metric */ + for (i = 0; i < ucs_gaudi_topo_ctx.num_connections; i++) { + conn = &ucs_gaudi_topo_ctx.connections[i]; + if (conn->gaudi_device == gaudi_device && + conn->hnic_device == hnic_device) { + if (distance) { + *distance = conn->distance; + } + return 1; + } + } + return 0; +} + +/* Compare function for sorting connections by distance */ +static int ucs_gaudi_compare_connections(const void *a, const void *b) +{ + const ucs_gaudi_connection_t *conn_a = (const ucs_gaudi_connection_t *)a; + const ucs_gaudi_connection_t *conn_b = (const ucs_gaudi_connection_t *)b; + int cmp; + const char *hnic_a; + const char *hnic_b; + + /* Prefer connections on same NUMA node over undefined NUMA */ + if (conn_a->common_numa_node != UCS_NUMA_NODE_UNDEFINED && + conn_b->common_numa_node == UCS_NUMA_NODE_UNDEFINED) { + return -1; + } + if (conn_a->common_numa_node == UCS_NUMA_NODE_UNDEFINED && + conn_b->common_numa_node != UCS_NUMA_NODE_UNDEFINED) { + return 1; + } + + /* Prefer lower latency */ + cmp = COMPARE(conn_a->distance.latency, conn_b->distance.latency); + if (cmp != 0) { + return cmp; + } + + /* Prefer higher bandwidth */ + cmp = COMPARE(conn_b->distance.bandwidth, conn_a->distance.bandwidth); + if (cmp != 0) { + return cmp; + } + + /* Optional: tie-breaker for deterministic sort */ + + /* NIC device name */ + hnic_a = ucs_topo_sys_device_get_name(conn_a->hnic_device); + hnic_b = ucs_topo_sys_device_get_name(conn_b->hnic_device); + cmp = strcmp(hnic_a, hnic_b); + if (cmp != 0) { + return cmp; + } + + /* Gaudi device name */ + cmp = strcmp(conn_a->gaudi_dev_name, conn_b->gaudi_dev_name); + if (cmp != 0) { + return cmp; + } + + /* Numeric fallbacks (should never differ) */ + if (conn_a->hnic_device != conn_b->hnic_device) { + return COMPARE(conn_b->hnic_device, conn_a->hnic_device); + } + return COMPARE(conn_a->gaudi_device, conn_b->gaudi_device); +} + +static void ucs_gaudi_sys_cpuset_for_numa_node(ucs_sys_cpuset_t *cpuset, + ucs_numa_node_t node) +{ + unsigned int cpu; + + CPU_ZERO(cpuset); + for (cpu = 0; cpu < ucs_numa_num_configured_cpus(); cpu++) { + if (ucs_numa_node_of_cpu(cpu) == node) { + CPU_SET(cpu, cpuset); + } + } +} + +/* Check if common_path indicates a PCIe Host Bridge (e.g., root complex) */ +static int ucs_gaudi_is_host_bridge_path(const char *common_path) +{ + const char *last; + char format_check[16]; + unsigned dom, bus; + int n; + + if (!common_path) { + ucs_debug("common_path is NULL"); + return 0; + } + + last = strrchr(common_path, '/'); + last = last ? last + 1 : common_path; + + /* Ensure the segment starts with "pci" and matches "pciXXXX:YY" */ + if (strncmp(last, "pci", 3) != 0) { + /* ucs_debug("common_path %s does not start with 'pci'", common_path); */ + return 0; + } + + n = sscanf(last, "pci%4x:%2x%15s", &dom, &bus, format_check); + if (n != 2) { + /* ucs_debug("common_path %s does not match 'pciXXXX:YY' format", common_path); */ + return 0; + } + + /* Bus 00 typically indicates a PCIe root complex (Host Bridge) */ + if (bus != 0) { + /* ucs_debug("common_path %s has non-zero bus %02x, not a Host Bridge", common_path, bus); */ + return 0; + } + + return 1; +} + +/* Print connection matrix in a format similar to nvidia-smi topo -m */ +static void ucs_gaudi_print_connection_matrix() +{ + ucs_gaudi_connection_t *conn; + ucs_numa_node_t gaudi_numa, hnic_numa; + ucs_sys_device_t gaudi, hnic; + ucs_sys_cpuset_t cpuset; + ucs_status_t status; + const char *gaudi_name, *hnic_name; + char *path1, *path2, *common_path; + char *buffer; + char numa_str[16]; + char module_id_str[16]; + char cpu_affinity[128]; + char connection_type[8]; + uint32_t module_id; + uint16_t hnic_vendor_id; + unsigned i, j, k, hops; + size_t buffer_size; + + /* allocate three scratch buffers */ + status = ucs_string_alloc_path_buffer(&path1, "path1"); + if (status != UCS_OK) { + goto out; + } + + status = ucs_string_alloc_path_buffer(&path2, "path2"); + if (status != UCS_OK) { + goto free_path1; + } + + status = ucs_string_alloc_path_buffer(&common_path, "common_path"); + if (status != UCS_OK) { + goto free_path2; + } + + buffer_size = 256 + (ucs_gaudi_topo_ctx.num_hnic_devices * 20); + buffer = ucs_malloc(buffer_size, "buffer"); + if (buffer == NULL) { + status = UCS_ERR_NO_MEMORY; + goto free_path3; + } + + /* Print header */ + ucs_snprintf_safe(buffer, buffer_size, "%-12s %-15s %-12s", "ModuleID", + "Gaudi", "NUMA ID"); + + for (i = 0; i < ucs_gaudi_topo_ctx.num_hnic_devices; i++) { + hnic = ucs_gaudi_topo_ctx.hnic_devices[i]; + hnic_vendor_id = ucs_gaudi_topo_ctx.hnic_vendor_ids[i]; + if (!ucs_gaudi_is_hnic_active(hnic, hnic_vendor_id)) { + continue; + } + hnic_name = ucs_topo_sys_device_get_name(hnic); + ucs_snprintf_safe(buffer + strlen(buffer), buffer_size - strlen(buffer), + " %-15s", hnic_name); + } + ucs_snprintf_safe(buffer + strlen(buffer), buffer_size - strlen(buffer), + " %-20s", "CPU Affinity"); + ucs_info("%s", buffer); + + + /* Print rows for each Gaudi device */ + for (i = 0; i < ucs_gaudi_topo_ctx.num_gaudi_devices; i++) { + gaudi = ucs_gaudi_topo_ctx.gaudi_devices[i]; + gaudi_name = ucs_gaudi_topo_ctx.gaudi_devices_names[i]; + gaudi_numa = ucs_topo_sys_device_get_numa_node(gaudi); + + /* Get module ID */ + status = ucs_gaudi_read_module_id(gaudi_name, &module_id); + if (status != UCS_OK) { + ucs_strncpy_safe(module_id_str, "N/A", sizeof(module_id_str)); + } else { + ucs_snprintf_safe(module_id_str, sizeof(module_id_str), "%u", + module_id); + } + + /* Get NUMA node for Gaudi device (NUMA column) */ + if (gaudi_numa == UCS_NUMA_NODE_UNDEFINED) { + ucs_strncpy_safe(numa_str, "N/A", sizeof(numa_str)); + + ucs_strncpy_safe(cpu_affinity, "N/A", sizeof(cpu_affinity)); + } else { + ucs_snprintf_safe(numa_str, sizeof(numa_str), "%d", gaudi_numa); + + ucs_gaudi_sys_cpuset_for_numa_node(&cpuset, gaudi_numa); + ucs_make_affinity_str(&cpuset, cpu_affinity, sizeof(cpu_affinity)); + } + + /* Start row with Gaudi info */ + ucs_snprintf_safe(buffer, buffer_size, "%-12s %-15s %-12s", + module_id_str, gaudi_name, numa_str); + + /* Fill connection types for each HNIC */ + for (j = 0; j < ucs_gaudi_topo_ctx.num_hnic_devices; j++) { + hnic = ucs_gaudi_topo_ctx.hnic_devices[j]; + hnic_vendor_id = ucs_gaudi_topo_ctx.hnic_vendor_ids[j]; + if (!ucs_gaudi_is_hnic_active(hnic, hnic_vendor_id)) { + continue; + } + hnic_numa = ucs_topo_sys_device_get_numa_node(hnic); + + /* Find connection */ + connection_type[0] = '\0'; + for (k = 0; k < ucs_gaudi_topo_ctx.num_connections; k++) { + conn = &ucs_gaudi_topo_ctx.connections[k]; + if (conn->gaudi_device == gaudi && conn->hnic_device == hnic) { + /* 1. Different NUMA - SYS */ + if (gaudi_numa != UCS_NUMA_NODE_UNDEFINED && + hnic_numa != UCS_NUMA_NODE_UNDEFINED && + gaudi_numa != hnic_numa) { + ucs_strncpy_safe(connection_type, "SYS", + sizeof(connection_type)); + break; + } + + /* 2. Same NUMA - based on PCIe hops */ + status = ucs_gaudi_sys_dev_to_sysfs_path(gaudi, path1, + PATH_MAX); + if (status != UCS_OK) { + continue; + } + status = ucs_gaudi_sys_dev_to_sysfs_path(hnic, path2, + PATH_MAX); + if (status != UCS_OK) { + continue; + } + hops = ucs_gaudi_count_pcie_hops(path1, path2, common_path); + + if (hops <= 1) { + ucs_strncpy_safe(connection_type, "PIX", + sizeof(connection_type)); + } else if (hops <= 3) { + ucs_strncpy_safe(connection_type, "PXB", + sizeof(connection_type)); + } else if (ucs_gaudi_is_host_bridge_path(common_path)) { + ucs_strncpy_safe(connection_type, "PHB", + sizeof(connection_type)); + } else if (gaudi_numa != UCS_NUMA_NODE_UNDEFINED && + hnic_numa != UCS_NUMA_NODE_UNDEFINED) { + /* Same NUMA, no shared root complex */ + ucs_strncpy_safe(connection_type, "NODE", + sizeof(connection_type)); + } else { + /* At least one NUMA unknown, or any other fallback */ + ucs_strncpy_safe(connection_type, "SYS", + sizeof(connection_type)); + } + break; + } + } + if (connection_type[0] == '\0') { + ucs_strncpy_safe(connection_type, "SYS", + sizeof(connection_type)); + } + ucs_snprintf_safe(buffer + strlen(buffer), + buffer_size - strlen(buffer), " %-15s", + connection_type); + } + + /* Append CPU affinity */ + ucs_snprintf_safe(buffer + strlen(buffer), buffer_size - strlen(buffer), + " %-20s", cpu_affinity); + ucs_info("%s", buffer); + } + + /* Print NIC Legend with NUMA nodes */ + ucs_info("\nNIC Legend:"); + for (i = 0; i < ucs_gaudi_topo_ctx.num_hnic_devices; i++) { + hnic = ucs_gaudi_topo_ctx.hnic_devices[i]; + hnic_vendor_id = ucs_gaudi_topo_ctx.hnic_vendor_ids[i]; + if (!ucs_gaudi_is_hnic_active(hnic, hnic_vendor_id)) { + continue; + } + + hnic_name = ucs_topo_sys_device_get_name(hnic); + hnic_numa = ucs_topo_sys_device_get_numa_node(hnic); + if (hnic_numa == UCS_NUMA_NODE_UNDEFINED) { + ucs_snprintf_safe(numa_str, sizeof(numa_str), "N/A"); + } else { + ucs_snprintf_safe(numa_str, sizeof(numa_str), "%d", hnic_numa); + } + ucs_info(" NIC%u: %s (NUMA %s)", i, hnic_name, numa_str); + } + + /* Print Connection Legend */ + ucs_info("\nLegend:"); + ucs_info(" SYS = Connection traversing PCIe as well as the SMP " + "interconnect between NUMA nodes (e.g., QPI/UPI)"); + ucs_info(" NODE = Connection traversing PCIe as well as the interconnect " + "between PCIe Host Bridges within a NUMA node"); + ucs_info(" PHB = Connection traversing PCIe as well as a PCIe Host " + "Bridge (typically the CPU)"); + ucs_info(" PXB = Connection traversing multiple PCIe bridges (without " + "traversing the PCIe Host Bridge)"); + ucs_info(" PIX = Connection traversing at most a single PCIe bridge"); + +free_all: + ucs_free(buffer); +free_path3: + ucs_free(common_path); +free_path2: + ucs_free(path2); +free_path1: + ucs_free(path1); +out: + return; +} + +/* Return default UCX port for given NIC vendor ID */ +static int ucs_gaudi_get_default_port(uint16_t vendor_id) +{ + switch (vendor_id) { + case UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID: /* 0x15b3 */ + return 1; /* mlx5_X:1 */ + case UCS_GAUDI_TOPO_BROADCOM_VENDOR_ID: /* 0x14e4 */ + return 1; /* bnxt_X:1 */ + default: + return 1; /* fallback */ + } +} + +/* + * Get and validate the NUMA node ID for a device. + * Returns node 0 if the node is undefined or out of range. + */ +static inline unsigned +ucs_gaudi_get_validated_numa_node(ucs_sys_device_t device, + unsigned num_numa_nodes) +{ + ucs_numa_node_t node; + + node = ucs_topo_sys_device_get_numa_node(device); + if (node == UCS_NUMA_NODE_UNDEFINED) { + ucs_warn("Device %d has undefined NUMA node, falling back to 0", + device); + return 0U; + } + if ((unsigned)node >= num_numa_nodes) { + ucs_debug("Device %d NUMA node %d out of range (max %u), " + "falling back to 0", + device, node, num_numa_nodes); + return 0U; + } + return (unsigned)node; +} + +/* Build a balanced, topology-aware assignment. + * + * Pass 1: For each NUMA node that has NICs: + * - Balanced capacity split: floor(G_n / N_n) + remainder distributed 1:1 to first NICs + * - For each local Gaudi, choose among local NICs with remaining capacity: + * 1. Minimum latency (primary) + * 2. Maximum bandwidth (secondary) + * 3. Least assigned Gaudis (tertiary) + * 4. Lowest NIC index (quaternary) + * + * Pass 2: For any unassigned Gaudis (NUMAs w/o NICs or capacity overflow): + * - Search globally across all NICs + * - Tie-breaking: + * 1. Minimum latency + * 2. Maximum bandwidth + * 3. Prefer NICs at/under capacity + * 4. Least assigned Gaudis + * 5. Lowest NIC index + * + * Note: NICs without distance data are silently skipped in both passes. + */ +static ucs_status_t ucs_gaudi_build_assignment_balanced() +{ + ucs_sys_dev_distance_t distance; + ucs_sys_device_t gaudi_device; + ucs_sys_device_t hnic_device; + ucs_status_t status; + unsigned *gaudi_per_numa = NULL; + unsigned *hnic_per_numa = NULL; + unsigned *gaudi_offset = NULL; + unsigned *hnic_offset = NULL; + unsigned *gaudi_cursor = NULL; + unsigned *hnic_cursor = NULL; + unsigned *gaudi_idx_by_numa = NULL; + unsigned *hnic_idx_by_numa = NULL; + unsigned *hnic_usage = NULL; + unsigned *hnic_capacity = NULL; + unsigned assigned_count; + unsigned best_usage; + unsigned numa_node; + unsigned gaudi_start; + unsigned gaudi_end; + unsigned hnic_start; + unsigned hnic_end; + unsigned num_local_gaudis; + unsigned num_local_hnics; + unsigned base_capacity; + unsigned extra_gaudis; + unsigned hidx; + unsigned gidx; + unsigned gi; + unsigned i, j; + uint16_t vendor; + int lat_better; + int lat_eq; + int bw_better; + int bw_eq; + int use_better; + int idx_better; + int best_under_cap; + int this_under_cap; + int cap_better; + double best_latency; + double best_bandwidth; + ssize_t best_hidx; + + /* + * ucs_gaudi_estimate_distance() uses: + * hop_latency_ns = 10e-9 seconds (10 nanoseconds) + * hop_bw_penalty = 0.95 (5% per hop) + * + * Therefore, reasonable epsilons are: + * Latency: 0.1% of a hop = 10e-9 * 0.001 = 1e-11 seconds + * Bandwidth: 0.1% of typical 100GB/s = 1e11 * 0.001 = 1e8 bytes/s + */ + const double LATENCY_EPSILON = 1e-11; /* 10 picoseconds, 0.1% of a hop */ + const double BANDWIDTH_EPSILON = 1e8; /* 100 MB/s, 0.1% of 100GB/s */ + + const unsigned num_numa_nodes = ucs_numa_num_configured_nodes(); + const unsigned num_gaudi_devices = ucs_gaudi_topo_ctx.num_gaudi_devices; + const unsigned num_hnic_devices = ucs_gaudi_topo_ctx.num_hnic_devices; + + status = UCS_OK; + + /* Reset assignments */ + for (i = 0; i < num_gaudi_devices; i++) { + ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi[i] = + UCS_SYS_DEVICE_ID_UNKNOWN; + ucs_gaudi_topo_ctx.assigned_port_for_gaudi[i] = 1; + } + ucs_gaudi_topo_ctx.have_assignment = 0; + + /* Short-circuit if no devices */ + if (num_gaudi_devices == 0 || num_hnic_devices == 0) { + ucs_debug("No devices to assign (Gaudis: %u, NICs: %u)", + num_gaudi_devices, num_hnic_devices); + return status; + } + + /* Allocate per-NUMA arrays */ + gaudi_per_numa = ucs_calloc(num_numa_nodes, sizeof(*gaudi_per_numa), + "gaudi_per_numa"); + hnic_per_numa = ucs_calloc(num_numa_nodes, sizeof(*hnic_per_numa), + "hnic_per_numa"); + gaudi_offset = ucs_malloc((num_numa_nodes + 1) * sizeof(*gaudi_offset), + "gaudi_offset"); + hnic_offset = ucs_malloc((num_numa_nodes + 1) * sizeof(*hnic_offset), + "hnic_offset"); + gaudi_cursor = ucs_malloc(num_numa_nodes * sizeof(*gaudi_cursor), + "gaudi_cursor"); + hnic_cursor = ucs_malloc(num_numa_nodes * sizeof(*hnic_cursor), + "hnic_cursor"); + + if (!gaudi_per_numa || !hnic_per_numa || !gaudi_offset || !hnic_offset || + !gaudi_cursor || !hnic_cursor) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + + /* Count devices per NUMA node */ + for (i = 0; i < num_gaudi_devices; i++) { + numa_node = ucs_gaudi_get_validated_numa_node( + ucs_gaudi_topo_ctx.gaudi_devices[i], num_numa_nodes); + gaudi_per_numa[numa_node]++; + } + for (i = 0; i < num_hnic_devices; i++) { + numa_node = ucs_gaudi_get_validated_numa_node( + ucs_gaudi_topo_ctx.hnic_devices[i], num_numa_nodes); + hnic_per_numa[numa_node]++; + } + + /* Build offset arrays */ + gaudi_offset[0] = 0; + hnic_offset[0] = 0; + for (i = 0; i < num_numa_nodes; i++) { + gaudi_offset[i + 1] = gaudi_offset[i] + gaudi_per_numa[i]; + hnic_offset[i + 1] = hnic_offset[i] + hnic_per_numa[i]; + } + + /* Allocate index arrays */ + if (gaudi_offset[num_numa_nodes] > 0) { + gaudi_idx_by_numa = ucs_malloc(gaudi_offset[num_numa_nodes] * + sizeof(*gaudi_idx_by_numa), + "gaudi_idx_by_numa"); + if (!gaudi_idx_by_numa) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + } + if (hnic_offset[num_numa_nodes] > 0) { + hnic_idx_by_numa = ucs_malloc(hnic_offset[num_numa_nodes] * + sizeof(*hnic_idx_by_numa), + "hnic_idx_by_numa"); + if (!hnic_idx_by_numa) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + } + + /* Initialize cursors */ + for (i = 0; i < num_numa_nodes; i++) { + gaudi_cursor[i] = gaudi_offset[i]; + hnic_cursor[i] = hnic_offset[i]; + } + + /* Populate NUMA-indexed arrays */ + for (i = 0; i < num_gaudi_devices; i++) { + numa_node = ucs_gaudi_get_validated_numa_node( + ucs_gaudi_topo_ctx.gaudi_devices[i], num_numa_nodes); + gaudi_idx_by_numa[gaudi_cursor[numa_node]++] = i; + } + for (i = 0; i < num_hnic_devices; i++) { + numa_node = ucs_gaudi_get_validated_numa_node( + ucs_gaudi_topo_ctx.hnic_devices[i], num_numa_nodes); + hnic_idx_by_numa[hnic_cursor[numa_node]++] = i; + } + + /* Allocate usage and capacity tracking */ + hnic_usage = ucs_calloc(num_hnic_devices, sizeof(*hnic_usage), + "hnic_usage"); + hnic_capacity = ucs_malloc(num_hnic_devices * sizeof(*hnic_capacity), + "hnic_capacity"); + if (!hnic_usage || !hnic_capacity) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + + /* Initialize all capacities to "unlimited" for Pass 2 fallback logic */ + for (i = 0; i < num_hnic_devices; i++) { + hnic_capacity[i] = UINT_MAX; + } + + /* Pass 1: Local NUMA-balanced assignment */ + for (i = 0; i < num_numa_nodes; i++) { + gaudi_start = gaudi_offset[i]; + gaudi_end = gaudi_offset[i + 1]; + hnic_start = hnic_offset[i]; + hnic_end = hnic_offset[i + 1]; + num_local_gaudis = gaudi_end - gaudi_start; + num_local_hnics = hnic_end - hnic_start; + if (num_local_gaudis == 0 || num_local_hnics == 0) { + continue; + } + + /* Calculate balanced capacity per NIC */ + base_capacity = num_local_gaudis / num_local_hnics; + extra_gaudis = num_local_gaudis % num_local_hnics; + for (j = 0; j < num_local_hnics; j++) { + hidx = hnic_idx_by_numa[hnic_start + j]; + hnic_capacity[hidx] = base_capacity + ((j < extra_gaudis) ? 1 : 0); + } + + /* Assign each Gaudi to best local NIC */ + for (gi = gaudi_start; gi < gaudi_end; gi++) { + gidx = gaudi_idx_by_numa[gi]; + gaudi_device = ucs_gaudi_topo_ctx.gaudi_devices[gidx]; + + best_hidx = -1; + best_latency = DBL_MAX; + best_bandwidth = -1.0; + best_usage = UINT_MAX; + + for (j = 0; j < num_local_hnics; j++) { + hidx = hnic_idx_by_numa[hnic_start + j]; + hnic_device = ucs_gaudi_topo_ctx.hnic_devices[hidx]; + + if (!ucs_gaudi_lookup_distance(gaudi_device, hnic_device, + &distance)) { + continue; + } + + if (hnic_usage[hidx] >= hnic_capacity[hidx]) { + continue; + } + + /* Tie-breaking: latency -> bandwidth -> usage -> index */ + lat_better = (distance.latency + LATENCY_EPSILON < + best_latency); + lat_eq = !lat_better && (fabs(distance.latency - + best_latency) <= LATENCY_EPSILON); + bw_better = lat_eq && (distance.bandwidth > + best_bandwidth + BANDWIDTH_EPSILON); + bw_eq = lat_eq && !bw_better && + (fabs(distance.bandwidth - best_bandwidth) <= + BANDWIDTH_EPSILON); + use_better = bw_eq && (hnic_usage[hidx] < best_usage); + idx_better = bw_eq && !use_better && + ((ssize_t)hidx < best_hidx); + + if (lat_better || bw_better || use_better || idx_better) { + best_latency = distance.latency; + best_bandwidth = distance.bandwidth; + best_usage = hnic_usage[hidx]; + best_hidx = hidx; + } + } + + if (best_hidx >= 0) { + vendor = ucs_gaudi_topo_ctx.hnic_vendor_ids[best_hidx]; + hnic_usage[best_hidx]++; + ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi[gidx] = + ucs_gaudi_topo_ctx.hnic_devices[best_hidx]; + ucs_gaudi_topo_ctx.assigned_port_for_gaudi[gidx] = + ucs_gaudi_get_default_port(vendor); + } + } + } + + /* Pass 2: Global assignment with soft-cap preference */ + for (i = 0; i < num_gaudi_devices; i++) { + if (ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi[i] != + UCS_SYS_DEVICE_ID_UNKNOWN) { + continue; + } + + gaudi_device = ucs_gaudi_topo_ctx.gaudi_devices[i]; + + best_hidx = -1; + best_latency = DBL_MAX; + best_bandwidth = -1.0; + best_usage = UINT_MAX; + best_under_cap = 0; + + for (j = 0; j < num_hnic_devices; j++) { + hnic_device = ucs_gaudi_topo_ctx.hnic_devices[j]; + + if (!ucs_gaudi_lookup_distance(gaudi_device, hnic_device, + &distance)) { + continue; + } + + this_under_cap = (hnic_usage[j] < hnic_capacity[j]); + + /* Tie-breaking: latency -> bandwidth -> under-cap -> usage -> index */ + lat_better = (distance.latency + LATENCY_EPSILON < best_latency); + lat_eq = !lat_better && + (fabs(distance.latency - best_latency) <= LATENCY_EPSILON); + bw_better = lat_eq && (distance.bandwidth > + best_bandwidth + BANDWIDTH_EPSILON); + bw_eq = lat_eq && !bw_better && + (fabs(distance.bandwidth - best_bandwidth) <= + BANDWIDTH_EPSILON); + cap_better = bw_eq && (this_under_cap > best_under_cap); + use_better = bw_eq && !cap_better && (hnic_usage[j] < best_usage); + idx_better = bw_eq && !cap_better && !use_better && + ((ssize_t)j < best_hidx); + + if (lat_better || bw_better || cap_better || use_better || + idx_better) { + best_latency = distance.latency; + best_bandwidth = distance.bandwidth; + best_usage = hnic_usage[j]; + best_under_cap = this_under_cap; + best_hidx = j; + } + } + + if (best_hidx >= 0) { + vendor = ucs_gaudi_topo_ctx.hnic_vendor_ids[best_hidx]; + hnic_usage[best_hidx]++; + ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi[i] = + ucs_gaudi_topo_ctx.hnic_devices[best_hidx]; + ucs_gaudi_topo_ctx.assigned_port_for_gaudi[i] = + ucs_gaudi_get_default_port(vendor); + } + } + + /* Finalize and validate assignment */ + assigned_count = 0; + for (i = 0; i < num_gaudi_devices; i++) { + if (ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi[i] != + UCS_SYS_DEVICE_ID_UNKNOWN) { + assigned_count++; + } + } + + ucs_gaudi_topo_ctx.have_assignment = (assigned_count == num_gaudi_devices); + if (!ucs_gaudi_topo_ctx.have_assignment) { + ucs_warn("Gaudi->NIC assignment incomplete: %u of %u devices assigned", + assigned_count, num_gaudi_devices); + } + +out: + ucs_free(hnic_usage); + ucs_free(hnic_capacity); + ucs_free(gaudi_idx_by_numa); + ucs_free(hnic_idx_by_numa); + ucs_free(gaudi_cursor); + ucs_free(hnic_cursor); + ucs_free(gaudi_offset); + ucs_free(hnic_offset); + ucs_free(gaudi_per_numa); + ucs_free(hnic_per_numa); + + return status; +} + +/* Find best HNIC for a given Gaudi device */ +ucs_status_t ucs_gaudi_find_best_connection(const char *accel_name, + ucs_sys_device_t *hnic_device, + int *port_num) +{ + ucs_status_t status; + unsigned i; + + /* Validate parameters */ + if ((accel_name == NULL) || (hnic_device == NULL) || (port_num == NULL)) { + ucs_error("Invalid NULL parameter(s) to " + "ucs_gaudi_find_best_connection()"); + return UCS_ERR_INVALID_PARAM; + } + + /* Perform lazy initialization */ + status = ucs_gaudi_lazy_init(); + if (status != UCS_OK) { + if (status == UCS_ERR_UNSUPPORTED) { + /* Provider disabled: treat like "nothing suitable" */ + return UCS_ERR_NO_ELEM; + } + ucs_error("Failed to initialize Gaudi topology: %s", + ucs_status_string(status)); + return status; + } + + ucs_spin_lock(&ucs_gaudi_topo_ctx.lock); + + /* Return cached balanced assignment instead of searching connections */ + for (i = 0; i < ucs_gaudi_topo_ctx.num_gaudi_devices; i++) { + if (!strcmp(accel_name, ucs_gaudi_topo_ctx.gaudi_devices_names[i])) { + break; + } + } + + if (i < ucs_gaudi_topo_ctx.num_gaudi_devices && + ucs_gaudi_topo_ctx.have_assignment && + ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi[i] != + UCS_SYS_DEVICE_ID_UNKNOWN) { + *hnic_device = ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi[i]; + *port_num = ucs_gaudi_topo_ctx.assigned_port_for_gaudi[i]; + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + ucs_info("Selected HNIC %s:%d for Gaudi %s", + ucs_topo_sys_device_get_name(*hnic_device), *port_num, + accel_name); + return UCS_OK; + } + + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + ucs_error("No suitable HNIC found for Gaudi %s", accel_name); + return UCS_ERR_NO_ELEM; +} + +/* Get device-to-device distance for UCX topology queries */ +static ucs_status_t ucs_gaudi_get_distance(ucs_sys_device_t device1, + ucs_sys_device_t device2, + ucs_sys_dev_distance_t *distance) +{ + ucs_gaudi_connection_t *conn; + ucs_status_t status; + unsigned i; + uint16_t vendor_id1, vendor_id2; + + /* Validate parameters */ + if (distance == NULL) { + ucs_error("distance parameter cannot be NULL"); + return UCS_ERR_INVALID_PARAM; + } + + /* Default distance */ + *distance = ucs_topo_default_distance; + + /* Perform lazy initialization */ + status = ucs_gaudi_lazy_init(); + if (status != UCS_OK) { + if (status == UCS_ERR_UNSUPPORTED) { + /* Provider disabled: fall back to generic estimation */ + goto fallback; + } + ucs_error("Failed to initialize Gaudi topology: %s", + ucs_status_string(status)); + return status; + } + + /* If either device is unknown or they are identical, assume near topology */ + if ((device1 == UCS_SYS_DEVICE_ID_UNKNOWN) || + (device2 == UCS_SYS_DEVICE_ID_UNKNOWN) || (device1 == device2)) { + goto fallback; + } + + /* Check if either device is a Gaudi device or HNIC */ + if (ucs_gaudi_read_vendor_id(device1, &vendor_id1) != UCS_OK) { + goto fallback; + } + if (ucs_gaudi_read_vendor_id(device2, &vendor_id2) != UCS_OK) { + goto fallback; + } + + /* If one device is Gaudi or HNIC, check connection matrix */ + if ((vendor_id1 == UCS_GAUDI_TOPO_VENDOR_ID || + vendor_id1 == UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID || + vendor_id1 == UCS_GAUDI_TOPO_BROADCOM_VENDOR_ID) || + (vendor_id2 == UCS_GAUDI_TOPO_VENDOR_ID || + vendor_id2 == UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID || + vendor_id2 == UCS_GAUDI_TOPO_BROADCOM_VENDOR_ID)) { + ucs_spin_lock(&ucs_gaudi_topo_ctx.lock); + for (i = 0; i < ucs_gaudi_topo_ctx.num_connections; i++) { + conn = &ucs_gaudi_topo_ctx.connections[i]; + if ((conn->gaudi_device == device1 && + conn->hnic_device == device2) || + (conn->gaudi_device == device2 && + conn->hnic_device == device1)) { + *distance = conn->distance; + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + return UCS_OK; + } + } + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + } + +fallback: + /* Fallback to estimate_distance for other device pairs */ + return ucs_gaudi_estimate_distance(device1, device2, distance); +} + +/* Get device-to-memory distance for UCX topology queries */ +static void ucs_gaudi_get_memory_distance(ucs_sys_device_t device, + ucs_sys_dev_distance_t *distance) +{ + ucs_status_t status; + ucs_sys_cpuset_t thread_cpuset; + ucs_numa_node_t device_numa; + unsigned cpu, num_cpus, cpuset_size; + double total_distance; + int full_affinity; + uint16_t vendor_id; + + /* Validate parameters */ + if (distance == NULL) { + ucs_error("distance parameter cannot be NULL"); + return; + } + + *distance = ucs_topo_default_distance; + + /* If device is unknown, return default distance */ + if (device == UCS_SYS_DEVICE_ID_UNKNOWN) { + return; + } + + /* Check if device is Gaudi or HNIC */ + status = ucs_gaudi_read_vendor_id(device, &vendor_id); + if (status != UCS_OK || (vendor_id != UCS_GAUDI_TOPO_VENDOR_ID && + vendor_id != UCS_GAUDI_TOPO_MELLANOX_VENDOR_ID && + vendor_id != UCS_GAUDI_TOPO_BROADCOM_VENDOR_ID)) { + return; + } + + /* Get thread CPU affinity */ + status = ucs_sys_pthread_getaffinity(&thread_cpuset); + if (status != UCS_OK) { + /* Assume full CPU affinity if getting affinity fails */ + full_affinity = 1; + } else { + full_affinity = 0; + } + + /* Get device NUMA node */ + device_numa = ucs_topo_sys_device_get_numa_node(device); + if (device_numa == UCS_NUMA_NODE_UNDEFINED) { + device_numa = UCS_NUMA_NODE_DEFAULT; + } + + /* Sum NUMA distances for CPUs in affinity set */ + num_cpus = ucs_numa_num_configured_cpus(); + total_distance = 0; + for (cpu = 0; cpu < num_cpus; cpu++) { + if (!full_affinity && !CPU_ISSET(cpu, &thread_cpuset)) { + continue; + } + total_distance += ucs_numa_distance(device_numa, + ucs_numa_node_of_cpu(cpu)); + } + + /* Set distance: bandwidth from default, latency from average NUMA distance */ + distance->bandwidth = ucs_topo_default_distance.bandwidth; + cpuset_size = full_affinity ? num_cpus : CPU_COUNT(&thread_cpuset); + + /* Guard against misconfigured affinity resulting in empty cpuset. + * This can occur if thread CPU affinity is set to an empty mask. + */ + if (cpuset_size == 0) { + /* Return default distance; cannot compute meaningful average */ + return; + } + + /* Logic and baseline taken from UCX: ucs_topo_sysfs_numa_distance_to_latency + * NUMA distance normalized to 10, latency formula assumes ~10ns per unit, + */ + distance->latency = (total_distance / cpuset_size) * 10e-9; +} + +/* Initialize spinlock exactly once */ +static void ucs_gaudi_spinlock_once_init() +{ + gaudi_spinlock_init_status = ucs_spinlock_init(&ucs_gaudi_topo_ctx.lock, 0); +} + +/* Initialization function */ +void ucs_gaudi_topo_init() +{ + const char *disable; + + disable = getenv("UCS_GAUDI_TOPO_DISABLE"); + if (disable && strcmp(disable, "0") != 0) { + ucs_debug("Gaudi topology provider disabled by UCS_GAUDI_TOPO_DISABLE"); + return; + } + + /* Prevent double registration */ + if (ucs_gaudi_topo_ctx.provider_added) { + ucs_debug("Gaudi topology provider already registered"); + return; + } + + /* Ensure spinlock exists even if lazy init is first */ + pthread_once(&gaudi_spinlock_once_flag, ucs_gaudi_spinlock_once_init); + if (gaudi_spinlock_init_status != UCS_OK) { + ucs_error("Failed to initialize spinlock: %s", + ucs_status_string(gaudi_spinlock_init_status)); + return; + } + + pthread_mutex_lock(&gaudi_init_mutex); + if (!ucs_gaudi_topo_ctx.provider_added) { + ucs_debug("Registering Gaudi topology provider"); + ucs_list_add_head(&ucs_sys_topo_providers_list, + &ucs_gaudi_topo_provider.list); + ucs_gaudi_topo_ctx.provider_added = 1; + ucs_debug("Gaudi topology provider registered"); + } else { + ucs_debug("Gaudi topology provider already registered (raced)"); + } + pthread_mutex_unlock(&gaudi_init_mutex); +} + +static ucs_status_t ucs_gaudi_lazy_init() +{ + ucs_status_t status; + const char *disable; + + disable = getenv("UCS_GAUDI_TOPO_DISABLE"); + if (disable && strcmp(disable, "0") != 0) { + ucs_debug("Gaudi topology provider disabled by UCS_GAUDI_TOPO_DISABLE"); + return UCS_ERR_UNSUPPORTED; + } + + /* Ensure spinlock exists */ + pthread_once(&gaudi_spinlock_once_flag, ucs_gaudi_spinlock_once_init); + if (gaudi_spinlock_init_status != UCS_OK) { + ucs_error("Failed to initialize spinlock: %s", + ucs_status_string(gaudi_spinlock_init_status)); + return gaudi_spinlock_init_status; + } + + ucs_spin_lock(&ucs_gaudi_topo_ctx.lock); + + if (ucs_gaudi_topo_ctx.initialized) { + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + return UCS_OK; + } + + ucs_debug("Performing lazy initialization of Gaudi topology"); + + status = ucs_gaudi_enumerate_devices(); + if (status != UCS_OK) { + ucs_error("Failed to enumerate Gaudi devices: %s", + ucs_status_string(status)); + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + return status; + } + + status = ucs_gaudi_create_connection_matrix(); + if (status != UCS_OK) { + ucs_error("Failed to create connection matrix: %s", + ucs_status_string(status)); + goto out; + } + + qsort(ucs_gaudi_topo_ctx.connections, ucs_gaudi_topo_ctx.num_connections, + sizeof(ucs_gaudi_connection_t), ucs_gaudi_compare_connections); + + ucs_free(ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi); + ucs_free(ucs_gaudi_topo_ctx.assigned_port_for_gaudi); + ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi = + ucs_calloc(ucs_gaudi_topo_ctx.num_gaudi_devices, + sizeof(*ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi), + "assigned_hnic_for_gaudi"); + ucs_gaudi_topo_ctx.assigned_port_for_gaudi = + ucs_calloc(ucs_gaudi_topo_ctx.num_gaudi_devices, + sizeof(*ucs_gaudi_topo_ctx.assigned_port_for_gaudi), + "assigned_port_for_gaudi"); + if (!ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi || + !ucs_gaudi_topo_ctx.assigned_port_for_gaudi) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + + /* Build a single, balanced assignment */ + status = ucs_gaudi_build_assignment_balanced(); + if (status != UCS_OK) { + goto out; + } + + if (ucs_global_opts.log_component.log_level >= UCS_LOG_LEVEL_DEBUG) { + ucs_gaudi_print_connection_matrix(); + } + + ucs_gaudi_topo_ctx.initialized = 1; + ucs_debug("Gaudi topology initialized"); + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + return UCS_OK; + +out: + /* Free allocations that happened before error */ + ucs_free(ucs_gaudi_topo_ctx.gaudi_devices); + ucs_free(ucs_gaudi_topo_ctx.gaudi_devices_names); + ucs_free(ucs_gaudi_topo_ctx.hnic_devices); + ucs_free(ucs_gaudi_topo_ctx.hnic_vendor_ids); + ucs_free(ucs_gaudi_topo_ctx.connections); + ucs_free(ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi); + ucs_free(ucs_gaudi_topo_ctx.assigned_port_for_gaudi); + + ucs_gaudi_topo_ctx.gaudi_devices = NULL; + ucs_gaudi_topo_ctx.gaudi_devices_names = NULL; + ucs_gaudi_topo_ctx.hnic_devices = NULL; + ucs_gaudi_topo_ctx.hnic_vendor_ids = NULL; + ucs_gaudi_topo_ctx.connections = NULL; + ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi = NULL; + ucs_gaudi_topo_ctx.assigned_port_for_gaudi = NULL; + + ucs_gaudi_topo_ctx.num_gaudi_devices = 0; + ucs_gaudi_topo_ctx.num_hnic_devices = 0; + ucs_gaudi_topo_ctx.num_connections = 0; + ucs_gaudi_topo_ctx.have_assignment = 0; + + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + return status; +} + +/* Cleanup function */ +void ucs_gaudi_topo_cleanup() +{ + pthread_mutex_lock(&gaudi_init_mutex); + + /* If provider was never added, nothing to clean up */ + if (!ucs_gaudi_topo_ctx.provider_added) { + pthread_mutex_unlock(&gaudi_init_mutex); + return; + } + + /* Provider was added, so spinlock is guaranteed initialized */ + ucs_spin_lock(&ucs_gaudi_topo_ctx.lock); + + /* Remove the Gaudi topology provider from the list */ + ucs_list_del(&ucs_gaudi_topo_provider.list); + + ucs_free(ucs_gaudi_topo_ctx.gaudi_devices); + ucs_free(ucs_gaudi_topo_ctx.gaudi_devices_names); + ucs_free(ucs_gaudi_topo_ctx.hnic_devices); + ucs_free(ucs_gaudi_topo_ctx.hnic_vendor_ids); + ucs_free(ucs_gaudi_topo_ctx.connections); + ucs_free(ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi); + ucs_free(ucs_gaudi_topo_ctx.assigned_port_for_gaudi); + + /* Reset pointers */ + ucs_gaudi_topo_ctx.gaudi_devices = NULL; + ucs_gaudi_topo_ctx.gaudi_devices_names = NULL; + ucs_gaudi_topo_ctx.hnic_devices = NULL; + ucs_gaudi_topo_ctx.hnic_vendor_ids = NULL; + ucs_gaudi_topo_ctx.connections = NULL; + ucs_gaudi_topo_ctx.assigned_hnic_for_gaudi = NULL; + ucs_gaudi_topo_ctx.assigned_port_for_gaudi = NULL; + + /* Reset counters/flags */ + ucs_gaudi_topo_ctx.num_gaudi_devices = 0; + ucs_gaudi_topo_ctx.num_hnic_devices = 0; + ucs_gaudi_topo_ctx.num_connections = 0; + ucs_gaudi_topo_ctx.initialized = 0; + ucs_gaudi_topo_ctx.provider_added = 0; + ucs_gaudi_topo_ctx.have_assignment = 0; + + ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock); + pthread_mutex_unlock(&gaudi_init_mutex); + ucs_debug("Gaudi topology cleaned up"); +} diff --git a/src/ucs/sys/topo/gaudi/topo.h b/src/ucs/sys/topo/gaudi/topo.h new file mode 100644 index 00000000000..52ac054be48 --- /dev/null +++ b/src/ucs/sys/topo/gaudi/topo.h @@ -0,0 +1,64 @@ +/** +* Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. +* +* See file LICENSE for terms. +*/ + +#ifndef UCS_GAUDI_TOPO_H +#define UCS_GAUDI_TOPO_H + +#include +#include + +BEGIN_C_DECLS + +/** + * Initialize Gaudi topology provider. + * + * @note Must be paired with ucs_gaudi_topo_cleanup(). + */ +void ucs_gaudi_topo_init(void); + +/** + * Clean up Gaudi topology provider. + * + * @note Safe to call if topology is not initialized. + */ +void ucs_gaudi_topo_cleanup(void); + +/** + * Get Gaudi device index from a given module ID. + * + * Searches /sys/class/accel for the "accel" directory whose module_id + * attribute matches the supplied value and returns the numeric index . + * + * @param [in] module_id Gaudi module identifier to query. + * + * @return Non-negative Device index on success, -1 on failure + * (error details printed via ucs_error). + * + * @note On success, the return value is a zero-based index parsed from the + * "accel" directory name. On failure, -1 is returned and the error + * is logged using ucs_error(). + */ +int ucs_gaudi_get_index_from_module_id(uint32_t module_id); + +/** + * Find best HNIC for a Gaudi device based on topology distance. + * + * @param [in] accel_name Name of the Gaudi device (e.g., "accel0"). + * @param [out] hnic_device Filled with selected HNIC device ID. + * @param [out] port_num Filled with the default UCX port for that NIC. + * + * @return UCS_OK on success, + * @return UCS_ERR_INVALID_PARAM if any parameter is NULL, + * @return UCS_ERR_NO_ELEM if no suitable HNIC is found, + * @return other UCX error codes on initialization failures. + */ +ucs_status_t ucs_gaudi_find_best_connection(const char *accel_name, + ucs_sys_device_t *hnic_device, + int *port_num); + +END_C_DECLS + +#endif /* UCS_GAUDI_TOPO_H */ diff --git a/src/uct/Makefile.am b/src/uct/Makefile.am index ecf2b6fbe89..66a19b22df1 100644 --- a/src/uct/Makefile.am +++ b/src/uct/Makefile.am @@ -7,7 +7,7 @@ # See file LICENSE for terms. # -SUBDIRS = . cuda ib rocm sm ugni ze +SUBDIRS = . gaudi cuda ib rocm sm ugni ze lib_LTLIBRARIES = libuct.la libuct_la_CFLAGS = $(BASE_CFLAGS) $(LT_CFLAGS) diff --git a/src/uct/configure.m4 b/src/uct/configure.m4 index ea9e6e70eb8..f1a90d739b5 100644 --- a/src/uct/configure.m4 +++ b/src/uct/configure.m4 @@ -11,6 +11,7 @@ m4_include([src/uct/rocm/configure.m4]) m4_include([src/uct/sm/configure.m4]) m4_include([src/uct/ugni/configure.m4]) m4_include([src/uct/ze/configure.m4]) +m4_include([src/uct/gaudi/configure.m4]) AC_DEFINE_UNQUOTED([uct_MODULES], ["${uct_modules}"], [UCT loadable modules]) diff --git a/src/uct/gaudi/Makefile.am b/src/uct/gaudi/Makefile.am new file mode 100644 index 00000000000..e8a4c4c06ae --- /dev/null +++ b/src/uct/gaudi/Makefile.am @@ -0,0 +1,38 @@ +# +# Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +if HAVE_GAUDI + +SUBDIRS = . + +module_LTLIBRARIES = libuct_gaudi.la +libuct_gaudi_la_CPPFLAGS = $(BASE_CPPFLAGS) $(GAUDI_CPPFLAGS) +libuct_gaudi_la_CFLAGS = $(BASE_CFLAGS) $(LT_CFLAGS) +libuct_gaudi_la_LIBADD = $(top_builddir)/src/ucs/libucs.la \ + $(top_builddir)/src/uct/libuct.la +libuct_gaudi_la_LDFLAGS = $(GAUDI_LDFLAGS) $(GAUDI_LIBS) -version-info $(SOVERSION) + +noinst_HEADERS = \ + base/scal.h \ + base/gaudi_base.h + +libuct_gaudi_la_SOURCES = \ + base/gaudi_base.c + +noinst_HEADERS += \ + gaudi_gdr/gaudi_gdr_md.h \ + gaudi_gdr/gaudi_gdr_iface.h + +libuct_gaudi_la_SOURCES += \ + gaudi_gdr/gaudi_gdr_md.c \ + gaudi_gdr/gaudi_gdr_iface.c + +PKG_CONFIG_NAME=gaudi + +include $(top_srcdir)/config/module.am +# TODO: enable pkg-config processing when module static build is enabled +# include $(top_srcdir)/config/module-pkg-config.am + +endif diff --git a/src/uct/gaudi/base/gaudi_base.c b/src/uct/gaudi/base/gaudi_base.c new file mode 100644 index 00000000000..d13c587f914 --- /dev/null +++ b/src/uct/gaudi/base/gaudi_base.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "gaudi_base.h" +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +int uct_gaudi_base_get_fd(int device_id, bool *fd_created) { + synDeviceInfo deviceInfo; + + if (synDeviceGetInfo(-1, &deviceInfo) != synSuccess) { + int fd = hlthunk_open_by_module_id(device_id); + if (fd <0) { + ucs_info("Failed to get device fd via hlthunk_open_by_module_id, id %d", device_id); + fd = hlthunk_open(HLTHUNK_DEVICE_DONT_CARE, NULL); + } + + if (fd >=0 && fd_created != NULL) { + *fd_created = true; + } + return fd; + } + + return deviceInfo.fd; +} + +void uct_gaudi_base_close_fd(int fd, bool fd_created) { + if (fd_created && fd >= 0) { + hlthunk_close(fd); + } +} + +void uct_gaudi_base_close_dmabuf_fd(int fd) { + if (fd >= 0) { + close(fd); + } +} + +ucs_status_t uct_gaudi_base_get_sysdev(int fd, ucs_sys_device_t* sys_dev) { + ucs_status_t status; + char pci_bus_id[13]; + int rc = hlthunk_get_pci_bus_id_from_fd(fd, pci_bus_id, sizeof(pci_bus_id)); + if (rc != 0) { + ucs_error("Failed to get pci_bus_id via hlthunk_get_pci_bus_id_from_fd"); + return UCS_ERR_IO_ERROR; + } + + status = ucs_topo_find_device_by_bdf_name(pci_bus_id, sys_dev); + if (status != UCS_OK) { + ucs_error("Failed to get sys device"); + return status; + } + + return UCS_OK; +} + +ucs_status_t uct_gaudi_base_get_info(int fd, uint64_t *device_base_allocated_address, uint64_t *device_base_address, + uint64_t *totalSize, int *dmabuf_fd) +{ + uint64_t addr, hbm_pool_start, size, offset; + scal_handle_t scal_handle; + scal_pool_handle_t scal_pool_handle; + scal_memory_pool_infoV2 scal_mem_pool_info; + + int rc = scal_get_handle_from_fd(fd, &scal_handle); + if (rc != SCAL_SUCCESS) { + /* + * If rc value equal SCAL_SUCCESS, it mean that it use synDeviceAcquireByModuleId to open Gaudi device. + * Otherwise, the device is opened via hlthunk_open_by_module_id function. + */ + rc = scal_init(fd, "", &scal_handle, NULL); + } + if (rc != SCAL_SUCCESS) { + ucs_error("Failed to get scal handle"); + return UCS_ERR_IO_ERROR; + } + rc = scal_get_pool_handle_by_name(scal_handle, "global_hbm", &scal_pool_handle); + if (rc != SCAL_SUCCESS) { + ucs_error("Failed to get scal pool"); + return UCS_ERR_INVALID_ADDR; + } + rc = scal_pool_get_infoV2(scal_pool_handle, &scal_mem_pool_info); + if (rc != SCAL_SUCCESS) { + ucs_error("Failed to get scal pool info"); + return UCS_ERR_INVALID_ADDR; + } + addr = scal_mem_pool_info.device_base_allocated_address; + hbm_pool_start = scal_mem_pool_info.device_base_address; + size = scal_mem_pool_info.totalSize; + offset = hbm_pool_start - addr; + *dmabuf_fd = hlthunk_device_mapped_memory_export_dmabuf_fd(fd, addr, size, offset, (O_RDWR | O_CLOEXEC)); + if (*dmabuf_fd < 0) { + ucs_error("Failed to get dmabuf fd"); + return UCS_ERR_INVALID_ADDR; + } + + *device_base_allocated_address = addr; + *device_base_address = hbm_pool_start; + *totalSize = size; + return UCS_OK; +} + +ucs_status_t uct_gaudi_base_query_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p) +{ + uct_gaudi_md_t *gaudi_md = ucs_derived_of(md, uct_gaudi_md_t); + ucs_sys_device_t sys_dev; + + ucs_status_t status = uct_gaudi_base_get_sysdev(gaudi_md->fd, &sys_dev); + if (status != UCS_OK) { + return status; + } + return uct_single_device_resource(md, md->component->name, + UCT_DEVICE_TYPE_ACC, + sys_dev, tl_devices_p, + num_tl_devices_p); +} + +UCS_MODULE_INIT() +{ + return UCS_OK; +} diff --git a/src/uct/gaudi/base/gaudi_base.h b/src/uct/gaudi/base/gaudi_base.h new file mode 100644 index 00000000000..a8779a089b2 --- /dev/null +++ b/src/uct/gaudi/base/gaudi_base.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef GAUDI_BASE_H_ +#define GAUDI_BASE_H_ + +#include +#include +#include +#include "scal.h" + +int uct_gaudi_base_get_fd(int device_id, bool *fd_created); +void uct_gaudi_base_close_fd(int fd, bool fd_created); +void uct_gaudi_base_close_dmabuf_fd(int fd); +ucs_status_t uct_gaudi_base_get_sysdev(int fd, ucs_sys_device_t* sys_dev); +ucs_status_t uct_gaudi_base_get_info(int fd, uint64_t *device_base_allocated_address, uint64_t *device_base_address, + uint64_t *totalSize, int *dmabuf_fd); +ucs_status_t uct_gaudi_base_query_devices(uct_md_h md, + uct_tl_device_resource_t **tl_devices_p, + unsigned *num_tl_devices_p); +#endif diff --git a/src/uct/gaudi/base/scal.h b/src/uct/gaudi/base/scal.h new file mode 100644 index 00000000000..6e6e1450e62 --- /dev/null +++ b/src/uct/gaudi/base/scal.h @@ -0,0 +1,31 @@ +#ifndef SCAL_H_ +#define SCAL_H_ + +#include + +#define SCAL_SUCCESS 0 +#define DECLARE_HANDLE(name) struct name##__ { int unused; }; \ + typedef struct name##__ *name + +DECLARE_HANDLE(scal_handle_t); +DECLARE_HANDLE(scal_pool_handle_t); +DECLARE_HANDLE(scal_arc_fw_config_handle_t); + +typedef struct _scal_memory_pool_infoV2 +{ + scal_handle_t scal; + const char * name; + unsigned idx; + uint64_t device_base_address; + void *host_base_address; + uint32_t core_base_address; // 0 when the pool is not mapped to the cores + uint64_t totalSize; + uint64_t freeSize; + uint64_t device_base_allocated_address; +} scal_memory_pool_infoV2; + +int scal_init(int fd, const char * config_file_path, scal_handle_t * scal, scal_arc_fw_config_handle_t fwCfg); +int scal_get_handle_from_fd(int fd, scal_handle_t* scal); +int scal_get_pool_handle_by_name(const scal_handle_t scal, const char *pool_name, scal_pool_handle_t *pool); +int scal_pool_get_infoV2(const scal_pool_handle_t pool, scal_memory_pool_infoV2 *info); +#endif diff --git a/src/uct/gaudi/configure.m4 b/src/uct/gaudi/configure.m4 new file mode 100644 index 00000000000..b2db3554252 --- /dev/null +++ b/src/uct/gaudi/configure.m4 @@ -0,0 +1,11 @@ +# +# Copyright (C) Intel Corporation, 2025. All rights reserved. +# +# See file LICENSE for terms. +# + +UCX_CHECK_GAUDI + +AS_IF([test "x$gaudi_happy" = "xyes"], [uct_modules="${uct_modules}:gaudi"]) +AC_CONFIG_FILES([src/uct/gaudi/Makefile + src/uct/gaudi/ucx-gaudi.pc]) diff --git a/src/uct/gaudi/gaudi_gdr/gaudi_gdr_iface.c b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_iface.c new file mode 100644 index 00000000000..ca93253696a --- /dev/null +++ b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_iface.c @@ -0,0 +1,103 @@ +/* + * Copyright (C) Advanced Micro Devices, Inc. 2019-2023. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "gaudi_gdr_iface.h" +#include "gaudi_gdr_md.h" + +#include +#include + +static ucs_status_t uct_gaudi_gdr_iface_query(uct_iface_h tl_iface, + uct_iface_attr_t *iface_attr) +{ + uct_gaudi_gdr_iface_t *iface = ucs_derived_of(tl_iface, uct_gaudi_gdr_iface_t); + + uct_base_iface_query(&iface->super, iface_attr); + iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE; + iface_attr->bandwidth.dedicated = 0.0001; // DBL_MIN will be round down to 0 when packing address; + iface_attr->bandwidth.shared = 0; + iface_attr->max_num_eps = 0; + + return UCS_OK; +} + +static uct_iface_ops_t uct_gaudi_gdr_iface_ops = { + .ep_pending_purge = (uct_ep_pending_purge_func_t)ucs_empty_function, + .ep_connect = (uct_ep_connect_func_t)ucs_empty_function_return_success, + .ep_disconnect = (uct_ep_disconnect_func_t)ucs_empty_function_return_success, + .cm_ep_conn_notify = (uct_cm_ep_conn_notify_func_t)ucs_empty_function_return_unsupported, + .ep_destroy = (uct_ep_destroy_func_t)ucs_empty_function_return_unsupported, + .ep_put_short = (uct_ep_put_short_func_t)ucs_empty_function_return_unsupported, + .ep_put_bcopy = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_get_bcopy = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_am_short = (uct_ep_am_short_func_t)ucs_empty_function_return_unsupported, + .ep_am_short_iov = (uct_ep_am_short_iov_func_t)ucs_empty_function_return_unsupported, + .ep_am_bcopy = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_unsupported, + .ep_atomic_cswap64 = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_unsupported, + .ep_atomic64_post = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_unsupported, + .ep_atomic64_fetch = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_unsupported, + .ep_atomic_cswap32 = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_post = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_unsupported, + .ep_atomic32_fetch = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_unsupported, + .ep_pending_add = (uct_ep_pending_add_func_t)ucs_empty_function_return_unsupported, + .ep_flush = (uct_ep_flush_func_t)ucs_empty_function_return_success, + .ep_fence = (uct_ep_fence_func_t)ucs_empty_function_return_unsupported, + .ep_check = (uct_ep_check_func_t)ucs_empty_function_return_unsupported, + .ep_create = (uct_ep_create_func_t)ucs_empty_function_return_unsupported, + .iface_flush = (uct_iface_flush_func_t)ucs_empty_function_return_unsupported, + .iface_fence = (uct_iface_fence_func_t)ucs_empty_function_return_unsupported, + .iface_progress_enable = (uct_iface_progress_enable_func_t)ucs_empty_function, + .iface_progress_disable = (uct_iface_progress_disable_func_t)ucs_empty_function, + .iface_progress = (uct_iface_progress_func_t)ucs_empty_function_return_zero, + .iface_event_fd_get = (uct_iface_event_fd_get_func_t)ucs_empty_function_return_unsupported, + .iface_event_arm = (uct_iface_event_arm_func_t)ucs_empty_function_return_unsupported, + .iface_close = (uct_iface_close_func_t)ucs_empty_function, + .iface_query = uct_gaudi_gdr_iface_query, + .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success, + .iface_get_address = (uct_iface_get_address_func_t)ucs_empty_function_return_success, + .iface_is_reachable = uct_base_iface_is_reachable +}; + +static uct_iface_internal_ops_t uct_gaudi_gdr_iface_internal_ops = { + .iface_query_v2 = uct_iface_base_query_v2, + .iface_estimate_perf = (uct_iface_estimate_perf_func_t)ucs_empty_function_return_unsupported, + .iface_vfs_refresh = (uct_iface_vfs_refresh_func_t)ucs_empty_function, + .ep_query = (uct_ep_query_func_t)ucs_empty_function_return_unsupported, + .ep_invalidate = (uct_ep_invalidate_func_t)ucs_empty_function_return_unsupported, + .ep_connect_to_ep_v2 = (uct_ep_connect_to_ep_v2_func_t)ucs_empty_function_return_unsupported, + .iface_is_reachable_v2 = (uct_iface_is_reachable_v2_func_t)ucs_empty_function_return_zero, + .ep_is_connected = (uct_ep_is_connected_func_t)ucs_empty_function_return_zero_int +}; + +static UCS_CLASS_INIT_FUNC(uct_gaudi_gdr_iface_t, uct_md_h md, + uct_worker_h worker, + const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_gaudi_gdr_iface_ops, + &uct_gaudi_gdr_iface_internal_ops, md, worker, + params, + tl_config UCS_STATS_ARG(params->stats_root) + UCS_STATS_ARG(UCT_GAUDI_GDR_TL_NAME)); + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_gaudi_gdr_iface_t) +{ +} + +UCS_CLASS_DEFINE(uct_gaudi_gdr_iface_t, uct_base_iface_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_gaudi_gdr_iface_t, uct_iface_t, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); + +UCT_TL_DEFINE(&uct_gaudi_gdr_component, gaudi_gdr, + uct_gaudi_base_query_devices, uct_gaudi_gdr_iface_t, + "GAUDI_GDR_", uct_iface_config_table, + uct_iface_config_t); diff --git a/src/uct/gaudi/gaudi_gdr/gaudi_gdr_iface.h b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_iface.h new file mode 100644 index 00000000000..39ff284470e --- /dev/null +++ b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_iface.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_GAUDI_GDR_IFACE_H +#define UCT_GAUDI_GDR_IFACE_H + +#include + +#define UCT_GAUDI_GDR_TL_NAME "gaudi_gdr" + +typedef struct uct_gaudi_gdr_iface { + uct_base_iface_t super; +} uct_gaudi_gdr_iface_t; + +#endif diff --git a/src/uct/gaudi/gaudi_gdr/gaudi_gdr_md.c b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_md.c new file mode 100644 index 00000000000..4ecb3fd3df8 --- /dev/null +++ b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_md.c @@ -0,0 +1,234 @@ +/* + * Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include "gaudi_gdr_md.h" +#include +#include + +#include +#include +#include + +static ucs_config_field_t uct_gaudi_md_config_table[] = { + {"", "", NULL, ucs_offsetof(uct_gaudi_md_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + + {"DEVICE_ID", "0", + "Index of the HPU devices to query memory from.", + ucs_offsetof(uct_gaudi_md_config_t, device_id), + UCS_CONFIG_TYPE_INT}, + + {NULL} +}; + +static ucs_status_t uct_gaudi_md_query(uct_md_h md, uct_md_attr_v2_t *attr) +{ + uct_md_base_md_query(attr); + attr->detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_GAUDI); + attr->dmabuf_mem_types = UCS_BIT(UCS_MEMORY_TYPE_GAUDI); + return UCS_OK; +} + +static void uct_gaudi_md_close(uct_md_h uct_md) +{ + uct_gaudi_md_t *md = ucs_derived_of(uct_md, uct_gaudi_md_t); + uct_gaudi_base_close_dmabuf_fd(md->dmabuf_fd); + uct_gaudi_base_close_fd(md->fd, md->fd_created); + ucs_free(md); +} + +static ucs_status_t +uct_gaudi_md_query_attributes(uct_md_h md, const void *addr, size_t length, + ucs_memory_info_t *mem_info, int *dmabuf_fd) +{ + uct_gaudi_md_t *gaudi_md = ucs_derived_of(md, uct_gaudi_md_t); + + void *begin = (void *)gaudi_md->device_base_address; + void *end = (uint8_t *)begin + gaudi_md->totalSize; + + if ((addr < begin) || (addr >= end)) { + mem_info->type = UCS_MEMORY_TYPE_LAST; + return UCS_ERR_OUT_OF_RANGE; + } + + *dmabuf_fd = gaudi_md->dmabuf_fd; + mem_info->type = UCS_MEMORY_TYPE_GAUDI; + mem_info->base_address = (void *)gaudi_md->device_base_address; + mem_info->alloc_length = (size_t)gaudi_md->totalSize; + mem_info->sys_dev = gaudi_md->sys_dev; + return UCS_OK; +} + +static ucs_status_t uct_gaudi_md_mem_query(uct_md_h md, const void *addr, + const size_t length, + uct_md_mem_attr_t *mem_attr_p) +{ + int dmabuf_fd = UCT_DMABUF_FD_INVALID; + ucs_status_t status; + ucs_memory_info_t mem_info; + + status = uct_gaudi_md_query_attributes(md, addr, length, &mem_info, + &dmabuf_fd); + if (status != UCS_OK) { + return status; + } + + ucs_memtype_cache_update(mem_info.base_address, mem_info.alloc_length, + mem_info.type, mem_info.sys_dev); + + if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_MEM_TYPE) { + mem_attr_p->mem_type = mem_info.type; + } + + if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_SYS_DEV) { + mem_attr_p->sys_dev = mem_info.sys_dev; + } + + if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_BASE_ADDRESS) { + mem_attr_p->base_address = mem_info.base_address; + } + + if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_ALLOC_LENGTH) { + mem_attr_p->alloc_length = mem_info.alloc_length; + } + + if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_DMABUF_FD) { + int dup_fd = dup(dmabuf_fd); + if (dup_fd < 0) { + return UCS_ERR_IO_ERROR; + } + mem_attr_p->dmabuf_fd = dup_fd; + } + + if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_DMABUF_OFFSET) { + mem_attr_p->dmabuf_offset = UCS_PTR_BYTE_DIFF(mem_info.base_address, + addr); + } + return UCS_OK; +} + +static ucs_status_t +uct_gaudi_md_detect_memory_type(uct_md_h md, const void *addr, size_t length, + ucs_memory_type_t *mem_type_p) +{ + uct_md_mem_attr_t mem_attr; + ucs_status_t status; + + mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE; + status = uct_gaudi_md_mem_query(md, addr, length, &mem_attr); + if (status != UCS_OK) { + return status; + } + + *mem_type_p = mem_attr.mem_type; + return UCS_OK; +} + +static uct_md_ops_t md_ops = { + .close = uct_gaudi_md_close, + .query = uct_gaudi_md_query, + .mem_alloc = (uct_md_mem_alloc_func_t)ucs_empty_function_return_unsupported, + .mem_free = (uct_md_mem_free_func_t)ucs_empty_function_return_unsupported, + .mem_advise = (uct_md_mem_advise_func_t)ucs_empty_function_return_unsupported, + .mem_reg = (uct_md_mem_reg_func_t)ucs_empty_function_return_unsupported, + .mem_dereg = (uct_md_mem_dereg_func_t)ucs_empty_function_return_unsupported, + .mem_query = uct_gaudi_md_mem_query, + .mkey_pack = (uct_md_mkey_pack_func_t)ucs_empty_function_return_unsupported, + .mem_attach = (uct_md_mem_attach_func_t)ucs_empty_function_return_unsupported, + .detect_memory_type = uct_gaudi_md_detect_memory_type, +}; + +static ucs_status_t +uct_gaudi_md_open(uct_component_h component, const char *md_name, + const uct_md_config_t *md_config, uct_md_h *md_p) +{ + uct_gaudi_md_config_t *config = ucs_derived_of(md_config, + uct_gaudi_md_config_t); + uct_gaudi_md_t *md; + ucs_status_t status; + bool fd_created = false; + int fd; + + md = ucs_malloc(sizeof(uct_gaudi_md_t), "uct_gaudi_md_t"); + if (NULL == md) { + ucs_error("Failed to allocate memory for uct_gaudi_md_t"); + return UCS_ERR_NO_MEMORY; + } + + fd = uct_gaudi_base_get_fd(config->device_id, &fd_created); + if (fd <0) { + ucs_error("Failed to get device fd"); + status = UCS_ERR_NO_DEVICE; + goto err_free_md; + } + + status = uct_gaudi_base_get_info(fd, &md->device_base_allocated_address, + &md->device_base_address, + &md->totalSize, + &md->dmabuf_fd); + + if (status != UCS_OK) { + ucs_error("Failed to get dmabuf information\n"); + goto err_close_fd; + } + + status = uct_gaudi_base_get_sysdev(fd, &md->sys_dev); + if (status != UCS_OK) { + ucs_error("Failed to get sys dev"); + goto err_close_dmabuf; + } + + md->fd = fd; + md->fd_created = fd_created; + md->super.ops = &md_ops; + md->super.component = &uct_gaudi_gdr_component; + + *md_p = (uct_md_h)md; + return UCS_OK; + +err_close_dmabuf: + uct_gaudi_base_close_dmabuf_fd(md->dmabuf_fd); +err_close_fd: + uct_gaudi_base_close_fd(fd, fd_created); +err_free_md: + ucs_free(md); + return status; +} + +ucs_status_t +uct_gaudi_query_md_resources(uct_component_h component, + uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + return uct_md_query_single_md_resource(component, resources_p, + num_resources_p); +} + +uct_component_t uct_gaudi_gdr_component = { + .query_md_resources = uct_gaudi_query_md_resources, + .md_open = uct_gaudi_md_open, + .cm_open = (uct_component_cm_open_func_t)ucs_empty_function_return_unsupported, + .rkey_unpack = uct_md_stub_rkey_unpack, + .rkey_ptr = (uct_component_rkey_ptr_func_t)ucs_empty_function_return_unsupported, + .rkey_release = (uct_component_rkey_release_func_t)ucs_empty_function_return_success, + .name = "gaudi_gdr", + .md_config = { + .name = "gaudi-gdr memory domain", + .prefix = "GAUDI_GDR_", + .table = uct_gaudi_md_config_table, + .size = sizeof(uct_gaudi_md_config_t), + }, + .cm_config = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY, + .tl_list = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_gaudi_gdr_component), + .flags = 0, + .md_vfs_init = (uct_component_md_vfs_init_func_t)ucs_empty_function +}; +UCT_COMPONENT_REGISTER(&uct_gaudi_gdr_component); diff --git a/src/uct/gaudi/gaudi_gdr/gaudi_gdr_md.h b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_md.h new file mode 100644 index 00000000000..eadc344471a --- /dev/null +++ b/src/uct/gaudi/gaudi_gdr/gaudi_gdr_md.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) Intel Corporation, 2025. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef GAUDI_MD_H +#define GAUDI_MD_H + +#include +#include +#include + +extern uct_component_t uct_gaudi_gdr_component; + +typedef struct uct_gaudi_md { + uct_md_t super; + int fd; + bool fd_created; + uint64_t device_base_allocated_address; + uint64_t device_base_address; + uint64_t totalSize; + int dmabuf_fd; + ucs_sys_device_t sys_dev; +} uct_gaudi_md_t; + +typedef struct uct_gaudi_md_config { + uct_md_config_t super; + int device_id; +} uct_gaudi_md_config_t; + +#endif diff --git a/src/uct/gaudi/ucx-gaudi.pc.in b/src/uct/gaudi/ucx-gaudi.pc.in new file mode 100644 index 00000000000..7cd8ed40e0d --- /dev/null +++ b/src/uct/gaudi/ucx-gaudi.pc.in @@ -0,0 +1,12 @@ +# +# Copyright (C) Intel Corporation, 2025. All rights reserved. +# +# See file LICENSE for terms. +# + +Name: @PACKAGE@-gaudi +Description: Unified Communication X Library GAUDI module +Version: @VERSION@ +Libs: +Libs.private: + diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 265d53e0a87..2e5f50b30d0 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -1341,6 +1341,12 @@ ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md, /* check if ROCM KFD driver is loaded */ uct_ib_check_gpudirect_driver(md, "/dev/kfd", UCS_MEMORY_TYPE_ROCM); +#ifdef HAVE_GAUDI + /* Check for HabanaLabs Gaudi DMABuf support */ + uct_ib_check_gpudirect_driver(md, "/dev/accel/accel0", UCS_MEMORY_TYPE_GAUDI); + uct_ib_check_gpudirect_driver(md, "/dev/hl0", UCS_MEMORY_TYPE_GAUDI); +#endif + /* Check for dma-buf support */ uct_ib_md_check_dmabuf(md); }