Skip to content

Commit ed3cebd

Browse files
authored
Merge pull request #10391 from rakhmets/topic/nvml-fabric-info-v1
UCT/CUDA/CUDA_IPC: Switched to nvmlGpuFabricInfo v1 - v1.18.x
2 parents 0dc692c + a1a097a commit ed3cebd

File tree

2 files changed

+11
-13
lines changed

2 files changed

+11
-13
lines changed

config/m4/cuda.m4

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ AS_IF([test "x$cuda_checked" != "xyes"],
6666
[AC_MSG_ERROR([libnvidia-ml not found. Install appropriate nvidia-driver package])])
6767
cuda_happy="no"])])
6868
69-
# Check for nvmlDeviceGetGpuFabricInfoV
70-
AC_CHECK_DECLS([nvmlDeviceGetGpuFabricInfoV],
69+
# Check for nvmlDeviceGetGpuFabricInfo
70+
AC_CHECK_DECLS([nvmlDeviceGetGpuFabricInfo],
7171
[AC_DEFINE([HAVE_NVML_FABRIC_INFO], 1, [Enable NVML GPU fabric info support])],
72-
[AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfoV function not found in libnvidia-ml. MNNVL support will be disabled.])],
72+
[AC_MSG_NOTICE([nvmlDeviceGetGpuFabricInfo function not found in libnvidia-ml. MNNVL support will be disabled.])],
7373
[[#include <nvml.h>]])
7474
7575

src/uct/cuda/cuda_ipc/cuda_ipc_md.c

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ uct_cuda_ipc_md_check_fabric_info(uct_cuda_ipc_md_t *md,
416416
static int mnnvl_supported = 0;
417417
#else
418418
static int mnnvl_supported = -1;
419-
nvmlGpuFabricInfoV_t fabric_info;
419+
nvmlGpuFabricInfo_t fabric_info;
420420
nvmlDevice_t device;
421421
ucs_status_t status;
422422
char buf[64];
@@ -436,19 +436,17 @@ uct_cuda_ipc_md_check_fabric_info(uct_cuda_ipc_md_t *md,
436436
goto out_not_supported;
437437
}
438438

439-
fabric_info.version = nvmlGpuFabricInfo_v2;
440-
status = UCT_NVML_FUNC_LOG_ERR(
441-
nvmlDeviceGetGpuFabricInfoV(device, &fabric_info));
439+
status = UCT_NVML_FUNC_LOG_ERR(
440+
nvmlDeviceGetGpuFabricInfo(device, &fabric_info));
442441
if (status != UCS_OK) {
443442
goto out_not_supported;
444443
}
445444

446-
ucs_debug("fabric_info: healthmask=%u state=%u status=%u clique=%u uuid=%s",
447-
fabric_info.healthMask, fabric_info.state, fabric_info.status,
448-
fabric_info.cliqueId,
449-
ucs_str_dump_hex(
450-
fabric_info.clusterUuid, NVML_GPU_FABRIC_UUID_LEN, buf,
451-
sizeof(buf), SIZE_MAX));
445+
ucs_debug("fabric_info: state=%u status=%u uuid=%s", fabric_info.state,
446+
fabric_info.status,
447+
ucs_str_dump_hex(fabric_info.clusterUuid,
448+
NVML_GPU_FABRIC_UUID_LEN, buf, sizeof(buf),
449+
SIZE_MAX));
452450

453451
if ((fabric_info.state == NVML_GPU_FABRIC_STATE_COMPLETED) &&
454452
(fabric_info.status == NVML_SUCCESS)) {

0 commit comments

Comments
 (0)