Skip to content

Commit 591b065

Browse files
nmazzilli3j-xiong
authored andcommitted
src/hmem_cuda_gdrcopy: Adding more robust libgdrapi libpaths
feat: Creating multiple libgdrapi libpath open attempts Problem: Intermittent failure on CI for P5en failing on gdrcopy open failure. Solution: - Add array of lib_paths and required_symbols to validate dlopen against - Refactored error checks for each required_symbols into cuda_gdrcopy_load_symbol() - Testing: - Validated cuda_gdrcopy_load_symbol() with valid and invalid symbols - libfabric:1356524:1760465932::core:core:cuda_gdrcopy_load_symbol():89&lt;warn&gt; Failed to find nonexistent_symbol: /lib64/libgdrapi.so.2: undefined symbol: nonexistent_symbol - Validated cuda_gdrcopy_dl_hmem_init() with libgdrapi.so.2 and libgdrapi.so path Signed-off-by: Nick Mazzilli <[email protected]>
1 parent 4fd9580 commit 591b065

File tree

1 file changed

+47
-65
lines changed

1 file changed

+47
-65
lines changed

src/hmem_cuda_gdrcopy.c

Lines changed: 47 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -81,75 +81,57 @@ static pthread_spinlock_t global_gdr_lock;
8181
static void *gdrapi_handle;
8282
static struct gdrcopy_ops global_gdrcopy_ops;
8383

84-
static int cuda_gdrcopy_dl_hmem_init(void)
84+
static int cuda_gdrcopy_load_symbol(void **func_ptr, const char *symbol_name)
8585
{
86-
gdrapi_handle = dlopen("libgdrapi.so", RTLD_NOW);
87-
if (!gdrapi_handle) {
88-
FI_INFO(&core_prov, FI_LOG_CORE,
89-
"Failed to dlopen libgdrapi.so\n");
90-
return -FI_ENOSYS;
91-
}
92-
93-
global_gdrcopy_ops.gdr_open = dlsym(gdrapi_handle, "gdr_open");
94-
if (!global_gdrcopy_ops.gdr_open) {
95-
FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find gdr_open\n");
96-
goto err_dlclose_gdrapi;
97-
}
98-
99-
global_gdrcopy_ops.gdr_close = dlsym(gdrapi_handle, "gdr_close");
100-
if (!global_gdrcopy_ops.gdr_close) {
101-
FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find gdr_close\n");
102-
goto err_dlclose_gdrapi;
103-
}
104-
105-
global_gdrcopy_ops.gdr_pin_buffer = dlsym(gdrapi_handle, "gdr_pin_buffer");
106-
if (!global_gdrcopy_ops.gdr_pin_buffer) {
107-
FI_WARN(&core_prov, FI_LOG_CORE,
108-
"Failed to find gdr_pin_buffer\n");
109-
goto err_dlclose_gdrapi;
110-
}
111-
112-
global_gdrcopy_ops.gdr_unpin_buffer = dlsym(gdrapi_handle, "gdr_unpin_buffer");
113-
if (!global_gdrcopy_ops.gdr_unpin_buffer) {
114-
FI_WARN(&core_prov, FI_LOG_CORE,
115-
"Failed to find gdr_unpin_buffer\n");
116-
goto err_dlclose_gdrapi;
117-
}
118-
119-
global_gdrcopy_ops.gdr_map = dlsym(gdrapi_handle, "gdr_map");
120-
if (!global_gdrcopy_ops.gdr_map) {
121-
FI_WARN(&core_prov, FI_LOG_CORE,
122-
"Failed to find gdr_map\n");
123-
goto err_dlclose_gdrapi;
86+
dlerror();
87+
*func_ptr = dlsym(gdrapi_handle, symbol_name);
88+
if (!*func_ptr) {
89+
FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find %s: %s\n", symbol_name, dlerror());
90+
return -FI_ENODATA;
12491
}
125-
126-
global_gdrcopy_ops.gdr_unmap = dlsym(gdrapi_handle, "gdr_unmap");
127-
if (!global_gdrcopy_ops.gdr_unmap) {
128-
FI_WARN(&core_prov, FI_LOG_CORE,
129-
"Failed to find gdr_unmap\n");
130-
goto err_dlclose_gdrapi;
131-
}
132-
133-
global_gdrcopy_ops.gdr_copy_to_mapping = dlsym(gdrapi_handle, "gdr_copy_to_mapping");
134-
if (!global_gdrcopy_ops.gdr_copy_to_mapping) {
135-
FI_WARN(&core_prov, FI_LOG_CORE,
136-
"Failed to find gdr_copy_to_mapping\n");
137-
goto err_dlclose_gdrapi;
138-
}
139-
140-
global_gdrcopy_ops.gdr_copy_from_mapping = dlsym(gdrapi_handle, "gdr_copy_from_mapping");
141-
if (!global_gdrcopy_ops.gdr_copy_from_mapping) {
142-
FI_WARN(&core_prov, FI_LOG_CORE,
143-
"Failed to find gdr_copy_from_mapping\n");
144-
goto err_dlclose_gdrapi;
145-
}
146-
14792
return FI_SUCCESS;
93+
}
14894

149-
err_dlclose_gdrapi:
150-
memset(&global_gdrcopy_ops, 0, sizeof(global_gdrcopy_ops));
151-
dlclose(gdrapi_handle);
152-
return -FI_ENODATA;
95+
static int cuda_gdrcopy_dl_hmem_init(void)
96+
{
97+
const char *lib_paths[] = {
98+
"libgdrapi.so",
99+
"libgdrapi.so.2",
100+
NULL
101+
};
102+
Dl_info info;
103+
104+
for (int i = 0; lib_paths[i]; i++) {
105+
FI_INFO(&core_prov, FI_LOG_CORE, "Trying to load: %s\n", lib_paths[i]);
106+
void *handle = dlopen(lib_paths[i], RTLD_NOW);
107+
if (!handle) continue;
108+
109+
gdrapi_handle = handle;
110+
111+
/* Try to load all symbols - if any fail, cleanup and try next library */
112+
if (cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_open, "gdr_open") ||
113+
cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_close, "gdr_close") ||
114+
cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_pin_buffer, "gdr_pin_buffer") ||
115+
cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_unpin_buffer, "gdr_unpin_buffer") ||
116+
cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_map, "gdr_map") ||
117+
cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_unmap, "gdr_unmap") ||
118+
cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_copy_to_mapping, "gdr_copy_to_mapping") ||
119+
cuda_gdrcopy_load_symbol((void**)&global_gdrcopy_ops.gdr_copy_from_mapping, "gdr_copy_from_mapping")) {
120+
memset(&global_gdrcopy_ops, 0, sizeof(global_gdrcopy_ops));
121+
dlclose(handle);
122+
gdrapi_handle = NULL;
123+
continue;
124+
}
125+
126+
/* All symbols loaded successfully */
127+
if (dladdr(global_gdrcopy_ops.gdr_open, &info)) {
128+
FI_INFO(&core_prov, FI_LOG_CORE, "Loaded GDRCopy library: %s\n", info.dli_fname);
129+
}
130+
return FI_SUCCESS;
131+
}
132+
133+
FI_INFO(&core_prov, FI_LOG_CORE, "Failed to find usable libgdrapi.so\n");
134+
return -FI_ENOSYS;
153135
}
154136

155137
static int cuda_gdrcopy_dl_hmem_cleanup(void)

0 commit comments

Comments
 (0)