Skip to content

Commit 94a58ed

Browse files
committed
Fixed the issue from comments
1 parent 0600a8e commit 94a58ed

File tree

2 files changed

+33
-18
lines changed

2 files changed

+33
-18
lines changed

src/ucs/sys/topo/gaudi/topo.c

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
#define GAUDI_DEVICE_NAME_LEN 10
3939

4040
static pthread_mutex_t gaudi_init_mutex = PTHREAD_MUTEX_INITIALIZER;
41+
/* File-scope one-time control and status */
42+
static pthread_once_t gaudi_spinlock_once_flag = PTHREAD_ONCE_INIT;
43+
static ucs_status_t gaudi_spinlock_init_status = UCS_OK;
4144

4245
static const ucs_sys_dev_distance_t gaudi_fallback_node_distance =
4346
{.latency = 100e-9, .bandwidth = 17e9}; /* 100ns, 17 GB/s */
@@ -1272,12 +1275,12 @@ static ucs_status_t ucs_gaudi_build_assignment_balanced()
12721275
}
12731276

12741277
/* Count devices per NUMA node */
1275-
for (i = 0; i < num_gaudi_devices; ++i) {
1278+
for (i = 0; i < num_gaudi_devices; i++) {
12761279
numa_node = ucs_gaudi_get_validated_numa_node(
12771280
ucs_gaudi_topo_ctx.gaudi_devices[i], num_numa_nodes);
12781281
gaudi_per_numa[numa_node]++;
12791282
}
1280-
for (i = 0; i < num_hnic_devices; ++i) {
1283+
for (i = 0; i < num_hnic_devices; i++) {
12811284
numa_node = ucs_gaudi_get_validated_numa_node(
12821285
ucs_gaudi_topo_ctx.hnic_devices[i], num_numa_nodes);
12831286
hnic_per_numa[numa_node]++;
@@ -1539,7 +1542,7 @@ ucs_status_t ucs_gaudi_find_best_connection(const char *accel_name,
15391542
ucs_spin_lock(&ucs_gaudi_topo_ctx.lock);
15401543

15411544
/* Return cached balanced assignment instead of searching connections */
1542-
for (i = 0; i < ucs_gaudi_topo_ctx.num_gaudi_devices; ++i) {
1545+
for (i = 0; i < ucs_gaudi_topo_ctx.num_gaudi_devices; i++) {
15431546
if (!strcmp(accel_name, ucs_gaudi_topo_ctx.gaudi_devices_names[i])) {
15441547
break;
15451548
}
@@ -1686,7 +1689,7 @@ static void ucs_gaudi_get_memory_distance(ucs_sys_device_t device,
16861689
/* Sum NUMA distances for CPUs in affinity set */
16871690
num_cpus = ucs_numa_num_configured_cpus();
16881691
total_distance = 0;
1689-
for (cpu = 0; cpu < num_cpus; ++cpu) {
1692+
for (cpu = 0; cpu < num_cpus; cpu++) {
16901693
if (!full_affinity && !CPU_ISSET(cpu, &thread_cpuset)) {
16911694
continue;
16921695
}
@@ -1712,10 +1715,15 @@ static void ucs_gaudi_get_memory_distance(ucs_sys_device_t device,
17121715
distance->latency = (total_distance / cpuset_size) * 10e-9;
17131716
}
17141717

1718+
/* Initialize spinlock exactly once */
1719+
static void ucs_gaudi_spinlock_once_init()
1720+
{
1721+
gaudi_spinlock_init_status = ucs_spinlock_init(&ucs_gaudi_topo_ctx.lock, 0);
1722+
}
1723+
17151724
/* Initialization function */
17161725
void ucs_gaudi_topo_init()
17171726
{
1718-
ucs_status_t status;
17191727
const char *disable;
17201728

17211729
disable = getenv("UCS_GAUDI_TOPO_DISABLE");
@@ -1730,17 +1738,16 @@ void ucs_gaudi_topo_init()
17301738
return;
17311739
}
17321740

1741+
/* Ensure spinlock exists even if lazy init is first */
1742+
pthread_once(&gaudi_spinlock_once_flag, ucs_gaudi_spinlock_once_init);
1743+
if (gaudi_spinlock_init_status != UCS_OK) {
1744+
ucs_error("Failed to initialize spinlock: %s",
1745+
ucs_status_string(gaudi_spinlock_init_status));
1746+
return;
1747+
}
1748+
17331749
pthread_mutex_lock(&gaudi_init_mutex);
17341750
if (!ucs_gaudi_topo_ctx.provider_added) {
1735-
/* Initialize spinlock first */
1736-
status = ucs_spinlock_init(&ucs_gaudi_topo_ctx.lock, 0);
1737-
if (status != UCS_OK) {
1738-
pthread_mutex_unlock(&gaudi_init_mutex);
1739-
ucs_error("Failed to initialize spinlock: %s",
1740-
ucs_status_string(status));
1741-
return;
1742-
}
1743-
17441751
ucs_debug("Registering Gaudi topology provider");
17451752
ucs_list_add_head(&ucs_sys_topo_providers_list,
17461753
&ucs_gaudi_topo_provider.list);
@@ -1763,6 +1770,14 @@ static ucs_status_t ucs_gaudi_lazy_init()
17631770
return UCS_ERR_UNSUPPORTED;
17641771
}
17651772

1773+
/* Ensure spinlock exists */
1774+
pthread_once(&gaudi_spinlock_once_flag, ucs_gaudi_spinlock_once_init);
1775+
if (gaudi_spinlock_init_status != UCS_OK) {
1776+
ucs_error("Failed to initialize spinlock: %s",
1777+
ucs_status_string(gaudi_spinlock_init_status));
1778+
return gaudi_spinlock_init_status;
1779+
}
1780+
17661781
ucs_spin_lock(&ucs_gaudi_topo_ctx.lock);
17671782

17681783
if (ucs_gaudi_topo_ctx.initialized) {
@@ -1891,9 +1906,6 @@ void ucs_gaudi_topo_cleanup()
18911906
ucs_gaudi_topo_ctx.have_assignment = 0;
18921907

18931908
ucs_spin_unlock(&ucs_gaudi_topo_ctx.lock);
1894-
ucs_spinlock_destroy(&ucs_gaudi_topo_ctx.lock);
1895-
18961909
pthread_mutex_unlock(&gaudi_init_mutex);
1897-
18981910
ucs_debug("Gaudi topology cleaned up");
18991911
}

src/uct/gaudi/gaudi_gdr/gaudi_gdr_md.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ static void uct_gaudi_md_close(uct_md_h uct_md)
4545
if (md->dmabuf_fd >= 0) {
4646
close(md->dmabuf_fd);
4747
}
48+
if (md->fd >= 0) {
49+
close(md->fd);
50+
}
4851
ucs_free(md);
4952
}
5053

@@ -184,7 +187,7 @@ uct_gaudi_md_open(uct_component_h component, const char *md_name,
184187
goto err_close_dmabuf;
185188
}
186189

187-
md->fd = fd;
190+
md->fd = dup(fd);
188191
md->super.ops = &md_ops;
189192
md->super.component = &uct_gaudi_gdr_component;
190193

0 commit comments

Comments
 (0)