Skip to content

Commit eab5668

Browse files
Merge remote-tracking branch 'origin/master' into topic/gda_channels
2 parents 390d626 + f42271b commit eab5668

File tree

21 files changed

+186
-94
lines changed

21 files changed

+186
-94
lines changed

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ Michal Shalev <[email protected]>
7171
Mike Dubman <[email protected]>
7272
Mikhail Brinskii <[email protected]>
7373
74+
Nathan Bellalou <[email protected]>
7475
Nathan Hjelm <[email protected]>
7576
Netanel Yosephian <[email protected]>
7677
Ofir Farjon <[email protected]>

config/ucx.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
CPU model=Grace
33
UCX_REG_NONBLOCK_MEM_TYPES=host,cuda-managed
44
UCX_IB_ODP_MEM_TYPES=host,cuda-managed
5-
UCX_IB_MLX5_DEVX_OBJECTS=
5+
UCX_IB_MLX5_DEVX_OBJECTS=auto
66
UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBs
77
UCX_GDR_COPY_LAT=30ns
88
UCX_GDR_COPY_RCACHE_OVERHEAD=170ns

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,16 +183,19 @@ private:
183183
params.elements = elems;
184184

185185
ucs_status_t status;
186-
const ucs_time_t deadline = ucs_get_time() + ucs_time_from_sec(5.0);
186+
ucs_time_t deadline = ucs_get_time() + ucs_time_from_sec(60.0);
187187
do {
188+
if (ucs_get_time() > deadline) {
189+
ucs_warn("timeout creating device memory list");
190+
deadline = ULONG_MAX;
191+
}
192+
188193
ucp_worker_progress(perf.ucp.worker);
189194
status = ucp_device_mem_list_create(perf.ucp.ep, &params,
190195
&m_params.mem_list);
191-
} while ((status == UCS_ERR_NOT_CONNECTED) && (ucs_get_time() < deadline));
196+
} while (status == UCS_ERR_NOT_CONNECTED);
192197

193-
if (status == UCS_ERR_NOT_CONNECTED) {
194-
throw std::runtime_error("Timeout waiting for connection");
195-
} else if (status != UCS_OK) {
198+
if (status != UCS_OK) {
196199
throw std::runtime_error("Failed to create memory list");
197200
}
198201
}

src/ucp/core/ucp_device.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,7 @@ static ucs_status_t ucp_device_mem_list_create_handle(
414414
if (i == 0) {
415415
ucs_error("failed to select lane for local device %s",
416416
ucs_topo_sys_device_get_name(local_sys_dev));
417-
return UCS_ERR_NO_RESOURCE;
417+
return UCS_ERR_NO_DEVICE;
418418
}
419419

420420
/* Populate handle header */

src/ucp/wireup/select.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2501,8 +2501,7 @@ ucp_wireup_add_device_lanes(const ucp_wireup_select_params_t *select_params,
25012501
mem_type_tl_bitmap, UCP_NULL_LANE,
25022502
select_ctx, 0);
25032503
if (!found_lane) {
2504-
ucs_error("could not find device lanes");
2505-
return UCS_ERR_UNREACHABLE;
2504+
ucs_debug("ep %p: could not find device lanes", select_params->ep);
25062505
}
25072506

25082507
return UCS_OK;

src/ucs/config/global_opts.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ static ucs_config_field_t ucs_global_opts_table[] = {
129129
#if ENABLE_DEBUG_DATA
130130
"bt,freeze",
131131
#else
132-
"bt",
132+
"none",
133133
#endif
134134
"Error signal handling mode. Either 'none' to disable signal interception,\n"
135135
"or a combination of:\n"

src/ucs/sys/netlink.c

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ typedef struct {
2929
const struct sockaddr *sa_remote;
3030
int if_index;
3131
int found;
32+
int allow_default_gw; /* Allow matching default
33+
gateway routes */
3234
} ucs_netlink_route_info_t;
3335

3436

@@ -174,7 +176,7 @@ ucs_netlink_send_request(int protocol, unsigned short nlmsg_type,
174176

175177
static ucs_status_t
176178
ucs_netlink_get_route_info(const struct rtattr *rta, int len, int *if_index_p,
177-
const void **dst_in_addr)
179+
const void **dst_in_addr, size_t rtm_dst_len)
178180
{
179181
*if_index_p = -1;
180182
*dst_in_addr = NULL;
@@ -187,7 +189,10 @@ ucs_netlink_get_route_info(const struct rtattr *rta, int len, int *if_index_p,
187189
}
188190
}
189191

190-
if ((*if_index_p == -1) || (*dst_in_addr == NULL)) {
192+
if (/* Network interface index is not valid */
193+
(*if_index_p == -1) ||
194+
/* dst_in_addr required but not present */
195+
((rtm_dst_len != 0) && (*dst_in_addr == NULL))) {
191196
return UCS_ERR_INVALID_PARAM;
192197
}
193198

@@ -206,7 +211,8 @@ ucs_netlink_parse_rt_entry_cb(const struct nlmsghdr *nlh, void *arg)
206211
int khret;
207212

208213
if (ucs_netlink_get_route_info(RTM_RTA(rt_msg), RTM_PAYLOAD(nlh),
209-
&iface_index, &dst_in_addr) != UCS_OK) {
214+
&iface_index, &dst_in_addr,
215+
rt_msg->rtm_dst_len) != UCS_OK) {
210216
return UCS_INPROGRESS;
211217
}
212218

@@ -228,12 +234,14 @@ ucs_netlink_parse_rt_entry_cb(const struct nlmsghdr *nlh, void *arg)
228234
ucs_error("could not allocate route entry");
229235
return UCS_ERR_NO_MEMORY);
230236

231-
memset(&new_rule->dest, 0, sizeof(sizeof(new_rule->dest)));
237+
memset(&new_rule->dest, 0, sizeof(new_rule->dest));
232238
new_rule->dest.ss_family = rt_msg->rtm_family;
233-
if (UCS_OK != ucs_sockaddr_set_inet_addr((struct sockaddr *)&new_rule->dest,
234-
dst_in_addr)) {
235-
ucs_array_pop_back(iface_rules);
236-
return UCS_ERR_IO_ERROR;
239+
if (dst_in_addr != NULL) {
240+
if (ucs_sockaddr_set_inet_addr((struct sockaddr *)&new_rule->dest,
241+
dst_in_addr) != UCS_OK) {
242+
ucs_array_pop_back(iface_rules);
243+
return UCS_ERR_IO_ERROR;
244+
}
237245
}
238246

239247
new_rule->subnet_prefix_len = rt_msg->rtm_dst_len;
@@ -256,6 +264,13 @@ static void ucs_netlink_lookup_route(ucs_netlink_route_info_t *info)
256264

257265
iface_rules = &kh_val(&ucs_netlink_routing_table_cache, iter);
258266
ucs_array_for_each(curr_entry, iface_rules) {
267+
268+
if ((curr_entry->subnet_prefix_len == 0) && !info->allow_default_gw) {
269+
ucs_trace("iface_index=%d: skipping default gateway route",
270+
info->if_index);
271+
continue;
272+
}
273+
259274
if (ucs_sockaddr_is_same_subnet(
260275
info->sa_remote,
261276
(const struct sockaddr *)&curr_entry->dest,
@@ -266,7 +281,8 @@ static void ucs_netlink_lookup_route(ucs_netlink_route_info_t *info)
266281
}
267282
}
268283

269-
int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote)
284+
int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote,
285+
int allow_default_gw)
270286
{
271287
static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER;
272288
struct rtmsg rtm = {0};
@@ -285,9 +301,11 @@ int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote)
285301
NULL);
286302
}
287303

288-
info.if_index = if_index;
289-
info.sa_remote = sa_remote;
290-
info.found = 0;
304+
info.if_index = if_index;
305+
info.sa_remote = sa_remote;
306+
info.found = 0;
307+
info.allow_default_gw = allow_default_gw;
308+
291309
ucs_netlink_lookup_route(&info);
292310

293311
return info.found;

src/ucs/sys/netlink.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,17 @@ ucs_netlink_send_request(int protocol, unsigned short nlmsg_type,
4848
* Check whether a routing table rule exists for a given network
4949
* interface name and a destination address.
5050
*
51-
* @param [in] if_index A global index representing the network interface,
52-
as assigned by the system (e.g., obtained via
53-
if_nametoindex()).
54-
* @param [in] sa_remote Pointer to the destination address.
51+
* @param [in] if_index A global index representing the network
52+
interface, as assigned by the system
53+
(e.g., obtained via if_nametoindex()).
54+
* @param [in] sa_remote Pointer to the destination address.
55+
* @param [in] allow_default_gw Allow matching default gateway routes (1) or
56+
* only specific subnet routes (0).
5557
*
5658
* @return 1 if rule exists, or 0 otherwise.
5759
*/
58-
int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote);
60+
int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote,
61+
int allow_default_gw);
5962

6063
END_C_DECLS
6164

src/ucs/sys/sys.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <ucs/sys/checker.h>
1616
#include <ucs/sys/ptr_arith.h>
1717
#include <ucs/sys/string.h>
18+
#include <ucs/sys/sock.h>
1819
#include <ucs/sys/sys.h>
1920
#include <ucs/debug/log.h>
2021
#include <ucs/time/time.h>
@@ -177,6 +178,21 @@ ucs_status_t ucs_ifname_to_index(const char *ndev_name, unsigned *ndev_index_p)
177178
return UCS_OK;
178179
}
179180

181+
int ucs_netif_is_ipoib(const char *if_name)
182+
{
183+
struct ifreq ifr;
184+
ucs_status_t status;
185+
186+
status = ucs_netif_ioctl(if_name, SIOCGIFHWADDR, &ifr);
187+
if (status != UCS_OK) {
188+
/* If we can't determine the hardware type, assume it's not IPoIB */
189+
ucs_debug("failed to get hardware address for %s", if_name);
190+
return 0;
191+
}
192+
193+
return ifr.ifr_hwaddr.sa_family == ARPHRD_INFINIBAND;
194+
}
195+
180196
static uint64_t ucs_get_mac_address()
181197
{
182198
static uint64_t mac_address = 0;

src/ucs/sys/sys.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,16 @@ uint32_t ucs_file_checksum(const char *filename);
192192
ucs_status_t ucs_ifname_to_index(const char *ndev_name, unsigned *ndev_index_p);
193193

194194

195+
/**
196+
* Check if a network interface is an IPoIB (IP over InfiniBand) device.
197+
*
198+
* @param [in] if_name Network interface name to check.
199+
*
200+
* @return 1 if the interface is IPoIB, 0 otherwise.
201+
*/
202+
int ucs_netif_is_ipoib(const char *if_name);
203+
204+
195205
/**
196206
* Get a globally unique identifier of the machine running the current process.
197207
*/

0 commit comments

Comments
 (0)