Skip to content

Commit

Permalink
add ib roce gid auto selection
Browse files Browse the repository at this point in the history
  • Loading branch information
JiakunYan committed Sep 20, 2024
1 parent 393057c commit b55a7c2
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 25 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

# Vscode related
.vscode/
build/

# CLion related
.idea
Expand Down
122 changes: 121 additions & 1 deletion lci/backend/ibv/lcisi_ibv_detail.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ bool select_best_device_port(struct ibv_device** dev_list, int num_devices,
continue;
}
// Check whether we can get its lid
if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET && !port_attr.lid) {
if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && !port_attr.lid) {
fprintf(stderr, "Couldn't get local LID\n");
continue;
}
Expand Down Expand Up @@ -134,4 +134,124 @@ bool select_best_device_port(struct ibv_device** dev_list, int num_devices,
LCI_Log(LCI_LOG_INFO, "ibv", "No device is available!\n");
return false;
}
}

typedef enum roce_version_t {
ROCE_V1,
ROCE_V2,
ROCE_VER_UNKNOWN
} roce_version_t;

roce_version_t query_gid_roce_version(LCISI_server_t* server,
unsigned gid_index)
{
char buf[16];
int ret;
char* dev_name = ibv_get_device_name(server->ib_dev);

union ibv_gid gid;
ret = ibv_query_gid(server->dev_ctx, server->dev_port, gid_index, &gid);
if (ret == 0) {
ret = LCT_read_file(buf, sizeof(buf),
"/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
dev_name, server->dev_port, gid_index);
if (ret > 0) {
if (!strncmp(buf, "IB/RoCE v1", 10)) {
return ROCE_V1;
} else if (!strncmp(buf, "RoCE v2", 7)) {
return ROCE_V2;
}
}
}
LCI_Log(LCI_LOG_DEBUG, "ibv",
"failed to parse gid type '%s' (dev=%s port=%d index=%d)\n", buf,
dev_name, server->dev_port, gid_index);
return ROCE_VER_UNKNOWN;
}

bool test_roce_gid_index(LCISI_server_t* server, uint8_t gid_index)
{
struct ibv_ah_attr ah_attr;
struct ibv_ah* ah;
union ibv_gid gid;

IBV_SAFECALL(
ibv_query_gid(server->dev_ctx, server->dev_port, gid_index, &gid));

memset(&ah_attr, 0, sizeof(ah_attr));
ah_attr.port_num = server->dev_port;
ah_attr.is_global = 1;
ah_attr.grh.dgid = gid;
ah_attr.grh.sgid_index = gid_index;
ah_attr.grh.hop_limit = 255;
ah_attr.grh.flow_label = 1;
ah_attr.dlid = 0xC000;

ah = ibv_create_ah(server->dev_pd, &ah_attr);
if (ah == NULL) {
LCI_Log(LCI_LOG_DEBUG, "ibv", "gid entry %d is not operational\n",
gid_index);
return false;
}

ibv_destroy_ah(ah);
return true;
}

int select_best_gid_for_roce(LCISI_server_t* server)
{
static const roce_version_t roce_prio[] = {
ROCE_V2,
ROCE_V1,
ROCE_VER_UNKNOWN,
};
int gid_tbl_len = server->port_attr.gid_tbl_len;

LCI_Log(LCI_LOG_DEBUG, "ibv", "RoCE gid auto selection among %d gids\n",
gid_tbl_len);
for (int prio_idx = 0; prio_idx < sizeof(roce_prio); prio_idx++) {
for (int i = 0; i < gid_tbl_len; i++) {
roce_version_t version = query_gid_roce_version(server, i);

if ((roce_prio[prio_idx] == version) && test_roce_gid_index(server, i)) {
LCI_Log(LCI_LOG_INFO, "ibv", "RoCE gid auto selection: use %d %d\n", i,
version);
return i;
}
}
}

const int default_gid = 0;
LCI_Log(LCI_LOG_INFO, "ibv",
"RoCE gid auto selection: fall back to the default gid %d\n",
default_gid);
return default_gid; // default gid for roce
}

void gid_to_wire_gid(const union ibv_gid* gid, char wgid[])
{
LCI_Assert(sizeof(union ibv_gid) == 16, "Unexpected ibv_gid size %d\n",
sizeof(union ibv_gid));
uint32_t tmp_gid[4];
int i;

memcpy(tmp_gid, gid, sizeof(tmp_gid));
for (i = 0; i < 4; ++i) sprintf(&wgid[i * 8], "%08x", htobe32(tmp_gid[i]));
}

void wire_gid_to_gid(const char* wgid, union ibv_gid* gid)
{
LCI_Assert(sizeof(union ibv_gid) == 16, "Unexpected ibv_gid size %d\n",
sizeof(union ibv_gid));
char tmp[9];
__be32 v32;
int i;
uint32_t tmp_gid[4];

for (tmp[8] = 0, i = 0; i < 4; ++i) {
memcpy(tmp, wgid + i * 8, 8);
sscanf(tmp, "%x", &v32);
tmp_gid[i] = be32toh(v32);
}
memcpy(gid, tmp_gid, sizeof(*gid));
}
24 changes: 4 additions & 20 deletions lci/backend/ibv/lcisi_ibv_detail.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,12 @@
bool select_best_device_port(struct ibv_device** dev_list, int num_devices,
struct ibv_device** device_o, uint8_t* port_o);

void gid_to_wire_gid(const union ibv_gid* gid, char wgid[])
{
uint32_t tmp_gid[4];
int i;
int select_best_gid_for_roce(LCISI_server_t* server);

memcpy(tmp_gid, gid, sizeof(tmp_gid));
for (i = 0; i < 4; ++i) sprintf(&wgid[i * 8], "%08x", htobe32(tmp_gid[i]));
}
const int WIRE_GID_NBYTES = 32;

void wire_gid_to_gid(const char* wgid, union ibv_gid* gid)
{
char tmp[9];
__be32 v32;
int i;
uint32_t tmp_gid[4];
void gid_to_wire_gid(const union ibv_gid* gid, char wgid[]);

for (tmp[8] = 0, i = 0; i < 4; ++i) {
memcpy(tmp, wgid + i * 8, 8);
sscanf(tmp, "%x", &v32);
tmp_gid[i] = be32toh(v32);
}
memcpy(gid, tmp_gid, sizeof(*gid));
}
void wire_gid_to_gid(const char* wgid, union ibv_gid* gid);

#endif
11 changes: 8 additions & 3 deletions lci/backend/ibv/server_ibv.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ void LCISD_server_init(LCIS_server_t* s)
if (rc != 0) {
fprintf(stderr, "Unable to query port\n");
exit(EXIT_FAILURE);
} else if (server->port_attr.link_layer != IBV_LINK_LAYER_ETHERNET &&
} else if (server->port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND &&
!server->port_attr.lid) {
fprintf(stderr, "Couldn't get local LID\n");
exit(EXIT_FAILURE);
Expand All @@ -171,6 +171,11 @@ void LCISD_server_init(LCIS_server_t* s)

// query the gid
server->gid_idx = LCI_IBV_GID_IDX;
if (server->gid_idx < 0 &&
server->port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
// User did not explicitly specify the gid to use and we are using RoCE
server->gid_idx = select_best_gid_for_roce(server);
}
if (server->gid_idx >= 0) {
LCI_Log(LCI_LOG_INFO, "ibv", "Use GID index: %d\n", server->gid_idx);
if (ibv_query_gid(server->dev_ctx, server->dev_port, server->gid_idx,
Expand Down Expand Up @@ -352,7 +357,7 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
exit(EXIT_FAILURE);
}
}
char wgid[33];
char wgid[WIRE_GID_NBYTES + 1];
memset(wgid, 0, sizeof(wgid));
gid_to_wire_gid(&endpoint_p->server->gid, wgid);
// Use this queue pair "i" to connect to rank e.
Expand All @@ -375,7 +380,7 @@ void LCISD_endpoint_init(LCIS_server_t server_pp, LCIS_endpoint_t* endpoint_pp,
uint32_t dest_qpn;
uint16_t dest_lid;
union ibv_gid gid;
char wgid[33];
char wgid[WIRE_GID_NBYTES + 1];
sscanf(value, "%x:%hx:%s", &dest_qpn, &dest_lid, wgid);
wire_gid_to_gid(wgid, &gid);
// Once a queue pair (QP) has receive buffers posted to it, it is now
Expand Down
3 changes: 2 additions & 1 deletion lct/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ target_sources_relative(
tbarrier/tbarrier.cpp
util/thread.cpp
util/time.cpp
util/string.cpp)
util/string.cpp
util/io.cpp)

target_include_directories(LCT PRIVATE ${CMAKE_CURRENT_BINARY_DIR})

Expand Down
4 changes: 4 additions & 0 deletions lct/api/lct.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ LCT_API bool LCT_tbarrier_test(LCT_tbarrier_t tbarrier, int64_t ticket);
LCT_API void LCT_tbarrier_wait(LCT_tbarrier_t tbarrier, int64_t ticket);
LCT_API void LCT_tbarrier_arrive_and_wait(LCT_tbarrier_t tbarrier);

// File IO
LCT_API ssize_t LCT_read_file(char* buffer, size_t max,
const char* filename_fmt, ...);

#ifdef __cplusplus
}
#endif
Expand Down
39 changes: 39 additions & 0 deletions lct/util/io.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include <sys/param.h>
#include <unistd.h>
#include <stdarg.h>
#include <fcntl.h>
#include "lcti.hpp"

ssize_t read_file_vararg(char* buffer, size_t max, const char* filename_fmt,
va_list ap)
{
char filename[MAXPATHLEN];
ssize_t read_bytes;
int fd;

memset(buffer, 0, max);

vsnprintf(filename, MAXPATHLEN, filename_fmt, ap);

fd = open(filename, O_RDONLY);
if (fd < 0) {
return -1;
}

read_bytes = read(fd, buffer, max - 1);

close(fd);
return read_bytes;
}

ssize_t LCT_read_file(char* buffer, size_t max, const char* filename_fmt, ...)
{
ssize_t read_bytes;
va_list ap;

va_start(ap, filename_fmt);
read_bytes = read_file_vararg(buffer, max, filename_fmt, ap);
va_end(ap);

return read_bytes;
}

0 comments on commit b55a7c2

Please sign in to comment.