Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions fabtests/common/hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ static struct ft_hmem_ops hmem_ops[] = {
.mem_set = ft_rocr_memset,
.copy_to_hmem = ft_rocr_memcpy,
.copy_from_hmem = ft_rocr_memcpy,
.get_dmabuf_fd = ft_hmem_no_get_dmabuf_fd,
.put_dmabuf_fd = ft_hmem_no_put_dmabuf_fd,
.get_dmabuf_fd = ft_rocr_get_dmabuf_fd,
.put_dmabuf_fd = ft_rocr_put_dmabuf_fd,
},
[FI_HMEM_ZE] = {
.init = ft_ze_init,
Expand Down Expand Up @@ -235,4 +235,4 @@ int ft_hmem_put_dmabuf_fd(enum fi_hmem_iface iface, int fd)
int ft_hmem_no_put_dmabuf_fd(int fd)
{
return -FI_ENOSYS;
}
}
51 changes: 51 additions & 0 deletions fabtests/common/hmem_rocr.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ struct rocr_ops {
hsa_status_t (*hsa_memory_free)(void *ptr);
hsa_status_t (*hsa_amd_memory_fill)(void* ptr, uint32_t value,
size_t count);
hsa_status_t (*hsa_amd_portable_export_dmabuf)(const void *ptr, size_t size,
int *dmabuf, uint64_t *offset);
hsa_status_t (*hsa_amd_portable_close_dmabuf)(int dmabuf);
};

static struct rocr_ops rocr_ops;
Expand Down Expand Up @@ -200,6 +203,11 @@ int ft_rocr_init(void)
goto err_dlclose_rocr;
}

rocr_ops.hsa_amd_portable_export_dmabuf =
dlsym(rocr_handle, "hsa_amd_portable_export_dmabuf");
rocr_ops.hsa_amd_portable_close_dmabuf =
dlsym(rocr_handle, "hsa_amd_portable_close_dmabuf");

hsa_ret = rocr_ops.hsa_init();
if (hsa_ret != HSA_STATUS_SUCCESS) {
ROCR_ERR(hsa_ret, "hsa_init failed");
Expand Down Expand Up @@ -343,6 +351,38 @@ int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size)
return -FI_EIO;
}

int ft_rocr_get_dmabuf_fd(void *buf, size_t len,
int *dmabuf_fd, uint64_t *dmabuf_offset)
{
hsa_status_t hsa_ret;

if (!rocr_ops.hsa_amd_portable_export_dmabuf)
return -FI_EOPNOTSUPP;

hsa_ret = rocr_ops.hsa_amd_portable_export_dmabuf(buf, len,
dmabuf_fd, dmabuf_offset);
if (hsa_ret == HSA_STATUS_SUCCESS)
return FI_SUCCESS;

ROCR_ERR(hsa_ret, "hsa_amd_portable_export_dmabuf failed");
return -FI_EIO;
}

int ft_rocr_put_dmabuf_fd(int fd)
{
hsa_status_t hsa_ret;

if (!rocr_ops.hsa_amd_portable_close_dmabuf)
return -FI_EOPNOTSUPP;

hsa_ret = rocr_ops.hsa_amd_portable_close_dmabuf(fd);
if (hsa_ret == HSA_STATUS_SUCCESS)
return FI_SUCCESS;

ROCR_ERR(hsa_ret, "hsa_amd_portable_close_dmabuf failed");
return -FI_EIO;
}

#else

int ft_rocr_init(void)
Expand Down Expand Up @@ -375,4 +415,15 @@ int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size)
return -FI_ENOSYS;
}

int ft_rocr_get_dmabuf_fd(void *buf, size_t len,
int *dmabuf_fd, uint64_t *dmabuf_offset)
{
return -FI_ENOSYS;
}

int ft_rocr_put_dmabuf_fd(int fd)
{
return -FI_ENOSYS;
}

#endif /* HAVE_ROCR_RUNTIME_H */
4 changes: 3 additions & 1 deletion fabtests/common/shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -3432,7 +3432,7 @@ void ft_usage(char *name, char *desc)
void ft_hmem_usage()
{
FT_PRINT_OPTS_USAGE("-D <device_iface>", "Specify device interface: "
"e.g. cuda, ze, neuron, synapseai (default: None). "
"e.g. cuda, ze, neuron, synapseai, rocr (default: None). "
"Automatically enables FI_HMEM (-H)");
FT_PRINT_OPTS_USAGE("-i <device_id>", "Specify which device to use (default: 0)");
FT_PRINT_OPTS_USAGE("-H", "Enable provider FI_HMEM support");
Expand Down Expand Up @@ -3594,6 +3594,8 @@ void ft_parse_hmem_opts(int op, char *optarg, struct ft_opts *opts)
opts->iface = FI_HMEM_CUDA;
else if (!strncasecmp("neuron", optarg, 6))
opts->iface = FI_HMEM_NEURON;
else if (!strncasecmp("rocr", optarg, 4))
opts->iface = FI_HMEM_ROCR;
else if (!strncasecmp("synapseai", optarg, 9)) {
opts->iface = FI_HMEM_SYNAPSEAI;
opts->options |= FT_OPT_REG_DMABUF_MR;
Expand Down
3 changes: 3 additions & 0 deletions fabtests/include/hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ int ft_rocr_alloc(uint64_t device, void **buf, size_t size);
int ft_rocr_free(void *buf);
int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size);
int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size);
int ft_rocr_get_dmabuf_fd(void *buf, size_t len,
int *fd, uint64_t *offset);
int ft_rocr_put_dmabuf_fd(int fd);

int ft_neuron_init(void);
int ft_neuron_cleanup(void);
Expand Down
32 changes: 29 additions & 3 deletions fabtests/pytest/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,32 @@ def wait_until_neuron_device_available(ip, device_id):
raise RuntimeError("Error: neuron device {} is not available after {} tries".format(device_id, maxtry))


@functools.lru_cache(10)
@retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
def num_rocr_devices(ip):
proc = run("ssh {} rocm-smi --alldevices".format(ip), shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
timeout=60, encoding="utf-8")

if has_ssh_connection_err_msg(proc.stderr):
raise SshConnectionError()

# Count lines that start with a digit (device number)
result = 0
lines = proc.stdout.split("\n")
for line in lines:
line = line.strip()
if line and line[0].isdigit():
result += 1

return result


def num_hmem_devices(ip, hmem_type):
function_table = {
"cuda" : num_cuda_devices,
"neuron" : num_neuron_devices
"neuron" : num_neuron_devices,
"rocr": num_rocr_devices,
}

if hmem_type not in function_table:
Expand All @@ -141,6 +163,10 @@ def has_neuron(ip):
return num_neuron_devices(ip) > 0


def has_rocr(ip):
return num_rocr_devices(ip) > 0


@retry(retry_on_exception=is_ssh_connection_error, stop_max_attempt_number=3, wait_fixed=5000)
def has_hmem_support(cmdline_args, ip):
binpath = cmdline_args.binpath or ""
Expand Down Expand Up @@ -444,7 +470,7 @@ def prepare_base_command(self, command_type, executable,
if host_memory_type == "host":
return command, additional_env # default addtional environment variable

assert host_memory_type == "cuda" or host_memory_type == "neuron"
assert host_memory_type in ["cuda", "neuron", "rocr"]

if not has_hmem_support(self._cmdline_args, host_ip):
pytest.skip("no hmem support")
Expand All @@ -464,7 +490,7 @@ def prepare_base_command(self, command_type, executable,
else:
hmem_device_id = 0

if host_memory_type == "cuda":
if host_memory_type in ["cuda", "rocr"]:
command += " -i {}".format(hmem_device_id)
else:
assert host_memory_type == "neuron"
Expand Down
3 changes: 3 additions & 0 deletions fabtests/pytest/efa/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@
pytest.param("cuda_to_cuda", marks=pytest.mark.cuda_memory),
pytest.param("host_to_neuron", marks=pytest.mark.neuron_memory),
pytest.param("neuron_to_neuron", marks=pytest.mark.neuron_memory),
pytest.param("host_to_rocr", marks=pytest.mark.rocr_memory),
pytest.param("rocr_to_rocr", marks=pytest.mark.rocr_memory),
]

# Add more memory types that are useful for uni-directional tests.
memory_type_list_all = memory_type_list_bi_dir + [
pytest.param("cuda_to_host", marks=pytest.mark.cuda_memory),
pytest.param("neuron_to_host", marks=pytest.mark.neuron_memory),
pytest.param("rocr_to_host", marks=pytest.mark.rocr_memory),
]

@pytest.fixture(scope="module", params=memory_type_list_all)
Expand Down
1 change: 1 addition & 0 deletions fabtests/pytest/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ markers =
ubertest_verify: ubertest tests run with verify config
cuda_memory: testing with cuda device memory direct
neuron_memory: testing with neuron device memory
rocr_memory: testing with ROCr device memory
serial: test must be run in seria mode
unstable: test is unstable and only run when the marker is specified.
junit_suite_name = fabtests
Expand Down
2 changes: 2 additions & 0 deletions include/ofi_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ int rocr_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src,
int rocr_hmem_get_dmabuf_fd(const void *addr, uint64_t size, int *dmabuf_fd,
uint64_t *offset);
int rocr_hmem_put_dmabuf_fd(int fd);
void *rocr_alloc(size_t size);
void rocr_free(void *ptr);

int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size);
int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size);
Expand Down
1 change: 1 addition & 0 deletions include/rdma/fi_domain.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ struct fi_mr_attr {
int ze;
int neuron;
int synapseai;
int rocr;
} device;
void *hmem_data;
size_t page_size;
Expand Down
4 changes: 4 additions & 0 deletions man/fi_mr.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,7 @@ struct fi_mr_attr {
int ze
int neuron;
int synapseai;
int rocr;
} device;
void *hmem_data;
size_t page_size;
Expand Down Expand Up @@ -809,6 +810,9 @@ field is determined by the value specified through iface.
*synapseai*
: For FI_HMEM_SYNAPSEAI, the device identifier for Habana Gaudi hardware.

*rocr*
: For FI_HMEM_ROCR, the device index for an AMD GPU.

## hmem_data
The hmem_data field is reserved for future use and must be null.

Expand Down
Loading