-
Notifications
You must be signed in to change notification settings - Fork 927
Open
Description
It seems like mixing NCCL (GPU backend) and MPI in the same process might trigger an invalid group reference inside OpenMPI, but I’m not sure whether this is expected or a PyTorch bug. When I create an MPI group using torch.distributed
, the program crashes with a segmentation fault.
Here’s the code and the OpenMPI call stack:
Code:
test_mpi_p2p.py
import torch
import torch.distributed as dist
import os
def run():
# Initialize global NCCL process group
dist.init_process_group(backend="nccl") # torchrun automatically sets rank/world_size
rank = dist.get_rank()
world_size = dist.get_world_size()
# Bind to GPU device
device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
# Create a local MPI group (for CPU tensors only)
# Note: MPI backend can only operate on CPU tensors
local_ranks = [0, 1] # Custom local group
mpi_group = dist.new_group(ranks=local_ranks, backend='mpi')
# GPU tensors use NCCL for global communication
N = 1 << 20
send_gpu = torch.full((N,), float(rank + 1), device=device)
recv_gpu = torch.empty_like(send_gpu, device=device)
dst = (rank + 1) % world_size
src = (rank - 1 + world_size) % world_size
req_send = dist.isend(tensor=send_gpu, dst=dst)
req_recv = dist.irecv(tensor=recv_gpu, src=src)
req_send.wait()
req_recv.wait()
print(f'Rank {rank} received sample (GPU NCCL): {recv_gpu[:5].cpu().numpy()}')
# CPU tensors use the local MPI group
send_cpu = torch.full((5,), float(rank + 10))
recv_cpu = torch.empty_like(send_cpu)
dist.send(tensor=send_cpu, dst=dst, group=mpi_group)
dist.recv(tensor=recv_cpu, src=src, group=mpi_group)
print(f'Rank {rank} received sample (CPU MPI): {recv_cpu.numpy()}')
dist.destroy_process_group()
if __name__ == '__main__':
run()
OpenMPI version: 4.1.7
Call stack:
Core was generated by `/usr/bin/python -u test_mpi_p2p_new.py'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0 ompi_group_get_proc_name (rank=1, group=0x479f4e80) at ../ompi/group/group.h:405
warning: 405 ../ompi/group/group.h: No such file or directory
[Current thread is 1 (Thread 0x7f2f2bb74300 (LWP 14892))]
(gdb) bt
#0 ompi_group_get_proc_name (rank=1, group=0x479f4e80) at ../ompi/group/group.h:405
#1 ompi_dpm_group_is_dyn (thisjobid=1035599873, group=0x479f4e80) at dpm/dpm.c:1268
#2 ompi_dpm_mark_dyncomm (comm=comm@entry=0x437f8830) at dpm/dpm.c:1299
#3 0x00007f2f2b2ee331 in ompi_comm_set_nb (ncomm=ncomm@entry=0x7ffe90d3da60, oldcomm=oldcomm@entry=0x7f2f2b3c8ec0 <ompi_mpi_comm_world>, local_size=2, local_size@entry=0, local_ranks=local_ranks@entry=0x0, remote_size=remote_size@entry=0, remote_ranks=remote_ranks@entry=0x0, attr=0x0, errh=0x7f2f2b3c9de0 <ompi_mpi_errors_are_fatal>, copy_topocomponent=false,
local_group=0x479f4e80, remote_group=0x0, req=0x7ffe90d3d9f0) at communicator/comm.c:215
#4 0x00007f2f2b2ee669 in ompi_comm_set (ncomm=ncomm@entry=0x7ffe90d3da60, oldcomm=oldcomm@entry=0x7f2f2b3c8ec0 <ompi_mpi_comm_world>, local_size=local_size@entry=0, local_ranks=local_ranks@entry=0x0, remote_size=remote_size@entry=0, remote_ranks=remote_ranks@entry=0x0, attr=0x0, errh=0x7f2f2b3c9de0 <ompi_mpi_errors_are_fatal>, copy_topocomponent=false,
local_group=0x479f4e80, remote_group=0x0) at communicator/comm.c:116
#5 0x00007f2f2b2eee9b in ompi_comm_create (comm=comm@entry=0x7f2f2b3c8ec0 <ompi_mpi_comm_world>, group=0x479f4e80, newcomm=0x7ffe90d3db28) at communicator/comm.c:344
#6 0x00007f2f2b326a8a in PMPI_Comm_create (comm=0x7f2f2b3c8ec0 <ompi_mpi_comm_world>, group=<optimized out>, newcomm=<optimized out>) at pcomm_create.c:66
#7 0x00007f2f17842859 in c10d::ProcessGroupMPI::createProcessGroupMPI(std::vector<int, std::allocator<int> >) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so
#8 0x00007f2f1fc36d63 in pybind11::cpp_function::initialize<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object*, _object*)::{lambda(std::vector<int, std::allocator<int> >)#99}, c10::intrusive_ptr<c10d::ProcessGroupMPI, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroupMPI> >, std::vector<int, std::allocator<int> >, pybind11::name, pybind11::scope, pybind11::sibling, pybind11::call_guard<pybind11::gil_scoped_release> >(torch::distributed::c10d::(anonymous namespace)::c10d_init(_object*, _object*)::{lambda(std::vector<int, std::allocator<int> >)#99}&&, c10::intrusive_ptr<c10d::ProcessGroupMPI, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroupMPI> > (*)(std::vector<int, std::allocator<int> >), pybind11::name const&, pybind11::scope const&, pybind11::sibling const&, pybind11::call_guard<pybind11::gil_scoped_release> const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN(pybind11::detail::function_call&) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so
#9 0x00007f2f1f36b37d in pybind11::cpp_function::dispatcher(_object*, _object*, _object*) () from /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so
#10 0x000000000058208f in ?? ()
#11 0x0000000000549185 in _PyObject_MakeTpCall ()
#12 0x00000000005d73c9 in _PyEval_EvalFrameDefault ()
#13 0x00000000005d58eb in PyEval_EvalCode ()
#14 0x0000000000608b42 in ?? ()
#15 0x00000000006b4e93 in ?? ()
#16 0x00000000006b4bfa in _PyRun_SimpleFileObject ()
#17 0x00000000006b4a2f in _PyRun_AnyFileObject ()
#18 0x00000000006bca95 in Py_RunMain ()
#19 0x00000000006bc57d in Py_BytesMain ()
#20 0x00007f2f2bb9f1ca in __libc_start_call_main (main=main@entry=0x5189b0, argc=argc@entry=3, argv=argv@entry=0x7ffe90d3e588) at ../sysdeps/nptl/libc_start_call_main.h:58
#21 0x00007f2f2bb9f28b in __libc_start_main_impl (main=0x5189b0, argc=3, argv=0x7ffe90d3e588, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7ffe90d3e578) at ../csu/libc-start.c:360
#22 0x0000000000657ce5 in _start ()
The crash occurs in ../ompi/group/group.h:405
at the following code:
../ompi/group/group.h:405
#398 static inline opal_process_name_t ompi_group_get_proc_name(ompi_group_t *group, int rank)
#399 {
#400 ompi_proc_t *proc = ompi_group_get_proc_ptr_raw(group, rank); // Get pointer to the process at the given rank in the group
#401 if (ompi_proc_is_sentinel(proc)) { // Check if this is a sentinel (invalid) process
#402 return ompi_proc_sentinel_to_name((intptr_t) proc); // Convert sentinel to a process name403403
#403 }
#404
#405 return proc->super.proc_name; // Return the actual process name
#406 }
Metadata
Metadata
Assignees
Labels
No labels