You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[2024-09-05 09:40:49,421] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use).
[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.
Traceback (most recent call last):
File "/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 194, in
train()
File "/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 124, in train
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses
obj = dtype(**inputs)
File "", line 119, in init
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1442, in post_init
and (self.device.type != "cuda")
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1887, in device
return self._setup_devices
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/utils/generic.py", line 54, in get
cached = self.fget(obj)
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1813, in _setup_devices
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/accelerate/state.py", line 180, in init
dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/comm.py", line 670, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 120, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 146, in init_process_group
torch.distributed.init_process_group(backend,
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 900, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 245, in _env_rendezvous_handler
store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
File "/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 176, in _create_c10d_store
return TCPStore(
RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[2024-09-05 09:40:50,578] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777243
[2024-09-05 09:40:50,579] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777244
[2024-09-05 09:40:50,632] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777245
[2024-09-05 09:40:50,685] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777246
The text was updated successfully, but these errors were encountered:
zhangyaoyue01
changed the title
执行微调脚本报错The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use)
多卡执行微调脚本报错The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use)
Sep 5, 2024
[2024-09-05 09:40:49,421] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use).
[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.
Traceback (most recent call last):
File "
/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 194, in/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 124, in traintrain()
File "
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "
/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1442, in post_initobj = dtype(**inputs)
File "", line 119, in init
File "
and (self.device.type != "cuda")
File "
/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1887, in device/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/utils/generic.py", line 54, in getreturn self._setup_devices
File "
cached = self.fget(obj)
File "
/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1813, in _setup_devices/miniconda3/envs/onlyft/lib/python3.9/site-packages/accelerate/state.py", line 180, in initself.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
File "
dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
File "
/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/comm.py", line 670, in init_distributed/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 120, in initcdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "
/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 146, in init_process_group/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 900, in init_process_grouptorch.distributed.init_process_group(backend,
File "
store, rank, world_size = next(rendezvous_iterator)
File "
/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 245, in _env_rendezvous_handler/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 176, in _create_c10d_storestore = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
File "
return TCPStore(
RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[2024-09-05 09:40:50,578] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777243
[2024-09-05 09:40:50,579] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777244
[2024-09-05 09:40:50,632] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777245
[2024-09-05 09:40:50,685] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777246
The text was updated successfully, but these errors were encountered: