-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Description
System Info
File "/mnt/data/verl/verl-main-1120/verl/single_controller/ray/base.py", line 700, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/data/verl/verl-main-1120/verl/single_controller/base/decorator.py", line 442, in inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/mnt/data/verl/verl-main-1120/verl/utils/transferqueue_utils.py", line 199, in dummy_inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/mnt/data/verl/verl-main-1120/verl/workers/fsdp_workers.py", line 1100, in load_checkpoint
self.checkpoint_manager.load_checkpoint(
File "/mnt/data/verl/verl-main-1120/verl/utils/checkpoint/fsdp_checkpoint_manager.py", line 138, in load_checkpoint
self.model.load_state_dict(model_state_dict)
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 2604, in load_state_dict
load(self, state_dict)
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 2575, in load
module._load_from_state_dict(
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 2382, in _load_from_state_dict
hook(
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 88, in call
return self.hook(module, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_state_dict_utils.py", line 839, in _pre_load_state_dict_hook
_pre_load_state_dict_hook_fn[fsdp_state._state_dict_type](
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_state_dict_utils.py", line 659, in _sharded_pre_load_state_dict_hook
local_tensor = _ext_all_gather_dtensor(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_fsdp_extensions.py", line 179, in _ext_all_gather_dtensor
return all_gather_dtensor_fn(tensor, parent_mesh)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_shard_utils.py", line 132, in _all_gather_dtensor
tensor = tensor.redistribute(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/_api.py", line 556, in redistribute
return Redistribute.apply(
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/autograd/function.py", line 576, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/_redistribute.py", line 321, in forward
output = redistribute_local_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/_redistribute.py", line 213, in redistribute_local_tensor
new_local_tensor = current_placement._to_replicate_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/placement_types.py", line 251, in _to_replicate_tensor
result = funcol.all_gather_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_functional_collectives.py", line 203, in all_gather_tensor
group_name = _resolve_group_name(group, tag)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_functional_collectives.py", line 783, in _resolve_group_name
return dmesh._dim_group_names[dim]
^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'DeviceMesh' object has no attribute '_dim_group_names'. Did you mean: '_dim_group_infos'?. Did you mean: '_return_value'?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
AttributeError: 'DeviceMesh' object has no attribute '_dim_group_names'.
Expected behavior
no error