Skip to content

AttributeError: 'DeviceMesh' object has no attribute '_dim_group_names'. #4208

@dachengai

Description

@dachengai

System Info

File "/mnt/data/verl/verl-main-1120/verl/single_controller/ray/base.py", line 700, in func
return getattr(self.worker_dict[key], name)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/data/verl/verl-main-1120/verl/single_controller/base/decorator.py", line 442, in inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/mnt/data/verl/verl-main-1120/verl/utils/transferqueue_utils.py", line 199, in dummy_inner
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/mnt/data/verl/verl-main-1120/verl/workers/fsdp_workers.py", line 1100, in load_checkpoint
self.checkpoint_manager.load_checkpoint(
File "/mnt/data/verl/verl-main-1120/verl/utils/checkpoint/fsdp_checkpoint_manager.py", line 138, in load_checkpoint
self.model.load_state_dict(model_state_dict)
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 2604, in load_state_dict
load(self, state_dict)
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 2575, in load
module._load_from_state_dict(
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 2382, in _load_from_state_dict
hook(
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 88, in call
return self.hook(module, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_state_dict_utils.py", line 839, in _pre_load_state_dict_hook
_pre_load_state_dict_hook_fn[fsdp_state._state_dict_type](
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_state_dict_utils.py", line 659, in _sharded_pre_load_state_dict_hook
local_tensor = _ext_all_gather_dtensor(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_fsdp_extensions.py", line 179, in _ext_all_gather_dtensor
return all_gather_dtensor_fn(tensor, parent_mesh)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/fsdp/_shard_utils.py", line 132, in _all_gather_dtensor
tensor = tensor.redistribute(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/_api.py", line 556, in redistribute
return Redistribute.apply(
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/autograd/function.py", line 576, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/_redistribute.py", line 321, in forward
output = redistribute_local_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/_redistribute.py", line 213, in redistribute_local_tensor
new_local_tensor = current_placement._to_replicate_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/tensor/placement_types.py", line 251, in _to_replicate_tensor
result = funcol.all_gather_tensor(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_functional_collectives.py", line 203, in all_gather_tensor
group_name = _resolve_group_name(group, tag)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/distributed/_functional_collectives.py", line 783, in _resolve_group_name
return dmesh._dim_group_names[dim]
^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'DeviceMesh' object has no attribute '_dim_group_names'. Did you mean: '_dim_group_infos'?. Did you mean: '_return_value'?

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

AttributeError: 'DeviceMesh' object has no attribute '_dim_group_names'.

Expected behavior

no error

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions