InternLM
diff --git a/‎internlm/core/communication/isp.py
Lines changed: 52 additions & 13 deletions b/‎internlm/core/communication/isp.py
Lines changed: 52 additions & 13 deletions
diff --git a/‎internlm/core/communication/utils.py
Lines changed: 107 additions & 1 deletion b/‎internlm/core/communication/utils.py
Lines changed: 107 additions & 1 deletion
diff --git a/‎internlm/solver/optimizer/hybrid_zero_optim.py
Lines changed: 25 additions & 21 deletions b/‎internlm/solver/optimizer/hybrid_zero_optim.py
Lines changed: 25 additions & 21 deletions
@@ -3,7 +3,7 @@
 
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Dict, List, Union
+from typing import Any, Callable, Dict, List, Union
 
 import torch
 from torch import distributed as dist
@@ -135,7 +135,8 @@ def __init__(self) -> None:
         self.last_ckpt_block: nn.Module = None
         self.isp_outs: List[nn.Module] = []
         self.isp_modules: List[nn.Module] = []
-        self.index_to_isp_module: Dict[int, nn.Module] = {}
+        self.index_to_isp_modules: Dict[int, nn.Module] = {}
+        self.index_to_block: Dict[int, nn.Module] = {}
         self.module_to_index: Dict[nn.Module, int] = {}
         self.weight_global_handle: Dict[str, Any] = {}
         self.weight_global_output: Dict[str, torch.Tensor] = {}
@@ -163,6 +164,7 @@ def __init__(
         self.is_forward = True
         self.reduce_scatter_handlers = {}
         self._module_shapes = {}
+        self._forward_prefetch_prerequisites = []
 
         # real overlap state for each chunk.
         self._overlap_states: Dict[int, ISPOverlapState] = {}
@@ -186,7 +188,9 @@ def __init__(
         # key: isp module; value: transformer block index
         self._module_to_index = None
         # key: transformer block index; value: isp modules
-        self._index_to_isp_module = None
+        self._index_to_isp_modules = None
+        # key: transformer block index; value: transformer block
+        self._index_to_block = None
 
         # init overlap states if necessary.
         if self.overlap:
@@ -228,7 +232,8 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
                 ]
 
                 for idx, block in enumerate(children):
-                    self._overlap_states[cid].index_to_isp_module[idx] = []
+                    self._overlap_states[cid].index_to_isp_modules[idx] = []
+                    self._overlap_states[cid].index_to_block[idx] = block
                     for sub_name, sub in block.named_children():
                         for name, child in sub.named_children():
                             if name in ["out_proj", "wo"]:
@@ -243,7 +248,7 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
                                     self._module_shapes[name] = torch.Size(origin_shape)
                                 self._overlap_states[cid].module_to_index[child] = idx
                                 self._overlap_states[cid].isp_modules.append(child)
-                                self._overlap_states[cid].index_to_isp_module[idx].append(child)
+                                self._overlap_states[cid].index_to_isp_modules[idx].append(child)
 
                                 setattr(child, "isp_name", name)
 
@@ -260,7 +265,7 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
                                         f"{full_name}.bias",
                                     )
 
-        self._overlap_states[cid].num_blocks = len(self._overlap_states[cid].index_to_isp_module)
+        self._overlap_states[cid].num_blocks = len(self._overlap_states[cid].index_to_isp_modules)
 
     def _all_gather_module_weight(self, module):
         with_bias = module.bias is not None
@@ -307,7 +312,15 @@ def _all_gather_module_weight(self, module):
         self._weight_global_output[module] = weight_output
 
     def _all_gather_block_weight(self, block_index: int):
-        for module in self._index_to_isp_module[block_index]:
+        block = self._index_to_block[block_index]
+
+        # wait for prerequisite conditions
+        if self.is_forward:
+            for callback in self._forward_prefetch_prerequisites:
+                callback(block)
+
+        # prefetch parameters for all isp modules of the block
+        for module in self._index_to_isp_modules[block_index]:
             self._all_gather_module_weight(module)
 
     def _wait_handle(self, module):
@@ -358,7 +371,7 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: dis
         self._wait_handle(module)
 
     def _pre_forward_hook_for_block(self, *args):  # pylint: disable=W0613
-        for module in self._index_to_isp_module[self._ckpt_block_num - 1]:
+        for module in self._index_to_isp_modules[self._ckpt_block_num - 1]:
             self._all_gather_module_weight(module)
 
     def _post_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
@@ -446,13 +459,41 @@ def switch_current_model_chunk(self, chunk_id: int) -> None:
         self._weight_global_output = self._overlap_states[chunk_id].weight_global_output
         self._bias_global_output = self._overlap_states[chunk_id].bias_global_output
         self._module_to_index = self._overlap_states[chunk_id].module_to_index
-        self._index_to_isp_module = self._overlap_states[chunk_id].index_to_isp_module
+        self._index_to_isp_modules = self._overlap_states[chunk_id].index_to_isp_modules
+        self._index_to_block = self._overlap_states[chunk_id].index_to_block
         self._ckpt_block_num = self._overlap_states[chunk_id].ckpt_block_num
         self._last_ckpt_block = self._overlap_states[chunk_id].last_ckpt_block
         self._head = self._overlap_states[chunk_id].head
         self._embedding = self._overlap_states[chunk_id].embedding
         self._num_blocks = self._overlap_states[chunk_id].num_blocks
 
+    def register_prerequisite_for_forward_prefetch_hooks(self, prerequisite_func: Callable) -> None:
+        """
+        Registers a callback function that specifies a prerequisite condition for
+        prefetching parameters before forward computation.
+
+        This method allows users to define custom logic that must be satisfied before
+        parameters are fetched for the next forward pass. This can be useful for
+        implementing complex parameter update strategies or for coordinating
+        parameter access with other system components.
+
+        Args:
+            prerequisite_func (Callable): A callable that represents the prerequisite
+                                    condition. This function will be invoked before
+                                    the parameters are prefetched, and its return value
+                                    will determine whether the prefetching should proceed.
+
+        Returns:
+            None: This method does not return any value.
+
+        Raises:
+            TypeError: If the provided 'prerequisite_func' is not callable.
+        """
+        if not callable(prerequisite_func):
+            raise TypeError("The provided prerequisite function must be callable.")
+
+        self._forward_prefetch_prerequisites.append(prerequisite_func)
+
     # communication operation interfaces
 
     def all_gather(self, tensor: torch.Tensor, module: nn.Module, is_bias: bool = False):
@@ -521,8 +562,7 @@ def __init__(self, overlap_handler: ISPCommunicator, zero_optim) -> None:
         self._zero_optim = zero_optim
 
     def before_forward(self, scheduler, inputs) -> None:
-        if self._isp_communicator._ckpt_block_num > 0:
-            self._isp_communicator.is_forward = True
+        self._isp_communicator.is_forward = True
         # switch model chunk before forward
         chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank
         self._isp_communicator.switch_current_model_chunk(chunk_id)
@@ -537,8 +577,7 @@ def after_criterion(self, scheduler, loss) -> None:
         pass
 
     def before_backward(self, scheduler, outputs, outputs_grad) -> None:
-        if self._isp_communicator._ckpt_block_num > 0:
-            self._isp_communicator.is_forward = False
+        self._isp_communicator.is_forward = False
         # switch model chunk before backward
         chunk_id = 0 if gpc.virtual_pipeline_parallel_rank is None else gpc.virtual_pipeline_parallel_rank
         self._isp_communicator.switch_current_model_chunk(chunk_id)
 
@@ -1,12 +1,19 @@
 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/communication
 
-from typing import List, Tuple, Union
+from collections import OrderedDict
+from typing import Dict, List, Tuple, Union
 
 import torch
 import torch.distributed as dist
+from flash_attn.modules.embedding import ParallelGPT2Embeddings
+from torch import nn
 
+from internlm.core.communication.isp import ISPCommunicator
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.naive_amp import NaiveAMPModel
+from internlm.model.embedding import Embedding1D
+from internlm.model.linear import BaseScaleColumnParallelLinear
 from internlm.utils.common import get_current_device
 
 TensorShape = Union[torch.Size, List[int], Tuple[int]]
@@ -123,3 +130,102 @@ def gather_split_1d_tensor(tensor: torch.Tensor) -> torch.Tensor:
     chunks = [gathered[i * numel : (i + 1) * numel] for i in range(world_size)]
     dist.all_gather(chunks, tensor, group=gpc.get_group(ParallelMode.TENSOR))
     return gathered
+
+
+class ParamAsyncBcastHandler:
+    """
+    Model Partition Handler for overlap broadcast with forward
+    """
+
+    def __init__(
+        self, zero1_mode: ParallelMode, model: Union[nn.Module, nn.ModuleList], isp_communicator: ISPCommunicator = None
+    ) -> None:
+        self._block_to_param: Dict[nn.Module, List[nn.Parameter]] = OrderedDict()
+        self._param_to_rank: Dict[nn.Parameter, int] = {}
+        self._block_to_rank: Dict[nn.Module, int] = {}
+        self._bcast_handles: Dict[int, List[dist.Work]] = {}
+
+        zero1_size = gpc.get_world_size(zero1_mode)
+        total_param_num = sum(p.numel() for p in model.parameters())
+        avg_param_num = total_param_num * 1.0 // zero1_size
+
+        # initialize an empty list for _bcast_handles of each rank
+        self._bcast_handles = {rank: [] for rank in range(zero1_size)}
+
+        # just want to share same for loop for ModuleList and Module
+        if not isinstance(model, nn.ModuleList):
+            model = [model]
+
+        # record the parameters to transformer/embeding/head/norm block
+        for _chunk in model:
+            if isinstance(_chunk, NaiveAMPModel):
+                _chunk = _chunk.model
+
+            for _, children in _chunk.named_children():
+                # should be the transformer block definaton in modeling_xxx.py
+                if isinstance(children, nn.ModuleList):
+                    # record the block that a parameter belongs to
+                    for _, block in enumerate(children):
+                        # self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
+                        self._block_to_param[block] = list(block.parameters())
+                else:
+                    # record the block that a parameter belongs to
+                    # self._block_to_param[name] = list(children.parameters())
+                    self._block_to_param[children] = list(children.parameters())
+
+        alloc_num = 0
+        rank_to_go = 0
+
+        # process the parameters in block_to_param sequencially,
+        # allocate each parameter to a local rank of ParallelMode.ZERO1,
+        # NOTE that we do NOT consider following scenarios:
+        # 1) whether a parameter is trainable;
+        # 2) paramters maybe in different optimizer group
+        for block, params in self._block_to_param.items():
+            # allocate a model block to a local rank of ParallelMode.ZERO1
+            self._block_to_rank[block] = [rank_to_go]
+            for p in params:
+                alloc_num = alloc_num + p.numel()
+                # in this case, allocate the param to next rank if possible
+                if alloc_num > avg_param_num * 1.01 and rank_to_go < zero1_size - 1:
+                    rank_to_go = rank_to_go + 1
+                    alloc_num = 0
+                    self._block_to_rank[block].append(rank_to_go)
+                # allocate a parameter to a local rank of ParallelMode.ZERO1
+                self._param_to_rank[p] = rank_to_go
+
+        # register_forward_pre_hook for transformer/embeding/norm/xxx block
+        self._register_sync_parameters_hook(isp_communicator)
+
+    def _register_sync_parameters_hook(self, isp_communicator: ISPCommunicator = None) -> None:
+        def _pre_forward_hook(model: nn.Module, *args, **kwargs):  # pylint: disable=W0613
+            bcast_handles = []
+            # gather all required broadcast hanles into a list
+            for rank in self._block_to_rank[model]:
+                bcast_handles.extend(self._bcast_handles[rank])
+                # need to clear _bcast_handles since they would be processed later
+                self._bcast_handles[rank] = []
+            # wait all required broadcast handles to be completed
+            for handle in bcast_handles:
+                handle.wait()
+
+        # register_forward_pre_hook for transformer/embeding/norm/xxx block
+        for block, _ in self._block_to_rank.items():
+            # TODO: remove special handling for embedding and head layers,
+            # instead implement support for weight parallelism of embedding and head layers within the ISP.
+
+            # NOTE: Although the layernorm layer does not have explicit processing,
+            # both ISPCommunicator and ParamAsyncBcastHandler handle transformer blocks as granularity,
+            # so everything is fine.
+            if isp_communicator is None or isinstance(
+                block, (Embedding1D, ParallelGPT2Embeddings, BaseScaleColumnParallelLinear)
+            ):
+                block.register_forward_pre_hook(_pre_forward_hook)
+            else:
+                isp_communicator.register_prerequisite_for_forward_prefetch_hooks(_pre_forward_hook)
+
+    def get_rank_by_param(self, param) -> int:
+        return self._param_to_rank[param]
+
+    def add_bcast_handle(self, rank, handle) -> None:
+        self._bcast_handles[rank].append(handle)
@@ -3,12 +3,14 @@
 
 import math
 from functools import partial
+from itertools import product
 from typing import List, Optional
 
 import torch
 import torch.distributed as dist
 from torch.optim import Optimizer
 
+from internlm.core.communication.utils import ParamAsyncBcastHandler
 from internlm.core.context import IS_REPLICA_ZERO_PARALLEL, Config, ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.context.parallel_context import (
@@ -26,7 +28,6 @@
 )
 from internlm.solver.optimizer.utils import (
     DynamicGradScaler,
-    ParamBcastSyncHandler,
     flatten,
     get_grad_accumulate_object,
     has_inf_or_nan,
@@ -66,7 +67,7 @@ def __init__(
         cpu_offload=False,
         grad_scal_cfg: Config = None,
         zero_cfg: Config = None,
-        param_bcast_sync_handler: ParamBcastSyncHandler = None,
+        param_bcast_sync_handler: ParamAsyncBcastHandler = None,
         isp_communicator=None,
     ):
         # DynamicGradScaler related args
@@ -1053,26 +1054,29 @@ def _step(self, closure=None, norms=None):
     def broadcast_params(self):
         handles = []
 
-        for group_id in range(self.num_param_groups):
-            for rank in range(self._zero_world_size[group_id]):
-                # The following operations are performed only on the rank to which parameters are assigned.
-                if rank in self.param_group_no_params_ranks[group_id]:
-                    continue
-                fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
-                # grank = gpc.get_ranks_in_group(group_type)[rank]  # need to convert to the global rank
-                # assert grank == rank, f"{grank} == {rank}"
-                g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode[group_id])[rank]
-                handle = dist.broadcast(
-                    fp16_param,
-                    src=g_rank,
-                    group=gpc.get_group(self._broadcast_parallel_mode[group_id]),
-                    async_op=True,
-                )
+        # traverse according to rank firstly, which is conducive to overlapping broadcast communication.
+        for rank, group_id in product(range(max(self._zero_world_size)), range(self.num_param_groups)):
+            # skip ranks not in this parameter group.
+            if rank >= self._zero_world_size[group_id]:
+                continue
+            # The following operations are performed only on the rank to which parameters are assigned.
+            if rank in self.param_group_no_params_ranks[group_id]:
+                continue
+            fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
+            # grank = gpc.get_ranks_in_group(group_type)[rank]  # need to convert to the global rank
+            # assert grank == rank, f"{grank} == {rank}"
+            g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode[group_id])[rank]
+            handle = dist.broadcast(
+                fp16_param,
+                src=g_rank,
+                group=gpc.get_group(self._broadcast_parallel_mode[group_id]),
+                async_op=True,
+            )
 
-                if self._overlap_sync_param:
-                    self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
-                else:
-                    handles.append(handle)
+            if self._overlap_sync_param:
+                self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
+            else:
+                handles.append(handle)
 
         for handle in handles:
             handle.wait()