From de53b17506d97ba4e8f46ead33e781076e5182ff Mon Sep 17 00:00:00 2001 From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com> Date: Mon, 18 Dec 2023 18:33:28 +0800 Subject: [PATCH] fix token grad norm with tp (#547) --- internlm/solver/optimizer/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/internlm/solver/optimizer/utils.py b/internlm/solver/optimizer/utils.py index db9eefa4..c3bdd789 100644 --- a/internlm/solver/optimizer/utils.py +++ b/internlm/solver/optimizer/utils.py @@ -242,7 +242,7 @@ def append_grad(g, p): elif only_output: param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding" if ( - gpc.config.model["vocab_size"] == g.shape[0] + gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(ParallelMode.TENSOR) and gpc.config.model["hidden_size"] == g.shape[1] and "embedding" not in param_name.lower() ): @@ -393,9 +393,13 @@ def compute_vocab_grad_norm( if param_grads: for grad in param_grads: # get grad norm of each vocab - for i in range(vocab_size): + vocab_slice_size = grad.shape[0] + local_tp_rank = gpc.get_local_rank(ParallelMode.TENSOR) + for i in range(vocab_slice_size): cur_vocab_grad_norm = get_norm([grad[i, :]], norm_type, enable_cuda_kernels)[0] - vocab_grad_norm[i] += get_tensor_norm(cur_vocab_grad_norm, move_to_cuda=True) + vocab_grad_norm[i + vocab_slice_size * local_tp_rank] += get_tensor_norm( + cur_vocab_grad_norm, move_to_cuda=True + ) if last_stage is False: return vocab_grad_norm