Skip to content

Commit

Permalink
fix token grad norm with tp (InternLM#547)
Browse files Browse the repository at this point in the history
  • Loading branch information
JiaoPL authored Dec 18, 2023
1 parent 513ebb9 commit de53b17
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions internlm/solver/optimizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def append_grad(g, p):
elif only_output:
param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding"
if (
gpc.config.model["vocab_size"] == g.shape[0]
gpc.config.model["vocab_size"] == g.shape[0] * gpc.get_world_size(ParallelMode.TENSOR)
and gpc.config.model["hidden_size"] == g.shape[1]
and "embedding" not in param_name.lower()
):
Expand Down Expand Up @@ -393,9 +393,13 @@ def compute_vocab_grad_norm(
if param_grads:
for grad in param_grads:
# get grad norm of each vocab
for i in range(vocab_size):
vocab_slice_size = grad.shape[0]
local_tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
for i in range(vocab_slice_size):
cur_vocab_grad_norm = get_norm([grad[i, :]], norm_type, enable_cuda_kernels)[0]
vocab_grad_norm[i] += get_tensor_norm(cur_vocab_grad_norm, move_to_cuda=True)
vocab_grad_norm[i + vocab_slice_size * local_tp_rank] += get_tensor_norm(
cur_vocab_grad_norm, move_to_cuda=True
)

if last_stage is False:
return vocab_grad_norm
Expand Down

0 comments on commit de53b17

Please sign in to comment.