fix(pp): fix pp get tensor shape err and layernorm input dtype err (#378)

huangting4201 · web-flow · commit 5ad2eb02fb5b · 2024-12-10T15:23:33.000+08:00
diff --git a/internlm/core/scheduler/pipeline_scheduler_1f1b.py b/internlm/core/scheduler/pipeline_scheduler_1f1b.py
@@ -35,7 +35,11 @@ def get_tensor_shape():
     if not gpc.is_initialized(ParallelMode.PIPELINE):
         return None
 
-    if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
+    if (
+        hasattr(gpc.config.data, "seq_len")
+        and hasattr(gpc.config.data, "micro_bsz")
+        and hasattr(gpc.config.model, "hidden_size")
+    ):
         if gpc.config.data.use_packed_dataset and gpc.is_evaluating is False:
             if gpc.config.parallel.sequence_parallel:
                 sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
@@ -195,7 +195,7 @@ def _forward(self, hidden_states, *args, **kwargs):
         def _dropout_and_norm_attn(_hidden_states):
             _dropped = self.dropout1(_hidden_states)
             _residual = _dropped
-            _hidden_states = self.norm1(_residual.float())
+            _hidden_states = self.norm1(_residual.to(self.norm1.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
@@ -212,7 +212,7 @@ def _dropout_and_norm_attn(_hidden_states):
         def _dropout_and_norm_ffn(_residual, _hidden_states):
             _dropped = self.dropout2(_hidden_states)
             _residual = (_dropped + _residual) if _residual is not None else _dropped
-            _hidden_states = self.norm2(_residual.float())
+            _hidden_states = self.norm2(_residual.to(self.norm2.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
diff --git a/internlm/model/modeling_internlm2.py b/internlm/model/modeling_internlm2.py
@@ -257,7 +257,7 @@ def _dropout_and_norm_attn(_residual, _hidden_states):
                     def _dropout_and_norm_ffn(_residual, _hidden_states):
                         _dropped = self.dropout2(_hidden_states)
                         _residual = (_dropped + _residual) if _residual is not None else _dropped
-                        _hidden_states = self.ffn_norm(_residual.to(torch.float32))
+                        _hidden_states = self.ffn_norm(_residual.to(self.ffn_norm.weight.dtype))
 
                         return _residual, _hidden_states
 
diff --git a/internlm/model/modeling_llama.py b/internlm/model/modeling_llama.py
@@ -246,7 +246,7 @@ def _dropout_and_norm_attn(_residual, _hidden_states):
                     def _dropout_and_norm_ffn(_residual, _hidden_states):
                         _dropped = self.dropout2(_hidden_states)
                         _residual = (_dropped + _residual) if _residual is not None else _dropped
-                        _hidden_states = self.ffn_norm(_residual.to(torch.float32))
+                        _hidden_states = self.ffn_norm(_residual.to(self.ffn_norm.weight.dtype))
 
                         return _residual, _hidden_states
 
diff --git a/internlm/model/modeling_mixtral.py b/internlm/model/modeling_mixtral.py
@@ -214,7 +214,7 @@ def _forward(self, hidden_states, *args, **kwargs):
         def _dropout_and_norm_attn(_hidden_states):
             _dropped = self.dropout1(_hidden_states)
             _residual = _dropped
-            _hidden_states = self.norm1(_residual.float())
+            _hidden_states = self.norm1(_residual.to(self.norm1.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
@@ -231,7 +231,7 @@ def _dropout_and_norm_attn(_hidden_states):
         def _dropout_and_norm_ffn(_residual, _hidden_states):
             _dropped = self.dropout2(_hidden_states)
             _residual = (_dropped + _residual) if _residual is not None else _dropped
-            _hidden_states = self.norm2(_residual.float())
+            _hidden_states = self.norm2(_residual.to(self.norm2.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
@@ -205,7 +205,7 @@ def _forward(self, hidden_states, *args, **kwargs):
         def _dropout_and_norm_attn(_hidden_states):
             _dropped = self.dropout1(_hidden_states)
             _residual = _dropped
-            _hidden_states = self.norm1(_residual.float())
+            _hidden_states = self.norm1(_residual.to(self.norm1.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
@@ -222,7 +222,7 @@ def _dropout_and_norm_attn(_hidden_states):
         def _dropout_and_norm_ffn(_residual, _hidden_states):
             _dropped = self.dropout2(_hidden_states)
             _residual = (_dropped + _residual) if _residual is not None else _dropped
-            _hidden_states = self.norm2(_residual.float())
+            _hidden_states = self.norm2(_residual.to(self.norm2.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint: