Merge branch 'cherry-pick-df324ee9' into 'core_r0.7.0'

Merge branch 'gaod/moe/add_geglu_to_expertlayer' into 'main' See merge request ADLR/megatron-lm!1462
NVIDIA · May 30, 2024 · e2627c7 · e2627c7
2 parents d5d2a44 + e2fbaf7
commit e2627c7
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py
@@ -39,13 +39,13 @@ def __init__(self, num_local_experts: int, config: TransformerConfig):
 
         self.expert_parallel = config.expert_model_parallel_size > 1
         if self.config.gated_linear_unit:
-            if self.config.activation_func != F.silu:
-                raise ValueError("Activation function must be silu when using GroupedMLP.")
+            if self.config.activation_func not in (F.silu, F.gelu):
+                raise ValueError("Activation function must be silu or gelu when using GroupedMLP.")
 
             @jit_fuser
             def glu(x):
                 x = torch.chunk(x, 2, dim=-1)
-                return F.silu(x[0]) * x[1]
+                return self.config.activation_func(x[0]) * x[1]
 
             self.activation_func = glu
         else: