From 2118a14c389509b7d9ce2cdddc708c720e7d8fb9 Mon Sep 17 00:00:00 2001 From: aurelion-source Date: Sat, 1 Feb 2025 03:31:27 +0000 Subject: [PATCH] - Updated transformer_engine requirments - Import megablocks only if num_experts > 1 --- megatron/model/transformer.py | 5 +++-- requirements/requirements-transformerengine.txt | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 509e6f176..d738c60c6 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -26,8 +26,6 @@ from pkg_resources import packaging from importlib.metadata import version -from megatron.model.moe import ParallelDroplessMoE - from .norms import get_norm from megatron import mpu from megatron.model.fused_softmax import FusedScaleMaskSoftmax @@ -947,6 +945,9 @@ def __init__( else 1 ) + if self.num_experts > 1: + from megatron.model.moe import ParallelDroplessMoE + if self.gpt_j_residual: # GPT-J style layers allow us to defer the reduction of results across TP ranks until the end of the two sublayers. # the reduction we use is a simple allreduce for pure Tensor Parallel, diff --git a/requirements/requirements-transformerengine.txt b/requirements/requirements-transformerengine.txt index 10a1f3b82..3202ef4f2 100644 --- a/requirements/requirements-transformerengine.txt +++ b/requirements/requirements-transformerengine.txt @@ -1 +1,2 @@ -transformer-engine[pytorch] +transformer-engine[pytorch]==1.12 +flash-attn==2.6.3