From 7861d6640ccf6b0c4f5fdcad3ee6905298ef952a Mon Sep 17 00:00:00 2001 From: chris warner Date: Fri, 31 Jan 2025 20:43:29 +0000 Subject: [PATCH] - Updated dmoe config - Fixed merge issues - Fixed moe sequence parallelism bug - Added assertions for zero stages 2 and 3 with moe - Updated moe requirements --- configs/125M-dmoe.yml | 38 ++++++++++++++++++---------- megatron/model/transformer.py | 7 +---- megatron/mpu/__init__.py | 1 + megatron/neox_arguments/arguments.py | 4 +++ requirements/requirements-moe.txt | 4 +-- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/configs/125M-dmoe.yml b/configs/125M-dmoe.yml index e712fc847..3ca28d29f 100644 --- a/configs/125M-dmoe.yml +++ b/configs/125M-dmoe.yml @@ -2,18 +2,20 @@ { # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages # across the node boundaries ) - "pipe_parallel_size": 2, # MoE supports PP - "model_parallel_size": 2, # MoE uses model parallel group to split both experts and attention weights + "pipe_parallel_size": 1, # MoE supports PP + "model_parallel_size": 1, # MoE uses model parallel group to split both experts and attention weights # model settings "num_layers": 12, - "hidden_size": 1024, - "num_attention_heads": 16, + "hidden_size": 768, + "num_attention_heads": 12, "seq_length": 2048, "max_position_embeddings": 2048, "norm": "layernorm", "pos_emb": "rotary", "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", # moe settings "moe_num_experts": 8, @@ -24,19 +26,24 @@ "rope_fusion": false, "layernorm_fusion": false, - + # init methods + "init_method": "small_init", + "output_layer_init_method": "wang_init", + # optimizer settings "optimizer": { "type": "Adam", "params": { "lr": 0.0006, - "betas": [0.9, 0.999], + "betas": [0.9, 0.95], "eps": 1.0e-8, } }, + "min_lr": 0.00006, + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { - "stage": 0, + "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, @@ -48,7 +55,6 @@ # batch / data settings "train_micro_batch_size_per_gpu": 4, "data_impl": "mmap", - "split": "949,50,1", # activation checkpointing "checkpoint_activations": true, @@ -58,26 +64,30 @@ # regularization "gradient_clipping": 1.0, - "weight_decay": 0.0, + "weight_decay": 0.1, "hidden_dropout": 0.0, "attention_dropout": 0.0, "precision": "bfloat16", "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32 + # misc. training settings - "train_iters": 5, + "train_iters": 320000, "lr_decay_iters": 320000, "distributed_backend": "nccl", - "min_lr": 0.0006, - "warmup": 0.0, + "lr_decay_style": "cosine", + "warmup": 0.1, "checkpoint_factor": 10000, "eval_interval": 1000, "eval_iters": 10, # logging - "log_interval": 1, - "steps_per_print": 1, + "log_interval": 100, + "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, + + # networking + "hostfile": "/mock_path" } diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 9a2b16406..509e6f176 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -928,6 +928,7 @@ def __init__( super().__init__() self.layer_number = layer_number + self.neox_args = neox_args norm, eps = get_norm(neox_args) @@ -1014,12 +1015,6 @@ def get_te_lnmlp(**kw): **kw, ) - self.num_experts = ( - neox_args.moe_num_experts - if layer_number % neox_args.expert_interval == 0 - else 1 - ) - if self.num_experts <= 1: if neox_args.te_layernorm_mlp: self.mlp = get_te_lnmlp() diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py index 318326e5f..b30b6d2b1 100644 --- a/megatron/mpu/__init__.py +++ b/megatron/mpu/__init__.py @@ -39,6 +39,7 @@ from .initialize import get_expert_token_counts_for_rank from .initialize import initialize_model_parallel from .initialize import model_parallel_is_initialized +from .initialize import get_fp32_allreduce from .layers import ColumnParallelLinear from .layers import RowParallelLinear diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 7cecccd75..a012868a0 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1084,6 +1084,10 @@ def calculate_derived(self): # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1) + # MoE config + if self.moe_num_experts > 1: + assert self.zero_optimization["stage"] < 2, "MoE is not compatible with zero stages 2 and 3" + # Attention config if self.attention_config is None: self.update_value("attention_config", [[["global"], self.num_layers]]) diff --git a/requirements/requirements-moe.txt b/requirements/requirements-moe.txt index e75e5e9fd..a0627715c 100644 --- a/requirements/requirements-moe.txt +++ b/requirements/requirements-moe.txt @@ -1,2 +1,2 @@ -grouped-gemm==0.1.4 -megablocks==0.5.1 \ No newline at end of file +grouped-gemm==0.1.6 +megablocks==0.7.0