Skip to content

Commit

Permalink
- Updated dmoe config
Browse files Browse the repository at this point in the history
- Fixed merge issues
- Fixed moe sequence parallelism bug
- Added assertions for zero stages 2 and 3 with moe
- Updated moe requirements
  • Loading branch information
chris-warner-II committed Jan 31, 2025
1 parent 7b9679a commit 7861d66
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 22 deletions.
38 changes: 24 additions & 14 deletions configs/125M-dmoe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@
{
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
# across the node boundaries )
"pipe_parallel_size": 2, # MoE supports PP
"model_parallel_size": 2, # MoE uses model parallel group to split both experts and attention weights
"pipe_parallel_size": 1, # MoE supports PP
"model_parallel_size": 1, # MoE uses model parallel group to split both experts and attention weights

# model settings
"num_layers": 12,
"hidden_size": 1024,
"num_attention_heads": 16,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"norm": "layernorm",
"pos_emb": "rotary",
"no_weight_tying": true,
"gpt_j_residual": false,
"output_layer_parallelism": "column",

# moe settings
"moe_num_experts": 8,
Expand All @@ -24,19 +26,24 @@
"rope_fusion": false,
"layernorm_fusion": false,


# init methods
"init_method": "small_init",
"output_layer_init_method": "wang_init",

# optimizer settings
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0006,
"betas": [0.9, 0.999],
"betas": [0.9, 0.95],
"eps": 1.0e-8,
}
},
"min_lr": 0.00006,

# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
"zero_optimization": {
"stage": 0,
"stage": 1,
"allgather_partitions": True,
"allgather_bucket_size": 500000000,
"overlap_comm": True,
Expand All @@ -48,7 +55,6 @@
# batch / data settings
"train_micro_batch_size_per_gpu": 4,
"data_impl": "mmap",
"split": "949,50,1",

# activation checkpointing
"checkpoint_activations": true,
Expand All @@ -58,26 +64,30 @@

# regularization
"gradient_clipping": 1.0,
"weight_decay": 0.0,
"weight_decay": 0.1,
"hidden_dropout": 0.0,
"attention_dropout": 0.0,

"precision": "bfloat16",

"fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32

# misc. training settings
"train_iters": 5,
"train_iters": 320000,
"lr_decay_iters": 320000,
"distributed_backend": "nccl",
"min_lr": 0.0006,
"warmup": 0.0,
"lr_decay_style": "cosine",
"warmup": 0.1,
"checkpoint_factor": 10000,
"eval_interval": 1000,
"eval_iters": 10,

# logging
"log_interval": 1,
"steps_per_print": 1,
"log_interval": 100,
"steps_per_print": 10,
"keep_last_n_checkpoints": 4,
"wall_clock_breakdown": true,

# networking
"hostfile": "/mock_path"
}
7 changes: 1 addition & 6 deletions megatron/model/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,7 @@ def __init__(

super().__init__()
self.layer_number = layer_number
self.neox_args = neox_args

norm, eps = get_norm(neox_args)

Expand Down Expand Up @@ -1014,12 +1015,6 @@ def get_te_lnmlp(**kw):
**kw,
)

self.num_experts = (
neox_args.moe_num_experts
if layer_number % neox_args.expert_interval == 0
else 1
)

if self.num_experts <= 1:
if neox_args.te_layernorm_mlp:
self.mlp = get_te_lnmlp()
Expand Down
1 change: 1 addition & 0 deletions megatron/mpu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from .initialize import get_expert_token_counts_for_rank
from .initialize import initialize_model_parallel
from .initialize import model_parallel_is_initialized
from .initialize import get_fp32_allreduce

from .layers import ColumnParallelLinear
from .layers import RowParallelLinear
Expand Down
4 changes: 4 additions & 0 deletions megatron/neox_arguments/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,6 +1084,10 @@ def calculate_derived(self):
# the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)

# MoE config
if self.moe_num_experts > 1:
assert self.zero_optimization["stage"] < 2, "MoE is not compatible with zero stages 2 and 3"

# Attention config
if self.attention_config is None:
self.update_value("attention_config", [[["global"], self.num_layers]])
Expand Down
4 changes: 2 additions & 2 deletions requirements/requirements-moe.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
grouped-gemm==0.1.4
megablocks==0.5.1
grouped-gemm==0.1.6
megablocks==0.7.0

0 comments on commit 7861d66

Please sign in to comment.