Skip to content

Commit e8f5118

Browse files
authored
(feat/demo) add internlm2 1.8b config (#73)
1 parent 8d7b78e commit e8f5118

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed

configs/_base_/models/internlm2_1B.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Copyright (c) InternLM. All rights reserved.
2+
3+
model_type = "INTERNLM2_PUBLIC"
4+
5+
VOCAB_SIZE = 92544
6+
HIDDEN_SIZE = 2048
7+
NUM_ATTENTION_HEAD = 16
8+
NUM_KV_ATTENTION_HEAD = 8
9+
MULTIPLE_OF = 128
10+
MLP_RATIO = 4
11+
NUM_LAYER = 24
12+
13+
model = dict(
14+
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
15+
checkpoint=0.2, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
16+
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
17+
embed_split_hidden=True,
18+
num_layers=NUM_LAYER,
19+
hidden_size=HIDDEN_SIZE,
20+
vocab_size=VOCAB_SIZE,
21+
embed_grad_scale=1,
22+
parallel_output=True,
23+
num_attention_heads=NUM_ATTENTION_HEAD,
24+
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
25+
mlp_ratio=MLP_RATIO,
26+
multiple_of=MULTIPLE_OF,
27+
norm_type="rmsnorm",
28+
adapt_hf=True,
29+
apply_post_layer_norm=False,
30+
no_bias=True,
31+
layer_norm_epsilon=1e-5,
32+
rope_base=1000000,
33+
norm_head=True,
34+
)
35+
36+
hybrid_zero_optimizer = dict(
37+
# Enable low_level_optimzer overlap_communication
38+
overlap_sync_grad=True,
39+
overlap_sync_param=False,
40+
# bucket size for nccl communication params
41+
reduce_bucket_size=512 * 1024 * 1024,
42+
# grad clipping
43+
clip_grad_norm=1.0,
44+
)
45+
46+
"""
47+
zero1 parallel (dict):
48+
1. size: int
49+
* if size <= 0, the size of the zero process group is equal to the size of the dp process group,
50+
so parameters will be divided within the range of dp.
51+
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
52+
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
53+
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
54+
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
55+
tensor parallel (dict):
56+
1. size: int, the size of tensor parallel.
57+
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
58+
defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
59+
msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
60+
fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
61+
isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
62+
pipeline parallel (dict):
63+
1. size: int, the size of pipeline parallel.
64+
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
65+
defaults to False.
66+
weight parallel (dict):
67+
1. size: int, the size of weight parallel.
68+
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
69+
3. memory_pool: bool, enable/disable memory pool, defaults to False.
70+
"""
71+
parallel = dict(
72+
zero1=dict(size=8),
73+
tensor=dict(size=1, mode="mtp"),
74+
pipeline=dict(size=1, interleaved_overlap=True),
75+
weight=dict(size=1, overlap=True, memory_pool=True),
76+
)

0 commit comments

Comments
 (0)