Skip to content

Commit a3a42fb

Browse files
committed
add llama 3.2 3b connfigs and files
1 parent 5df7a5d commit a3a42fb

File tree

6 files changed

+166
-7
lines changed

6 files changed

+166
-7
lines changed

Diff for: submitit_train.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
for _ in range(1):
2727
# train_config = './train_configs/chemlactica_125m.toml'
2828
# train_config = './train_configs/chemlactica_1.3b.toml'
29-
train_config = "./train_configs/llama3.2_1b.toml"
29+
train_config = "./train_configs/llama3.2_3b.toml"
3030
# train_config = './train_configs/debug_model.toml'
3131
function = submitit.helpers.CommandFunction(
3232
[

Diff for: submitit_train_hparam_tuning.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
# "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"],
2727
# "optimizer.lr": ["2.5e-4"],
2828
# "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"],
29-
"training.gradient_accumulation_steps": ["21", "25", "29", "33"],
30-
"training.steps": ["31000", "26000", "22.500", "20000"],
29+
# "training.gradient_accumulation_steps": ["21", "25", "29", "33"],
30+
# "training.steps": ["31000", "26000", "22500", "20000"],
3131
}
3232

3333
jobs = []

Diff for: torchtitan/models/llama/__init__.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,16 @@
3737
n_heads=32,
3838
n_kv_heads=8,
3939
rope_theta=500000,
40-
share_embeddings=True
40+
share_embeddings=True,
41+
),
42+
"3B": ModelArgs(
43+
dim=3072,
44+
n_layers=28,
45+
n_heads=24,
46+
n_kv_heads=8,
47+
rope_theta=500000,
48+
ffn_dim_multiplier=2 / 3, # in Llama3.2-3B dim is 3072, but ffn dim is 8192
49+
share_embeddings=True,
4150
),
4251
"8B": ModelArgs(
4352
dim=4096,
@@ -66,4 +75,4 @@
6675
multiple_of=4096,
6776
rope_theta=500000,
6877
),
69-
}
78+
}

Diff for: train_configs/llama3.2_1b.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ lr = 6e-4
3030

3131
[training]
3232
batch_size = 10
33-
gradient_accumulation_steps = 21
33+
gradient_accumulation_steps = 16
3434
seq_len = 2048
3535
warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps
3636
max_norm = 1.0 # grad norm clipping
37-
steps = 31000
37+
steps = 40000
3838
data_parallel_degree = -1
3939
tensor_parallel_degree = 1
4040
compile = true

Diff for: train_configs/llama3.2_3b.toml

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# torchtitan Config.toml
2+
3+
[job]
4+
dump_folder = "/nfs/h100/raid/chem/checkpoints"
5+
description = "Llama 3.2 training"
6+
use_for_integration_test = false
7+
8+
[profiling]
9+
enable_profiling = false
10+
save_traces_folder = "profile_trace"
11+
profile_freq = 10
12+
enable_memory_snapshot = false
13+
save_memory_snapshot_folder = "memory_snapshot"
14+
15+
[metrics]
16+
log_freq = 1
17+
enable_color_printing = true
18+
enable_aim = true
19+
save_aim_folder = "aim"
20+
21+
[model]
22+
name = "llama3"
23+
flavor = "3B"
24+
norm_type = "rmsnorm" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm
25+
tokenizer_path = "torchtitan/tokenizers/Llama-3.2-chem-1B-v1/"
26+
27+
[optimizer]
28+
name = "AdamW"
29+
lr = 6e-4
30+
31+
[training]
32+
batch_size = 6
33+
gradient_accumulation_steps = 28
34+
seq_len = 2048
35+
warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps
36+
max_norm = 1.0 # grad norm clipping
37+
steps = 40000
38+
data_parallel_degree = -1
39+
tensor_parallel_degree = 1
40+
compile = true
41+
# dataset = "c4" # supported datasets: c4_test (2K), c4 (177M)
42+
# dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K)
43+
dataset = "chemlactica_train"
44+
data_processing_style="chemlactica_style"
45+
representation_type = "SMILES"
46+
47+
[validation]
48+
valid_freq = 2000
49+
enable_valid = true
50+
dataset = "chemlactica_valid" # supported datasets: chemlactica_valid_mini
51+
52+
[dataloader]
53+
num_workers = 2
54+
55+
[experimental]
56+
pipeline_parallel_degree = 1
57+
enable_async_tensor_parallel = false
58+
59+
[checkpoint]
60+
enable_checkpoint = true
61+
save_folder = "yerevann/Llama-3.2-3B"
62+
load_folder = "meta-llama/Llama-3.2-3B"
63+
# load_folder = "yerevann/Llama-3.2-1B/ec943c9e63db4cf7b4a8b847"
64+
# load_at_step = 40000
65+
interval_type = "steps"
66+
interval = 2000
67+
model_weights_only = false
68+
export_dtype = "float32"
69+
async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"]
70+
71+
[activation_checkpoint]
72+
mode = 'none' # ['none', 'selective', 'full']
73+
selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy
74+
75+
[float8]
76+
enable_float8_linear = false

Diff for: train_configs/llama3.2_3b_conversion.toml

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# torchtitan Config.toml
2+
3+
[job]
4+
dump_folder = "/nfs/h100/raid/chem/checkpoints"
5+
description = "Llama 3.2 training"
6+
use_for_integration_test = false
7+
8+
[profiling]
9+
enable_profiling = false
10+
save_traces_folder = "profile_trace"
11+
profile_freq = 10
12+
enable_memory_snapshot = false
13+
save_memory_snapshot_folder = "memory_snapshot"
14+
15+
[metrics]
16+
log_freq = 1
17+
enable_color_printing = true
18+
enable_aim = false
19+
save_aim_folder = "aim"
20+
21+
[model]
22+
name = "llama3"
23+
flavor = "3B"
24+
norm_type = "rmsnorm" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm
25+
tokenizer_path = "torchtitan/tokenizers/Llama-3.2-chem-1B-v1"
26+
# tokenizer_path = "meta-llama/Llama-3.2-1B"
27+
28+
[optimizer]
29+
name = "AdamW"
30+
lr = 1.0e-4
31+
32+
[training]
33+
batch_size = 1
34+
gradient_accumulation_steps = 3
35+
seq_len = 2048
36+
warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps
37+
max_norm = 1.0 # grad norm clipping
38+
steps = 10
39+
data_parallel_degree = -1
40+
tensor_parallel_degree = 1
41+
compile = false
42+
# dataset = "c4" # supported datasets: c4_test (2K), c4 (177M)
43+
# dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K)
44+
dataset = "chemlactica_train"
45+
data_processing_style="chemlactica_style"
46+
47+
[experimental]
48+
pipeline_parallel_degree = 1
49+
enable_async_tensor_parallel = false
50+
51+
[checkpoint]
52+
enable_checkpoint = true
53+
load_folder = "meta-llama/Llama-3.2-3B"
54+
save_folder = "meta-llama/Llama-3.2-3B"
55+
# load_folder = "yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
56+
load_at_step = 0
57+
# save_folder = "hf/yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
58+
interval_type = "steps"
59+
interval = 1000
60+
model_weights_only = false
61+
export_dtype = "float32"
62+
async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"]
63+
64+
[model_download_export]
65+
to_titan = true
66+
weights_source = "huggingface"
67+
# to_hf = true
68+
69+
[activation_checkpoint]
70+
mode = 'none' # ['none', 'selective', 'full']
71+
selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy
72+
73+
[float8]
74+
enable_float8_linear = false

0 commit comments

Comments
 (0)