Skip to content

Commit 5df7a5d

Browse files
committed
update hparam tuning script
1 parent b7a28e4 commit 5df7a5d

File tree

4 files changed

+65
-38
lines changed

4 files changed

+65
-38
lines changed

submitit_train.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
if __name__ == "__main__":
1111
executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
12-
n_gpus = 8
12+
n_gpus = 6
1313
node = "h100"
1414
executor.update_parameters(
1515
name="titan",

submitit_train_hparam_tuning.py

+57-30
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,76 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
17
import submitit
2-
import datetime
3-
import yaml
4-
import os
58

69

710
if __name__ == "__main__":
811
executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
9-
n_gpus = 8
12+
n_gpus = 6
13+
node = "h100"
1014
executor.update_parameters(
11-
name="titan", timeout_min=3 * 60,
15+
name="titan",
16+
timeout_min=6 * 60,
1217
gpus_per_node=n_gpus,
13-
nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4,
14-
slurm_additional_parameters={
15-
"partition": "h100"
16-
}
18+
nodes=1,
19+
mem_gb=80,
20+
cpus_per_task=n_gpus * 12,
21+
slurm_additional_parameters={"partition": node},
1722
)
1823

1924
hparams = {
2025
# "optimizer.lr": ["1.2e-3", "9e-4", "6e-4", "3e-4"],
2126
# "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"],
2227
# "optimizer.lr": ["2.5e-4"],
2328
# "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"],
29+
"training.gradient_accumulation_steps": ["21", "25", "29", "33"],
30+
"training.steps": ["31000", "26000", "22.500", "20000"],
2431
}
2532

2633
jobs = []
2734
with executor.batch():
2835
for _ in range(1):
29-
for hparam_name, value in hparams.items():
30-
for v in value:
31-
# train_config = './train_configs/chemlactica_125m.toml'
32-
# train_config = './train_configs/chemlactica_1.3b.toml'
33-
train_config = './train_configs/llama3.2_1b.toml'
34-
# train_config = './train_configs/debug_model.toml'
35-
function = submitit.helpers.CommandFunction([
36-
'python3', '-m', 'torch.distributed.run',
37-
'--nproc_per_node', f'{n_gpus}',
38-
'--rdzv_backend', 'c10d',
39-
'--rdzv_endpoint', 'localhost:0',
40-
'--local-ranks-filter', '0',
41-
'--role', 'rank', '--tee', '3',
42-
'train.py',
43-
'--job.config_file', train_config,
44-
f'--{hparam_name}', v
45-
])
46-
print(' '.join(function.command))
47-
# subprocess.run(function.command)
48-
job = executor.submit(function)
49-
jobs.append(job)
36+
length = len(list(hparams.values())[0])
37+
for i in range(length):
38+
hparam_dict = {}
39+
for key, values in hparams.items():
40+
hparam_dict[key] = values[i]
41+
42+
# train_config = './train_configs/chemlactica_125m.toml'
43+
# train_config = './train_configs/chemlactica_1.3b.toml'
44+
train_config = "./train_configs/llama3.2_1b.toml"
45+
# train_config = './train_configs/debug_model.toml'
46+
command_lst = [
47+
"python3",
48+
"-m",
49+
"torch.distributed.run",
50+
"--nproc_per_node",
51+
f"{n_gpus}",
52+
"--rdzv_backend",
53+
"c10d",
54+
"--rdzv_endpoint",
55+
"localhost:0",
56+
"--local-ranks-filter",
57+
"0",
58+
"--role",
59+
"rank",
60+
"--tee",
61+
"3",
62+
"train.py",
63+
"--job.config_file",
64+
train_config,
65+
]
66+
67+
# add the hparam
68+
for key, value in hparam_dict.items():
69+
command_lst.append(f"--{key}")
70+
command_lst.append(value)
71+
72+
function = submitit.helpers.CommandFunction(command_lst)
73+
print(" ".join(function.command))
74+
# subprocess.run(function.command)
75+
job = executor.submit(function)
76+
jobs.append(job)

train_configs/llama3.2_1b.toml

+4-4
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ lr = 6e-4
3030

3131
[training]
3232
batch_size = 10
33-
gradient_accumulation_steps = 16
33+
gradient_accumulation_steps = 21
3434
seq_len = 2048
3535
warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps
3636
max_norm = 1.0 # grad norm clipping
37-
steps = 40000
37+
steps = 31000
3838
data_parallel_degree = -1
3939
tensor_parallel_degree = 1
4040
compile = true
@@ -50,7 +50,7 @@ enable_valid = true
5050
dataset = "chemlactica_valid" # supported datasets: chemlactica_valid_mini
5151

5252
[dataloader]
53-
num_workers = 4
53+
num_workers = 2
5454

5555
[experimental]
5656
pipeline_parallel_degree = 1
@@ -61,7 +61,7 @@ enable_checkpoint = true
6161
save_folder = "yerevann/Llama-3.2-1B"
6262
load_folder = "meta-llama/Llama-3.2-1B"
6363
# load_folder = "yerevann/Llama-3.2-1B/ec943c9e63db4cf7b4a8b847"
64-
load_at_step = 40000
64+
# load_at_step = 40000
6565
interval_type = "steps"
6666
interval = 2000
6767
model_weights_only = false

train_configs/llama3.2_1b_conversion.toml

+3-3
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ enable_async_tensor_parallel = false
5252
enable_checkpoint = true
5353
# load_folder = "meta-llama/Llama-3.2-1B"
5454
# save_folder = "meta-llama/Llama-3.2-1B"
55-
load_folder = "yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6"
56-
load_at_step = 20000
57-
save_folder = "hf/yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6"
55+
load_folder = "yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
56+
load_at_step = 40000
57+
save_folder = "hf/yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
5858
interval_type = "steps"
5959
interval = 1000
6060
model_weights_only = false

0 commit comments

Comments
 (0)