|
| 1 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 2 | +# All rights reserved. |
| 3 | +# |
| 4 | +# This source code is licensed under the BSD-style license found in the |
| 5 | +# LICENSE file in the root directory of this source tree. |
| 6 | + |
1 | 7 | import submitit
|
2 |
| -import datetime |
3 |
| -import yaml |
4 |
| -import os |
5 | 8 |
|
6 | 9 |
|
7 | 10 | if __name__ == "__main__":
|
8 | 11 | executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
|
9 |
| - n_gpus = 8 |
| 12 | + n_gpus = 6 |
| 13 | + node = "h100" |
10 | 14 | executor.update_parameters(
|
11 |
| - name="titan", timeout_min=3 * 60, |
| 15 | + name="titan", |
| 16 | + timeout_min=6 * 60, |
12 | 17 | gpus_per_node=n_gpus,
|
13 |
| - nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4, |
14 |
| - slurm_additional_parameters={ |
15 |
| - "partition": "h100" |
16 |
| - } |
| 18 | + nodes=1, |
| 19 | + mem_gb=80, |
| 20 | + cpus_per_task=n_gpus * 12, |
| 21 | + slurm_additional_parameters={"partition": node}, |
17 | 22 | )
|
18 | 23 |
|
19 | 24 | hparams = {
|
20 | 25 | # "optimizer.lr": ["1.2e-3", "9e-4", "6e-4", "3e-4"],
|
21 | 26 | # "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"],
|
22 | 27 | # "optimizer.lr": ["2.5e-4"],
|
23 | 28 | # "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"],
|
| 29 | + "training.gradient_accumulation_steps": ["21", "25", "29", "33"], |
| 30 | + "training.steps": ["31000", "26000", "22.500", "20000"], |
24 | 31 | }
|
25 | 32 |
|
26 | 33 | jobs = []
|
27 | 34 | with executor.batch():
|
28 | 35 | for _ in range(1):
|
29 |
| - for hparam_name, value in hparams.items(): |
30 |
| - for v in value: |
31 |
| - # train_config = './train_configs/chemlactica_125m.toml' |
32 |
| - # train_config = './train_configs/chemlactica_1.3b.toml' |
33 |
| - train_config = './train_configs/llama3.2_1b.toml' |
34 |
| - # train_config = './train_configs/debug_model.toml' |
35 |
| - function = submitit.helpers.CommandFunction([ |
36 |
| - 'python3', '-m', 'torch.distributed.run', |
37 |
| - '--nproc_per_node', f'{n_gpus}', |
38 |
| - '--rdzv_backend', 'c10d', |
39 |
| - '--rdzv_endpoint', 'localhost:0', |
40 |
| - '--local-ranks-filter', '0', |
41 |
| - '--role', 'rank', '--tee', '3', |
42 |
| - 'train.py', |
43 |
| - '--job.config_file', train_config, |
44 |
| - f'--{hparam_name}', v |
45 |
| - ]) |
46 |
| - print(' '.join(function.command)) |
47 |
| - # subprocess.run(function.command) |
48 |
| - job = executor.submit(function) |
49 |
| - jobs.append(job) |
| 36 | + length = len(list(hparams.values())[0]) |
| 37 | + for i in range(length): |
| 38 | + hparam_dict = {} |
| 39 | + for key, values in hparams.items(): |
| 40 | + hparam_dict[key] = values[i] |
| 41 | + |
| 42 | + # train_config = './train_configs/chemlactica_125m.toml' |
| 43 | + # train_config = './train_configs/chemlactica_1.3b.toml' |
| 44 | + train_config = "./train_configs/llama3.2_1b.toml" |
| 45 | + # train_config = './train_configs/debug_model.toml' |
| 46 | + command_lst = [ |
| 47 | + "python3", |
| 48 | + "-m", |
| 49 | + "torch.distributed.run", |
| 50 | + "--nproc_per_node", |
| 51 | + f"{n_gpus}", |
| 52 | + "--rdzv_backend", |
| 53 | + "c10d", |
| 54 | + "--rdzv_endpoint", |
| 55 | + "localhost:0", |
| 56 | + "--local-ranks-filter", |
| 57 | + "0", |
| 58 | + "--role", |
| 59 | + "rank", |
| 60 | + "--tee", |
| 61 | + "3", |
| 62 | + "train.py", |
| 63 | + "--job.config_file", |
| 64 | + train_config, |
| 65 | + ] |
| 66 | + |
| 67 | + # add the hparam |
| 68 | + for key, value in hparam_dict.items(): |
| 69 | + command_lst.append(f"--{key}") |
| 70 | + command_lst.append(value) |
| 71 | + |
| 72 | + function = submitit.helpers.CommandFunction(command_lst) |
| 73 | + print(" ".join(function.command)) |
| 74 | + # subprocess.run(function.command) |
| 75 | + job = executor.submit(function) |
| 76 | + jobs.append(job) |
0 commit comments