update hparam tuning script

tigranfah · tigranfah · commit 5df7a5d1c06a · 2024-12-16T17:58:38.000+04:00
diff --git a/submitit_train.py b/submitit_train.py
@@ -9,7 +9,7 @@
 
 if __name__ == "__main__":
     executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
-    n_gpus = 8
+    n_gpus = 6
     node = "h100"
     executor.update_parameters(
         name="titan",
diff --git a/submitit_train_hparam_tuning.py b/submitit_train_hparam_tuning.py
@@ -1,49 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import submitit
-import datetime
-import yaml
-import os
 
 
 if __name__ == "__main__":
     executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
-    n_gpus = 8
+    n_gpus = 6
+    node = "h100"
     executor.update_parameters(
-        name="titan", timeout_min=3 * 60,
+        name="titan",
+        timeout_min=6 * 60,
         gpus_per_node=n_gpus,
-        nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4,
-        slurm_additional_parameters={
-            "partition": "h100"
-        }
+        nodes=1,
+        mem_gb=80,
+        cpus_per_task=n_gpus * 12,
+        slurm_additional_parameters={"partition": node},
     )
 
     hparams = {
         # "optimizer.lr": ["1.2e-3", "9e-4", "6e-4", "3e-4"],
         # "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"],
         # "optimizer.lr": ["2.5e-4"],
         # "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"],
+        "training.gradient_accumulation_steps": ["21", "25", "29", "33"],
+        "training.steps": ["31000", "26000", "22.500", "20000"],
     }
 
     jobs = []
     with executor.batch():
         for _ in range(1):
-            for hparam_name, value in hparams.items():
-                for v in value:
-                    # train_config = './train_configs/chemlactica_125m.toml'
-                    # train_config = './train_configs/chemlactica_1.3b.toml'
-                    train_config = './train_configs/llama3.2_1b.toml'
-                    # train_config = './train_configs/debug_model.toml'
-                    function = submitit.helpers.CommandFunction([
-                        'python3', '-m', 'torch.distributed.run',
-                        '--nproc_per_node', f'{n_gpus}',
-                        '--rdzv_backend', 'c10d',
-                        '--rdzv_endpoint', 'localhost:0',
-                        '--local-ranks-filter', '0',
-                        '--role', 'rank', '--tee', '3',
-                        'train.py',
-                        '--job.config_file', train_config,
-                        f'--{hparam_name}', v
-                    ])
-                    print(' '.join(function.command))
-                    # subprocess.run(function.command)
-                    job = executor.submit(function)
-                    jobs.append(job)
+            length = len(list(hparams.values())[0])
+            for i in range(length):
+                hparam_dict = {}
+                for key, values in hparams.items():
+                    hparam_dict[key] = values[i]
+
+                # train_config = './train_configs/chemlactica_125m.toml'
+                # train_config = './train_configs/chemlactica_1.3b.toml'
+                train_config = "./train_configs/llama3.2_1b.toml"
+                # train_config = './train_configs/debug_model.toml'
+                command_lst = [
+                    "python3",
+                    "-m",
+                    "torch.distributed.run",
+                    "--nproc_per_node",
+                    f"{n_gpus}",
+                    "--rdzv_backend",
+                    "c10d",
+                    "--rdzv_endpoint",
+                    "localhost:0",
+                    "--local-ranks-filter",
+                    "0",
+                    "--role",
+                    "rank",
+                    "--tee",
+                    "3",
+                    "train.py",
+                    "--job.config_file",
+                    train_config,
+                ]
+
+                # add the hparam
+                for key, value in hparam_dict.items():
+                    command_lst.append(f"--{key}")
+                    command_lst.append(value)
+
+                function = submitit.helpers.CommandFunction(command_lst)
+                print(" ".join(function.command))
+                # subprocess.run(function.command)
+                job = executor.submit(function)
+                jobs.append(job)
diff --git a/train_configs/llama3.2_1b.toml b/train_configs/llama3.2_1b.toml
@@ -30,11 +30,11 @@ lr = 6e-4
 
 [training]
 batch_size = 10
-gradient_accumulation_steps = 16
+gradient_accumulation_steps = 21
 seq_len = 2048
 warmup_steps = 500  # lr scheduler warm up, normally 20% of the train steps
 max_norm = 1.0  # grad norm clipping
-steps = 40000
+steps = 31000
 data_parallel_degree = -1
 tensor_parallel_degree = 1
 compile = true
@@ -50,7 +50,7 @@ enable_valid = true
 dataset = "chemlactica_valid"  # supported datasets: chemlactica_valid_mini
 
 [dataloader]
-num_workers = 4
+num_workers = 2
 
 [experimental]
 pipeline_parallel_degree = 1
@@ -61,7 +61,7 @@ enable_checkpoint = true
 save_folder = "yerevann/Llama-3.2-1B"
 load_folder = "meta-llama/Llama-3.2-1B"
 # load_folder = "yerevann/Llama-3.2-1B/ec943c9e63db4cf7b4a8b847"
-load_at_step = 40000
+# load_at_step = 40000
 interval_type = "steps"
 interval = 2000
 model_weights_only = false
diff --git a/train_configs/llama3.2_1b_conversion.toml b/train_configs/llama3.2_1b_conversion.toml
@@ -52,9 +52,9 @@ enable_async_tensor_parallel = false
 enable_checkpoint = true
 # load_folder = "meta-llama/Llama-3.2-1B"
 # save_folder = "meta-llama/Llama-3.2-1B"
-load_folder = "yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6"
-load_at_step = 20000
-save_folder = "hf/yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6"
+load_folder = "yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
+load_at_step = 40000
+save_folder = "hf/yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
 interval_type = "steps"
 interval = 1000
 model_weights_only = false