add llama 3.2 3b connfigs and files

tigranfah · tigranfah · commit a3a42fb9192a · 2024-12-18T11:06:26.000+04:00
diff --git a/submitit_train.py b/submitit_train.py
@@ -26,7 +26,7 @@
         for _ in range(1):
             # train_config = './train_configs/chemlactica_125m.toml'
             # train_config = './train_configs/chemlactica_1.3b.toml'
-            train_config = "./train_configs/llama3.2_1b.toml"
+            train_config = "./train_configs/llama3.2_3b.toml"
             # train_config = './train_configs/debug_model.toml'
             function = submitit.helpers.CommandFunction(
                 [
diff --git a/submitit_train_hparam_tuning.py b/submitit_train_hparam_tuning.py
@@ -26,8 +26,8 @@
         # "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"],
         # "optimizer.lr": ["2.5e-4"],
         # "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"],
-        "training.gradient_accumulation_steps": ["21", "25", "29", "33"],
-        "training.steps": ["31000", "26000", "22.500", "20000"],
+        # "training.gradient_accumulation_steps": ["21", "25", "29", "33"],
+        # "training.steps": ["31000", "26000", "22500", "20000"],
     }
 
     jobs = []
diff --git a/torchtitan/models/llama/__init__.py b/torchtitan/models/llama/__init__.py
@@ -37,7 +37,16 @@
         n_heads=32,
         n_kv_heads=8,
         rope_theta=500000,
-        share_embeddings=True
+        share_embeddings=True,
+    ),
+    "3B": ModelArgs(
+        dim=3072,
+        n_layers=28,
+        n_heads=24,
+        n_kv_heads=8,
+        rope_theta=500000,
+        ffn_dim_multiplier=2 / 3,  # in Llama3.2-3B dim is 3072, but ffn dim is 8192
+        share_embeddings=True,
     ),
     "8B": ModelArgs(
         dim=4096,
@@ -66,4 +75,4 @@
         multiple_of=4096,
         rope_theta=500000,
     ),
-}
+}
diff --git a/train_configs/llama3.2_1b.toml b/train_configs/llama3.2_1b.toml
@@ -30,11 +30,11 @@ lr = 6e-4
 
 [training]
 batch_size = 10
-gradient_accumulation_steps = 21
+gradient_accumulation_steps = 16
 seq_len = 2048
 warmup_steps = 500  # lr scheduler warm up, normally 20% of the train steps
 max_norm = 1.0  # grad norm clipping
-steps = 31000
+steps = 40000
 data_parallel_degree = -1
 tensor_parallel_degree = 1
 compile = true
diff --git a/train_configs/llama3.2_3b.toml b/train_configs/llama3.2_3b.toml
@@ -0,0 +1,76 @@
+# torchtitan Config.toml
+
+[job]
+dump_folder = "/nfs/h100/raid/chem/checkpoints"
+description = "Llama 3.2 training"
+use_for_integration_test = false
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+enable_color_printing = true
+enable_aim = true
+save_aim_folder = "aim"
+
+[model]
+name = "llama3"
+flavor = "3B"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm
+tokenizer_path = "torchtitan/tokenizers/Llama-3.2-chem-1B-v1/"
+
+[optimizer]
+name = "AdamW"
+lr = 6e-4
+
+[training]
+batch_size = 6
+gradient_accumulation_steps = 28
+seq_len = 2048
+warmup_steps = 500  # lr scheduler warm up, normally 20% of the train steps
+max_norm = 1.0  # grad norm clipping
+steps = 40000
+data_parallel_degree = -1
+tensor_parallel_degree = 1
+compile = true
+# dataset = "c4"  # supported datasets: c4_test (2K), c4 (177M)
+# dataset = "chemlactica_train_mini"  # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K)
+dataset = "chemlactica_train"
+data_processing_style="chemlactica_style"
+representation_type = "SMILES"
+
+[validation]
+valid_freq = 2000
+enable_valid = true
+dataset = "chemlactica_valid"  # supported datasets: chemlactica_valid_mini
+
+[dataloader]
+num_workers = 2
+
+[experimental]
+pipeline_parallel_degree = 1
+enable_async_tensor_parallel = false
+
+[checkpoint]
+enable_checkpoint = true
+save_folder = "yerevann/Llama-3.2-3B"
+load_folder = "meta-llama/Llama-3.2-3B"
+# load_folder = "yerevann/Llama-3.2-1B/ec943c9e63db4cf7b4a8b847"
+# load_at_step = 40000
+interval_type = "steps"
+interval = 2000
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "async_with_pinned_mem"  # ["disabled", "async", "async_with_pinned_mem"]
+
+[activation_checkpoint]
+mode = 'none'  # ['none', 'selective', 'full']
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+
+[float8]
+enable_float8_linear = false
diff --git a/train_configs/llama3.2_3b_conversion.toml b/train_configs/llama3.2_3b_conversion.toml
@@ -0,0 +1,74 @@
+# torchtitan Config.toml
+
+[job]
+dump_folder = "/nfs/h100/raid/chem/checkpoints"
+description = "Llama 3.2 training"
+use_for_integration_test = false
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+enable_color_printing = true
+enable_aim = false
+save_aim_folder = "aim"
+
+[model]
+name = "llama3"
+flavor = "3B"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm
+tokenizer_path = "torchtitan/tokenizers/Llama-3.2-chem-1B-v1"
+# tokenizer_path = "meta-llama/Llama-3.2-1B"
+
+[optimizer]
+name = "AdamW"
+lr = 1.0e-4
+
+[training]
+batch_size = 1
+gradient_accumulation_steps = 3
+seq_len = 2048
+warmup_steps = 500  # lr scheduler warm up, normally 20% of the train steps
+max_norm = 1.0  # grad norm clipping
+steps = 10
+data_parallel_degree = -1
+tensor_parallel_degree = 1
+compile = false
+# dataset = "c4"  # supported datasets: c4_test (2K), c4 (177M)
+# dataset = "chemlactica_train_mini"  # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K)
+dataset = "chemlactica_train"
+data_processing_style="chemlactica_style"
+
+[experimental]
+pipeline_parallel_degree = 1
+enable_async_tensor_parallel = false
+
+[checkpoint]
+enable_checkpoint = true
+load_folder = "meta-llama/Llama-3.2-3B"
+save_folder = "meta-llama/Llama-3.2-3B"
+# load_folder = "yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
+load_at_step = 0
+# save_folder = "hf/yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8"
+interval_type = "steps"
+interval = 1000
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "async_with_pinned_mem"  # ["disabled", "async", "async_with_pinned_mem"]
+
+[model_download_export]
+to_titan = true
+weights_source = "huggingface"
+# to_hf = true
+
+[activation_checkpoint]
+mode = 'none'  # ['none', 'selective', 'full']
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+
+[float8]
+enable_float8_linear = false

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`for _ in range(1):`
`27`	`27`	`# train_config = './train_configs/chemlactica_125m.toml'`
`28`	`28`	`# train_config = './train_configs/chemlactica_1.3b.toml'`
`29`		`- train_config = "./train_configs/llama3.2_1b.toml"`
	`29`	`+ train_config = "./train_configs/llama3.2_3b.toml"`
`30`	`30`	`# train_config = './train_configs/debug_model.toml'`
`31`	`31`	`function = submitit.helpers.CommandFunction(`
`32`	`32`	`[`