From 50bdee586ded0c0645cb94b8ab49a16985b50313 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 6 Oct 2023 13:46:24 -0700
Subject: [PATCH 1/6] Update llama2-model.libsonnet

---
 tests/pytorch/r2.1/llama2-model.libsonnet | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pytorch/r2.1/llama2-model.libsonnet b/tests/pytorch/r2.1/llama2-model.libsonnet
index 4e8d55182..1122d8287 100644
--- a/tests/pytorch/r2.1/llama2-model.libsonnet
+++ b/tests/pytorch/r2.1/llama2-model.libsonnet
@@ -222,7 +222,7 @@ local utils = import 'templates/utils.libsonnet';
 
         # save llama2 training
         cd ..
-        echo -e 'python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none > output.txt' >> llama2training.sh
+        echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh
         echo -e 'import numpy as np' >> getvalue.py
         echo -e 'file = open("output.txt")' >> getvalue.py
         echo -e 'content = file.readlines()' >> getvalue.py
@@ -383,7 +383,7 @@ local utils = import 'templates/utils.libsonnet';
     llama2_inference + v4_8 + common.Functional + timeouts.Hours(3) + infer7B,
     llama2_inference + v4_8 + common.Functional + timeouts.Hours(3) + infer70B,
     llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B,
-    llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B128,
+    llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B256,
     llama2_training + convergence + v4_8 + common.Functional + timeouts.Hours(3) + spmd2Bconv,
   ],
 }

From 901831c5ae15a7596bc6d6009f909b3a6bc49f2c Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 6 Oct 2023 14:38:15 -0700
Subject: [PATCH 2/6] Update llama2-model.libsonnet

---
 tests/pytorch/r2.1/llama2-model.libsonnet | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/pytorch/r2.1/llama2-model.libsonnet b/tests/pytorch/r2.1/llama2-model.libsonnet
index 1122d8287..a2b653d3d 100644
--- a/tests/pytorch/r2.1/llama2-model.libsonnet
+++ b/tests/pytorch/r2.1/llama2-model.libsonnet
@@ -229,10 +229,10 @@ local utils = import 'templates/utils.libsonnet';
         echo -e 'value_line = content[-1]' >> getvalue.py
         echo -e 'value_value = float((value_line.split())[2])' >> getvalue.py
         echo -e 'value_value = np.reciprocal(value_value)' >> getvalue.py
-        echo -e 'if value_value > 6.863 or value_value < 6.209 :' >> getvalue.py
-        echo -e '    raise ValueError("expose to train_steps_per_second exceeded throuhold 6.536 +- 5%")' >> getvalue.py
+        echo -e 'if value_value > 14.000 or value_value < 12.667 :' >> getvalue.py
+        echo -e '    raise ValueError("expose to train_steps_per_second exceeded throuhold 13.333 +- 5%")' >> getvalue.py
         echo -e 'else:' >> getvalue.py
-        echo -e '    print("Finished llama2 test and warm latency/token within expected throuhold 6.536 +- 5%")' >> getvalue.py
+        echo -e '    print("Finished llama2 test and warm latency/token within expected throuhold 13.333 +- 5%")' >> getvalue.py
         echo -e 'cat output.txt' >> llama2training.sh
         echo -e 'python3 transformers/getvalue.py' >> llama2training.sh
         cat llama2training.sh

From 5183affacd9fe84ba4342d310782ddf1ed68a57b Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 6 Oct 2023 14:41:52 -0700
Subject: [PATCH 3/6] Update llama2-model.libsonnet

---
 tests/pytorch/nightly/llama2-model.libsonnet | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet
index 8b8efcf82..a5d641521 100644
--- a/tests/pytorch/nightly/llama2-model.libsonnet
+++ b/tests/pytorch/nightly/llama2-model.libsonnet
@@ -148,6 +148,25 @@ local utils = import 'templates/utils.libsonnet';
         cat llama2training.sh
         pwd
         ls
+
+        # save llama2 training
+        echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh
+        echo -e 'import numpy as np' >> getvalue.py
+        echo -e 'file = open("output.txt")' >> getvalue.py
+        echo -e 'content = file.readlines()' >> getvalue.py
+        echo -e 'value_line = content[-1]' >> getvalue.py
+        echo -e 'value_value = float((value_line.split())[2])' >> getvalue.py
+        echo -e 'value_value = np.reciprocal(value_value)' >> getvalue.py
+        echo -e 'if value_value > 14.000 or value_value < 12.667 :' >> getvalue.py
+        echo -e '    raise ValueError("expose to train_steps_per_second exceeded throuhold 13.333 +- 5%")' >> getvalue.py
+        echo -e 'else:' >> getvalue.py
+        echo -e '    print("Finished llama2 test and warm latency/token within expected throuhold 13.333 +- 5%")' >> getvalue.py
+        echo -e 'cat output.txt' >> llama2training.sh
+        echo -e 'python3 transformers/getvalue.py' >> llama2training.sh
+        cat llama2training.sh
+        pwd
+        ls
+
       |||,
     },
   },

From 952cf83b3775afbba72280eeff348957b55f65dd Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 6 Oct 2023 14:42:22 -0700
Subject: [PATCH 4/6] Update llama2-model.libsonnet

---
 tests/pytorch/nightly/llama2-model.libsonnet | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet
index a5d641521..1086cb80c 100644
--- a/tests/pytorch/nightly/llama2-model.libsonnet
+++ b/tests/pytorch/nightly/llama2-model.libsonnet
@@ -143,12 +143,6 @@ local utils = import 'templates/utils.libsonnet';
         cd 7B/
         wget https://storage.googleapis.com/manfei_public_experimental/2B.json
 
-        # save llama2 training
-        echo -e 'python transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 32 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/7B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none' >> llama2training.sh
-        cat llama2training.sh
-        pwd
-        ls
-
         # save llama2 training
         echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh
         echo -e 'import numpy as np' >> getvalue.py

From 1c1f1fa6e47cfc102260b77b47e19d1627a98908 Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 6 Oct 2023 15:38:10 -0700
Subject: [PATCH 5/6] Update llama2-model.libsonnet

---
 tests/pytorch/nightly/llama2-model.libsonnet | 33 ++++----------------
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet
index 1086cb80c..18d135d9b 100644
--- a/tests/pytorch/nightly/llama2-model.libsonnet
+++ b/tests/pytorch/nightly/llama2-model.libsonnet
@@ -45,23 +45,15 @@ local utils = import 'templates/utils.libsonnet';
     },
     command: self.paramsOverride.trainCommand,
   },
+  local pjrt = self.pjrt,
+  pjrt:: common.PyTorchTpuVmMixin {
+    modelName: 'llama2-pjrt',
+  },
   local infer = self.infer,
-  infer:: common.PyTorchTpuVmMixin {
+  infer:: common.PyTorchTpuVmMixin + pjrt {
     modelName+: '-infer',
     tpuSettings+: {
       tpuVmExtraSetup: |||
-        pip3 uninstall torch torch_xla torchvision libtpu-nightly -y
-        sudo apt-get update -y
-        sudo apt-get install libomp5 -y
-        pip3 install mkl mkl-include
-        pip3 install tf-nightly tb-nightly tbp-nightly
-        pip3 install numpy
-        sudo apt-get install numactl -y
-        sudo apt-get install libopenblas-dev -y
-        pip3 install --user --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
-        pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl
-        pip3 install torch_xla[tpuvm]
-
         # install tokenizer model
         wget https://storage.googleapis.com/tpu-pytorch/lsiyuan-experiment/llama/spiece.model
 
@@ -93,7 +85,7 @@ local utils = import 'templates/utils.libsonnet';
     },
   },
   local spmd = self.spmd,
-  spmd:: common.PyTorchTpuVmMixin {
+  spmd:: common.PyTorchTpuVmMixin + pjrt {
     modelName+: '-train-spmd',
     tpuSettings+: {
       tpuVmExports+: |||
@@ -110,19 +102,6 @@ local utils = import 'templates/utils.libsonnet';
         export TPU_MEGACORE=megacore_dense
       |||,
       tpuVmExtraSetup: |||
-        pip3 uninstall torch torch_xla torchvision libtpu-nightly -y
-        sudo apt update -y
-        sudo apt-get update -y
-        pip install accelerate -U
-        sudo apt-get install libomp5 -y
-        pip3 install mkl mkl-include
-        pip3 install numpy
-        sudo apt-get install numactl -y
-        sudo apt-get install libopenblas-dev -y
-        pip3 install --user --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
-        pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl
-        pip3 install torch_xla[tpuvm]
-
         # install tokenizer model
         wget https://storage.googleapis.com/tpu-pytorch/lsiyuan-experiment/llama/spiece.model
 

From 460a05b1f828e23588ff72b45e43bee4de97888d Mon Sep 17 00:00:00 2001
From: Manfei <41607353+ManfeiBai@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:47:11 -0700
Subject: [PATCH 6/6] Update llama2-model.libsonnet

---
 tests/pytorch/nightly/llama2-model.libsonnet | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet
index 18d135d9b..04cdb21f3 100644
--- a/tests/pytorch/nightly/llama2-model.libsonnet
+++ b/tests/pytorch/nightly/llama2-model.libsonnet
@@ -123,7 +123,7 @@ local utils = import 'templates/utils.libsonnet';
         wget https://storage.googleapis.com/manfei_public_experimental/2B.json
 
         # save llama2 training
-        echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh
+        echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/7B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh
         echo -e 'import numpy as np' >> getvalue.py
         echo -e 'file = open("output.txt")' >> getvalue.py
         echo -e 'content = file.readlines()' >> getvalue.py