From 50bdee586ded0c0645cb94b8ab49a16985b50313 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Fri, 6 Oct 2023 13:46:24 -0700 Subject: [PATCH 1/6] Update llama2-model.libsonnet --- tests/pytorch/r2.1/llama2-model.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pytorch/r2.1/llama2-model.libsonnet b/tests/pytorch/r2.1/llama2-model.libsonnet index 4e8d55182..1122d8287 100644 --- a/tests/pytorch/r2.1/llama2-model.libsonnet +++ b/tests/pytorch/r2.1/llama2-model.libsonnet @@ -222,7 +222,7 @@ local utils = import 'templates/utils.libsonnet'; # save llama2 training cd .. - echo -e 'python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none > output.txt' >> llama2training.sh + echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh echo -e 'import numpy as np' >> getvalue.py echo -e 'file = open("output.txt")' >> getvalue.py echo -e 'content = file.readlines()' >> getvalue.py @@ -383,7 +383,7 @@ local utils = import 'templates/utils.libsonnet'; llama2_inference + v4_8 + common.Functional + timeouts.Hours(3) + infer7B, llama2_inference + v4_8 + common.Functional + timeouts.Hours(3) + infer70B, llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B, - llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B128, + llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B256, llama2_training + convergence + v4_8 + common.Functional + timeouts.Hours(3) + spmd2Bconv, ], } From 901831c5ae15a7596bc6d6009f909b3a6bc49f2c Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:38:15 -0700 Subject: [PATCH 2/6] Update llama2-model.libsonnet --- tests/pytorch/r2.1/llama2-model.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/pytorch/r2.1/llama2-model.libsonnet b/tests/pytorch/r2.1/llama2-model.libsonnet index 1122d8287..a2b653d3d 100644 --- a/tests/pytorch/r2.1/llama2-model.libsonnet +++ b/tests/pytorch/r2.1/llama2-model.libsonnet @@ -229,10 +229,10 @@ local utils = import 'templates/utils.libsonnet'; echo -e 'value_line = content[-1]' >> getvalue.py echo -e 'value_value = float((value_line.split())[2])' >> getvalue.py echo -e 'value_value = np.reciprocal(value_value)' >> getvalue.py - echo -e 'if value_value > 6.863 or value_value < 6.209 :' >> getvalue.py - echo -e ' raise ValueError("expose to train_steps_per_second exceeded throuhold 6.536 +- 5%")' >> getvalue.py + echo -e 'if value_value > 14.000 or value_value < 12.667 :' >> getvalue.py + echo -e ' raise ValueError("expose to train_steps_per_second exceeded throuhold 13.333 +- 5%")' >> getvalue.py echo -e 'else:' >> getvalue.py - echo -e ' print("Finished llama2 test and warm latency/token within expected throuhold 6.536 +- 5%")' >> getvalue.py + echo -e ' print("Finished llama2 test and warm latency/token within expected throuhold 13.333 +- 5%")' >> getvalue.py echo -e 'cat output.txt' >> llama2training.sh echo -e 'python3 transformers/getvalue.py' >> llama2training.sh cat llama2training.sh From 5183affacd9fe84ba4342d310782ddf1ed68a57b Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:41:52 -0700 Subject: [PATCH 3/6] Update llama2-model.libsonnet --- tests/pytorch/nightly/llama2-model.libsonnet | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet index 8b8efcf82..a5d641521 100644 --- a/tests/pytorch/nightly/llama2-model.libsonnet +++ b/tests/pytorch/nightly/llama2-model.libsonnet @@ -148,6 +148,25 @@ local utils = import 'templates/utils.libsonnet'; cat llama2training.sh pwd ls + + # save llama2 training + echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh + echo -e 'import numpy as np' >> getvalue.py + echo -e 'file = open("output.txt")' >> getvalue.py + echo -e 'content = file.readlines()' >> getvalue.py + echo -e 'value_line = content[-1]' >> getvalue.py + echo -e 'value_value = float((value_line.split())[2])' >> getvalue.py + echo -e 'value_value = np.reciprocal(value_value)' >> getvalue.py + echo -e 'if value_value > 14.000 or value_value < 12.667 :' >> getvalue.py + echo -e ' raise ValueError("expose to train_steps_per_second exceeded throuhold 13.333 +- 5%")' >> getvalue.py + echo -e 'else:' >> getvalue.py + echo -e ' print("Finished llama2 test and warm latency/token within expected throuhold 13.333 +- 5%")' >> getvalue.py + echo -e 'cat output.txt' >> llama2training.sh + echo -e 'python3 transformers/getvalue.py' >> llama2training.sh + cat llama2training.sh + pwd + ls + |||, }, }, From 952cf83b3775afbba72280eeff348957b55f65dd Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:42:22 -0700 Subject: [PATCH 4/6] Update llama2-model.libsonnet --- tests/pytorch/nightly/llama2-model.libsonnet | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet index a5d641521..1086cb80c 100644 --- a/tests/pytorch/nightly/llama2-model.libsonnet +++ b/tests/pytorch/nightly/llama2-model.libsonnet @@ -143,12 +143,6 @@ local utils = import 'templates/utils.libsonnet'; cd 7B/ wget https://storage.googleapis.com/manfei_public_experimental/2B.json - # save llama2 training - echo -e 'python transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 32 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/7B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none' >> llama2training.sh - cat llama2training.sh - pwd - ls - # save llama2 training echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh echo -e 'import numpy as np' >> getvalue.py From 1c1f1fa6e47cfc102260b77b47e19d1627a98908 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Fri, 6 Oct 2023 15:38:10 -0700 Subject: [PATCH 5/6] Update llama2-model.libsonnet --- tests/pytorch/nightly/llama2-model.libsonnet | 33 ++++---------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet index 1086cb80c..18d135d9b 100644 --- a/tests/pytorch/nightly/llama2-model.libsonnet +++ b/tests/pytorch/nightly/llama2-model.libsonnet @@ -45,23 +45,15 @@ local utils = import 'templates/utils.libsonnet'; }, command: self.paramsOverride.trainCommand, }, + local pjrt = self.pjrt, + pjrt:: common.PyTorchTpuVmMixin { + modelName: 'llama2-pjrt', + }, local infer = self.infer, - infer:: common.PyTorchTpuVmMixin { + infer:: common.PyTorchTpuVmMixin + pjrt { modelName+: '-infer', tpuSettings+: { tpuVmExtraSetup: ||| - pip3 uninstall torch torch_xla torchvision libtpu-nightly -y - sudo apt-get update -y - sudo apt-get install libomp5 -y - pip3 install mkl mkl-include - pip3 install tf-nightly tb-nightly tbp-nightly - pip3 install numpy - sudo apt-get install numactl -y - sudo apt-get install libopenblas-dev -y - pip3 install --user --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu - pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl - pip3 install torch_xla[tpuvm] - # install tokenizer model wget https://storage.googleapis.com/tpu-pytorch/lsiyuan-experiment/llama/spiece.model @@ -93,7 +85,7 @@ local utils = import 'templates/utils.libsonnet'; }, }, local spmd = self.spmd, - spmd:: common.PyTorchTpuVmMixin { + spmd:: common.PyTorchTpuVmMixin + pjrt { modelName+: '-train-spmd', tpuSettings+: { tpuVmExports+: ||| @@ -110,19 +102,6 @@ local utils = import 'templates/utils.libsonnet'; export TPU_MEGACORE=megacore_dense |||, tpuVmExtraSetup: ||| - pip3 uninstall torch torch_xla torchvision libtpu-nightly -y - sudo apt update -y - sudo apt-get update -y - pip install accelerate -U - sudo apt-get install libomp5 -y - pip3 install mkl mkl-include - pip3 install numpy - sudo apt-get install numactl -y - sudo apt-get install libopenblas-dev -y - pip3 install --user --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu - pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl - pip3 install torch_xla[tpuvm] - # install tokenizer model wget https://storage.googleapis.com/tpu-pytorch/lsiyuan-experiment/llama/spiece.model From 460a05b1f828e23588ff72b45e43bee4de97888d Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Fri, 6 Oct 2023 16:47:11 -0700 Subject: [PATCH 6/6] Update llama2-model.libsonnet --- tests/pytorch/nightly/llama2-model.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet index 18d135d9b..04cdb21f3 100644 --- a/tests/pytorch/nightly/llama2-model.libsonnet +++ b/tests/pytorch/nightly/llama2-model.libsonnet @@ -123,7 +123,7 @@ local utils = import 'templates/utils.libsonnet'; wget https://storage.googleapis.com/manfei_public_experimental/2B.json # save llama2 training - echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh + echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/7B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh echo -e 'import numpy as np' >> getvalue.py echo -e 'file = open("output.txt")' >> getvalue.py echo -e 'content = file.readlines()' >> getvalue.py