Skip to content

Commit

Permalink
test(workflow): change env into flash2 and add rerun workflow (#48)
Browse files Browse the repository at this point in the history
Co-authored-by: kkscilife <[email protected]>
  • Loading branch information
kkscilife and kkscilife authored Feb 23, 2024
1 parent 1a13152 commit 14c79be
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 37 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/demo_in_readme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ jobs:

- name: raw-chinese-data
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: alpaca-data
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_alpaca.sh
train:
Expand All @@ -44,26 +44,26 @@ jobs:
- name: slurm-train
id: basic_train
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_new_ckpt
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
- name: torchrun-train
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
Expand All @@ -79,7 +79,7 @@ jobs:

- name: convert-model-then-load
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/model/convert_to_hf.sh
cd ./hf_ckpt
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ on:
- "doc/**"
- "**.md"
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_s

jobs:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
timeout-minutes: 15
steps:
- name: mask env
run: |
Expand All @@ -22,8 +23,8 @@ jobs:

- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
training_8GPU_ISP:
runs-on: [t_cluster]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_before_merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ jobs:

- name: model_init_tests
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_init.py --seed=1024
4 changes: 2 additions & 2 deletions .github/workflows/pr_merged.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:

- name: acc_tests
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py
Expand All @@ -40,7 +40,7 @@ jobs:

- name: loss_tests
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-loss-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_swap_nb_loss_and_gradnorm.py
Expand Down
21 changes: 21 additions & 0 deletions .github/workflows/rerun.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: check-status

on:
workflow_run:
workflows: [unit-tests,pr-merged,weekly-tests]
types: [completed]

jobs:
on-failure:
runs-on: ubuntu-latest
if: ${{ (github.event.workflow_run.head_branch == 'main' || github.event.workflow_run.head_branch == 'develop') && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.run_attempt < 3 }}
steps:
- run: |
echo 'The triggering workflow failed'
sleep 600
curl -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ github.token }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/rerun-failed-jobs
12 changes: 6 additions & 6 deletions .github/workflows/unit_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:

- name: core_pipeline
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py
Expand All @@ -47,7 +47,7 @@ jobs:

- name: utils_storage_manager
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py
Expand All @@ -63,7 +63,7 @@ jobs:

- name: model_fused_precision
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_model/test_fused_precision/test_fused_precision.py
Expand All @@ -79,7 +79,7 @@ jobs:

- name: data_batch_sample
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_data/test_batch_sampler.py
Expand All @@ -95,7 +95,7 @@ jobs:

- name: utils_timeout
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python -m pytest -s -v ./tests/test_utils/test_timeout.py
Expand All @@ -111,7 +111,7 @@ jobs:

- name: utils_model_checkpoint
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:2 python -m pytest -s -v ./tests/test_utils/test_model_checkpoint.py
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/upload_to_pypi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
- name: build and upload package
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} dist/*
31 changes: 16 additions & 15 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ on:
schedule:
- cron: '56 18 * * 5'
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_s

jobs:
training_4GPU:
runs-on: [t_cluster]
timeout-minutes: 10
timeout-minutes: 15
steps:
- name: mask env
run: |
Expand All @@ -21,8 +22,8 @@ jobs:

- name: training_4GPU
run: |
source $evo_env
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
training_8GPU_4DP2TP:
runs-on: [t_cluster]
Expand All @@ -38,7 +39,7 @@ jobs:

- name: training_8GPU_4DP2TP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
Expand All @@ -56,7 +57,7 @@ jobs:

- name: training_8GPU_4DP2TPSP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
Expand All @@ -74,7 +75,7 @@ jobs:

- name: training_8GPU_4DP2PP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
Expand All @@ -92,7 +93,7 @@ jobs:

- name: training_8GPU_4DP2PP_InterleavedOverlap
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
Expand All @@ -111,7 +112,7 @@ jobs:

- name: training_16GPU_4DP2TP2PP_MTP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
Expand All @@ -130,7 +131,7 @@ jobs:

- name: training_16GPU_4DP2TP2PP_MSP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
Expand All @@ -149,7 +150,7 @@ jobs:

- name: training_16GPU_4DP2TP2PP_FSP
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
Expand Down Expand Up @@ -199,7 +200,7 @@ jobs:

- name: test_optimizer
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
unit_test_model:
Expand All @@ -216,17 +217,17 @@ jobs:

- name: test_embedding_accuracy
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py
- name: test_model_internlm_accuracy
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py
- name: test_norm_accuracy
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py
load_ckpt_then_assert_loss:
Expand All @@ -243,7 +244,7 @@ jobs:

- name: test_ckpt_loss
run: |
source $evo_env
source activate ${evo_env_torch21_flash2}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_load_ckpt_loss.py
notify_to_feishu:
Expand Down
2 changes: 1 addition & 1 deletion ci_scripts/model/convert_to_hf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ readonly TOKENIZER="${GITHUB_WORKSPACE}/hf_ckpt/tokenizer.model"
readonly CONFIG="${GITHUB_WORKSPACE}/hf_ckpt/config.json"
readonly INERNLM="${GITHUB_WORKSPACE}/hf_ckpt/modeling_internlm.py"
exit_code=0
expected_num=8
expected_num=9

source ./ci_scripts/common/basic_func.sh

Expand Down

0 comments on commit 14c79be

Please sign in to comment.