Feat(QA): Check loss when swapping micro_num and micro_bsz && Check g… #7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: weekly-tests | |
on: | |
push: | |
branches: | |
- "main" | |
- "develop" | |
env: | |
SLURM_PARTITION: llm_s | |
jobs: | |
training_8GPU: | |
runs-on: [t_cluster] | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v3 | |
- name: training_8GPU | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training | |
training_16GPU_8DP2TP: | |
runs-on: [t_cluster] | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v3 | |
- name: training_16GPU_8DP2TP | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training | |
training_16GPU_8DP2TPSP: | |
runs-on: [t_cluster] | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v3 | |
- name: training_16GPU_8DP2TPSP | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py | |
sed -i 's/^.*sequence_parallel=.*/ sequence_parallel=True,/' ./configs/7B_sft.py | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training | |
training_16GPU_8DP2PP: | |
runs-on: [t_cluster] | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v3 | |
- name: training_16GPU_8DP2PP | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training | |
training_16GPU_8DP2PP_InterleavedOverlap: | |
runs-on: [t_cluster] | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v3 | |
- name: training_16GPU_8DP2PP_InterleavedOverlap | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py | |
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training | |
unit_test_optimizer: | |
runs-on: [t_cluster] | |
timeout-minutes: 30 | |
steps: | |
- uses: actions/checkout@v3 | |
- name: test_optimizer | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py | |
unit_test_model: | |
runs-on: [t_cluster] | |
timeout-minutes: 5 | |
steps: | |
- uses: actions/checkout@v3 | |
- name: test_embedding_accuracy | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py | |
- name: test_model_internlm_accuracy | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py | |
- name: test_norm_accuracy | |
run: | | |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 | |
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py |