17
17
- name : training_8GPU
18
18
run : |
19
19
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
20
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
20
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
21
21
22
22
training_16GPU_8DP2TP :
23
23
runs-on : [t_cluster]
29
29
run : |
30
30
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
31
31
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
32
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
32
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
33
33
34
34
training_16GPU_8DP2TPSP :
35
35
runs-on : [t_cluster]
42
42
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
43
43
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
44
44
sed -i 's/^.*sequence_parallel=.*/ sequence_parallel=True,/' ./configs/7B_sft.py
45
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
45
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
46
46
47
47
training_16GPU_8DP2PP :
48
48
runs-on : [t_cluster]
54
54
run : |
55
55
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
56
56
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
57
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
57
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
58
58
59
59
training_16GPU_8DP2PP_InterleavedOverlap :
60
60
runs-on : [t_cluster]
67
67
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
68
68
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
69
69
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py
70
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
70
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
71
71
72
72
unit_test_optimizer :
73
73
runs-on : [t_cluster]
78
78
- name : test_optimizer
79
79
run : |
80
80
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
81
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
81
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
82
82
83
83
unit_test_model :
84
84
runs-on : [t_cluster]
@@ -89,14 +89,14 @@ jobs:
89
89
- name : test_embedding_accuracy
90
90
run : |
91
91
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
92
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py
92
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py
93
93
94
94
- name : test_model_internlm_accuracy
95
95
run : |
96
96
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
97
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py
97
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py
98
98
99
99
- name : test_norm_accuracy
100
100
run : |
101
101
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
102
- srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py
102
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 -- job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py
0 commit comments