fix(qwen3): fix qwen3 bias #1466
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: e2e-tests | |
| on: | |
| pull_request: | |
| branches: | |
| - "develop" | |
| paths-ignore: | |
| - "doc/**" | |
| - "**.md" | |
| env: | |
| WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) | |
| SLURM_PARTITION: llm_s | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| training_4GPU: | |
| runs-on: [t_cluster] | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_4GPU | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=\$jobname -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_8GPU_ISP: | |
| runs-on: [t_cluster] | |
| timeout-minutes: 10 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_8GPU_ISP | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=ISP-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_8GPU_ISP_CKPT: | |
| runs-on: [t_cluster] | |
| timeout-minutes: 20 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_8GPU_ISP_CKPT | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=ISP_CKPT-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_SAVE_CKPT" ./tests/test_training/test_loss.py | |
| jobname=LOAD-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_ISP_LOAD_CKPT" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_8GPU_4DP2TP: | |
| strategy: | |
| matrix: | |
| runner: [t_cluster] | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_8GPU_4DP2TP_T | |
| if: ${{ matrix.runner == 't_cluster' }} | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_8GPU_4DP2TPSP: | |
| strategy: | |
| matrix: | |
| runner: [t_cluster] | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_8GPU_4DP2TPSP_T | |
| if: ${{ matrix.runner == 't_cluster' }} | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_8GPU_4DP2PP: | |
| strategy: | |
| matrix: | |
| runner: [t_cluster] | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_8GPU_4DP2PP_T | |
| if: ${{ matrix.runner == 't_cluster' }} | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_8GPU_4DP2PP_ZB: | |
| runs-on: [t_cluster] | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_8GPU_4DP2PP_ZB | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_ZB" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_16GPU_4DP2TP2PP_MTP: | |
| strategy: | |
| matrix: | |
| runner: [t_cluster] | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_16GPU_4DP2TP2PP_MTP_T | |
| if: ${{ matrix.runner == 't_cluster' }} | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_16GPU_4DP2TP2PP_MSP: | |
| strategy: | |
| matrix: | |
| runner: [t_cluster] | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_16GPU_4DP2TP2PP_MSP_T | |
| if: ${{ matrix.runner == 't_cluster' }} | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_16GPU_4DP2TP2PP_FSP: | |
| strategy: | |
| matrix: | |
| runner: [t_cluster] | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 15 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_16GPU_4DP2TP2PP_FSP_T | |
| if: ${{ matrix.runner == 't_cluster' }} | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF | |
| training_llama2: | |
| strategy: | |
| matrix: | |
| runner: [t_cluster] | |
| runs-on: ${{ matrix.runner }} | |
| timeout-minutes: 20 | |
| steps: | |
| - name: mask env | |
| run: | | |
| echo "::add-mask::${{env.WORKSPACE_PREFIX}}" | |
| echo "::add-mask::$path_prefix" | |
| - uses: actions/checkout@v3 | |
| - name: training_llama2_T | |
| run: | | |
| ssh ${USER}@${CI_HOST} bash << EOF | |
| cd $GITHUB_WORKSPACE | |
| source activate ${evo_env_torch21_flash2} | |
| jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT} | |
| srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=\$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py | |
| exit_code=$? | |
| sh ./ci_scripts/common/check_slurm_cancled.sh \$exit_code \$jobname | |
| EOF |