diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml index e131881b..2a8f56c9 100644 --- a/.github/workflows/demo_in_readme.yaml +++ b/.github/workflows/demo_in_readme.yaml @@ -23,12 +23,12 @@ jobs: - name: raw-chinese-data run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} - name: alpaca-data run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sh ./ci_scripts/data/tokenizer_alpaca.sh train: @@ -44,26 +44,26 @@ jobs: - name: slurm-train id: basic_train run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} - name: load_preset_ckpt if: ${{ failure() && steps.basic_train.conclusion == 'failure' }} run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB} - name: load_new_ckpt run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB} rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak - name: torchrun-train run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak @@ -79,7 +79,7 @@ jobs: - name: convert-model-then-load run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/model/convert_to_hf.sh cd ./hf_ckpt diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 22f6ad88..96e5ab00 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -7,12 +7,13 @@ on: - "doc/**" - "**.md" env: + WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) SLURM_PARTITION: llm_s jobs: training_4GPU: runs-on: [t_cluster] - timeout-minutes: 10 + timeout-minutes: 15 steps: - name: mask env run: | @@ -22,8 +23,8 @@ jobs: - name: training_4GPU run: | - source $evo_env - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py + source activate ${evo_env_torch21_flash2} + srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py training_8GPU_ISP: runs-on: [t_cluster] diff --git a/.github/workflows/pr_before_merge.yaml b/.github/workflows/pr_before_merge.yaml index 3210a5a0..42e2a89b 100644 --- a/.github/workflows/pr_before_merge.yaml +++ b/.github/workflows/pr_before_merge.yaml @@ -24,6 +24,6 @@ jobs: - name: model_init_tests run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_init.py --seed=1024 diff --git a/.github/workflows/pr_merged.yaml b/.github/workflows/pr_merged.yaml index ba1913cf..4d7c2c53 100644 --- a/.github/workflows/pr_merged.yaml +++ b/.github/workflows/pr_merged.yaml @@ -24,7 +24,7 @@ jobs: - name: acc_tests run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py @@ -40,7 +40,7 @@ jobs: - name: loss_tests run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-loss-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_swap_nb_loss_and_gradnorm.py diff --git a/.github/workflows/rerun.yaml b/.github/workflows/rerun.yaml new file mode 100644 index 00000000..17c4ef9f --- /dev/null +++ b/.github/workflows/rerun.yaml @@ -0,0 +1,21 @@ +name: check-status + +on: + workflow_run: + workflows: [unit-tests,pr-merged,weekly-tests] + types: [completed] + +jobs: + on-failure: + runs-on: ubuntu-latest + if: ${{ (github.event.workflow_run.head_branch == 'main' || github.event.workflow_run.head_branch == 'develop') && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.run_attempt < 3 }} + steps: + - run: | + echo 'The triggering workflow failed' + sleep 600 + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ github.token }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/rerun-failed-jobs diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml index a053b75c..70c3e63d 100644 --- a/.github/workflows/unit_tests.yaml +++ b/.github/workflows/unit_tests.yaml @@ -31,7 +31,7 @@ jobs: - name: core_pipeline run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py @@ -47,7 +47,7 @@ jobs: - name: utils_storage_manager run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py @@ -63,7 +63,7 @@ jobs: - name: model_fused_precision run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_model/test_fused_precision/test_fused_precision.py @@ -79,7 +79,7 @@ jobs: - name: data_batch_sample run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_data/test_batch_sampler.py @@ -95,7 +95,7 @@ jobs: - name: utils_timeout run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python -m pytest -s -v ./tests/test_utils/test_timeout.py @@ -111,7 +111,7 @@ jobs: - name: utils_model_checkpoint run: | - source $evo_env + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:2 python -m pytest -s -v ./tests/test_utils/test_model_checkpoint.py diff --git a/.github/workflows/upload_to_pypi.yaml b/.github/workflows/upload_to_pypi.yaml index 15d4ed85..bfc7c93f 100644 --- a/.github/workflows/upload_to_pypi.yaml +++ b/.github/workflows/upload_to_pypi.yaml @@ -34,7 +34,7 @@ jobs: - name: build and upload package run: | + source activate ${evo_env_torch21_flash2} export PYTHONPATH=$PWD:$PYTHONPATH - source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} dist/* diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml index afbcdc0a..e3c2f827 100644 --- a/.github/workflows/weekly_test.yaml +++ b/.github/workflows/weekly_test.yaml @@ -4,12 +4,13 @@ on: schedule: - cron: '56 18 * * 5' env: + WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) SLURM_PARTITION: llm_s jobs: training_4GPU: runs-on: [t_cluster] - timeout-minutes: 10 + timeout-minutes: 15 steps: - name: mask env run: | @@ -21,8 +22,8 @@ jobs: - name: training_4GPU run: | - source $evo_env - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py + source activate ${evo_env_torch21_flash2} + srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py training_8GPU_4DP2TP: runs-on: [t_cluster] @@ -38,7 +39,7 @@ jobs: - name: training_8GPU_4DP2TP run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py @@ -56,7 +57,7 @@ jobs: - name: training_8GPU_4DP2TPSP run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py @@ -74,7 +75,7 @@ jobs: - name: training_8GPU_4DP2PP run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py @@ -92,7 +93,7 @@ jobs: - name: training_8GPU_4DP2PP_InterleavedOverlap run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py @@ -111,7 +112,7 @@ jobs: - name: training_16GPU_4DP2TP2PP_MTP run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py @@ -130,7 +131,7 @@ jobs: - name: training_16GPU_4DP2TP2PP_MSP run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py @@ -149,7 +150,7 @@ jobs: - name: training_16GPU_4DP2TP2PP_FSP run: | - source $evo_env + source activate ${evo_env_torch21_flash2} sed -i 's/^.*tensor=.*/ tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py @@ -199,7 +200,7 @@ jobs: - name: test_optimizer run: | - source $evo_env + source activate ${evo_env_torch21_flash2} srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py unit_test_model: @@ -216,17 +217,17 @@ jobs: - name: test_embedding_accuracy run: | - source $evo_env + source activate ${evo_env_torch21_flash2} srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py - name: test_model_internlm_accuracy run: | - source $evo_env + source activate ${evo_env_torch21_flash2} srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py - name: test_norm_accuracy run: | - source $evo_env + source activate ${evo_env_torch21_flash2} srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py load_ckpt_then_assert_loss: @@ -243,7 +244,7 @@ jobs: - name: test_ckpt_loss run: | - source $evo_env + source activate ${evo_env_torch21_flash2} srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_load_ckpt_loss.py notify_to_feishu: diff --git a/ci_scripts/model/convert_to_hf.sh b/ci_scripts/model/convert_to_hf.sh index 8da02196..3bf381c7 100644 --- a/ci_scripts/model/convert_to_hf.sh +++ b/ci_scripts/model/convert_to_hf.sh @@ -12,7 +12,7 @@ readonly TOKENIZER="${GITHUB_WORKSPACE}/hf_ckpt/tokenizer.model" readonly CONFIG="${GITHUB_WORKSPACE}/hf_ckpt/config.json" readonly INERNLM="${GITHUB_WORKSPACE}/hf_ckpt/modeling_internlm.py" exit_code=0 -expected_num=8 +expected_num=9 source ./ci_scripts/common/basic_func.sh