diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml
index e131881bd..2a8f56c96 100644
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@@ -23,12 +23,12 @@ jobs:
 
     - name: raw-chinese-data
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
 
     - name: alpaca-data
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sh ./ci_scripts/data/tokenizer_alpaca.sh
 
   train:
@@ -44,26 +44,26 @@ jobs:
     - name: slurm-train
       id: basic_train
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
 
     - name: load_preset_ckpt
       if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
 
     - name: load_new_ckpt
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
         rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
 
     - name: torchrun-train
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
         rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
 
@@ -79,7 +79,7 @@ jobs:
 
     - name: convert-model-then-load
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/model/convert_to_hf.sh
         cd ./hf_ckpt
diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index 22f6ad88d..96e5ab004 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -7,12 +7,13 @@ on:
       - "doc/**"
       - "**.md"
 env:
+  WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
   SLURM_PARTITION: llm_s
 
 jobs:
   training_4GPU:
     runs-on: [t_cluster]
-    timeout-minutes: 10
+    timeout-minutes: 15
     steps:
     - name: mask env
       run: |
@@ -22,8 +23,8 @@ jobs:
 
     - name: training_4GPU
       run: |
-        source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
+        source activate ${evo_env_torch21_flash2}
+        srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
 
   training_8GPU_ISP:
     runs-on: [t_cluster]
diff --git a/.github/workflows/pr_before_merge.yaml b/.github/workflows/pr_before_merge.yaml
index 3210a5a08..42e2a89bf 100644
--- a/.github/workflows/pr_before_merge.yaml
+++ b/.github/workflows/pr_before_merge.yaml
@@ -24,6 +24,6 @@ jobs:
 
     - name: model_init_tests
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_init.py --seed=1024
diff --git a/.github/workflows/pr_merged.yaml b/.github/workflows/pr_merged.yaml
index ba1913cf6..4d7c2c531 100644
--- a/.github/workflows/pr_merged.yaml
+++ b/.github/workflows/pr_merged.yaml
@@ -24,7 +24,7 @@ jobs:
 
     - name: acc_tests
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1  python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py
 
@@ -40,7 +40,7 @@ jobs:
 
     - name: loss_tests
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-loss-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_swap_nb_loss_and_gradnorm.py
 
diff --git a/.github/workflows/rerun.yaml b/.github/workflows/rerun.yaml
new file mode 100644
index 000000000..17c4ef9fd
--- /dev/null
+++ b/.github/workflows/rerun.yaml
@@ -0,0 +1,21 @@
+name: check-status
+
+on:
+  workflow_run:
+    workflows: [unit-tests,pr-merged,weekly-tests]
+    types: [completed]
+
+jobs:
+  on-failure:
+    runs-on: ubuntu-latest
+    if: ${{ (github.event.workflow_run.head_branch == 'main' || github.event.workflow_run.head_branch == 'develop') && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.run_attempt < 3 }}
+    steps:
+      - run: |
+          echo 'The triggering workflow failed'
+          sleep 600
+          curl -L \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ github.token }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/rerun-failed-jobs 
diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml
index a053b75c9..70c3e63dd 100644
--- a/.github/workflows/unit_tests.yaml
+++ b/.github/workflows/unit_tests.yaml
@@ -31,7 +31,7 @@ jobs:
 
     - name: core_pipeline
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py
 
@@ -47,7 +47,7 @@ jobs:
 
     - name: utils_storage_manager
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py
 
@@ -63,7 +63,7 @@ jobs:
 
     - name: model_fused_precision
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_model/test_fused_precision/test_fused_precision.py
 
@@ -79,7 +79,7 @@ jobs:
 
     - name: data_batch_sample
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_data/test_batch_sampler.py
 
@@ -95,7 +95,7 @@ jobs:
 
     - name: utils_timeout
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python -m pytest -s -v ./tests/test_utils/test_timeout.py
 
@@ -111,7 +111,7 @@ jobs:
 
     - name: utils_model_checkpoint
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:2 python -m pytest -s -v  ./tests/test_utils/test_model_checkpoint.py
 
diff --git a/.github/workflows/upload_to_pypi.yaml b/.github/workflows/upload_to_pypi.yaml
index 15d4ed855..bfc7c93f4 100644
--- a/.github/workflows/upload_to_pypi.yaml
+++ b/.github/workflows/upload_to_pypi.yaml
@@ -34,7 +34,7 @@ jobs:
 
     - name: build and upload package
       run: |
+        source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
-        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python setup.py sdist bdist_wheel
         twine upload -u __token__ -p ${{ secrets.PYPI_API_TOKEN }} dist/*
diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
index afbcdc0a8..e3c2f827e 100644
--- a/.github/workflows/weekly_test.yaml
+++ b/.github/workflows/weekly_test.yaml
@@ -4,12 +4,13 @@ on:
   schedule:
     - cron:  '56 18 * * 5'
 env:
+  WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
   SLURM_PARTITION: llm_s
 
 jobs:
   training_4GPU:
     runs-on: [t_cluster]
-    timeout-minutes: 10
+    timeout-minutes: 15
     steps:
     - name: mask env
       run: |
@@ -21,8 +22,8 @@ jobs:
 
     - name: training_4GPU
       run: |
-        source $evo_env
-        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
+        source activate ${evo_env_torch21_flash2}
+        srun -p ${SLURM_PARTITION} --exclusive --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n4 --ntasks-per-node=4 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_4GPU" ./tests/test_training/test_loss.py
 
   training_8GPU_4DP2TP:
     runs-on: [t_cluster]
@@ -38,7 +39,7 @@ jobs:
 
     - name: training_8GPU_4DP2TP
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
 
@@ -56,7 +57,7 @@ jobs:
 
     - name: training_8GPU_4DP2TPSP
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
             
@@ -74,7 +75,7 @@ jobs:
 
     - name: training_8GPU_4DP2PP
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
 
@@ -92,7 +93,7 @@ jobs:
 
     - name: training_8GPU_4DP2PP_InterleavedOverlap
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
         sed -i 's/^.*num_chunks=.*/    num_chunks=2,/' ./configs/7B_sft.py
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP_InterleavedOverlap" ./tests/test_training/test_loss.py
@@ -111,7 +112,7 @@ jobs:
 
     - name: training_16GPU_4DP2TP2PP_MTP
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="mtp"),/' ./configs/7B_sft.py
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
@@ -130,7 +131,7 @@ jobs:
 
     - name: training_16GPU_4DP2TP2PP_MSP
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="msp"),/' ./configs/7B_sft.py
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
@@ -149,7 +150,7 @@ jobs:
 
     - name: training_16GPU_4DP2TP2PP_FSP
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         sed -i 's/^.*tensor=.*/    tensor=dict(size=2, mode="fsp"),/' ./configs/7B_sft.py
         sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
@@ -199,7 +200,7 @@ jobs:
 
     - name: test_optimizer
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
 
   unit_test_model:
@@ -216,17 +217,17 @@ jobs:
 
     - name: test_embedding_accuracy
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py
         
     - name: test_model_internlm_accuracy
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py
         
     - name: test_norm_accuracy
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py
 
   load_ckpt_then_assert_loss:
@@ -243,7 +244,7 @@ jobs:
 
     - name: test_ckpt_loss
       run: |
-        source $evo_env
+        source activate ${evo_env_torch21_flash2}
         srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_load_ckpt_loss.py
 
   notify_to_feishu:
diff --git a/ci_scripts/model/convert_to_hf.sh b/ci_scripts/model/convert_to_hf.sh
index 8da021963..3bf381c74 100644
--- a/ci_scripts/model/convert_to_hf.sh
+++ b/ci_scripts/model/convert_to_hf.sh
@@ -12,7 +12,7 @@ readonly TOKENIZER="${GITHUB_WORKSPACE}/hf_ckpt/tokenizer.model"
 readonly CONFIG="${GITHUB_WORKSPACE}/hf_ckpt/config.json"
 readonly INERNLM="${GITHUB_WORKSPACE}/hf_ckpt/modeling_internlm.py"
 exit_code=0
-expected_num=8
+expected_num=9
 
 source ./ci_scripts/common/basic_func.sh