diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index ff992c1f..28b0e4f1 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -1,5 +1,5 @@
 name: e2e-tests
-on: 
+on:
   pull_request:
     branches:
       - "develop"
@@ -73,7 +73,7 @@ jobs:
   training_8GPU_4DP2TP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
@@ -81,21 +81,20 @@ jobs:
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_8GPU_4DP2TP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_8GPU_4DP2TP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_8GPU_4DP2TPSP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
@@ -103,21 +102,20 @@ jobs:
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_8GPU_4DP2TPSP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_8GPU_4DP2TPSP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_8GPU_4DP2PP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
@@ -125,16 +123,15 @@ jobs:
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_8GPU_4DP2PP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_8GPU_4DP2PP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_8GPU_4DP2PP_ZB:
     runs-on: [t_cluster]
@@ -157,7 +154,7 @@ jobs:
   training_16GPU_4DP2TP2PP_MTP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
@@ -165,21 +162,20 @@ jobs:
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_16GPU_4DP2TP2PP_MTP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_16GPU_4DP2TP2PP_MTP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_16GPU_4DP2TP2PP_MSP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
@@ -187,21 +183,20 @@ jobs:
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_16GPU_4DP2TP2PP_MSP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_16GPU_4DP2TP2PP_MSP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_16GPU_4DP2TP2PP_FSP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
@@ -209,21 +204,20 @@ jobs:
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_16GPU_4DP2TP2PP_FSP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_16GPU_4DP2TP2PP_FSP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_llama2:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 20
     steps:
@@ -231,33 +225,11 @@ jobs:
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_llama2_910B
+    - name: training_llama2_T
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
-
-  training_internlm2:
-    strategy:
-      matrix:
-        runner: [910B]
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 20
-    steps:
-    - name: mask env
-      run: |
-        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
-        echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
-    - uses: actions/checkout@v3
-    - name: training_internlm2_910B
-      run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
\ No newline at end of file
diff --git a/configs/57B_qwen2_MoE.py b/configs/57B_qwen2_MoE.py
index 0fd67603..abfb0a5b 100644
--- a/configs/57B_qwen2_MoE.py
+++ b/configs/57B_qwen2_MoE.py
@@ -190,7 +190,6 @@
 weight parallel (dict):
     1. size: int, the size of weight parallel.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 expert parallel (dict):
     1. size: int
         * if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -201,15 +200,14 @@
 expert weight parallel (dict):
     1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=1, overlap=True, memory_pool=True),
+    weight=dict(size=1, overlap=True),
     expert=dict(size=-1, no_tp=False),
-    expert_weight=dict(size=1, overlap=True, memory_pool=True),
+    expert_weight=dict(size=1, overlap=True),
 )
 
 cudnn_deterministic = False
diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py
index c558427c..8d8acc40 100644
--- a/configs/7B_MoE4_sft.py
+++ b/configs/7B_MoE4_sft.py
@@ -103,6 +103,20 @@
     clip_grad_norm=1.0,
 )
 
+
+# loss config (dict):
+#     1. label_smoothing
+#     2. op_type: cross_entropy operator type, we support five types for loss computing,
+#                 including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
+#                 default is "py_vocab_parallel".
+#         "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
+#         "apex_naive": cross_entropy from apex
+#         "py_naive": self-implemented cross_entropy
+#         "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
+#         "py_vocab_parallel": self-implemented vocab parallel cross_entropy
+#         * op_types that ends with "naive" only support parallel_output=False;
+#         * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
+
 loss = dict(
     label_smoothing=0,
     moe_loss_coeff=0.1,
@@ -183,6 +197,10 @@
 weight parallel (dict):
     1. size: int, the size of weight parallel.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. launch_allgather_before: str, before which module to launch the all gather communication to
+        prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
+        Must be used with forward_overlap_per 'layer'.
+    4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
 expert parallel (dict):
     1. size: int
         * if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -193,14 +211,18 @@
 expert weight parallel (dict):
     1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. launch_allgather_before: str, before which module to launch the all gather communication to
+        prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
+        Must be used with forward_overlap_per 'layer'.
+    4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=1, overlap=True),
+    weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
     expert=dict(size=-1, no_tp=False),
-    expert_weight=dict(size=1, overlap=True),
+    expert_weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
 )
 
 cudnn_deterministic = False
diff --git a/configs/7B_internlm2.py b/configs/7B_internlm2.py
index 97758bba..51741703 100644
--- a/configs/7B_internlm2.py
+++ b/configs/7B_internlm2.py
@@ -98,9 +98,21 @@
     clip_grad_norm=1.0,
 )
 
-loss = dict(
-    label_smoothing=0,
-)
+
+# loss config (dict):
+#     1. label_smoothing
+#     2. op_type: cross_entropy operator type, we support five types for loss computing,
+#                 including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
+#                 default is "py_vocab_parallel".
+#         "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
+#         "apex_naive": cross_entropy from apex
+#         "py_naive": self-implemented cross_entropy
+#         "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
+#         "py_vocab_parallel": self-implemented vocab parallel cross_entropy
+
+#         * op_types that ends with "naive" only support parallel_output=False;
+#         * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
+loss = dict(label_smoothing=0, op_type="py_vocab_parallel")
 
 adam = dict(
     lr=1e-4,
diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py
index 2698a82f..39c78660 100644
--- a/configs/7B_isp_sft.py
+++ b/configs/7B_isp_sft.py
@@ -1,5 +1,5 @@
 JOB_NAME = "7b_train"
-# model_type = "INTERNLM2_PUBLIC"
+model_type = "INTERNLM2_PUBLIC"
 DO_ALERT = False
 
 VOCAB_SIZE = 103168
@@ -31,7 +31,7 @@
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
     # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined 
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
     # load function such as "llama"
     load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
     # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
@@ -108,8 +108,24 @@
     clip_grad_norm=1.0,
 )
 
+
+# loss config (dict):
+#     1. label_smoothing
+#     2. op_type: cross_entropy operator type, we support five types for loss computing,
+#                 including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
+#                 default is "py_vocab_parallel".
+#         "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
+#         "apex_naive": cross_entropy from apex
+#         "py_naive": self-implemented cross_entropy
+#         "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
+#         "py_vocab_parallel": self-implemented vocab parallel cross_entropy
+
+#         * op_types that ends with "naive" only support parallel_output=False;
+#         * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
+
 loss = dict(
     label_smoothing=0,
+    op_type="flash_vocab_parallel",
 )
 
 adam = dict(
@@ -145,7 +161,7 @@
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
-    # no_bias=True,
+    no_bias=True,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
     dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
@@ -186,26 +202,30 @@
 weight parallel (dict):
     1. size: int, the size of weight parallel.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. launch_allgather_before: str, before which module to launch the all gather communication to
+        prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
+        Must be used with forward_overlap_per 'layer'.
+    4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
 sequence_2D (dict):
     1. enable: bool, whether enable the 2D sequence parallel or not.
-    2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses). 
+    2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses).
                   head_size * context_size should be equal tensor size.
     3. context_size: int, the parallel degree of context parallelism.
                   head_size * context_size should be equal tensor size.
     4. window_size: int, the sliding window size in context parallelism.
     5. device_placement_strategy: dict,
-        head_first: bool, if `True`, ranks of the same head parallel group are 
+        head_first: bool, if `True`, ranks of the same head parallel group are
                               given high priority for colocation on the same node;
                               if `False`, ranks of the same context parallel group are
                               given high priority for colocation on the same node;
-        interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could 
+        interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
                            interleaved the ranks in the same window to make full use of NIC as much as possible.
 """
 parallel = dict(
     zero1=dict(size=-1),
     tensor=dict(size=2, mode="isp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=4, overlap=True),
+    weight=dict(size=4, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
     sequence_2D=dict(
         enable=False,
         head_size=2,
diff --git a/configs/8x22B_mixtral.py b/configs/8x22B_mixtral.py
index 56206bd4..debd423b 100644
--- a/configs/8x22B_mixtral.py
+++ b/configs/8x22B_mixtral.py
@@ -191,7 +191,6 @@
 weight parallel (dict):
     1. size: int, the size of weight parallel.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 expert parallel (dict):
     1. size: int
         * if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -202,15 +201,14 @@
 expert weight parallel (dict):
     1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=1, overlap=True, memory_pool=True),
+    weight=dict(size=1, overlap=True),
     expert=dict(size=-1, no_tp=False),
-    expert_weight=dict(size=1, overlap=True, memory_pool=True),
+    expert_weight=dict(size=1, overlap=True),
 )
 
 cudnn_deterministic = False
diff --git a/configs/8x7B_mixtral.py b/configs/8x7B_mixtral.py
index f589c967..322342ea 100644
--- a/configs/8x7B_mixtral.py
+++ b/configs/8x7B_mixtral.py
@@ -191,7 +191,6 @@
 weight parallel (dict):
     1. size: int, the size of weight parallel.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 expert parallel (dict):
     1. size: int
         * if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -202,15 +201,14 @@
 expert weight parallel (dict):
     1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=1, overlap=True, memory_pool=True),
+    weight=dict(size=1, overlap=True),
     expert=dict(size=-1, no_tp=False),
-    expert_weight=dict(size=1, overlap=True, memory_pool=True),
+    expert_weight=dict(size=1, overlap=True),
 )
 
 cudnn_deterministic = False
diff --git a/internlm/checkpoint/load_funcs.py b/internlm/checkpoint/load_funcs.py
index d23cae63..dde4bc52 100644
--- a/internlm/checkpoint/load_funcs.py
+++ b/internlm/checkpoint/load_funcs.py
@@ -1,6 +1,7 @@
 # Copyright (c) InternLM. All rights reserved.
 
 from internlm.model.modeling_internlm import InternLM1
+from internlm.model.modeling_internlm2 import InternLM2
 from internlm.model.modeling_llama import Llama2
 from internlm.utils.logger import get_logger
 
@@ -9,4 +10,5 @@
 LOAD_FUNC_DICT = {
     "llama": Llama2.load_llama_pretrained_weights,
     "internlm_test": InternLM1.load_internlm_with_dynamic_parallel_size,
+    "internlm2_test": InternLM2.load_internlm2_with_dynamic_parallel_size,
 }
diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index 989b1c00..f4751f59 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -21,8 +21,14 @@
 from internlm.utils.timeout import LLM_NCCL_TIMEOUT
 from internlm.utils.utils import TensorParallelMode
 
-from . import process_group_initializer as pgroup_initializer
-from .process_group_initializer import ParallelMode
+from .process_group_initializer import (
+    GroupConfig,
+    ParallelMode,
+    create_parallel_process_groups,
+    create_single_process_group,
+    generate_2d_attn_process_group,
+    generate_parallel_group_configs,
+)
 from .random import add_seed, get_seeds, set_mode
 
 # for layernorm
@@ -633,60 +639,47 @@ def init_parallel_groups(self):
 
         self.check_sanity()
 
-        initializer_args = [
-            rank,
-            world_size,
-            self.weight_parallel_size,
-            self.weight_data_parallel_size,
-            self.sequence_parallel_size,
-            self.data_parallel_size,
-            self.pipeline_parallel_size,
-            self.tensor_parallel_size,
-            self.zero1_parallel_size,
-            self.nettest_parallel_size,
-            self.expert_parallel_size,
-            self.expert_tensor_parallel_size,
-            self.expert_weight_parallel_size,
-            self.expert_data_parallel_size,
-            parallel_config.sequence_2D,
-        ]
-
-        # run initialization of different process groups
-        initializers = []
-        if "gqa" in parallel_config and parallel_config["gqa"] is True:
-            initializers.append(pgroup_initializer.Initializer_GQA(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Weight(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Weight_Data(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Data(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_ISP_Data(*initializer_args))
-        if (
-            isinstance(parallel_config["tensor"], dict)
-            and parallel_config["tensor"]["mode"] == TensorParallelMode.isp.name
-        ):
-            initializers.append(pgroup_initializer.Initializer_Zero1_ISP(*initializer_args))
-        else:
-            initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
-        if isinstance(parallel_config["zero1"], dict) and parallel_config["zero1"].get("fsdp", False):
-            initializers.append(pgroup_initializer.Initializer_Zero3_dp(*initializer_args))
-        initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
-        if self.pipeline_parallel_size > 1:
-            initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
-        if self.config.model.get("num_experts", 1) > 1:
-            if isinstance(parallel_config["tensor"], dict) and parallel_config["tensor"]["mode"] == "isp":
-                initializers.append(pgroup_initializer.Initializer_Expert_Weight_Data(*initializer_args))
-            else:
-                initializers.append(pgroup_initializer.Initializer_Expert_Data(*initializer_args))
+        parallel_sizes = {
+            ParallelMode.TENSOR: self.tensor_parallel_size,
+            ParallelMode.SEQUENCE: self.sequence_parallel_size,
+            ParallelMode.PIPELINE: self.pipeline_parallel_size,
+            ParallelMode.DATA: self.data_parallel_size,
+            ParallelMode.ZERO1: self.zero1_parallel_size,
+            ParallelMode.WEIGHT: self.weight_parallel_size,
+            ParallelMode.WEIGHT_DATA: self.weight_data_parallel_size,
+            ParallelMode.NETTEST: self.nettest_parallel_size,
+            ParallelMode.EXPERT: self.expert_parallel_size,
+            ParallelMode.EXPERT_WEIGHT: self.expert_weight_parallel_size,
+            ParallelMode.EXPERT_TENSOR: self.expert_tensor_parallel_size,
+            ParallelMode.EXPERT_DATA: self.expert_data_parallel_size,
+        }
+
+        # process groups for parallelism.
+        enable_moe = self.config.model.get("num_experts", 1) > 1
+        tp_mode = "mtp" if isinstance(parallel_config.tensor, int) else parallel_config.tensor.get("mode", "mtp")
+        is_fsdp = False if isinstance(parallel_config.zero1, int) else parallel_config.zero1.get("fsdp", False)
+        parallel_strategy = "fsdp" if is_fsdp else tp_mode
+        group_configs = generate_parallel_group_configs(parallel_strategy, parallel_sizes, enable_moe)
+        group_results = create_parallel_process_groups(world_size, rank, group_configs, with_cpu_group=False)
+
+        # process group for network test.
+        group_results.append(
+            create_single_process_group(
+                world_size,
+                rank,
+                GroupConfig(ParallelMode.NETTEST, self.nettest_parallel_size, allow_partial_group=True),
+            )
+        )
+
+        # process group for isp 2D attn.
         if parallel_config.sequence_2D.get("enable", False) is True:
-            initializers.append(pgroup_initializer.Initializer_2D_SEQUENCE_PARALLEL(*initializer_args))
+            group_results.extend(
+                generate_2d_attn_process_group(world_size, rank, parallel_config.sequence_2D, parallel_sizes)
+            )
 
-        for initializer in initializers:
-            parallel_setting = initializer.init_dist_group()
-            if isinstance(parallel_setting, list):
-                for args in parallel_setting:
-                    self._register_dist(*args)
-            else:
-                self._register_dist(*parallel_setting)
+        # register process groups
+        for result in group_results:
+            self._register_dist(*result)
 
     def is_initialized(self, parallel_mode: ParallelMode):
         """Returns a boolean value indicating whether `parallel_mode` is initialized
diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py
index fbc3e07a..1e805738 100644
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@@ -6,11 +6,16 @@
 import math
 from abc import ABC, abstractmethod
 from enum import Enum
+from functools import reduce
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch.distributed as dist
 
+from internlm.utils.logger import get_logger
 from internlm.utils.timeout import LLM_NCCL_TIMEOUT
 
+logger = get_logger(__file__)
+
 
 # parallel modes
 class ParallelMode(Enum):
@@ -69,9 +74,6 @@ class ParallelMode(Enum):
     # real data parallel for isp
     ISP_DATA = "isp_data"
 
-    # grouped query attention
-    GQA = "gqa"
-
     # sequence 2D parallel
     HEAD = "head"
     CONTEXT = "context"
@@ -81,6 +83,349 @@ class ParallelMode(Enum):
     DKV_INTRA_WINDOW = "dkv_intra_window"
 
 
+class GroupConfig:
+    """config for initialze a process group"""
+
+    def __init__(
+        self,
+        mode: ParallelMode,
+        size: int,
+        anonymous: bool = False,
+        allow_partial_group: bool = False,
+        subgroups: Optional[List["GroupConfig"]] = None,
+    ) -> None:
+        self.mode = mode
+        self.size = size
+        self.anonymous = anonymous
+        self.allow_partial_group = allow_partial_group
+        self.subgroups = subgroups if subgroups is not None else []
+
+        self._early_subgroup_checking()
+
+    def _early_subgroup_checking(self) -> None:
+        if len(self.subgroups) == 0:
+            return
+
+        group_target_size = reduce(lambda x, y: x * y, [_g.size for _g in self.subgroups])
+        assert group_target_size <= self.size, "subgroup size should less than father group"
+
+
+def init_cpu_group(group, ranks, use_cpu: bool = False):
+    if use_cpu:
+        cpu_group = (
+            dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT) if dist.get_backend() != "gloo" else group
+        )
+    else:
+        cpu_group = None
+
+    return cpu_group
+
+
+def get_group_ranks(
+    global_ranks_or_sizes: Union[int, List[int]],
+    cur_group_size: int,
+    pre_group_size: int,
+    allow_partial_group: bool = False,
+):
+    group_ranks = []
+
+    if isinstance(global_ranks_or_sizes, list):
+        global_size = len(global_ranks_or_sizes)
+        global_ranks = global_ranks_or_sizes
+    else:
+        global_size = global_ranks_or_sizes
+        global_ranks = None
+
+    real_global_size = global_size
+
+    if allow_partial_group:
+        global_size = math.ceil(global_size / cur_group_size) * cur_group_size
+
+    assert global_size % cur_group_size == 0, "err1"
+
+    def _get_local_starts():
+        for i in range(0, global_size, cur_group_size * pre_group_size):
+            for j in range(pre_group_size):
+                yield 0 + i + j
+
+    for start in _get_local_starts():
+        ranks = [
+            start + i * pre_group_size for i in range(cur_group_size) if start + i * pre_group_size < real_global_size
+        ]
+        if global_ranks is not None:
+            ranks = [global_ranks[_idx] for _idx in ranks]
+
+        group_ranks.append(ranks)
+
+    assert len(group_ranks) == global_size // cur_group_size, f"{group_ranks}, {global_size}, {cur_group_size}"
+
+    return group_ranks
+
+
+def _create_parallel_process_groups(
+    global_ranks_or_sizes: int,
+    self_rank: int,
+    pre_group_size: int,
+    group_configs: List[GroupConfig],
+    with_cpu_group: bool = False,
+):
+    group_results = []
+
+    for group in group_configs:
+        if group.anonymous is True:
+            pre_group_size = pre_group_size * group.size
+            continue
+
+        group_ranks, accelerator_group = None, None
+        all_group_ranks = get_group_ranks(global_ranks_or_sizes, group.size, pre_group_size, group.allow_partial_group)
+
+        for idx, ranks in enumerate(all_group_ranks):
+            _pg = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if self_rank in ranks:
+                group_ranks, accelerator_group = all_group_ranks[idx], _pg
+            else:
+                dist.destroy_process_group(_pg)
+
+        if group_ranks is None:
+            pre_group_size = pre_group_size * group.size
+            continue
+
+        cpu_group = init_cpu_group(accelerator_group, group_ranks, with_cpu_group)
+
+        group_results.append(
+            (group_ranks.index(self_rank), len(group_ranks), accelerator_group, cpu_group, group_ranks, group.mode)
+        )
+
+        if len(group.subgroups) > 0:
+            subgroup_results = _create_parallel_process_groups(
+                global_ranks_or_sizes, self_rank, pre_group_size, group.subgroups, with_cpu_group
+            )
+            group_results.extend(subgroup_results)
+
+        pre_group_size = pre_group_size * group.size
+
+    return group_results
+
+
+def create_parallel_process_groups(
+    world_size: int, self_rank: int, group_configs: List[List[GroupConfig]], with_cpu_group: bool = False
+):
+    group_results = []
+    already_allocated_group = {}
+
+    def _checker(order: str, result: Tuple[Any]) -> bool:
+        parallel_mode = result[-1]
+
+        if parallel_mode not in already_allocated_group:
+            already_allocated_group[parallel_mode] = (order, result)
+            return True
+        else:
+            # check
+            ranks_in_group_idx = -2
+            pre_order, pre_allocate_result = already_allocated_group[parallel_mode]
+
+            error_msg = (
+                f"The ranks allocated for {parallel_mode} are inconsistent in config {pre_order} and {order}: "
+                + f"{pre_allocate_result[ranks_in_group_idx]} != {result[ranks_in_group_idx]}"
+            )
+            assert pre_allocate_result[ranks_in_group_idx] == result[ranks_in_group_idx], error_msg
+
+            # release process group
+            dist.destroy_process_group(result[2])  # accelerator_group
+            if with_cpu_group:
+                dist.destroy_process_group(result[3])  # cpu_group
+
+        return False
+
+    for order, group_config in group_configs:
+        pre_group_size = 1
+
+        results = _create_parallel_process_groups(
+            world_size,
+            self_rank,
+            pre_group_size,
+            group_config,
+            with_cpu_group,
+        )
+
+        for result in results:
+            if _checker(order, result) is True:
+                group_results.append(result)
+
+    return group_results
+
+
+def create_single_process_group(
+    world_size: int, self_rank: int, config: GroupConfig, with_cpu_group: bool = False, pre_anonymous_size: int = 1
+):
+    pre_group_size = pre_anonymous_size
+
+    return _create_parallel_process_groups(
+        world_size,
+        self_rank,
+        pre_group_size,
+        [config],
+        with_cpu_group,
+    )[0]
+
+
+MTP_GROUP_ORDER = [ParallelMode.TENSOR, ParallelMode.DATA, ParallelMode.PIPELINE]
+MTP_MOE_GROUP_ORDER = [ParallelMode.EXPERT_TENSOR, ParallelMode.EXPERT, ParallelMode.EXPERT_DATA, ParallelMode.PIPELINE]
+ISP_SP_GROUP_ORDER = [ParallelMode.TENSOR, ParallelMode.DATA, ParallelMode.PIPELINE]
+ISP_WP_GROUP_ORDER = [ParallelMode.WEIGHT, ParallelMode.WEIGHT_DATA, ParallelMode.PIPELINE]
+ISP_MOE_GROUP_ORDER = [ParallelMode.EXPERT_WEIGHT, ParallelMode.EXPERT, ParallelMode.EXPERT_DATA, ParallelMode.PIPELINE]
+FSDP_ORDER = [ParallelMode.DATA]  # TODO: should we support moe for fsdp?
+
+SUBGROUP_SPEC = {
+    "mtp": {
+        ParallelMode.DATA: [ParallelMode.ZERO1],
+    },
+    "isp": {
+        ParallelMode.WEIGHT_DATA: [ParallelMode.ZERO1],
+    },  # TODO: WEIGHT_ZERO1
+    "fsdp": {
+        ParallelMode.DATA: [ParallelMode.ZERO3_DP, ParallelMode.ZERO1],
+    },
+}
+
+
+def generate_parallel_group_configs(
+    parallel_strategy: str, parallel_sizes: Dict[ParallelMode, int], enable_moe: bool = False
+) -> List[List[GroupConfig]]:
+
+    group_configs = []
+    subgroup_spec = SUBGROUP_SPEC.get(parallel_strategy, SUBGROUP_SPEC["mtp"])
+
+    def _recurse_generater(order: List[ParallelMode]):
+        config = []
+
+        for mode in order:
+            # disable pp process group for compatibility when pp size is 1.
+            anonymous = mode is ParallelMode.PIPELINE and parallel_sizes[mode] == 1
+
+            if mode not in subgroup_spec:
+                config.append(GroupConfig(mode, parallel_sizes[mode], anonymous))
+            else:
+                config.append(
+                    GroupConfig(
+                        mode, parallel_sizes[mode], anonymous, subgroups=_recurse_generater(subgroup_spec[mode])
+                    )
+                )
+
+        return config
+
+    if parallel_strategy == "isp":
+        # sp configs
+        group_configs.append(("isp-sp", _recurse_generater(ISP_SP_GROUP_ORDER)))
+        # wp configs
+        group_configs.append(("isp-wp", _recurse_generater(ISP_WP_GROUP_ORDER)))
+        if enable_moe:
+            group_configs.append(("isp-moe", _recurse_generater(ISP_MOE_GROUP_ORDER)))
+    elif parallel_strategy == "fsdp":
+        group_configs.append(("fsdp", _recurse_generater(FSDP_ORDER)))
+    else:  # 3d parallel: mtp, msp, fsp
+        group_configs.append(("3d", _recurse_generater(MTP_GROUP_ORDER)))
+        if enable_moe:
+            group_configs.append(("3d-moe", _recurse_generater(MTP_MOE_GROUP_ORDER)))
+
+    return group_configs
+
+
+def generate_2d_attn_process_group(
+    world_size: int,
+    self_rank: int,
+    config: Dict[str, Any],
+    parallel_sizes: Dict[ParallelMode, int],
+    with_cpu_group: bool = False,
+):
+
+    assert config.context_size * config.head_size == parallel_sizes[ParallelMode.SEQUENCE]
+    assert world_size % parallel_sizes[ParallelMode.SEQUENCE] == 0
+
+    if config.window_size >= 8 or config.window_size == config.context_size:
+        logger.warning("interleaved is forced False when window size > 8 or equals context size.")
+        config.interleaved = False
+
+    if config.device_placement_strategy.head_first and config.head_size > 1:
+        logger.warning("interleaved is forced False when head_first is True and head size > 1.")
+        config.interleaved = False
+
+    group_results = []
+    sp_pre_group_size = 1
+    for parallel_mode in ISP_SP_GROUP_ORDER:
+        if parallel_mode is ParallelMode.TENSOR:  # assert sp is tp.
+            break
+        else:
+            sp_pre_group_size *= parallel_sizes[parallel_mode]
+
+    # head and context process groups.
+    if config.device_placement_strategy.head_first:
+        group_configs = [
+            GroupConfig(ParallelMode.HEAD, config.head_size),
+            GroupConfig(ParallelMode.CONTEXT, config.context_size),
+        ]
+        context_results_index = 1
+    else:
+        group_configs = [
+            GroupConfig(ParallelMode.CONTEXT, config.context_size),
+            GroupConfig(ParallelMode.HEAD, config.head_size),
+        ]
+        context_results_index = 0
+
+    group_results.extend(
+        _create_parallel_process_groups(world_size, self_rank, sp_pre_group_size, group_configs, with_cpu_group)
+    )
+
+    # window process groups.
+    window_num = config.context_size // config.window_size
+    cp_pre_group_size = 1 if context_results_index == 0 else config.head_size
+    every_context_ranks = get_group_ranks(world_size, config.context_size, cp_pre_group_size)
+
+    def _gen_window_process_groups(context_ranks: List[int]):
+        if not config.device_placement_strategy.interleaved:
+            window_ranks = context_ranks
+        else:
+            _indexes = [
+                j * 2 + i * config.window_size if i % 2 == 0 else j * 2 + 1 + (i - 1) * config.window_size
+                for i in range(window_num)
+                for j in range(config.window_size)
+            ]
+            window_ranks = [context_ranks[_i] for _i in _indexes]
+
+        group_results.extend(
+            _create_parallel_process_groups(
+                window_ranks,
+                self_rank,
+                1,
+                [
+                    GroupConfig(ParallelMode.INTRA_WINDOW, config.window_size),
+                    GroupConfig(ParallelMode.INTER_WINDOW, window_num),
+                ],
+                with_cpu_group,
+            )
+        )
+        group_results.extend(
+            _create_parallel_process_groups(
+                window_ranks,
+                self_rank,
+                1,
+                [
+                    GroupConfig(ParallelMode.DKV_INTRA_WINDOW, config.window_size),
+                    GroupConfig(ParallelMode.DKV_INTER_WINDOW, window_num),
+                ],
+                with_cpu_group,
+            )
+        )
+
+    for context_ranks in every_context_ranks:
+        _gen_window_process_groups(context_ranks)
+
+    # print(get_group_ranks(window_ranks, config.window_size, 1))
+    # print(get_group_ranks(window_ranks, window_num, config.window_size))
+
+    return group_results
+
+
 class ProcessGroupInitializer(ABC):
     """An object, knowing the parallelism configuration, that initializes parallel groups.
 
@@ -1106,86 +1451,6 @@ def init_dist_group(self, use_cpu: bool = False):
         return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 
 
-class Initializer_GQA(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for allreduce kv gradients with common attention head.
-
-    Args:
-        rank (int): The rank of current process.
-        world_size (int): Size of whole communication world.
-        weight_parallel_size (int): Size of model weight parallel.
-        weight_data_parallel_size (int): Size of data parallel for common weight.
-        sequence_parallel_size (int): Size of data sequence parallel.
-        data_parallel_size (int): Size of data parallel.
-        pipeline_parallel_size (int): Size of pipeline parallel.
-        tensor_parallel_size (int): Size of tensor parallel.
-        zero1_parallel_size (int): Size of zero1 parallel.
-        nettest_parallel_size (int): Size of net testing parallel.
-        expert_parallel_size (int): Size of expert parallel.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # TODO: should adapt to general case
-        self.num_kv_attention_heads = 8
-        self.NUM_ATTENTION_HEAD = 32
-        self.kv_head_repeats_num = self.NUM_ATTENTION_HEAD // self.num_kv_attention_heads
-        self.num_kv_group_per_tp = self.num_kv_attention_heads
-        self.num_kv_groups = self.num_kv_group_per_tp * self.data_parallel_size
-
-        assert self.world_size % self.tensor_parallel_size == 0
-        assert self.world_size % (self.pipeline_parallel_size * self.tensor_parallel_size) == 0
-        assert self.pipeline_parallel_size == 1
-
-    def init_dist_group(self, use_cpu: bool = False):
-        """Initialize weight's data parallel groups, and assign local_ranks and groups to each gpu.
-
-        Returns:
-            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
-                A WEIGHT_DATA parallelism's information tuple.
-
-        n=128 sp=32 wp=64 zo1=1 with nopp
-        sp groups: [0-31] [32-63] [64-95] [96-127]
-        wp groups: [0-63] [64-127]
-        kv_head groups: [0,1,2,3] [4,5,6,7] [8,9,10,11] [12,13,14,15]
-                        [16,17,18,19] [20,21,22,23] [24,25,26,27] [28,29,30,31]
-                        ...
-                        ...
-                        ...
-        """
-        local_rank = None
-        ranks_in_group = None
-        process_group = None
-        cpu_group = None
-        group_world_size = None
-        mode = ParallelMode.GQA
-
-        # TODO: consider PP
-        for i in range(self.data_parallel_size):
-            for j in range(self.num_kv_group_per_tp):
-                ranks = [
-                    i * self.tensor_parallel_size + j * self.kv_head_repeats_num + k
-                    for k in range(self.kv_head_repeats_num)
-                ]
-                group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
-                if use_cpu:
-                    group_cpu = (
-                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
-                        if dist.get_backend() != "gloo"
-                        else group
-                    )
-                else:
-                    group_cpu = None
-
-                if self.rank in ranks:
-                    local_rank = ranks.index(self.rank)
-                    group_world_size = len(ranks)
-                    process_group = group
-                    cpu_group = group_cpu
-                    ranks_in_group = ranks
-
-        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
-
-
 class Initializer_2D_SEQUENCE_PARALLEL(ProcessGroupInitializer):
     """
     A ProcessGroupInitializer for 2D sequence parallel.
diff --git a/internlm/core/parallel/comm/__init__.py b/internlm/core/parallel/comm/__init__.py
index e69de29b..be170f28 100644
--- a/internlm/core/parallel/comm/__init__.py
+++ b/internlm/core/parallel/comm/__init__.py
@@ -0,0 +1,3 @@
+from .attn_offload import get_offload_manager, initialize_offload_manager
+
+__all__ = ["initialize_offload_manager", "get_offload_manager"]
diff --git a/internlm/core/parallel/comm/attn_offload.py b/internlm/core/parallel/comm/attn_offload.py
new file mode 100644
index 00000000..da23f3ae
--- /dev/null
+++ b/internlm/core/parallel/comm/attn_offload.py
@@ -0,0 +1,127 @@
+import torch
+
+from internlm.utils.common import get_current_device
+
+global_attn_offload = None
+
+
+class AttnOffloadManager:
+    """
+    A manager for attention output CPU offloading and GPU prefetch loading.
+    """
+
+    def __init__(self, enable_cpu_offload: bool = False) -> None:
+        # cpu offload overlapping
+        self.cpu_offload = enable_cpu_offload
+        # layer id mapping to flash attn output
+        self.fa_output_mapping = {}
+        self.fa_stream = torch.cuda.Stream()
+        self.d2h_final_event = torch.cuda.Event()
+        self.h2d_final_event = torch.cuda.Event()
+        # prepare for tensor buffer
+        self.tensor_id_to_tensor_bufs = {}
+
+    def get_tensor_buf_for_offloaded_tensor(self, tensor, layer_id, tensor_id):
+        """Get tensor buffer for offloaded tensor."""
+        layer_id = layer_id % 2
+        if layer_id not in self.tensor_id_to_tensor_bufs:
+            self.tensor_id_to_tensor_bufs[layer_id] = {}
+
+        if tensor_id not in self.tensor_id_to_tensor_bufs[layer_id]:
+            allocate_new_buf = True
+        else:
+            tensor_buf = self.tensor_id_to_tensor_bufs[layer_id][tensor_id]
+            allocate_new_buf = tensor_buf.size() == tensor.size() and tensor_buf.dtype == tensor.dtype
+
+        if allocate_new_buf:
+            # supposed to only execute once
+            buffer = torch.empty(
+                tensor.size(),
+                dtype=tensor.dtype,
+                layout=tensor.layout,
+                device=tensor.device,
+            )
+
+            self.tensor_id_to_tensor_bufs[layer_id][tensor_id] = buffer
+
+        return self.tensor_id_to_tensor_bufs[layer_id][tensor_id]
+
+    def insert_fa_output_with_layer(self, layer_idx, output):
+        assert layer_idx not in self.fa_output_mapping
+        if self.cpu_offload is False:
+            self.fa_output_mapping[layer_idx] = output
+            return
+
+        tensors = []
+        for tensor_id, tensor in enumerate(output):
+            if tensor is None:
+                tensors.append(None)
+                continue
+            tensor_buf = self.get_tensor_buf_for_offloaded_tensor(tensor, layer_idx, tensor_id)
+            tensor_buf.copy_(tensor)
+            tensors.append(tensor_buf)
+        self.fa_output_mapping[layer_idx] = tensors
+
+    def get_fa_output_with_layer(self, layer_idx):
+        assert layer_idx in self.fa_output_mapping
+        return self.fa_output_mapping.pop(layer_idx)
+
+    def offload_fa_output_with_layer(self, layer_idx):
+        assert layer_idx in self.fa_output_mapping
+
+        self.fa_stream.wait_stream(torch.cuda.current_stream())
+        self.fa_stream.wait_event(self.d2h_final_event)
+
+        with torch.cuda.stream(self.fa_stream):
+            _gpu_tensors = self.fa_output_mapping.pop(layer_idx)
+            _cpu_tensors = []
+            for _tensor in _gpu_tensors:
+                if _tensor is None:
+                    _cpu_tensors.append(_tensor)
+                    continue
+
+                _cpu_backup = torch.empty(
+                    _tensor.size(),
+                    dtype=_tensor.dtype,
+                    layout=_tensor.layout,
+                    device="cpu",
+                    pin_memory=True,
+                )
+                _cpu_backup.copy_(_tensor, non_blocking=True)
+                _cpu_tensors.append(_cpu_backup)
+
+                # _cpu_tensors.append(_tensor.to("cpu", non_blocking=False))
+
+            self.fa_output_mapping[layer_idx] = _cpu_tensors
+
+        self.fa_stream.record_event(self.d2h_final_event)
+
+    def preload_fa_output_with_layer(self, layer_idx):
+        assert layer_idx in self.fa_output_mapping
+
+        self.fa_stream.wait_stream(torch.cuda.current_stream())
+        self.fa_stream.wait_event(self.h2d_final_event)
+
+        # Important: get device before with stream, in stream get device is error
+        _device = get_current_device()
+        with torch.cuda.stream(self.fa_stream):
+            _cpu_tensors = self.fa_output_mapping.pop(layer_idx)
+            self.fa_output_mapping[layer_idx] = [
+                _tensor.to(device=_device, non_blocking=True) if _tensor is not None else _tensor
+                for _tensor in _cpu_tensors
+            ]
+
+        self.fa_stream.record_event(self.h2d_final_event)
+
+
+def initialize_offload_manager(enable_cpu_offload: bool = False):
+    global global_attn_offload
+    if global_attn_offload is None:
+        global_attn_offload = AttnOffloadManager(enable_cpu_offload)
+
+    return global_attn_offload
+
+
+def get_offload_manager():
+    assert global_attn_offload is not None
+    return global_attn_offload
diff --git a/internlm/core/parallel/comm/isp.py b/internlm/core/parallel/comm/isp.py
index c80c32d8..23a92980 100644
--- a/internlm/core/parallel/comm/isp.py
+++ b/internlm/core/parallel/comm/isp.py
@@ -37,6 +37,8 @@
     params_dispatch_with_condition,
 )
 
+from .attn_offload import get_offload_manager
+
 
 # not really useful, only for code hint.
 class WPCommunicator(ABC):
@@ -266,7 +268,6 @@ def __init__(
         dtype: torch.dtype = torch.half,
         device: torch.device = None,
         activation_checkpointing: float = 0.0,
-        module_shapes: Dict[str, torch.Size] = None,
     ) -> None:
         self.dtype = dtype
         if device is None:
@@ -274,7 +275,6 @@ def __init__(
         else:
             self.device = device
         self.activation_checkpointing = activation_checkpointing
-        self.module_shapes = module_shapes
 
 
 class ISPOverlapState:
@@ -285,7 +285,7 @@ class ISPOverlapState:
     def __init__(self) -> None:
         self.num_blocks: int = 0
         self.ckpt_block_num: int = 0
-        self.isp_outs: List[nn.Module] = []
+        self.isp_prefetch_launch_module: List[nn.Module] = []
         self.isp_modules: List[nn.Module] = []
         self.index_to_isp_modules: Dict[int, nn.Module] = {}
         self.index_to_block: Dict[int, nn.Module] = {}
@@ -308,6 +308,7 @@ def __init__(
         overlap: bool = False,
         process_group: dist.ProcessGroup = None,
         is_moe: bool = False,
+        selective_ckpt_offload: bool = False,
         early_reduce_scatter_release: bool = True,
     ) -> None:
         self.process_group = process_group
@@ -316,13 +317,22 @@ def __init__(
         self.is_moe = is_moe
         self.is_forward = True
         self.reduce_scatter_handlers = {}
-        self._module_shapes = {}
         self._forward_prefetch_prerequisites = []
         self._zero_const_pool = {}
 
         self._enable_early_reduce_scatter_release = early_reduce_scatter_release
         self._early_prev_layer_rs_handles = []
         self._early_curr_layer_rs_handles = []
+        self._forward_overlap_per = self._get_forward_overlap_granularity()
+        self._launch_before_module = self._get_launch_before_module()
+        # As an optimization, do not release weight after forward for the last
+        # transformer block since wp would prefetch it immediately
+        self.layers_wp_not_release = []  # [gpc.config.isp_num_layers - 1]
+        self.layers_fa_not_release = [
+            gpc.config.isp_num_layers - 1,
+            int(gpc.config.model.checkpoint * gpc.config.isp_num_layers) - 1,
+        ]
+        self.sc_offload = selective_ckpt_offload
 
         # real overlap state for each chunk.
         self._overlap_states: Dict[int, ISPOverlapState] = {}
@@ -330,7 +340,7 @@ def __init__(
         # inner interface variables of overlap state.
         self._num_blocks = None
         self._ckpt_block_num = None
-        self._isp_outs = None
+        self._isp_prefetch_launch_module = None
         self._isp_modules = None
         # key: isp module; value: module global all-gather op handle
         self._weight_global_handle = None
@@ -357,7 +367,32 @@ def __init__(
                 self._register_sync_parameters_hook()
             # switch to chunk 0 at first.
             self.switch_current_model_chunk(0)
-            self.model_conf.module_shapes = self._module_shapes
+
+    def _get_launch_before_module(self):
+        if self.is_moe is True:
+            _launch_before = gpc.config.parallel.expert_weight.get("launch_allgather_before", "wo")
+        else:
+            _launch_before = gpc.config.parallel.weight.get("launch_allgather_before", "wo")
+
+        if _launch_before == "wqkv":
+            return ["wqkv", "Wqkv", "qkv", "q_a_proj", "q_proj"]
+        elif _launch_before == "attn":
+            return ["attn"]
+        elif _launch_before == "wo":
+            return ["out_proj", "wo"]
+        elif _launch_before == "w1":
+            return ["w1", "fused_w1_w3"]
+        else:
+            assert False, "launch module should be in ['wqkv', 'attn', 'wo', 'w1']"
+
+    def _get_forward_overlap_granularity(self):
+        if self.is_moe is True:
+            _overlap_granularity = gpc.config.parallel.expert_weight.get("forward_overlap_per", "layer")
+        else:
+            _overlap_granularity = gpc.config.parallel.weight.get("forward_overlap_per", "layer")
+
+        assert _overlap_granularity in ["module", "layer"]
+        return _overlap_granularity
 
     def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
         self._overlap_states[cid] = ISPOverlapState()
@@ -365,6 +400,13 @@ def _parse_model_structure(self, cid: int, model: nn.Module) -> None:
         def get_model(obj: nn.Module) -> nn.Module:
             return get_model(obj.model) if hasattr(obj, "model") else obj
 
+        def is_allgather_launch_module(name, module):
+            return (
+                hasattr(module, "is_attn_cls")
+                and getattr(module, "is_attn_cls")
+                and self._launch_before_module == ["attn"]
+            ) or (name.split(".")[-1] in self._launch_before_module)
+
         # Important: only works for llama-class models
         children_name = get_model(model).named_children()
         for _, children in children_name:
@@ -375,23 +417,18 @@ def get_model(obj: nn.Module) -> nn.Module:
                     self._overlap_states[cid].index_to_isp_modules[idx] = []
                     self._overlap_states[cid].index_to_block[idx] = block
                     for name, child in block.named_modules():
-                        if name.split(".")[-1] in ["out_proj", "wo"]:
-                            self._overlap_states[cid].isp_outs.append(child)
-                            self._overlap_states[cid].module_to_index[child] = idx
+                        if is_allgather_launch_module(name, child):
+                            self._overlap_states[cid].isp_prefetch_launch_module.append(child)
                         if isinstance(child, (ParallelLinearWithCommExt)):
                             if is_moe_param(child.weight) != self.is_moe:
                                 continue
-                            if name not in self._module_shapes:
-                                weight_parallel_size = dist.get_world_size(self.process_group)
-                                origin_shape = tuple(
-                                    [child.weight.shape[0] * weight_parallel_size] + list(child.weight.shape[1:])
-                                )
-                                self._module_shapes[name] = torch.Size(origin_shape)
+
                             self._overlap_states[cid].module_to_index[child] = idx
                             self._overlap_states[cid].isp_modules.append(child)
                             self._overlap_states[cid].index_to_isp_modules[idx].append(child)
 
                             setattr(child, "isp_name", name)
+                            setattr(child, "isp_layer_idx", idx)
 
                             full_name = f"{cid}.{idx}.{name}"
                             setattr(
@@ -409,25 +446,28 @@ def get_model(obj: nn.Module) -> nn.Module:
         self._overlap_states[cid].num_blocks = len(self._overlap_states[cid].index_to_isp_modules)
 
     def _all_gather_module_weight(self, module):
+        assert module not in self._bias_global_output and module not in self._weight_global_output
         with_bias = module.bias is not None
 
         # submit the all-gather communication for weight and bias.
         if with_bias:
-            bias_output, bias_handle = all_gather_raw(
-                module.bias,
+            if module not in self._bias_global_output:
+                bias_output, bias_handle = all_gather_raw(
+                    module.bias,
+                    self.process_group,
+                    async_op=True,
+                )
+                self._bias_global_handle[module] = bias_handle
+                self._bias_global_output[module] = bias_output
+
+        if module not in self._weight_global_output:
+            weight_output, weight_handle = all_gather_raw(
+                module.weight,
                 self.process_group,
                 async_op=True,
             )
-            self._bias_global_handle[module] = bias_handle
-            self._bias_global_output[module] = bias_output
-
-        weight_output, weight_handle = all_gather_raw(
-            module.weight,
-            self.process_group,
-            async_op=True,
-        )
-        self._weight_global_handle[module] = weight_handle
-        self._weight_global_output[module] = weight_output
+            self._weight_global_handle[module] = weight_handle
+            self._weight_global_output[module] = weight_output
 
     def _all_gather_block_weight(self, block_index: int):
         block = self._index_to_block[block_index]
@@ -469,23 +509,39 @@ def _pre_forward_hook_for_first_block(self, *args):  # pylint: disable=W0613
         """
         prefetch weight for block 0 before forward.
         """
-        if self.is_forward is True:
+        if self._forward_overlap_per == "layer" and self.is_forward is True:
             self._all_gather_block_weight(0)
 
-    def _pre_forward_hook_for_last_ckpt_block(self, *args):  # pylint: disable=W0613
-        if self.is_forward is False:
-            self._all_gather_block_weight(self._ckpt_block_num - 1)
-
-    def _pre_forward_hook_for_out_proj(self, module: nn.Module, *args):  # pylint: disable=W0613
+    def _pre_forward_hook_for_prefetch_launch_module(self, module: nn.Module, *args):  # pylint: disable=W0613
         block_index = self._module_to_index[module]
 
-        if (block_index - 1 < self._ckpt_block_num) and self.is_forward is False:
-            if block_index - 1 >= 0:
-                self._all_gather_block_weight(block_index - 1)
-        else:
-            # start the all-gather for next block
-            if block_index + 1 < self._num_blocks:
-                self._all_gather_block_weight(block_index + 1)
+        if self._forward_overlap_per == "layer":
+            if (block_index - 1 < self._ckpt_block_num) and self.is_forward is False:
+                if block_index - 1 >= 0:
+                    self._all_gather_block_weight(block_index - 1)
+            else:
+                # start the all-gather for next block
+                if block_index + 1 < self._num_blocks:
+                    self._all_gather_block_weight(block_index + 1)
+
+        # register offload and prefetch hook for selective ckpt with wo linear
+        if self.sc_offload is True:
+            # move current layer's attn output from GPU to CPU asynchronizely
+            if (
+                self.is_forward is True
+                and gpc.config.selective_checkpoint
+                and block_index not in self.layers_fa_not_release
+                and block_index < self._ckpt_block_num
+            ):
+                get_offload_manager().offload_fa_output_with_layer(layer_idx=block_index)
+
+            # load previous layer's attn output from CPU to GPU asynchronizely
+            if (
+                self.is_forward is False
+                and gpc.config.selective_checkpoint
+                and (0 <= (block_index - 1) < self._ckpt_block_num)
+            ):
+                get_offload_manager().preload_fa_output_with_layer(layer_idx=block_index - 1)
 
     def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
         if module not in self._weight_global_handle:
@@ -493,7 +549,36 @@ def _pre_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: dis
 
         self._wait_handle(module)
 
+        if self._forward_overlap_per == "module":
+            # start the all-gather for next module
+            # 1.forward prefetch for next module
+            module_index = self._isp_modules.index(module)
+            module_layer_id = self._module_to_index[module]
+            if module_index + 1 < len(self._isp_modules) and self.is_forward is True:
+                next_module = self._isp_modules[module_index + 1]
+                self._all_gather_module_weight(next_module)
+
+            # 2.recompute forward prefetch for next module
+            if self.is_forward is False:
+                if module_index + 1 < len(self._isp_modules):
+                    next_module = self._isp_modules[module_index + 1]
+                    next_module_layer_id = self._module_to_index[next_module]
+                    if module_layer_id == next_module_layer_id:
+                        self._all_gather_module_weight(next_module)
+                    # if current module is the last module in current layer, prefetch previous layer's first module
+                    elif module_layer_id - 1 >= 0:
+                        next_module = self._index_to_isp_modules[module_layer_id - 1][0]
+                        self._all_gather_module_weight(next_module)
+                else:
+                    # if current module is the last module, prefetch previous layer's first module
+                    if module_layer_id - 1 >= 0:
+                        next_module = self._index_to_isp_modules[module_layer_id - 1][0]
+                        self._all_gather_module_weight(next_module)
+
     def _post_forward_hook_for_module(self, module: nn.Module, *args):  # pylint: disable=W0613
+        if int(module.isp_layer_idx) in self.layers_wp_not_release:
+            # print(f"the layer {module.isp_layer_idx} after forward not clear weight")
+            return
         if not ((self._module_to_index[module] < self._ckpt_block_num) and self.is_forward is False):
             self._clear_handle(module)
             self._clear_weight(module)
@@ -528,29 +613,24 @@ def _register_sync_parameters_hook(self) -> None:
         register forward hooks and backward hooks for isp modules.
         """
         # register forward hooks
-        # 1. register pre_forward_hook @block_0 to prefetch for block 0
-        # 2. register pre_forward_hook @block_(ckpt_block_num-1) to prefetch for the last ckpt block
-        # 3. register pre_forward_hook @out_proj module to prefetch for next block,
-        #    notice that next block's all_gather op should be after current block's all_to_all op
-        # 4. register pre_forward_hook @isp_module to wait handle for current module
-        # 5. register post_forward_hook @isp_module to release resource
+        # 1. register pre_forward_hook @block_0 to prefetch weight for block 0.
+        # 2. register pre_forward_hook @prefetch_launch_module to prefetch weight for next block,
+        #    when forward overlap granularity is 'layer'.
+        # 3. register pre_forward_hook @isp_module to wait handle for current module,
+        #    and prefetch weight for next module when forward overlap granularity is 'module'.
+        # 4. register post_forward_hook @isp_module to release memory resource.
         self._index_to_block[0].register_forward_pre_hook(self._pre_forward_hook_for_first_block)
 
-        if self._ckpt_block_num >= 1:
-            self._index_to_block[self._ckpt_block_num - 1].register_forward_pre_hook(
-                self._pre_forward_hook_for_last_ckpt_block
-            )
-
-        for out_proj in self._isp_outs:
-            out_proj.register_forward_pre_hook(self._pre_forward_hook_for_out_proj)
+        for module in self._isp_prefetch_launch_module:
+            module.register_forward_pre_hook(self._pre_forward_hook_for_prefetch_launch_module)
 
         for module in self._isp_modules:
             module.register_forward_pre_hook(self._pre_forward_hook_for_module)
             module.register_forward_hook(self._post_forward_hook_for_module)
 
         # register backward hooks
-        # 1. register pre_backward_hook @isp_module to wait handle for current module and to prefetch for next module
-        # 2. register post_backward_hook @isp_module to release resource
+        # 1. register pre_backward_hook @isp_module to wait handle for current module and to prefetch for next module.
+        # 2. register post_backward_hook @isp_module to release memory resource.
         if self._ckpt_block_num < self._num_blocks:
             for module in self._isp_modules:
                 module.register_full_backward_pre_hook(self._pre_backward_hook_for_module)
@@ -575,7 +655,7 @@ def communication_mode(self) -> str:
         return "wp"
 
     def switch_current_model_chunk(self, chunk_id: int) -> None:
-        self._isp_outs = self._overlap_states[chunk_id].isp_outs
+        self._isp_prefetch_launch_module = self._overlap_states[chunk_id].isp_prefetch_launch_module
         self._isp_modules = self._overlap_states[chunk_id].isp_modules
         self._weight_global_handle = self._overlap_states[chunk_id].weight_global_handle
         self._bias_global_handle = self._overlap_states[chunk_id].bias_global_handle
diff --git a/internlm/core/scheduler/pipeline_scheduler_1f1b.py b/internlm/core/scheduler/pipeline_scheduler_1f1b.py
index 4864c77f..289bc37d 100644
--- a/internlm/core/scheduler/pipeline_scheduler_1f1b.py
+++ b/internlm/core/scheduler/pipeline_scheduler_1f1b.py
@@ -35,7 +35,11 @@ def get_tensor_shape():
     if not gpc.is_initialized(ParallelMode.PIPELINE):
         return None
 
-    if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
+    if (
+        hasattr(gpc.config.data, "seq_len")
+        and hasattr(gpc.config.data, "micro_bsz")
+        and hasattr(gpc.config.model, "hidden_size")
+    ):
         if gpc.config.data.use_packed_dataset and gpc.is_evaluating is False:
             if gpc.config.parallel.sequence_parallel:
                 sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
diff --git a/internlm/core/trainer_builder.py b/internlm/core/trainer_builder.py
index d0ef284d..71c30d00 100644
--- a/internlm/core/trainer_builder.py
+++ b/internlm/core/trainer_builder.py
@@ -11,12 +11,13 @@
 from internlm.checkpoint.checkpoint_manager import CheckpointManager
 from internlm.core.context import global_context as gpc
 from internlm.core.context.process_group_initializer import ParallelMode
+from internlm.core.parallel.comm import initialize_offload_manager
 from internlm.core.trainer import Trainer
 from internlm.data.streaming.utils import streaming_simple_resume
 from internlm.data.train_state import get_train_state
 from internlm.eval.evaluation import evaluate_on_val_dls
 from internlm.initialize.initialize_trainer import initialize_trainer
-from internlm.model.losses.ce_loss import FlashGPTLMLoss
+from internlm.model.losses.ce_loss import InternLoss
 from internlm.model.metrics import AccPerplex
 from internlm.monitor.monitor import send_alert_message
 from internlm.train.pipeline import (
@@ -118,6 +119,9 @@ def __init__(
         # initialize isp communicator
         isp_communicator = initialize_parallel_communicator(model)
 
+        # initialize cpu offload manager for selective checkpoint
+        initialize_offload_manager(gpc.config.get("selective_checkpoint_offload", False))
+
         # initialize train state
         train_state = get_train_state(train_dl)
 
@@ -172,9 +176,11 @@ def _read_config(self, config_path: str) -> list:
         with open(config_path, "r") as f:
             return f.readlines()
 
-    def _initialize_criterion(self) -> FlashGPTLMLoss:
-        return FlashGPTLMLoss(
-            parallel_output=gpc.config.model.parallel_output, label_smoothing=gpc.config.loss.label_smoothing
+    def _initialize_criterion(self) -> InternLoss:
+        return InternLoss(
+            parallel_output=gpc.config.model.parallel_output,
+            label_smoothing=gpc.config.loss.label_smoothing,
+            op_type=gpc.config.loss.op_type,
         )
 
     def _initialize_checkpoint_manager(
diff --git a/internlm/data/build_dataloader.py b/internlm/data/build_dataloader.py
index 64da9539..e99bbfc7 100644
--- a/internlm/data/build_dataloader.py
+++ b/internlm/data/build_dataloader.py
@@ -2,12 +2,13 @@
 import subprocess
 from functools import partial
 
+import torch
 import torch.distributed as dist
 from torch.utils.data import ConcatDataset, DataLoader
 
+from internlm.accelerator.abstract_accelerator import get_accelerator
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.data.megatron.batch_sampler import MegatronBatchSampler
 from internlm.data.megatron.collaters import megatron_collate_fn
 from internlm.data.megatron.dataset import build_megatron_dataset
 from internlm.data.mocked.batch_sampler import MockedSequentialBatchSampler
@@ -41,8 +42,8 @@
 from internlm.utils.logger import get_logger
 from internlm.utils.utils import DataType
 
-# global llm logger
 logger = get_logger(__file__)
+internlm_accelerator = get_accelerator()
 
 
 def get_tokenized_train_loader_items(data_cfg):
@@ -156,10 +157,14 @@ def get_streaming_train_loader_items(data_cfg):
 
 
 def get_megatron_train_loader_items(data_cfg):
+    assert data_cfg.get(
+        "pack_sample_into_one", False
+    ), "megatron dataloader curently only supports pack_sample_into_one=True"
     try:
         from internlm.data.megatron import helpers  # noqa # pylint: disable=W0611
     except ImportError:
-        if gpc.is_rank_for_log():
+        # Compile dynamic library on-demand
+        if gpc.get_global_rank() % internlm_accelerator.device_count() == 0:
             subprocess.run(  # noqa # pylint: disable=W1510
                 [
                     "g++",
@@ -173,23 +178,28 @@ def get_megatron_train_loader_items(data_cfg):
                     "internlm/data/megatron/helpers.cpp",
                     "-o",
                     "internlm/data/megatron/helpers.so",
-                ]
+                ],
             )
+        torch.distributed.barrier()
+
+    # NOTICE: Currently we only support single megatron dataset, a.k.a., single .bin and .idx
+    # Megatron dataset (.bin and.idx) should be generated by Megatron-LM tools/preprocess_data.py
+    # https://github.com/NVIDIA/Megatron-LM/blob/main/tools/preprocess_data.py
     train_ds = build_megatron_dataset(
         data_prefix=data_cfg.train_folder,
-        data_impl=data_cfg.get("data_impl", "infer"),
-        splits_string="1.0, 0.0, 0.0",
-        train_valid_test_num_samples=[9600000, 0, 0],
         seq_len=data_cfg.seq_len,
         seed=data_cfg.get("seed", 1024),
-        skip_warmup=True,
     )
 
-    train_sampler = MegatronBatchSampler(
-        total_samples=len(train_ds),
-        consumed_samples=0,
+    train_sampler = StaticBatchSampler(
+        train_ds.datasets if isinstance(train_ds, ConcatDataset) else [train_ds],
         batch_size=data_cfg.micro_num * data_cfg.micro_bsz,
+        rampup_batch_size=data_cfg.rampup_batch_size,
+        micro_bsz=data_cfg.micro_bsz,
+        seed=data_cfg.get("seed", 1024),
         drop_last=True,
+        data_rank=gpc.get_local_rank(ParallelMode.DATA),
+        data_world_size=gpc.get_world_size(ParallelMode.DATA),
     )
 
     train_collate_fn = partial(
@@ -203,14 +213,18 @@ def get_mock_train_loader_items(data_cfg):
     assert data_cfg.get(
         "pack_sample_into_one", False
     ), "mocked dataloader curently only supports pack_sample_into_one=True"
+
     train_ds = MockedDataset(
         train_folder=data_cfg.train_folder,
         micro_bsz=data_cfg.micro_bsz,
         micro_num=data_cfg.micro_num,
         seq_len=data_cfg.seq_len,
     )
+
     train_sampler = MockedSequentialBatchSampler(train_ds, data_cfg.micro_num)
+
     train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.seq_len * data_cfg.micro_bsz)
+
     return train_ds, train_sampler, train_collate_fn
 
 
diff --git a/internlm/data/megatron/__init__.py b/internlm/data/megatron/__init__.py
index 5e447596..5405f6f8 100644
--- a/internlm/data/megatron/__init__.py
+++ b/internlm/data/megatron/__init__.py
@@ -1,9 +1,7 @@
-from .batch_sampler import MegatronBatchSampler
 from .collaters import megatron_collate_fn
 from .dataset import build_megatron_dataset
 
 __all__ = [
-    "MegatronBatchSampler",
     "build_megatron_dataset",
     "megatron_collate_fn",
 ]
diff --git a/internlm/data/megatron/batch_sampler.py b/internlm/data/megatron/batch_sampler.py
deleted file mode 100644
index 049cfcf7..00000000
--- a/internlm/data/megatron/batch_sampler.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import copy
-import math
-
-from internlm.core.context import ParallelMode
-from internlm.core.context import global_context as gpc
-
-
-class MegatronBatchSampler:
-    """
-    MegatronBatchSampler
-    """
-
-    def __init__(self, total_samples, consumed_samples, batch_size, drop_last=True):
-        # Keep a copy of input params for later use.
-        self.total_samples = total_samples
-        self.consumed_samples = consumed_samples
-        self.batch_size = batch_size
-        self.drop_last = drop_last
-
-        self.dp_rank = gpc.get_local_rank(ParallelMode.DATA)
-        self.dp_size = gpc.get_world_size(ParallelMode.DATA)
-
-        # Sanity checks.
-        assert self.total_samples > 0, "no sample to consume: {}".format(self.total_samples)
-        assert self.consumed_samples < self.total_samples, "no samples left to consume: {}, {}".format(
-            self.consumed_samples, self.total_samples
-        )
-        assert self.batch_size > 0
-        assert self.dp_size > 0
-        assert self.dp_rank < self.dp_size, "dp_rank should be smaller than dp_size: {}, " "{}".format(
-            self.dp_rank, self.dp_size
-        )
-
-    def __len__(self):
-        if self.drop_last and self.total_samples % self.dp_size != 0:
-            return math.ceil(self.total_samples - self.dp_size) / self.dp_size
-        else:
-            return math.ceil(self.total_samples / self.dp_size)
-
-    def get_start_end_idx(self):
-        start_idx = self.dp_rank * self.batch_size
-        end_idx = start_idx + self.batch_size
-        return start_idx, end_idx
-
-    def __iter__(self):
-        batch = []
-        # Last batch will be dropped if drop_last is not set False
-        for idx in range(self.consumed_samples, self.total_samples):
-            batch.append(idx)
-            if len(batch) == self.batch_size * self.dp_size:
-                start_idx, end_idx = self.get_start_end_idx()
-                yield batch[start_idx:end_idx]
-                batch = []
-
-        # Check the last partial batch and see drop_last is set
-        if len(batch) > 0 and not self.drop_last:
-            start_idx, end_idx = self.get_start_end_idx()
-            yield batch[start_idx:end_idx]
-
-    # TODO: implement copy method that compatible with InternEvo trainstate
-    def copy(self):
-        return copy.deepcopy(self)
diff --git a/internlm/data/megatron/collaters.py b/internlm/data/megatron/collaters.py
index 252bc289..c6ffc80e 100644
--- a/internlm/data/megatron/collaters.py
+++ b/internlm/data/megatron/collaters.py
@@ -2,48 +2,36 @@
 
 
 def megatron_collate_fn(batch, micro_num, micro_bsz, seq_len):
-
-    input_ids_result = [[] for _ in range(micro_num)]
-    labels_result = [[] for _ in range(micro_num)]
-    cu_seqlens = []
+    input_ids_list = [[] for _ in range(micro_num)]
+    labels_list = [[] for _ in range(micro_num)]
     cu_seqlens_list = []
-    indexes = []
     indexes_list = []
 
-    for i, item in enumerate(batch):
-        assert i < micro_num * micro_bsz
-        seq_len_list = item["text"]
-        assert len(seq_len_list) == seq_len + 1
-
-        micro_bsz_index = i % micro_bsz
-        micro_num_index = i // micro_bsz
-
-        input_ids_result[micro_num_index].append(seq_len_list[:-1])
-        labels_result[micro_num_index].append(seq_len_list[1:])
-
-        cu_seqlens.append(seq_len * micro_bsz_index)
-        indexes = indexes + list(range(seq_len))
+    assert len(batch) == micro_bsz * micro_num
+    for idx, b in enumerate(batch):
+        tokens = b["text"]
+        # The length of megatron preprocessed data samples is (seq_len + 1)
+        # So we use the first seq_len tokens as input and the last seq_len tokens as shifted labels
+        assert len(tokens) == seq_len + 1
+        micro_bsz_index = idx % micro_bsz
+        micro_num_index = idx // micro_bsz
+        input_ids_list[micro_num_index].append(tokens[:-1])
+        labels_list[micro_num_index].append(tokens[1:])
 
         if micro_bsz_index == micro_bsz - 1:
-            input_ids_result[micro_num_index] = torch.cat(
-                [torch.from_numpy(arr).long() for arr in input_ids_result[micro_num_index]], dim=0
+            # Since megatron data sample is numpy format, we need to convert it to tensor and concate within micro batch
+            input_ids_list[micro_num_index] = torch.cat(
+                [torch.from_numpy(arr) for arr in input_ids_list[micro_num_index]], dim=0
             )
-            labels_result[micro_num_index] = torch.cat(
-                [torch.from_numpy(arr).long() for arr in labels_result[micro_num_index]], dim=0
+            labels_list[micro_num_index] = torch.cat(
+                [torch.from_numpy(arr) for arr in labels_list[micro_num_index]], dim=0
             )
-            cu_seqlens.append(seq_len * micro_bsz)
-            cu_seqlens_list.append(torch.IntTensor(cu_seqlens))
-            cu_seqlens = []
-            indexes_list.append(torch.IntTensor(indexes))
-            indexes = []
-
-    input_ids = torch.stack(input_ids_result)
-    labels = torch.stack(labels_result)
-    indexes = torch.stack(indexes_list)
+            cu_seqlens_list.append(torch.IntTensor([i * seq_len for i in range(micro_bsz + 1)]))
+            indexes_list.append(torch.IntTensor(list(range(seq_len)) * micro_bsz))
 
     return {
-        "input_ids": input_ids,
+        "input_ids": torch.stack(input_ids_list),
         "cu_seqlens": cu_seqlens_list,
-        "indexes": indexes,
+        "indexes": torch.stack(indexes_list),
         "type_ids": torch.zeros(micro_num, micro_bsz * seq_len, dtype=torch.int64),
-    }, labels
+    }, torch.stack(labels_list)
diff --git a/internlm/data/megatron/dataset.py b/internlm/data/megatron/dataset.py
index 7dba0294..88f4697b 100644
--- a/internlm/data/megatron/dataset.py
+++ b/internlm/data/megatron/dataset.py
@@ -1,5 +1,6 @@
 # adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/gpt_dataset.py
 # adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/datasets/indexed_dataset.py
+
 import hashlib
 import os
 import struct
@@ -764,82 +765,25 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
     return indexed_dataset
 
 
-def get_train_valid_test_split_(splits_string, size):
-    """Get dataset splits from comma or '/' separated string list."""
-
-    splits = []
-    if splits_string.find(",") != -1:
-        splits = [float(s) for s in splits_string.split(",")]
-    elif splits_string.find("/") != -1:
-        splits = [float(s) for s in splits_string.split("/")]
-    else:
-        splits = [float(splits_string)]
-    while len(splits) < 3:
-        splits.append(0.0)
-    splits = splits[:3]
-    splits_sum = sum(splits)
-    assert splits_sum > 0.0
-    splits = [split / splits_sum for split in splits]
-    splits_index = [0]
-    for index, split in enumerate(splits):
-        splits_index.append(splits_index[index] + int(round(split * float(size))))
-    diff = splits_index[-1] - size
-    for index in range(1, len(splits_index)):
-        splits_index[index] -= diff
-    assert len(splits_index) == 4
-    assert splits_index[-1] == size
-    return splits_index
-
-
 def build_megatron_dataset(
     data_prefix,
-    data_impl,
-    splits_string,
-    train_valid_test_num_samples,
     seq_len,
     seed,
-    skip_warmup,
-    return_doc_ids=False,
-    *,
-    data_cache_path=None,
 ):
-
     # Indexed dataset.
-    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
-
-    total_num_of_documents = indexed_dataset.sizes.shape[0]
-    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
-
-    # Print stats about the splits.
-    print_rank_0(" > dataset split:")
-
-    def print_split_stats(index, name):
-        print_rank_0("    {}:".format(name))
-        print_rank_0(
-            "     document indices in [{}, {}) total of {} "
-            "documents".format(splits[index], splits[index + 1], splits[index + 1] - splits[index])
-        )
-
-    print_split_stats(0, "train")
-
-    def build_dataset(index, name):
-        dataset = None
-        if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32)
-            dataset = GPTDataset(
-                name,
-                data_prefix,
-                documents,
-                indexed_dataset,
-                splits_string,
-                train_valid_test_num_samples[index],
-                seq_len,
-                seed,
-                return_doc_ids,
-                data_cache_path=data_cache_path,
-            )
-        return dataset
-
-    train_dataset = build_dataset(0, "train")
-
-    return train_dataset
+    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl="infer", skip_warmup=True)
+
+    # GPT dataset.
+    return GPTDataset(
+        name="train",
+        data_prefix=data_prefix,
+        documents=np.arange(start=0, stop=indexed_dataset.sizes.shape[0], step=1, dtype=np.int32),
+        indexed_dataset=indexed_dataset,
+        splits_string="1.0, 0.0, 0.0",  # proportion of dataset for train/valid/test, we set 1.0 for train only
+        num_samples=gpc.config.data.micro_bsz
+        * gpc.config.data.micro_num
+        * gpc.get_world_size(ParallelMode.DATA)
+        * gpc.config.data.total_steps,  # total number of train samples
+        seq_length=seq_len,
+        seed=seed,
+    )
diff --git a/internlm/data/mocked/batch_sampler.py b/internlm/data/mocked/batch_sampler.py
index 737566fa..62f3dcea 100644
--- a/internlm/data/mocked/batch_sampler.py
+++ b/internlm/data/mocked/batch_sampler.py
@@ -1,24 +1,46 @@
-import copy
-
-
 class MockedSequentialBatchSampler:
     """
-    MockedSequentialBatchSampler
+    A batch sampler that yields sequential batches of a specified size from a dataset.
     """
 
     def __init__(self, train_ds, micro_num):
+        """
+        Initialize the MockedSequentialBatchSampler.
+
+        Args:
+            train_ds: The training dataset to sample from.
+            micro_num (int): The number of micro batches.
+        """
         self.train_ds = train_ds
         self.micro_num = micro_num
 
+        self.batch_count = 0
+        self.num_consumed_samples_in_epoch = 0
+
     def __iter__(self):
         num_samples = len(self.train_ds)
-        for start in range(0, num_samples, self.micro_num):
+        while self.num_consumed_samples_in_epoch < num_samples:
+            start = self.num_consumed_samples_in_epoch
             end = min(start + self.micro_num, num_samples)
+            self.batch_count += 1
+            self.num_consumed_samples_in_epoch += end - start
             yield list(range(start, end))
 
     def __len__(self):
         return (len(self.train_ds) + self.micro_num - 1) // self.micro_num
 
-    # TODO: implement copy method that compatible with InternEvo trainstate
+    def state_dict(self):
+        states = {
+            "batch_count": self.batch_count,
+            "num_consumed_samples_in_epoch": self.num_consumed_samples_in_epoch,
+        }
+        return states
+
+    def load_state_dict(self, states):
+        self.batch_count = states["batch_count"]
+        self.num_consumed_samples_in_epoch = states["num_consumed_samples_in_epoch"]
+
     def copy(self):
-        return copy.deepcopy(self)
+        copy_sampler = MockedSequentialBatchSampler(self.train_ds, self.micro_num)
+        copy_sampler.load_state_dict(self.state_dict())
+        return copy_sampler
diff --git a/internlm/data/mocked/dataset.py b/internlm/data/mocked/dataset.py
index 0d0e488e..88020a78 100644
--- a/internlm/data/mocked/dataset.py
+++ b/internlm/data/mocked/dataset.py
@@ -108,7 +108,7 @@ def __init__(self, train_folder: str, micro_bsz: int, micro_num: int, seq_len: i
         ]
 
         # simple sanity check: ensure loaded per-step data is equivalent to saved per-step data
-        self.sanity_check(tokens_list, labels_list)
+        self._sanity_check(tokens_list, labels_list)
 
     def __len__(self) -> int:
         return len(self.db_tokens)
@@ -122,7 +122,7 @@ def __getitem__(self, idx: int) -> Dict[str, List[int]]:
             "type_ids": [0] * (self.micro_bsz * self.seq_len),
         }
 
-    def sanity_check(self, tokens_list: List[torch.Tensor], labels_list: List[torch.Tensor]):
+    def _sanity_check(self, tokens_list: List[torch.Tensor], labels_list: List[torch.Tensor]):
         tokens_list_tocheck = []
         for i in range(len(self.db_tokens)):
             tokens_list_tocheck += self.db_tokens[i]
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index e255c810..d6038d18 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -66,6 +66,8 @@ def get_default_parser():
 def args_sanity_check():
     assert gpc.config is not None, "config is not load!"
 
+    gpc.is_forward = True
+
     if "JOB_NAME" not in gpc.config:
         gpc.config._add_item("JOB_NAME", "AnonymousJob")
 
@@ -73,6 +75,13 @@ def args_sanity_check():
     if "model_type" not in gpc.config:
         gpc.config._add_item("model_type", ModelType.INTERNLM.name)
 
+    if gpc.config.model_type == "InternLM3_M":
+        # TODO: need check for isp overlap
+        num_layers = gpc.config.model.num_self_decoder_layers + gpc.config.model.num_cross_decoder_layers
+    else:
+        num_layers = gpc.config.model.num_layers
+    gpc.config.isp_num_layers = num_layers
+
     if "use_apex_adam" not in gpc.config:
         gpc.config._add_item("use_apex_adam", False)
 
@@ -94,13 +103,17 @@ def args_sanity_check():
         gpc.config.parallel._add_item("tensor", dict(size=1, mode=TensorParallelMode.mtp.name))
 
     if "weight" not in gpc.config.parallel:
-        gpc.config.parallel._add_item("weight", dict(size=1, overlap=False))
+        gpc.config.parallel._add_item(
+            "weight", dict(size=1, overlap=False, launch_allgather_before="wo", forward_overlap_per="layer")
+        )
 
     if "expert" not in gpc.config.parallel:
         gpc.config.parallel._add_item("expert", dict(size=-1, no_tp=False))
 
     if "expert_weight" not in gpc.config.parallel:
-        gpc.config.parallel._add_item("expert_weight", dict(size=1, overlap=False))
+        gpc.config.parallel._add_item(
+            "expert_weight", dict(size=1, overlap=False, launch_allgather_before="wo", forward_overlap_per="layer")
+        )
 
     if isinstance(gpc.config.parallel.pipeline, int):
         pp = gpc.config.parallel.pipeline
@@ -347,17 +360,6 @@ def args_sanity_check():
     if "use_flash_attn" not in gpc.config.model:
         gpc.config.model._add_item("use_flash_attn", True)
 
-    old_parallel_output = gpc.config.model.get("parallel_output", None)
-    # Try to change user setting
-    if internlm_accelerator.get_accelerator_backend() is not AcceleratorType.GPU:
-        gpc.config.model.update({"parallel_output": False})
-        if old_parallel_output is True and gpc.is_rank_for_log():
-            logger.warning(
-                "'parallel_output' is converted from 'True' to 'False'."
-                "Because 'parallel_output' only support by FlashCrossEntropyLoss."
-                "Please make sure you are using flash attention in cuda device."
-            )
-
     if "MoE" in gpc.config.get("model_type", ModelType.INTERNLM.name):
         if "num_experts" not in model:
             model._add_item("num_experts", 1)
@@ -395,17 +397,18 @@ def args_sanity_check():
         gpc.config.parallel["tensor"] = dict(size=gpc.config.parallel["tensor"], mode=TensorParallelMode.mtp.name)
     if gpc.config.parallel["tensor"].get("mode", None) is None:
         gpc.config.parallel["tensor"]["mode"] = TensorParallelMode.mtp.name
-    assert (
-        gpc.config.VOCAB_SIZE % gpc.config.parallel.tensor.size == 0
-    ), "VOCAB_SIZE must be integer multiple of tensor parallel size"
     if gpc.config.parallel["tensor"]["mode"] == TensorParallelMode.isp.name:
         assert not gpc.config.parallel.zero1.fsdp, "FSDP does not support isp"
         assert (
             torch.__version__ >= "2.1.0"
         ), f"requires torch>=2.1.0 when using isp but current version is {torch.__version__}"
-        assert (
-            gpc.config.VOCAB_SIZE % gpc.config.parallel.weight.size == 0
-        ), "VOCAB_SIZE must be integer multiple of wp size"
+
+    assert (
+        gpc.config.model.vocab_size % gpc.config.parallel.weight.size == 0
+    ), "model.vocab_size must be integer multiple of weight parallel size"
+    assert (
+        gpc.config.model.vocab_size % gpc.config.parallel.tensor.size == 0
+    ), "model.vocab_size must be integer multiple of tensor parallel size"
 
     assert gpc.config.parallel["tensor"].get("mode", None) in [
         TensorParallelMode.mtp.name,
@@ -445,6 +448,9 @@ def args_sanity_check():
     ]:
         gpc.config.parallel.sequence_parallel = True
 
+        if gpc.config.model.get("parallel_output", False) is False:
+            logger.warning("When enable sequence parallel, it recommend to enable parallel_output")
+
     # set default value for weight parallel
     if gpc.config.parallel["weight"].get("overlap", None) is None:
         gpc.config.parallel["weight"]["overlap"] = False
@@ -536,7 +542,20 @@ def args_sanity_check():
         gpc.config.loss._add_item("moe_loss_coeff", 1.0)
 
     if "selective_checkpoint" not in gpc.config:
-        gpc.config._add_item("selective_checkpoint", False)
+        gpc.config.selective_checkpoint = False
+    if "selective_checkpoint_offload" not in gpc.config:
+        gpc.config.selective_checkpoint_offload = False
+    if gpc.config.selective_checkpoint is True:
+        assert (
+            gpc.config.parallel["tensor"]["mode"] == "isp"
+        ), "When using selective_checkpoint, tensor parallel mode must be isp"
+    if gpc.config.selective_checkpoint_offload is True:
+        assert (
+            gpc.config.selective_checkpoint is True
+        ), "When using selective_checkpoint_offload, selective_checkpoint must be True"
+        assert (
+            gpc.config.parallel.weight.launch_allgather_before == "wo"
+        ), "When using selective_checkpoint_offload, wp launch allgather communication should be set before 'wo' module"
 
     # moe not support overlap and zero1.5 for now
     if gpc.config.model.get("num_experts", 1) > 1:
@@ -587,6 +606,11 @@ def args_sanity_check():
                 gpc.config.data.use_packed_dataset is False
             ), "only unpacked data is supported when using 2D sequence parallel."
 
+    # loss operator type
+    loss_cfg = gpc.config.loss
+    if loss_cfg.get("op_type", None) is None:
+        loss_cfg._add_item("op_type", "py_vocab_parallel")
+
 
 def launch(
     config: Union[str, Path, Config, Dict],
diff --git a/internlm/model/losses/__init__.py b/internlm/model/losses/__init__.py
index 58287815..5d6c8db3 100644
--- a/internlm/model/losses/__init__.py
+++ b/internlm/model/losses/__init__.py
@@ -1,5 +1,5 @@
-from .ce_loss import FlashGPTLMLoss
+from .ce_loss import InternLoss
 
 __all__ = [
-    "FlashGPTLMLoss",
+    "InternLoss",
 ]
diff --git a/internlm/model/losses/ce_loss.py b/internlm/model/losses/ce_loss.py
index 69e09d2f..5b2a380e 100644
--- a/internlm/model/losses/ce_loss.py
+++ b/internlm/model/losses/ce_loss.py
@@ -1,36 +1,61 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
+import torch
 from torch import nn
 
-from internlm.core.context import global_context as gpc
+from internlm.accelerator import get_accelerator
 from internlm.model.ops.cross_entropy import new_cross_entropy
-from internlm.utils.logger import get_logger
 
-logger = get_logger(__file__)
+internlm_accelerator = get_accelerator()
 
 
-class FlashGPTLMLoss(nn.Module):
-    """
-    Loss function for flash GPT Language Model.
+class InternLoss(nn.Module):
+    """We use a base class to wrap different CrossEntropy implementations
+    and unify input and output parameters.
+
+    This class is designed not to rely on gpc, making it easy to transplant.
+
+    Different variants of CrossEntropy, with supporting parallel computation and inplace operations.
+
+    If parallel_output is False, the output will gather head's output, only 'FlashCrossEntropyLoss' and
+    'CrossEntropyApexVocabParallel' support it.
     """
 
-    def __init__(self, parallel_output=True, label_smoothing=0):
+    def __init__(
+        self,
+        parallel_output=False,
+        ignore_index=-100,
+        reduction="mean",
+        label_smoothing=0.0,
+        inplace_backward=True,
+        op_type="py_vocab_parallel",
+    ) -> None:
         super().__init__()
 
         if label_smoothing is not None:
             if label_smoothing != 0:
-                if gpc.is_rank_for_log():
-                    print(f"use label_smoothing: {label_smoothing}")
+                print(f"use label_smoothing: {label_smoothing}", flush=True)
         else:
             label_smoothing = 0
 
         self.label_smoothing = label_smoothing
+
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+        self.op_type = op_type
+
+        assert self.reduction in [
+            "mean",
+            "none",
+        ], f"Only support reduction is mean/none, but the passed in reduction is {self.reduction}"
+
+        # In order to facilitate the calculation of loss for different datasets, we set reduction as 'none',
+        # and do loss reduction ourselves.
         self.loss_fn = new_cross_entropy(
-            reduction="mean",
-            label_smoothing=self.label_smoothing,
+            op_type=op_type,
+            ignore_index=ignore_index,
+            label_smoothing=label_smoothing,
             parallel_output=parallel_output,
-            inplace_backward=True,
+            inplace_backward=inplace_backward,
+            reduction="none",
         )
 
     def forward(self, *args):
@@ -44,9 +69,18 @@ def forward(self, *args):
             raise RuntimeError(f"The number of criterion inputs are:{len(args)}")
         shift_logits = logits.contiguous().view(-1, logits.size(-1))
         shift_labels = labels.contiguous().view(-1)
-        loss = self.loss_fn(
-            shift_logits, shift_labels
-        )  # There is no need to consider the ignore_index problem here, because the loss calculation will be
-        # calculated through the calculation range, and -100 must be outside this range, so there is no problem
+
+        with torch.autocast(device_type=internlm_accelerator.get_backend_name()):
+            loss_list = self.loss_fn(
+                shift_logits, shift_labels
+            )  # There is no need to consider the ignore_index problem here, because the loss calculation will be
+            # # calculated through the calculation range, and -100 must be outside this range, so there is no problem
+
+        cond = shift_labels != self.ignore_index
+        if self.reduction == "mean":
+            # This loss is only for one dp rank.
+            loss = loss_list.sum() / (cond).sum()
+        else:
+            loss = loss_list
 
         return loss
diff --git a/internlm/model/metrics.py b/internlm/model/metrics.py
index af52858f..a7f6c966 100644
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@@ -305,6 +305,7 @@ def __init__(self, device, dp_pg, dataset_types: List[str] = None) -> None:
             reduction="none",
             parallel_output=gpc.config.model.parallel_output,
             inplace_backward=True,
+            op_type=gpc.config.loss.op_type,
         )
         self.scatter_sum = scatter_sum_impl
 
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index e2837724..ebf7d0b0 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -195,7 +195,7 @@ def _forward(self, hidden_states, *args, **kwargs):
         def _dropout_and_norm_attn(_hidden_states):
             _dropped = self.dropout1(_hidden_states)
             _residual = _dropped
-            _hidden_states = self.norm1(_residual.float())
+            _hidden_states = self.norm1(_residual.to(self.norm1.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
@@ -212,7 +212,7 @@ def _dropout_and_norm_attn(_hidden_states):
         def _dropout_and_norm_ffn(_residual, _hidden_states):
             _dropped = self.dropout2(_hidden_states)
             _residual = (_dropped + _residual) if _residual is not None else _dropped
-            _hidden_states = self.norm2(_residual.float())
+            _hidden_states = self.norm2(_residual.to(self.norm2.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
diff --git a/internlm/model/modeling_internlm2.py b/internlm/model/modeling_internlm2.py
index a4389b63..69da0837 100644
--- a/internlm/model/modeling_internlm2.py
+++ b/internlm/model/modeling_internlm2.py
@@ -1,6 +1,7 @@
 # Copyright (c) InternLM. All rights reserved.
 import math
 import os
+from functools import reduce
 from typing import Optional
 
 import torch
@@ -11,6 +12,7 @@
 from internlm.accelerator import get_accelerator
 from internlm.core.context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
+from internlm.core.parallel.shard import partition_uniform
 from internlm.initialize.initialize_tensor import (
     normal_,
     scaled_init_method_normal,
@@ -26,6 +28,7 @@
 from internlm.model.utils import (
     convert_attn_args_to_kwargs,
     convert_attn_kwargs_to_args,
+    get_parallel_size_from_file,
 )
 from internlm.solver.activation_checkpoint import activation_checkpoint
 from internlm.utils.logger import get_logger
@@ -254,7 +257,7 @@ def _dropout_and_norm_attn(_residual, _hidden_states):
                     def _dropout_and_norm_ffn(_residual, _hidden_states):
                         _dropped = self.dropout2(_hidden_states)
                         _residual = (_dropped + _residual) if _residual is not None else _dropped
-                        _hidden_states = self.ffn_norm(_residual.to(torch.float32))
+                        _hidden_states = self.ffn_norm(_residual.to(self.ffn_norm.weight.dtype))
 
                         return _residual, _hidden_states
 
@@ -576,6 +579,196 @@ def load_hf_weights(folder: str, model: nn.Module) -> None:
 
         internlm_accelerator.empty_cache()
 
+    @staticmethod
+    def load_internlm2_with_dynamic_parallel_size(folder, model):
+        """Load InternLM2 with dynamic parallel size."""
+        assert folder is not None, "Please specify the folder of the pretrained model"
+        assert gpc.config.model_type in ["INTERNLM2_PUBLIC"], "dynamic_parallel is only for INTERNLM2_PUBLIC"
+
+        fns = get_fns(folder)
+        if gpc.is_rank_for_log():
+            logger.info(f"Loading pretrained model from {folder}")
+        model_fns, old_tp, old_pp = get_parallel_size_from_file(fns)  # pylint: disable=W0612
+
+        tp = gpc.get_world_size(ParallelMode.TENSOR)
+        tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+        assert old_tp % tp == 0 or tp % old_tp == 0, (
+            f"Expected TP size in loaded checkpoint to be fit with TP size in current config, but got {old_tp} in "
+            f"checkpoint and {tp} in current config"
+        )
+
+        correspond_tps = []
+
+        if old_tp <= tp:
+            correspond_tps.append(tp_rank // (tp // old_tp))
+            ratio = tp // old_tp
+            rank = tp_rank % ratio
+        else:
+            for i in range(old_tp // tp):
+                correspond_tps.append(tp_rank * (old_tp // tp) + i)
+            rank = 0
+            ratio = 1
+
+        current_states = {}
+
+        pp = gpc.get_world_size(ParallelMode.PIPELINE)  # noqa: F841 # pylint: disable=W0612
+
+        assert gpc.config.model.num_chunks == 1, "May cause future collisions, ignore this if necessary"
+
+        old_pp_partition = partition_uniform(gpc.config.model.num_layers, old_pp, 1)
+
+        for idx, parts in enumerate(old_pp_partition):
+            start, end = parts[0]
+            if model.last_layer <= start or model.first_layer >= end:
+                continue
+            tmp_states = {}
+
+            for correspond_tp in correspond_tps:
+                model_name = f"model_tp{correspond_tp}_pp{idx}.pt"
+                states = llm_load(os.path.join(folder, model_name), map_location="cpu")
+                states = {k.replace("model.", ""): v for k, v in states.items()}
+                for i in range(start, end):
+                    if i >= model.last_layer:
+                        break
+                    if i < model.first_layer:
+                        continue
+
+                    for name in list(states.keys()):
+                        if f".{i-start}." in name:
+                            to_name = name.replace(f".{i-start}.", f".{i-model.first_layer}.")
+
+                            if gpc.config.model_type == "INTERNLM2_PUBLIC":
+                                if "norm" in name:
+                                    tmp_states[to_name] = [states.pop(name)]
+                                elif any(x in name for x in ("wo", "w2")):
+                                    tmp_states[to_name] = tmp_states.get(to_name, [])
+                                    tmp_states[to_name].append(states.pop(name).chunk(ratio, dim=1)[rank])
+                                elif any(x in name for x in ("w1", "w3")):
+                                    tmp_states[to_name] = tmp_states.get(to_name, [])
+                                    tmp_states[to_name].append(states.pop(name).chunk(ratio, dim=0)[rank])
+                                elif any(x in name for x in ("wqkv",)):
+                                    tmp_states[to_name] = tmp_states.get(to_name, [])
+                                    if tp > gpc.config.model.num_kv_attention_heads:
+                                        assert old_tp <= gpc.config.model.num_kv_attention_heads, (
+                                            f"`old_tp ({old_tp}) => tp ({tp})` is not supported. "
+                                            "At least one of `tp` and `old_tp` should be less than or "
+                                            "equal to `num_kv_attention_heads`"
+                                        )
+                                        # Suitable for cases where the num_kv_attention_head is small,
+                                        # but you want to have a large TP Size
+                                        q_per_kv = (
+                                            gpc.config.model.num_attention_heads
+                                            // gpc.config.model.num_kv_attention_heads
+                                        )
+                                        head_dim = gpc.config.model.hidden_size // gpc.config.model.num_attention_heads
+                                        index = torch.concat(
+                                            (
+                                                torch.arange(q_per_kv).chunk(ratio, dim=0)[tp_rank % ratio],
+                                                torch.tensor([q_per_kv, q_per_kv + 1]),
+                                            )
+                                        )
+                                        index = index + (q_per_kv + 2) * (tp_rank // ratio)
+                                        index = index % (
+                                            (q_per_kv + 2) * (gpc.config.model.num_kv_attention_heads / old_tp)
+                                        )
+                                        index = index * head_dim
+                                        index = index.repeat_interleave(head_dim) + torch.arange(head_dim).repeat(
+                                            index.shape[0]
+                                        )
+                                        tmp_states[to_name].append(
+                                            torch.index_select(states.pop(name), 0, index.to(torch.int32))
+                                        )
+                                    else:
+                                        tmp_states[to_name].append(states.pop(name).chunk(ratio, dim=0)[rank])
+                                else:
+                                    raise KeyError(f"Unknown key {name}.")
+
+                            else:
+                                assert False, "unsupported model type"
+
+                if "tok_embeddings.weight" in states and model.first_layer == 0:
+                    tmp_states["tok_embeddings.weight"] = tmp_states.get("tok_embeddings.weight", [])
+                    tmp_states["tok_embeddings.weight"].append(
+                        states["tok_embeddings.weight"].chunk(ratio, dim=1)[rank]
+                    )
+                if "output.weight" in states and model.last_layer == gpc.config.model.num_layers:
+                    tmp_states["norm.weight"] = [states["norm.weight"]]
+                    tmp_states["output.weight"] = tmp_states.get("output.weight", [])
+                    tmp_states["output.weight"].append(states["output.weight"].chunk(ratio, dim=0)[rank])
+
+                states = {}
+
+            for name in list(tmp_states.keys()):
+                data = tmp_states.pop(name)
+                if len(data) == 1:
+                    current_states[name] = data[0]
+                else:
+                    current_states[name] = torch.concat(
+                        data, dim=1 if name == "tok_embeddings.weight" or any(x in name for x in ("wo", "w2")) else 0
+                    )
+                    # Merge copied kv heads
+                    if "wqkv" in name and old_tp > gpc.config.model.num_kv_attention_heads:
+                        assert (
+                            tp <= gpc.config.model.num_kv_attention_heads
+                        ), "new_tp should be less than or equal to num_kv_attention_heads"
+                        head_dim = gpc.config.model.hidden_size // gpc.config.model.num_attention_heads
+                        q_per_kv = gpc.config.model.num_attention_heads // gpc.config.model.num_kv_attention_heads
+                        copied_times = old_tp // gpc.config.model.num_kv_attention_heads
+                        cur_q_per_kv = q_per_kv // copied_times
+
+                        # pylint: disable=all
+                        def duplicate_kv_index(i):
+                            if i % (cur_q_per_kv + 2) >= cur_q_per_kv:
+                                return i
+                            else:
+                                return -100
+
+                        def unique_kv_index(i):
+                            if i // (cur_q_per_kv + 2) == copied_times - 1 or i % (cur_q_per_kv + 2) < cur_q_per_kv:
+                                return i
+                            else:
+                                return -100
+
+                        # pylint: enable=all
+
+                        # Verify
+                        duplicate_index = [duplicate_kv_index(i) for i in range((cur_q_per_kv + 2) * copied_times)]
+                        duplicate_index = [i for i in duplicate_index if i != -100]
+                        duplicate_index = _duplicate_index = torch.tensor(duplicate_index)
+                        for i in range(gpc.config.model.num_kv_attention_heads // tp - 1):
+                            duplicate_index = torch.concat(
+                                (duplicate_index, _duplicate_index + duplicate_index.max() + 1), dim=0
+                            )
+                        duplicate_kv = []
+                        for index in duplicate_index.reshape(-1, copied_times * 2).chunk(copied_times, dim=-1):
+                            index = index.reshape(-1) * head_dim
+                            index = index.repeat_interleave(head_dim) + torch.arange(head_dim).repeat(index.shape[0])
+                            duplicate_kv.append(torch.index_select(current_states[name], 0, index))
+                        assert reduce(
+                            lambda x, y: x and y,
+                            [torch.allclose(duplicate_kv[0], x, atol=1e-5) for x in duplicate_kv[1:]],
+                        ), "Copied kv heads are not equal after training!"
+
+                        # Merge
+                        unique_index = [unique_kv_index(i) for i in range((cur_q_per_kv + 2) * copied_times)]
+                        unique_index = [i for i in unique_index if i != -100]
+                        unique_index = _unique_index = torch.tensor(unique_index)
+                        for i in range(gpc.config.model.num_kv_attention_heads // tp - 1):
+                            unique_index = torch.concat((unique_index, _unique_index + unique_index.max() + 1), dim=0)
+                        unique_index = unique_index * head_dim
+                        unique_index = unique_index.repeat_interleave(head_dim) + torch.arange(head_dim).repeat(
+                            unique_index.shape[0]
+                        )
+                        current_states[name] = torch.index_select(current_states[name], 0, unique_index)
+        missing_keys, unexpected_keys = model.load_state_dict(current_states, strict=False)
+
+        if gpc.get_local_rank(ParallelMode.DATA) == 0:
+            pp_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
+            logger.info(
+                f"Missing keys:{missing_keys}, unexpected keys:{unexpected_keys} in "
+                f"tp:{gpc.get_local_rank(ParallelMode.TENSOR)}, pp:{pp_rank}"
+            )
+
     @staticmethod
     def convert_internevo2hf_weights(src: str, tgt: str) -> None:
         def permute(qkv, num_heads, num_kv_heads, head_dim, adapt_hf=True):
diff --git a/internlm/model/modeling_llama.py b/internlm/model/modeling_llama.py
index 46fc9c03..56b88e83 100644
--- a/internlm/model/modeling_llama.py
+++ b/internlm/model/modeling_llama.py
@@ -246,7 +246,7 @@ def _dropout_and_norm_attn(_residual, _hidden_states):
                     def _dropout_and_norm_ffn(_residual, _hidden_states):
                         _dropped = self.dropout2(_hidden_states)
                         _residual = (_dropped + _residual) if _residual is not None else _dropped
-                        _hidden_states = self.ffn_norm(_residual.to(torch.float32))
+                        _hidden_states = self.ffn_norm(_residual.to(self.ffn_norm.weight.dtype))
 
                         return _residual, _hidden_states
 
diff --git a/internlm/model/modeling_mixtral.py b/internlm/model/modeling_mixtral.py
index 844b5081..8e8767ce 100644
--- a/internlm/model/modeling_mixtral.py
+++ b/internlm/model/modeling_mixtral.py
@@ -214,7 +214,7 @@ def _forward(self, hidden_states, *args, **kwargs):
         def _dropout_and_norm_attn(_hidden_states):
             _dropped = self.dropout1(_hidden_states)
             _residual = _dropped
-            _hidden_states = self.norm1(_residual.float())
+            _hidden_states = self.norm1(_residual.to(self.norm1.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
@@ -231,7 +231,7 @@ def _dropout_and_norm_attn(_hidden_states):
         def _dropout_and_norm_ffn(_residual, _hidden_states):
             _dropped = self.dropout2(_hidden_states)
             _residual = (_dropped + _residual) if _residual is not None else _dropped
-            _hidden_states = self.norm2(_residual.float())
+            _hidden_states = self.norm2(_residual.to(self.norm2.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
index 964b268e..f40d35f3 100644
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@@ -205,7 +205,7 @@ def _forward(self, hidden_states, *args, **kwargs):
         def _dropout_and_norm_attn(_hidden_states):
             _dropped = self.dropout1(_hidden_states)
             _residual = _dropped
-            _hidden_states = self.norm1(_residual.float())
+            _hidden_states = self.norm1(_residual.to(self.norm1.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
@@ -222,7 +222,7 @@ def _dropout_and_norm_attn(_hidden_states):
         def _dropout_and_norm_ffn(_residual, _hidden_states):
             _dropped = self.dropout2(_hidden_states)
             _residual = (_dropped + _residual) if _residual is not None else _dropped
-            _hidden_states = self.norm2(_residual.float())
+            _hidden_states = self.norm2(_residual.to(self.norm2.weight.dtype))
             return _residual, _hidden_states
 
         if self.dropout_selective_checkpoint:
diff --git a/internlm/model/modules/linear.py b/internlm/model/modules/linear.py
index 856e6ba0..6f190268 100644
--- a/internlm/model/modules/linear.py
+++ b/internlm/model/modules/linear.py
@@ -344,6 +344,9 @@ def forward(
         ctx.compute_weight_gradient = weight.requires_grad
         ctx.backend = backend
 
+        saved_x = None if ctx.compute_weight_gradient is False else x
+        ctx.save_for_backward(saved_x, weight, batch_sizes)
+
         if torch.is_autocast_enabled():
             x = x.to(dtype=torch.get_autocast_gpu_dtype())
         x = x.contiguous()
@@ -358,8 +361,7 @@ def forward(
 
             output = torch.matmul(x, weight)
 
-        saved_x = None if ctx.compute_weight_gradient is False else x
-        ctx.save_for_backward(saved_x, weight, batch_sizes)
+        assert len(output.shape) == len(x.shape)
 
         return output
 
@@ -372,6 +374,14 @@ def backward(ctx, grad_output):
         x, weight, batch_sizes = ctx.saved_tensors
         grad_input, grad_weight = None, None
 
+        if grad_output.numel() == 0:
+            if ctx.needs_input_grad[1]:
+                grad_weight = torch.zeros_like(weight)
+            if ctx.needs_input_grad[0]:
+                grad_input = torch.zeros_like(x)
+
+            return grad_input, grad_weight, None, None, None, None, None
+
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
             if backend == "gmm":
@@ -450,6 +460,8 @@ def forward(
         saved_x = None if ctx.compute_weight_gradient is False else x
         ctx.save_for_backward(saved_x, weight, batch_sizes)
 
+        assert len(output.shape) == len(x.shape)
+
         return output
 
     @staticmethod
@@ -461,20 +473,28 @@ def backward(ctx, grad_output):
         backend = ctx.backend
         full_weight_shape = ctx.full_weight_shape
 
-        grad_output = grad_output.contiguous()
-
-        total_weight = communicator.weight_hook(weight, module=module)
-        total_weight = total_weight.reshape(full_weight_shape)
-        grad_input, grad_weight = None, None
         if grad_output.numel() == 0:
+            if ctx.needs_input_grad[1]:
+                total_weight_shape = torch.Size(
+                    (full_weight_shape.numel() // full_weight_shape[-1], full_weight_shape[-1])
+                )
+                grad_weight = torch.zeros(total_weight_shape, dtype=weight.dtype, device=weight.device)
+                grad_weight, grad_weight_sync = communicator.grad_hook(
+                    grad_weight, async_op=True, module=module, is_bias=False
+                )
             if ctx.needs_input_grad[0]:
                 grad_input = torch.zeros_like(x)
             if ctx.needs_input_grad[1]:
-                grad_weight = torch.zeros_like(total_weight).reshape(-1, full_weight_shape[-1])
-                grad_weight, _ = communicator.grad_hook(grad_weight, async_op=False, module=module, is_bias=False)
+                grad_weight_sync.wait()
 
             return grad_input, grad_weight, None, None, None, None, None
 
+        grad_output = grad_output.contiguous()
+
+        total_weight = communicator.weight_hook(weight, module=module)
+        total_weight = total_weight.reshape(full_weight_shape)
+        grad_input, grad_weight = None, None
+
         if ctx.needs_input_grad[1]:
             assert ctx.compute_weight_gradient
             if backend == "gmm":
diff --git a/internlm/model/modules/mlp.py b/internlm/model/modules/mlp.py
index 6e74d6b6..e51e5897 100644
--- a/internlm/model/modules/mlp.py
+++ b/internlm/model/modules/mlp.py
@@ -99,12 +99,12 @@ def __init__(
             self.w1 = new_linear(
                 "w1", in_features, hidden_features, bias, device=device, dtype=dtype, is_expert=is_expert
             )
-            self.w2 = new_linear(
-                "w2", hidden_features, out_features, bias, device=device, dtype=dtype, is_expert=is_expert
-            )
             self.w3 = new_linear(
                 "w3", in_features, hidden_features, bias, device=device, dtype=dtype, is_expert=is_expert
             )
+            self.w2 = new_linear(
+                "w2", hidden_features, out_features, bias, device=device, dtype=dtype, is_expert=is_expert
+            )
 
     def forward(self, x):
         if not self.mlp_layer_fusion:
@@ -177,10 +177,10 @@ def __init__(
                 backend=backend,
                 is_expert=is_expert,
             )
-            self.w2 = new_linear(
-                "grouped_w2",
+            self.w3 = new_linear(
+                "grouped_w3",
+                in_features,
                 hidden_features,
-                out_features,
                 bias,
                 device=device,
                 dtype=dtype,
@@ -188,10 +188,10 @@ def __init__(
                 backend=backend,
                 is_expert=is_expert,
             )
-            self.w3 = new_linear(
-                "grouped_w3",
-                in_features,
+            self.w2 = new_linear(
+                "grouped_w2",
                 hidden_features,
+                out_features,
                 bias,
                 device=device,
                 dtype=dtype,
diff --git a/internlm/model/ops/_flash_attn.py b/internlm/model/ops/_flash_attn.py
new file mode 100644
index 00000000..87aac2eb
--- /dev/null
+++ b/internlm/model/ops/_flash_attn.py
@@ -0,0 +1,331 @@
+# Copyright (c) InternLM. All rights reserved.
+import torch
+
+from internlm.accelerator import get_accelerator
+from internlm.core.context import global_context as gpc
+from internlm.core.parallel.comm import get_offload_manager
+
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import (
+        _flash_attn_varlen_backward,
+        _flash_attn_varlen_forward,
+    )
+
+    gpu_flash_attn_impl = True
+except (ModuleNotFoundError, ImportError):
+    gpu_flash_attn_impl = False
+
+internlm_accelerator = get_accelerator()
+device_backend = internlm_accelerator.get_accelerator_backend()
+
+
+class FlashAttnVarlenKVPackedFunc_V263(torch.autograd.Function):
+    """
+    Varlen KVPacked Func from Flash Attn v2.6.3.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        softcap,
+        alibi_slopes,
+        deterministic,
+        return_softmax,
+        layer_idx,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+
+        k, v = kv[:, 0], kv[:, 1]
+
+        _ckpt_block_num = int(gpc.config.model.checkpoint * gpc.config.isp_num_layers)
+
+        if gpc.is_forward is False and gpc.config.selective_checkpoint and layer_idx < _ckpt_block_num:
+            out, out_padded, softmax_lse, S_dmask, rng_state = get_offload_manager().get_fa_output_with_layer(layer_idx)
+        else:
+            (
+                out,
+                q,
+                k,
+                v,
+                out_padded,
+                softmax_lse,
+                S_dmask,
+                rng_state,
+            ) = _flash_attn_varlen_forward(  # pylint: disable=E1123
+                q,
+                k,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                dropout_p,
+                softmax_scale,
+                causal=causal,
+                window_size=window_size,
+                softcap=softcap,
+                alibi_slopes=alibi_slopes,
+                return_softmax=return_softmax and dropout_p > 0,
+                block_table=None,
+            )
+
+        # store attn forward output to avoid re-computation of attn when activation checkpoint is enabled
+        if gpc.is_forward and gpc.config.selective_checkpoint and layer_idx < _ckpt_block_num:
+            get_offload_manager().insert_fa_output_with_layer(
+                layer_idx=layer_idx, output=(out, out_padded, softmax_lse, S_dmask, rng_state)
+            )
+
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.window_size = window_size
+        ctx.softcap = softcap
+        ctx.alibi_slopes = alibi_slopes
+        ctx.deterministic = deterministic
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):  # pylint: disable=W0613
+        q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state = ctx.saved_tensors
+        dq = torch.empty_like(q)
+        kv_shape = k.shape[:-2] + (2, *k.shape[-2:])
+        dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device)
+        _flash_attn_varlen_backward(  # pylint: disable=E1121,E1124
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dkv[:, 0],
+            dkv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.window_size,
+            ctx.softcap,
+            ctx.alibi_slopes,
+            ctx.deterministic,
+            rng_state=rng_state,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dkv = dkv[..., : dout.shape[-1]]
+        return dq, dkv, None, None, None, None, None, None, None, None, None, None, None, None, None
+
+
+class FlashAttnVarlenKVPackedFunc_V221(torch.autograd.Function):
+    """
+    Varlen KVPacked Func from Flash Attn v2.2.1.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+        layer_idx,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+
+        k, v = kv[:, 0], kv[:, 1]
+
+        _ckpt_block_num = int(gpc.config.model.checkpoint * gpc.config.isp_num_layers)
+
+        if gpc.is_forward is False and gpc.config.selective_checkpoint and layer_idx < _ckpt_block_num:
+            out, out_padded, softmax_lse, S_dmask, rng_state = get_offload_manager().get_fa_output_with_layer(layer_idx)
+        else:
+            out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
+                q,
+                k,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                dropout_p,
+                softmax_scale,
+                causal=causal,
+                return_softmax=return_softmax and dropout_p > 0,
+            )
+
+        # store attn forward output to avoid re-computation of attn when activation checkpoint is enabled
+        if gpc.is_forward and gpc.config.selective_checkpoint and layer_idx < _ckpt_block_num:
+            get_offload_manager().insert_fa_output_with_layer(
+                layer_idx=layer_idx, output=(out, out_padded, softmax_lse, S_dmask, rng_state)
+            )
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):  # pylint: disable=W0613
+        q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state = ctx.saved_tensors
+        dq = torch.empty_like(q)
+        kv_shape = k.shape[:-2] + (2, *k.shape[-2:])
+        dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device)
+        _flash_attn_varlen_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dkv[:, 0],
+            dkv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+            rng_state=rng_state,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dkv = dkv[..., : dout.shape[-1]]
+        return dq, dkv, None, None, None, None, None, None, None, None, None
+
+
+def flash_attn_varlen_kvpacked_func(
+    q,
+    kv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    softcap=0.0,  # 0.0 means deactivated
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+    layer_idx=0,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    If K, V are already stacked into 1 tensor, this function will be faster than
+    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
+    of the gradients of K, V.
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        softcap: float. Anything > 0 activates softcapping attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+
+    assert gpu_flash_attn_impl is True and flash_attn.__version__ in [
+        "2.2.1",
+        "2.6.3",
+    ], "flash-attn should be installed and version must be v2.2.1 or v2.6.3"
+
+    if flash_attn.__version__ == "2.2.1":
+        return FlashAttnVarlenKVPackedFunc_V221.apply(
+            q,
+            kv,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal,
+            return_attn_probs,
+            layer_idx,
+        )
+
+    return FlashAttnVarlenKVPackedFunc_V263.apply(
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        window_size,
+        softcap,
+        alibi_slopes,
+        deterministic,
+        return_attn_probs,
+        layer_idx,
+    )
diff --git a/internlm/model/ops/attention.py b/internlm/model/ops/attention.py
index d0a668c8..3aec51f5 100644
--- a/internlm/model/ops/attention.py
+++ b/internlm/model/ops/attention.py
@@ -93,13 +93,14 @@
     from flash_attn.flash_attn_interface import (
         flash_attn_varlen_func as _flash_varlen_qkvsplited_func,
     )
-    from flash_attn.flash_attn_interface import (
-        flash_attn_varlen_kvpacked_func as _flash_varlen_kvpacked_func,
-    )
     from flash_attn.flash_attn_interface import (
         flash_attn_varlen_qkvpacked_func as _flash_varlen_qkvpacked_func,
     )
 
+    from ._flash_attn import (
+        flash_attn_varlen_kvpacked_func as _flash_varlen_kvpacked_func,
+    )
+
     gpu_flash_attn_impl = True
 except (ModuleNotFoundError, ImportError):
     gpu_flash_attn_impl = False
@@ -187,6 +188,7 @@ def _flash_varlen_kvpacked_attn(
     dropout_p=0.0,
     softmax_scale=None,
     causal=False,
+    layer_idx=0,
 ):
     # compatible data format: [1, packelen, 3, n_head, headim]
     q, kv = q.squeeze(dim=0), kv.squeeze(dim=0)
@@ -204,6 +206,7 @@ def _flash_varlen_kvpacked_attn(
         dropout_p,
         softmax_scale,
         causal,
+        layer_idx=layer_idx,
     )
 
     return output.unsqueeze(dim=0)
@@ -521,6 +524,7 @@ def _npu_varlen_kvpacked_attn(
     dropout_p=0.0,
     softmax_scale=None,
     causal=False,
+    layer_idx=0,  # pylint: disable=W0613
 ):
     # TODO: support npu native varlen flash attention
     k, v = kv.unbind(dim=2)
@@ -579,6 +583,7 @@ def _deeplink_varlen_kvpacked_attn(
     dropout_p=0.0,
     softmax_scale=None,
     causal=False,
+    layer_idx=0,  # pylint: disable=W0613
 ):
     # compatible data format: [1, packelen, 3, n_head, headim]
     q, kv = q.squeeze(dim=0), kv.squeeze(dim=0)
@@ -886,6 +891,8 @@ class SelfAttention(nn.Module):
         attention_dropout (float): Dropout rate for attention scores. Defaults to 0.0.
     """
 
+    is_attn_cls = True
+
     def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, layer_idx=0):
         super().__init__()
         self.causal = causal
@@ -928,7 +935,7 @@ def _qkv_without_cu_seqlens(self, qkv, softmax_scale=None, causal=None, key_padd
 
         # TODO: more unified interface
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         extra_kwargs = {}
         if attn_type is AttnType.SlidingWindowZigZagFlash:
@@ -944,7 +951,7 @@ def _q_kv_without_cu_seqlens(self, q, kv, softmax_scale=None, causal=None, key_p
         attn_type, op = _select_attn_op(AttnOpType.FixedLenKVPacked)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         extra_kwargs = {}
         if attn_type is AttnType.SlidingWindowZigZagFlash:
@@ -960,7 +967,7 @@ def _q_k_v_without_cu_seqlens(self, q, k, v, softmax_scale=None, causal=None, ke
         attn_type, op = _select_attn_op(AttnOpType.FixedLenQKVSplited)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if (attn_type is AttnType.Torch and key_padding_mask is not None) else ()
+        extra_args = (key_padding_mask,) if (attn_type is AttnType.Torch and key_padding_mask is not None) else ()
 
         extra_kwargs = {}
         if attn_type is AttnType.SlidingWindowZigZagFlash:
@@ -984,7 +991,7 @@ def _qkv_with_cu_seqlens(
         attn_type, op = _select_attn_op(AttnOpType.VarLenQKVPacked)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         return op(qkv, cu_seqlens, max_seqlen, dropout, softmax_scale, causal, *extra_args)
 
@@ -1007,10 +1014,20 @@ def _q_kv_with_cu_seqlens(
         attn_type, op = _select_attn_op(AttnOpType.VarLenKVPacked)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         return op(
-            q, kv, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout, softmax_scale, causal, *extra_args
+            q,
+            kv,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout,
+            softmax_scale,
+            causal,
+            *extra_args,
+            layer_idx=self.layer_idx,
         )
 
     @forward.register(conditions=(str(QKVPackType.QKVSPLITED), str(CuSeqlenType.With)))
@@ -1033,7 +1050,7 @@ def _q_k_v_with_cu_seqlens(
         attn_type, op = _select_attn_op(AttnOpType.VarLenQKVSplited)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         return op(
             q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout, softmax_scale, causal, *extra_args
@@ -1088,7 +1105,7 @@ def _q_kv_without_cu_seqlens(self, q, kv, softmax_scale=None, causal=None, key_p
         attn_type, op = _select_attn_op(AttnOpType.FixedLenKVPacked)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         return op(q, kv, dropout, softmax_scale, causal, *extra_args)
 
@@ -1100,7 +1117,7 @@ def _q_k_v_without_cu_seqlens(self, q, k, v, softmax_scale=None, causal=None, ke
         attn_type, op = _select_attn_op(AttnOpType.FixedLenQKVSplited)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         return op(q, k, v, dropout, softmax_scale, causal, *extra_args)
 
@@ -1123,7 +1140,7 @@ def _q_kv_with_cu_seqlens(
         attn_type, op = _select_attn_op(AttnOpType.VarLenKVPacked)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         return op(
             q, kv, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout, softmax_scale, causal, *extra_args
@@ -1149,7 +1166,7 @@ def _q_k_v_with_cu_seqlens(
         attn_type, op = _select_attn_op(AttnOpType.VarLenQKVSplited)
 
         dropout = self.dropout if attn_type is AttnType.Torch else self.dropout.p
-        extra_args = (key_padding_mask) if attn_type is AttnType.Torch else ()
+        extra_args = (key_padding_mask,) if attn_type is AttnType.Torch else ()
 
         return op(
             q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout, softmax_scale, causal, *extra_args
diff --git a/internlm/model/ops/cross_entropy.py b/internlm/model/ops/cross_entropy.py
index 82a2da70..99bf1e04 100644
--- a/internlm/model/ops/cross_entropy.py
+++ b/internlm/model/ops/cross_entropy.py
@@ -6,354 +6,131 @@
 This file implements support for the cross entropy operators.
 """
 
+from enum import Enum
+
 import torch
-import torch.distributed as dist
 from torch import nn
 
 from internlm.accelerator import AcceleratorType, get_accelerator
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.model.ops.cross_entropy_ops import (
+    CrossEntropyApexVocabParallel,
+    CrossEntropyLossApex,
+    CrossEntropyPython,
+)
 from internlm.utils.logger import get_logger
 
 logger = get_logger(__file__)
 internlm_accelerator = get_accelerator()
 
 
-# Adapted from https://github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/core/ \
-# sequence_parallel/cross_entropy.py
-class _VocabSequenceParallelCrossEntropy(torch.autograd.Function):
-    """
-    Cross Entropy module for isp.
-    """
-
-    @staticmethod
-    def forward(ctx, vocab_seq_parallel_logits, target, reduction, label_smoothing=0.0):  # pylint: disable=W0613
-        sp_size = gpc.get_world_size(ParallelMode.TENSOR)
-
-        # reshape
-        # vocab_seq_parallel_logits: [B * (S/P), V] -> [B, S/P, V]
-        # target: [B * S/P] -> [B, S/P]
-        bsz = gpc.config.data.micro_bsz if gpc.config.data.use_packed_dataset is False else 1
-        vocab_seq_parallel_logits = vocab_seq_parallel_logits.view(bsz, -1, gpc.config.model.vocab_size)
-        target = target.view(bsz, -1)
-
-        # transpose
-        # vocab_seq_parallel_logits: [B, S/P, V] -> [S/P, B, V]
-        # target: [B, S/P] -> [S/P, B]
-        # return: [S, B]
-        vocab_seq_parallel_logits = vocab_seq_parallel_logits.transpose(0, 1).contiguous()
-        target = target.transpose(0, 1).contiguous()
-
-        ctx.seqlen = vocab_seq_parallel_logits.size(0) * sp_size
-        batch_size = vocab_seq_parallel_logits.size(1)
-
-        # Need softmax for backward
-        softmax = torch.nn.functional.softmax(vocab_seq_parallel_logits, dim=-1)
-        ctx.vocab_size = vocab_seq_parallel_logits.size(2)
-        loss = torch.nn.functional.nll_loss(softmax.log().view(-1, ctx.vocab_size), target.view(-1), reduction="none")
-
-        loss_all = torch.empty(
-            ctx.seqlen, batch_size, dtype=vocab_seq_parallel_logits.dtype, device=vocab_seq_parallel_logits.device
-        )
+def average_losses_across_data_parallel_group(losses):
+    """Reduce a tensor of losses across all GPUs."""
+    averaged_losses = torch.cat([loss.clone().detach().view(1) for loss in losses])
+    torch.distributed.all_reduce(averaged_losses, group=gpc.get_group(ParallelMode.DATA))
+    averaged_losses = averaged_losses / gpc.get_world_size(ParallelMode.DATA)
 
-        torch.distributed.all_gather_into_tensor(loss_all, loss, group=gpc.get_group(ParallelMode.TENSOR))
+    return averaged_losses
 
-        # [s b] => [b, s]
-        loss_all = loss_all.transpose(0, 1).contiguous()
 
-        ctx.save_for_backward(softmax, target)
+class CrossEntropyOpType(Enum):
+    torch_naive = 1  # CrossEntropy from torch
+    flash_vocab_parallel = 2  # VocabParallel CorssEntropy from flash_attn
+    apex_naive = 3  # CrossEntropy from apex
+    py_vocab_parallel = 4  # self-implemented VocabParallel CrossEntropy
+    py_naive = 5  # self-implemented CrossEntropy
+    # sequence_parallel = 6 # self-implemented SequenceParallel CrossEntropy
 
-        return loss_all
 
-    @staticmethod
-    def backward(ctx, grad_output):
-        softmax, target = ctx.saved_tensors
+cross_entropy_op_name_map = {
+    "torch_naive": CrossEntropyOpType.torch_naive,
+    "flash_vocab_parallel": CrossEntropyOpType.flash_vocab_parallel,
+    "apex_naive": CrossEntropyOpType.apex_naive,
+    "py_vocab_parallel": CrossEntropyOpType.py_vocab_parallel,
+    "py_naive": CrossEntropyOpType.py_naive,
+    # "sequence_parallel": CrossEntropyOpType.sequence_parallel,
+}
 
-        # transpose
-        grad_output = grad_output.transpose(0, 1).contiguous()
 
-        step_seqlen = ctx.seqlen // gpc.get_world_size(ParallelMode.TENSOR)
-        sp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
-        grad_output_part = grad_output[step_seqlen * sp_rank : step_seqlen * (sp_rank + 1), :]
+# TODO: ops是否需要实现更加统一的形式
+def new_cross_entropy(
+    op_type: str = "py_vocab_parallel",
+    ignore_index: int = -100,
+    label_smoothing: float = 0,
+    parallel_output: bool = False,
+    inplace_backward: bool = True,
+    reduction: str = "none",
+):
+    try:
+        op_type = cross_entropy_op_name_map[op_type]
+    except KeyError:
+        raise KeyError(f"op_type only support: {cross_entropy_op_name_map.keys()}")
 
-        grad_input = softmax
-        grad_2d = grad_input.view(-1, ctx.vocab_size)
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+    if internlm_accelerator.get_accelerator_backend() is not AcceleratorType.GPU:
+        assert op_type in [
+            CrossEntropyOpType.torch_naive,
+            CrossEntropyOpType.py_vocab_parallel,
+        ], "no-GPU env only support 'torch_naive' or 'py_vocab_parallel loss function"
 
-        grad_2d[arange_1d, target.view(-1)] -= 1
-        grad_input.mul_(grad_output_part.unsqueeze(dim=-1))
+    if op_type == CrossEntropyOpType.torch_naive:
 
-        # transpose
-        grad_input = grad_input.transpose(0, 1).contiguous()
-        # reshape
-        grad_input = grad_input.view(-1, gpc.config.model.vocab_size)
+        assert parallel_output is False, (
+            "'torch_naive' (nn.CrossEntropyLoss) don't support parallel_output, "
+            "try use 'flash_vocab_parallel' or 'py_vocab_parallel'"
+        )
 
-        return grad_input, None, None
+        return nn.CrossEntropyLoss(reduction=reduction, label_smoothing=label_smoothing, ignore_index=ignore_index)
 
+    elif op_type == CrossEntropyOpType.flash_vocab_parallel:
 
-def vocab_sequence_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0):
-    return _VocabSequenceParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
+        assert gpc.get_group(ParallelMode.TENSOR) is not None, "The process group should not be None."
 
+        try:
+            from flash_attn.losses.cross_entropy import (
+                CrossEntropyLoss as FlashCrossEntropyLoss,
+            )
 
-def average_losses_across_data_parallel_group(losses):
-    """Reduce a tensor of losses across all GPUs."""
-    averaged_losses = torch.cat([loss.clone().detach().view(1) for loss in losses])
-    torch.distributed.all_reduce(averaged_losses, group=gpc.get_group(ParallelMode.DATA))
-    averaged_losses = averaged_losses / gpc.get_world_size(ParallelMode.DATA)
+            flash_cross_entropy_impl = True
+        except (ModuleNotFoundError, ImportError):
+            flash_cross_entropy_impl = False
 
-    return averaged_losses
+        assert (
+            gpc.config.model.get("use_flash_attn", False) and flash_cross_entropy_impl
+        ), "Only flash cross entropy support parallel_output"
 
+        assert (
+            internlm_accelerator.get_accelerator_backend() is AcceleratorType.GPU
+        ), "flash cross entropy only support gpu backend"
 
-class VocabSequenceParallelCrossEntropyLoss(nn.Module):
-    """
-    Cross Entropy module for isp.
-    """
-
-    def __init__(
-        self,
-        ignore_index: int = -100,
-        reduction: str = "mean",
-        label_smoothing: float = 0,
-        process_group=None,
-    ):
-        super().__init__()
-        if reduction not in ["mean", "none"]:
-            raise NotImplementedError("Only support reduction = 'mean' or 'none'")
-        self.ignore_index = ignore_index
-        self.reduction = reduction
-        self.label_smoothing = label_smoothing
-        self.process_group = process_group
-
-    def loss_mean_func(self, output_tensor):
-        losses = output_tensor.float()
-        loss = torch.sum(losses.view(-1)) / losses.numel()
-
-        # TODO: allreduce loss in dp group
-
-        return loss
-
-    def forward(self, _input, target):
-        assert _input.is_cuda and target.is_cuda
-
-        _loss_list = vocab_sequence_parallel_cross_entropy(_input, target, self.label_smoothing)
-
-        if self.reduction == "mean":
-            loss = self.loss_mean_func(_loss_list)
-            return loss
-
-        return _loss_list.view(-1)
-
-
-class _VocabParallelCrossEntropy(torch.autograd.Function):
-    """Adapt from: https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/cross_entropy.py
-    Supports vocab parallel loss calculation, but does not support inplace backward.
-    NOTE: This class is different from the original Apex implementation. Apex will calculate the loss of
-        ignore_index and flashCrossEntropy will set it to 0. InterEvo adapts the second approach.
-    """
-
-    @staticmethod
-    @internlm_accelerator.amp.custom_fwd
-    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0, process_group=None):
-        # Maximum value along vocab dimension across all GPUs.
-        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
-        if process_group is not None and dist.get_world_size(process_group) > 1:
-            torch.distributed.all_reduce(logits_max, op=torch.distributed.ReduceOp.MAX, group=process_group)
-        # Subtract the maximum value.
-        vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
-
-        # Get the partition's vocab indecies
-        # get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
-        partition_vocab_size = vocab_parallel_logits.size()[-1]
-        if process_group is not None and dist.get_world_size(process_group) > 1:
-            rank = dist.get_rank(process_group)
-            # world_size = dist.get_world_size(process_group)
-            part_len = vocab_parallel_logits.shape[-1]
-            vocab_start_index, vocab_end_index = part_len * rank, part_len * (rank + 1)
-        else:
-            vocab_start_index, vocab_end_index = 0, vocab_parallel_logits.shape[-1]
-
-        # vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
-        # Create a mask of valid vocab ids (1 means it needs to be masked).
-        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
-        ignore_mask = target == -100
-        masked_target = target.clone() - vocab_start_index
-        masked_target[target_mask] = 0
-
-        # Get predicted-logits = logits[target].
-        # For Simplicity, we convert logits to a 2-D tensor with size
-        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
-        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
-        masked_target_1d = masked_target.view(-1)
-        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
-        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
-        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
-        predicted_logits = predicted_logits_1d.view_as(target)
-        predicted_logits[target_mask] = 0.0
-
-        # All reduce is needed to get the chunks from other GPUs.
-        if process_group is not None and dist.get_world_size(process_group) > 1:
-            torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
-
-        # Sum of exponential of logits along vocab dimension across all GPUs.
-        exp_logits = vocab_parallel_logits
-        torch.exp(vocab_parallel_logits, out=exp_logits)
-        sum_exp_logits = exp_logits.sum(dim=-1)
-
-        if process_group is not None and dist.get_world_size(process_group) > 1:
-            torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
-
-        # Normalize and optionally smooth logits
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-
-        # Loss = log(sum(exp(logits))) - predicted-logit.
-        sum_exp_logits = torch.log(sum_exp_logits)
-        loss = sum_exp_logits - predicted_logits
-        loss[ignore_mask] = 0.0
-
-        vocab_size = exp_logits.size(-1)
-        if label_smoothing > 0:
-            r"""
-            We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth.
-            = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt})
-            = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
-            = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
-            = (K * (1 - alpha) - 1) / (K - 1)) * y_gt  + (alpha / (K - 1)) * \sum_{i} y_i
-            = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K
-            From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py
-            """
-            assert 1.0 > label_smoothing > 0.0
-            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
-
-            # Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs.
-            log_probs = torch.log(exp_logits)
-            mean_log_probs = log_probs.mean(dim=-1)
-            loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
-
-        ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
-        # Store softmax, target-mask and masked-target for backward pass.
-        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d, ignore_mask)
-
-        return loss
-
-    @staticmethod
-    @internlm_accelerator.amp.custom_bwd
-    def backward(ctx, grad_output):
-
-        # Retreive tensors from the forward path.
-        softmax, target_mask, masked_target_1d, ignore_mask = ctx.saved_tensors
-        label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
-
-        # All the inputs have softmax as thier gradient.
-        grad_input = softmax  # s_{k}
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
-
-        softmax_update = 1.0 - target_mask.view(-1).float()
-
-        if label_smoothing > 0:
-            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
-            grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
-            average_grad = 1 / vocab_size
-            grad_2d[arange_1d, :] -= smoothing * average_grad
-        else:
-            grad_2d[arange_1d, masked_target_1d] -= softmax_update
-
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(grad_output.unsqueeze(dim=-1))
-        grad_input[ignore_mask] = 0.0  # set ignore token loss as 0.
-
-        return grad_input, None, None, None
-
-
-class CrossEntropyApexVocabParallel(nn.Module):
-    """Adapt from: https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/cross_entropy.py
-    Supports vocab parallel loss calculation, but does not support inplace backward.
-    """
-
-    def __init__(
-        self, ignore_index=-100, reduction="mean", label_smoothing=0.0, process_group=None, inplace_backward=False
-    ):
-        super().__init__()
-        if reduction not in ["mean", "none"]:
-            raise NotImplementedError("Only support reduction = 'mean' or 'none'")
-        assert inplace_backward is False, "does not support inplace backward"
-        self.ignore_index = ignore_index
-        self.reduction = reduction
-        self.label_smoothing = label_smoothing
-        self.process_group = process_group
-
-    def forward(self, vocab_parallel_logits, target):
-        # assert vocab_parallel_logits.is_cuda and vocab_parallel_logits.is_cuda
-
-        # SoftmaxCrossEntropyLoss implicitly casts to float
-        loss = _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, self.label_smoothing, self.process_group)
-        if self.reduction == "mean":
-            return loss.sum() / (target != self.ignore_index).sum()
-        else:
-            return loss
-
-
-def flash_loss(
-    ignore_index=-100,
-    reduction="mean",
-    label_smoothing=0.0,
-    process_group=None,
-    inplace_backward=False,  # pylint:disable=W0613
-):
-    try:
-        from flash_attn.losses.cross_entropy import (
-            CrossEntropyLoss as FlashCrossEntropyLoss,
+        logger.warning(
+            "You are using flash_attn cross_entropy operators, \
+            which may result loss divergency in long sequence."
         )
 
-        flash_cross_entropy_impl = True
-    except (ModuleNotFoundError, ImportError):
-        flash_cross_entropy_impl = False
-
-    assert (
-        gpc.config.model.get("use_flash_attn", False) and flash_cross_entropy_impl
-    ), "Only flash cross entropy support parallel_output"
-
-    assert (
-        internlm_accelerator.get_accelerator_backend() is AcceleratorType.GPU
-    ), "flash cross entropy only support gpu backend"
+        return FlashCrossEntropyLoss(
+            ignore_index=ignore_index,
+            reduction=reduction,
+            label_smoothing=label_smoothing,
+            process_group=gpc.get_group(ParallelMode.TENSOR),
+            inplace_backward=inplace_backward,
+        )
 
-    return FlashCrossEntropyLoss(
-        ignore_index=ignore_index,
-        reduction=reduction,
-        label_smoothing=label_smoothing,
-        process_group=process_group,
-    )
+    elif op_type == CrossEntropyOpType.apex_naive:
+        assert parallel_output is False, (
+            "'apex_naive' (nn.CrossEntropyLoss) can'ts support parallel_output,"
+            "try use 'flash_vocab_parallel' or 'py_vocab_parallel'"
+        )
 
+        return CrossEntropyLossApex(
+            ignore_index=ignore_index,
+            reduction=reduction,
+            inplace_backward=inplace_backward,
+            label_smoothing=label_smoothing,
+        )
 
-# TODO: ops是否需要实现更加统一的形式
-def new_cross_entropy(
-    ignore_index: int = -100,
-    reduction: str = "mean",
-    label_smoothing: float = 0,
-    parallel_output: bool = False,
-    **kwargs,
-):
-    # if is_using_isp() and parallel_output:
-    #     if gpc.is_rank_for_log():
-    #         logger.warning("Use VocabSequenceParallelCrossEntropyLoss.")
-    #     return VocabSequenceParallelCrossEntropyLoss(
-    #         ignore_index=ignore_index,
-    #         reduction=reduction,
-    #         label_smoothing=label_smoothing,
-    #         process_group=gpc.get_group(ParallelMode.TENSOR),
-    #     )
-
-    if parallel_output:
-        # return flash_loss(
-        #     ignore_index=ignore_index,
-        #     reduction=reduction,
-        #     label_smoothing=label_smoothing,
-        #     process_group=gpc.get_group(ParallelMode.TENSOR),
-        # )
+    elif op_type == CrossEntropyOpType.py_vocab_parallel:
+        assert gpc.get_group(ParallelMode.TENSOR) is not None, "The process group should not be None."
 
         return CrossEntropyApexVocabParallel(
             ignore_index=ignore_index,
@@ -361,13 +138,13 @@ def new_cross_entropy(
             label_smoothing=label_smoothing,
             process_group=gpc.get_group(ParallelMode.TENSOR),
         )
-    else:
-        if gpc.is_rank_for_log():
-            logger.warning(
-                "Use nn.CrossEntropyLoss rather than flashattn CrossEntropyLoss."
-                "parallel_output must be set false. Please note this!"
-            )
-        kwargs.pop("inplace_backward", None)
-        return nn.CrossEntropyLoss(
-            ignore_index=ignore_index, reduction=reduction, label_smoothing=label_smoothing, **kwargs
+
+    elif op_type == CrossEntropyOpType.py_naive:
+        assert parallel_output is False, (
+            "'py_naive' (nn.CrossEntropyLoss) don't support parallel_output,"
+            "try use 'flash_vocab_parallel' or 'py_vocab_parallel'"
         )
+        return CrossEntropyPython(ignore_index=ignore_index, reduction=reduction)
+
+    else:
+        raise RuntimeError(f"unkown loss function type: {op_type}")
diff --git a/internlm/model/ops/cross_entropy_ops/__init__.py b/internlm/model/ops/cross_entropy_ops/__init__.py
new file mode 100644
index 00000000..1f4b6630
--- /dev/null
+++ b/internlm/model/ops/cross_entropy_ops/__init__.py
@@ -0,0 +1,11 @@
+from .apex_naive_loss import CrossEntropyLossApex
+from .py_naive_loss import CrossEntropyPython
+from .py_vocab_parallel_loss import CrossEntropyApexVocabParallel
+from .sequence_parallel_loss import VocabSequenceParallelCrossEntropyLoss
+
+__all__ = [
+    "CrossEntropyLossApex",
+    "CrossEntropyPython",
+    "CrossEntropyApexVocabParallel",
+    "VocabSequenceParallelCrossEntropyLoss",
+]
diff --git a/internlm/model/ops/cross_entropy_ops/apex_naive_loss.py b/internlm/model/ops/cross_entropy_ops/apex_naive_loss.py
new file mode 100644
index 00000000..139f20a2
--- /dev/null
+++ b/internlm/model/ops/cross_entropy_ops/apex_naive_loss.py
@@ -0,0 +1,77 @@
+import torch
+from torch import nn
+
+from internlm.accelerator import get_accelerator
+
+try:
+    import xentropy_cuda_lib
+except (ImportError, ModuleNotFoundError):
+    has_xentropy_cuda_lib = False
+else:
+    has_xentropy_cuda_lib = True
+
+
+internlm_accelerator = get_accelerator()
+
+
+class SoftmaxCrossEntropyLossFn(torch.autograd.Function):
+    """
+    Adapt from: https://github.com/NVIDIA/apex/blob/master/apex/contrib/xentropy/softmax_xentropy.py
+    Inplace backward is supported, but loss calculation of vocab parallel is not supported.
+    NOTE: it should be noted that when the pack_length exceeds 40K, the loss will not decrease.
+    """
+
+    @staticmethod
+    @internlm_accelerator.amp.custom_fwd
+    def forward(ctx, logits, labels, smoothing=0.0, padding_idx=0, inplace_backward=False):
+        losses, max_log_sum_exp = xentropy_cuda_lib.forward(logits, labels, smoothing)
+        losses.masked_fill_(labels == padding_idx, 0)
+        ctx.save_for_backward(logits, max_log_sum_exp, labels)
+        ctx.smoothing = smoothing
+        ctx.padding_idx = padding_idx
+        ctx.inplace_backward = inplace_backward
+        return losses
+
+    @staticmethod
+    @internlm_accelerator.amp.custom_bwd
+    def backward(ctx, grad_loss):
+        logits, max_log_sum_exp, labels = ctx.saved_tensors
+        if not grad_loss.is_contiguous():
+            grad_loss = grad_loss.contiguous()
+        grad_loss.masked_fill_(labels == ctx.padding_idx, 0)
+        grad_logits = xentropy_cuda_lib.backward(
+            grad_loss, logits, max_log_sum_exp, labels, ctx.smoothing, ctx.inplace_backward
+        )
+        return grad_logits, None, None, None, None
+
+
+class CrossEntropyLossApex(nn.Module):
+    """
+    Inplace backward is supported, but loss calculation of vocab parallel is not supported.
+    NOTE: it should be noted that when the pack_length exceeds 40K, the loss will not decrease.
+    """
+
+    def __init__(self, ignore_index=-100, reduction="mean", label_smoothing=0.0, inplace_backward=False):
+        super().__init__()
+        if reduction not in ["mean", "none"]:
+            raise NotImplementedError("Only support reduction = 'mean' or 'none'")
+
+        assert (
+            has_xentropy_cuda_lib is True
+        ), "The 'xentropy_cuda_lib' package which CrossEntropyLossApex needed was not found in your environment!"
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.label_smoothing = label_smoothing
+        self.inplace_backward = inplace_backward
+
+    def forward(self, logits, target):
+        # assert logits.is_cuda and target.is_cuda
+
+        # SoftmaxCrossEntropyLoss implicitly casts to float
+        loss = SoftmaxCrossEntropyLossFn.apply(
+            logits, target, self.label_smoothing, self.ignore_index, self.inplace_backward
+        )
+        if self.reduction == "mean":
+            return loss.sum() / (target != self.ignore_index).sum()
+        else:
+            return loss
diff --git a/internlm/model/ops/cross_entropy_ops/py_naive_loss.py b/internlm/model/ops/cross_entropy_ops/py_naive_loss.py
new file mode 100644
index 00000000..f391933f
--- /dev/null
+++ b/internlm/model/ops/cross_entropy_ops/py_naive_loss.py
@@ -0,0 +1,83 @@
+import torch
+from torch import nn
+
+from internlm.accelerator import get_accelerator
+
+internlm_accelerator = get_accelerator()
+
+
+class CrossEntropyWriteInPython(torch.autograd.Function):
+    """baseline for unit test."""
+
+    @staticmethod
+    @internlm_accelerator.amp.custom_fwd
+    def forward(ctx, logits, target, ignore_idx):
+        # (1) cal mask
+        ignore_mask = target == ignore_idx
+        target[ignore_mask] = 0
+
+        # (2) safe softmax for logist
+        logits_max = torch.max(logits, dim=-1)[0]
+        logits = logits - logits_max.unsqueeze(dim=-1)
+
+        # (3) cal predicted_logits
+        vocab_size = logits.shape[-1]
+        logits_2d = logits.view(-1, vocab_size)
+        target = target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits = logits_2d[arange_1d, target].clone().contiguous().view_as(target)
+
+        # (4) softmax
+        exp_logits = logits
+        torch.exp(logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+
+        # (5) Normalize and optionally smooth logits
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+
+        # (6) cal log
+        sum_exp_logits = torch.log(sum_exp_logits)
+
+        # (7) cal loss
+        loss = sum_exp_logits - predicted_logits
+
+        # (8) apply ignore_mask
+        loss[ignore_mask] = 0.0
+        ctx.save_for_backward(exp_logits, target, ignore_mask)
+        return loss
+
+    @staticmethod
+    @internlm_accelerator.amp.custom_bwd
+    def backward(ctx, grad_output):
+        # The deriving of cross entropy ref:
+        # https://shivammehta25.github.io/posts/deriving-categorical-cross-entropy-and-softmax/
+        softmax, target, ignore_mask = ctx.saved_tensors
+
+        # Add the gradient from matching classes(which is indicate by target).
+        grad_input = softmax
+        grad_2d = grad_input.view(-1, softmax.shape[-1])
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, target] -= 1.0
+
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))  # elementwise multiplication
+        grad_input[ignore_mask] = 0.0  # set ignore token loss as 0.
+
+        return grad_input, None, None, None
+
+
+class CrossEntropyPython(nn.Module):
+    """
+    Baseline for unit test. Please do not use this class directly.
+    """
+
+    def __init__(self, ignore_index=-100, reduction="mean"):
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+
+    def forward(self, logits, target):
+        loss = CrossEntropyWriteInPython.apply(logits, target, self.ignore_index)
+        if self.reduction == "mean":
+            return loss.sum() / (target != self.ignore_index).sum()
+        else:
+            return loss
diff --git a/internlm/model/ops/cross_entropy_ops/py_vocab_parallel_loss.py b/internlm/model/ops/cross_entropy_ops/py_vocab_parallel_loss.py
new file mode 100644
index 00000000..6f5457c8
--- /dev/null
+++ b/internlm/model/ops/cross_entropy_ops/py_vocab_parallel_loss.py
@@ -0,0 +1,160 @@
+import torch
+import torch.distributed as dist
+from torch import nn
+
+from internlm.accelerator import get_accelerator
+
+internlm_accelerator = get_accelerator()
+
+
+class _VocabParallelCrossEntropy(torch.autograd.Function):
+    """Adapt from: https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/cross_entropy.py
+    Supports vocab parallel loss calculation, but does not support inplace backward.
+    NOTE: This class is different from the original Apex implementation. Apex will calculate the loss of
+        ignore_index and flashCrossEntropy will set it to 0. InterEvo adapts the second approach.
+    """
+
+    @staticmethod
+    @internlm_accelerator.amp.custom_fwd
+    def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0, process_group=None):
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        if process_group is not None and dist.get_world_size(process_group) > 1:
+            torch.distributed.all_reduce(logits_max, op=torch.distributed.ReduceOp.MAX, group=process_group)
+        # Subtract the maximum value.
+        vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
+
+        # Get the partition's vocab indecies
+        # get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        if process_group is not None and dist.get_world_size(process_group) > 1:
+            rank = dist.get_rank(process_group)
+            # world_size = dist.get_world_size(process_group)
+            part_len = vocab_parallel_logits.shape[-1]
+            vocab_start_index, vocab_end_index = part_len * rank, part_len * (rank + 1)
+        else:
+            vocab_start_index, vocab_end_index = 0, vocab_parallel_logits.shape[-1]
+
+        # vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
+        ignore_mask = target == -100
+        masked_target = target.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+
+        # All reduce is needed to get the chunks from other GPUs.
+        if process_group is not None and dist.get_world_size(process_group) > 1:
+            torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = vocab_parallel_logits
+        torch.exp(vocab_parallel_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+
+        if process_group is not None and dist.get_world_size(process_group) > 1:
+            torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
+
+        # Normalize and optionally smooth logits
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        sum_exp_logits = torch.log(sum_exp_logits)
+        loss = sum_exp_logits - predicted_logits
+        loss[ignore_mask] = 0.0
+
+        vocab_size = exp_logits.size(-1)
+        if label_smoothing > 0:
+            r"""
+            We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth.
+            = (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt})
+            = (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
+            = ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
+            = (K * (1 - alpha) - 1) / (K - 1)) * y_gt  + (alpha / (K - 1)) * \sum_{i} y_i
+            = (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K
+            From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py
+            """
+            assert 1.0 > label_smoothing > 0.0
+            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
+
+            # Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs.
+            log_probs = torch.log(exp_logits)
+            mean_log_probs = log_probs.mean(dim=-1)
+            loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
+
+        ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d, ignore_mask)
+
+        return loss
+
+    @staticmethod
+    @internlm_accelerator.amp.custom_bwd
+    def backward(ctx, grad_output):
+
+        # Retreive tensors from the forward path.
+        softmax, target_mask, masked_target_1d, ignore_mask = ctx.saved_tensors
+        label_smoothing, vocab_size = ctx.label_smoothing, ctx.vocab_size
+
+        # All the inputs have softmax as thier gradient.
+        grad_input = softmax  # s_{k}
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+
+        softmax_update = 1.0 - target_mask.view(-1).float()
+
+        if label_smoothing > 0:
+            smoothing = label_smoothing * vocab_size / (vocab_size - 1)
+            grad_2d[arange_1d, masked_target_1d] -= (1.0 - smoothing) * softmax_update
+            average_grad = 1 / vocab_size
+            grad_2d[arange_1d, :] -= smoothing * average_grad
+        else:
+            grad_2d[arange_1d, masked_target_1d] -= softmax_update
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+        grad_input[ignore_mask] = 0.0  # set ignore token loss as 0.
+
+        return grad_input, None, None, None
+
+
+class CrossEntropyApexVocabParallel(nn.Module):
+    """Adapt from: https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/cross_entropy.py
+    Supports vocab parallel loss calculation, but does not support inplace backward.
+    """
+
+    def __init__(
+        self, ignore_index=-100, reduction="mean", label_smoothing=0.0, process_group=None, inplace_backward=False
+    ):
+        super().__init__()
+        if reduction not in ["mean", "none"]:
+            raise NotImplementedError("Only support reduction = 'mean' or 'none'")
+        assert inplace_backward is False, "does not support inplace backward"
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.label_smoothing = label_smoothing
+        self.process_group = process_group
+
+    def forward(self, vocab_parallel_logits, target):
+        # assert vocab_parallel_logits.is_cuda and vocab_parallel_logits.is_cuda
+
+        # SoftmaxCrossEntropyLoss implicitly casts to float
+        loss = _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target, self.label_smoothing, self.process_group)
+        if self.reduction == "mean":
+            return loss.sum() / (target != self.ignore_index).sum()
+        else:
+            return loss
diff --git a/internlm/model/ops/cross_entropy_ops/sequence_parallel_loss.py b/internlm/model/ops/cross_entropy_ops/sequence_parallel_loss.py
new file mode 100644
index 00000000..2072944f
--- /dev/null
+++ b/internlm/model/ops/cross_entropy_ops/sequence_parallel_loss.py
@@ -0,0 +1,121 @@
+import torch
+from torch import nn
+
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+
+
+# Adapted from https://github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/core/ \
+# sequence_parallel/cross_entropy.py
+class _VocabSequenceParallelCrossEntropy(torch.autograd.Function):
+    """
+    Cross Entropy module for isp.
+    """
+
+    @staticmethod
+    def forward(ctx, vocab_seq_parallel_logits, target, reduction, label_smoothing=0.0):  # pylint: disable=W0613
+        sp_size = gpc.get_world_size(ParallelMode.TENSOR)
+
+        # reshape
+        # vocab_seq_parallel_logits: [B * (S/P), V] -> [B, S/P, V]
+        # target: [B * S/P] -> [B, S/P]
+        bsz = gpc.config.data.micro_bsz if gpc.config.data.use_packed_dataset is False else 1
+        vocab_seq_parallel_logits = vocab_seq_parallel_logits.view(bsz, -1, gpc.config.model.vocab_size)
+        target = target.view(bsz, -1)
+
+        # transpose
+        # vocab_seq_parallel_logits: [B, S/P, V] -> [S/P, B, V]
+        # target: [B, S/P] -> [S/P, B]
+        # return: [S, B]
+        vocab_seq_parallel_logits = vocab_seq_parallel_logits.transpose(0, 1).contiguous()
+        target = target.transpose(0, 1).contiguous()
+
+        ctx.seqlen = vocab_seq_parallel_logits.size(0) * sp_size
+        batch_size = vocab_seq_parallel_logits.size(1)
+
+        # Need softmax for backward
+        softmax = torch.nn.functional.softmax(vocab_seq_parallel_logits, dim=-1)
+        ctx.vocab_size = vocab_seq_parallel_logits.size(2)
+        loss = torch.nn.functional.nll_loss(softmax.log().view(-1, ctx.vocab_size), target.view(-1), reduction="none")
+
+        loss_all = torch.empty(
+            ctx.seqlen, batch_size, dtype=vocab_seq_parallel_logits.dtype, device=vocab_seq_parallel_logits.device
+        )
+
+        torch.distributed.all_gather_into_tensor(loss_all, loss, group=gpc.get_group(ParallelMode.TENSOR))
+
+        # [s b] => [b, s]
+        loss_all = loss_all.transpose(0, 1).contiguous()
+
+        ctx.save_for_backward(softmax, target)
+
+        return loss_all
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        softmax, target = ctx.saved_tensors
+
+        # transpose
+        grad_output = grad_output.transpose(0, 1).contiguous()
+
+        step_seqlen = ctx.seqlen // gpc.get_world_size(ParallelMode.TENSOR)
+        sp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+        grad_output_part = grad_output[step_seqlen * sp_rank : step_seqlen * (sp_rank + 1), :]
+
+        grad_input = softmax
+        grad_2d = grad_input.view(-1, ctx.vocab_size)
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+
+        grad_2d[arange_1d, target.view(-1)] -= 1
+        grad_input.mul_(grad_output_part.unsqueeze(dim=-1))
+
+        # transpose
+        grad_input = grad_input.transpose(0, 1).contiguous()
+        # reshape
+        grad_input = grad_input.view(-1, gpc.config.model.vocab_size)
+
+        return grad_input, None, None
+
+
+def vocab_sequence_parallel_cross_entropy(vocab_parallel_logits, target, label_smoothing=0.0):
+    return _VocabSequenceParallelCrossEntropy.apply(vocab_parallel_logits, target, label_smoothing)
+
+
+class VocabSequenceParallelCrossEntropyLoss(nn.Module):
+    """
+    Cross Entropy module for isp.
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        reduction: str = "mean",
+        label_smoothing: float = 0,
+        process_group=None,
+    ):
+        super().__init__()
+        if reduction not in ["mean", "none"]:
+            raise NotImplementedError("Only support reduction = 'mean' or 'none'")
+        self.ignore_index = ignore_index
+        self.reduction = reduction
+        self.label_smoothing = label_smoothing
+        self.process_group = process_group
+
+    def loss_mean_func(self, output_tensor):
+        losses = output_tensor.float()
+        loss = torch.sum(losses.view(-1)) / losses.numel()
+
+        # TODO: allreduce loss in dp group
+
+        return loss
+
+    def forward(self, _input, target):
+        assert _input.is_cuda and target.is_cuda
+
+        _loss_list = vocab_sequence_parallel_cross_entropy(_input, target, self.label_smoothing)
+
+        if self.reduction == "mean":
+            loss = self.loss_mean_func(_loss_list)
+            return loss
+
+        return _loss_list.view(-1)
diff --git a/internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py b/internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py
index 6d531158..5c22fed3 100644
--- a/internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py
+++ b/internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py
@@ -481,7 +481,6 @@ def forward(
     @staticmethod
     def backward(ctx, dout, *args):  # pylint: disable=W0613
 
-        torch.cuda.synchronize()
         q, k, v, out, softmax_lse = ctx.saved_tensors
 
         dq, dk, dv = zigzag_double_ring_flash_attn_backward(
@@ -504,8 +503,6 @@ def backward(ctx, dout, *args):  # pylint: disable=W0613
             deterministic=ctx.deterministic,
         )
 
-        torch.cuda.synchronize()
-
         return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None, None, None
 
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index e3ebf44d..7c974abe 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -6,9 +6,12 @@
 
 from internlm.core.context.parallel_context import global_context as gpc
 from internlm.model.modules.mha import MHA
+from internlm.utils.logger import get_logger
 from internlm.utils.storage_manager import get_fns, llm_load
 from internlm.utils.utils import TensorParallelMode
 
+logger = get_logger(__file__)
+
 
 def internlm1_mha_pre_load_convert(
     model: MHA, state_dict: Dict, prefix: str, *args, **kwargs  # pylint: disable=W0613
@@ -138,3 +141,20 @@ def merge_pp_src_states(states):
             layer_shift += _layer_shift + 1
         merged_states.append(shifted_state)
     return merged_states
+
+
+def get_parallel_size_from_file(fns, suffix=None):
+    model_fns, old_tp, old_pp = [], -1, -1
+    for fn in fns:
+        # filter with `_t` is for avoiding conflict with model_config.py
+
+        if fn.startswith("model_t"):
+            if (suffix and fn.endswith(suffix)) or (suffix is None and not fn.endswith("md5")):
+                model_fns.append(fn)
+                _, tp, pp = os.path.splitext(fn)[0].split("_")
+                old_tp = max(old_tp, int(tp[2:]) + 1)
+                old_pp = max(old_pp, int(pp[2:]) + 1)
+
+    assert old_tp > 0 and old_pp > 0, f"ckpt with tp:{old_tp} and pp:{old_pp} is illegal"
+    model_fns.sort()
+    return model_fns, old_tp, old_pp
diff --git a/internlm/train/pipeline.py b/internlm/train/pipeline.py
index 72c67ae4..784a5305 100644
--- a/internlm/train/pipeline.py
+++ b/internlm/train/pipeline.py
@@ -164,7 +164,7 @@ def set_fp32_attr_for_model(model: Union[nn.Module, nn.ModuleList]):
 
 
 def set_parallel_attr_for_param_groups(model: Union[nn.Module, nn.ModuleList]):
-    def _check_module_pure_dp_wdp(name, module):  # pylint: disable=W0613
+    def _check_module_pure_dp(name, module):  # pylint: disable=W0613
         for param in module.parameters():
             setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
@@ -220,11 +220,13 @@ def _check_module(name, module):
                 setattr(param, IS_REPLICA_ZERO_PARALLEL, True)
 
     for _chunk in unwrap_naive_amp(model):
-        # special case for pure dp or pure wdp mode
-        if gpc.get_world_size(ParallelMode.DATA) == gpc.get_world_size(ParallelMode.GLOBAL) and gpc.get_world_size(
-            ParallelMode.WEIGHT_DATA
-        ) == gpc.get_world_size(ParallelMode.GLOBAL):
-            _check_module_func = _check_module_pure_dp_wdp
+        # special case for pure dp mode
+        if (
+            isinstance(gpc.config.parallel["tensor"], dict)
+            and gpc.config.parallel["tensor"].get("mode", TensorParallelMode.mtp.name) == TensorParallelMode.mtp.name
+            and gpc.get_world_size(ParallelMode.DATA) == gpc.get_world_size(ParallelMode.GLOBAL)
+        ):
+            _check_module_func = _check_module_pure_dp
         else:
             _check_module_func = _check_module
         # set param parallel attribute
@@ -361,6 +363,7 @@ def initialize_parallel_communicator(model: Union[nn.Module, nn.ModuleList]):
             gpc.config.parallel.weight.overlap,
             gpc.get_group(ParallelMode.WEIGHT),
             is_moe=False,
+            selective_ckpt_offload=gpc.config.get("selective_checkpoint_offload", False),
             early_reduce_scatter_release=gpc.config.parallel.weight.early_reduce_scatter_release,
         )
         # register communicator for isp column parallel linear.
@@ -955,22 +958,34 @@ def traverse(module):
 
 
 def inject_config(model: nn.Module) -> None:
+    # Compatibility for Vision-Language Model
     if hasattr(model.config, "text_config"):
-        model_config = model.config.text_config
+        llm_cfg = model.config.text_config
     else:
-        model_config = model.config
-    gpc.config.model.vocab_size = gpc.config.VOCAB_SIZE = model_config.vocab_size
-    gpc.config.model.hidden_size = gpc.config.HIDDEN_SIZE = model_config.hidden_size
-    gpc.config.model.num_layers = gpc.config.NUM_LAYER = model_config.num_hidden_layers
-    gpc.config.model.num_attention_heads = gpc.config.NUM_ATTENTION_HEAD = model_config.num_attention_heads
-    gpc.config.model.mlp_ratio = gpc.config.MLP_RATIO = model_config.intermediate_size / model_config.hidden_size
+        llm_cfg = model.config
+    gpc.config.model.vocab_size = gpc.config.VOCAB_SIZE = llm_cfg.vocab_size
+    gpc.config.model.hidden_size = gpc.config.HIDDEN_SIZE = llm_cfg.hidden_size
+    gpc.config.model.num_layers = gpc.config.NUM_LAYER = llm_cfg.num_hidden_layers
+    # Compatibility for Mamba
+    if hasattr(llm_cfg, "num_attention_heads"):
+        gpc.config.model.num_attention_heads = gpc.config.NUM_ATTENTION_HEAD = llm_cfg.num_attention_heads
+    gpc.config.model.mlp_ratio = gpc.config.MLP_RATIO = llm_cfg.intermediate_size / llm_cfg.hidden_size
     # For models that use GQA
-    if hasattr(model_config, "num_key_value_heads"):
-        gpc.config.model.num_kv_attention_heads = gpc.config.NUM_KV_ATTENTION_HEAD = model_config.num_key_value_heads
+    if hasattr(llm_cfg, "num_key_value_heads"):
+        gpc.config.model.num_kv_attention_heads = gpc.config.NUM_KV_ATTENTION_HEAD = llm_cfg.num_key_value_heads
 
 
 def inject_model_helper(model: Union[nn.Module, nn.ModuleList], inject_info: Optional[Dict] = None) -> None:
-    # get inject_info
+    """
+    Inject model helper functions.
+
+    Args:
+        model (Union[nn.Module, nn.ModuleList]):
+            For built-in models, it is nn.Module for no pp and nn.ModuleList for pp.
+            For injected models, it is nn.Module.
+        inject_info (Optional[Dict]): configurations for injected_models.
+    """
+    # parse inject_info
     if inject_info is not None:
         inject = inject_info.get("inject", False)
         interactive = inject_info.get("interactive", False)
@@ -992,31 +1007,37 @@ def inject_model_helper(model: Union[nn.Module, nn.ModuleList], inject_info: Opt
         "norm": inject_norm,
     }
 
+    # inject config
+    if inject:
+        inject_config(model)
+
     if not isinstance(model, nn.ModuleList):
         model = [model]
-
-    # inject modules
     for _chunk in model:
-        if gpc.get_world_size(ParallelMode.DATA) == gpc.get_world_size(ParallelMode.GLOBAL) and gpc.get_world_size(
-            ParallelMode.WEIGHT_DATA
-        ) == gpc.get_world_size(ParallelMode.GLOBAL):
+        # Special case for pure dp mode: skip
+        if (
+            isinstance(gpc.config.parallel["tensor"], dict)
+            and gpc.config.parallel["tensor"].get("mode", TensorParallelMode.mtp.name) == TensorParallelMode.mtp.name
+            and gpc.get_world_size(ParallelMode.DATA) == gpc.get_world_size(ParallelMode.GLOBAL)
+        ):
             continue
+        # In-place replacement or check for modules: "embed", "linear", "norm"
+        # (1) If inject=True, in-place replacement
+        # (2) If inject=False, check
         for mod in modules:
             inject_funcs[mod](_chunk, inject, interactive)
-
-    # reset parameters and move model to device
+        # reset parameters if needed, model should have reset_parameters() method
+        if reset_params:
+            _chunk.reset_parameters()
     for _chunk in model:
-        if inject:
-            if reset_params:
-                _chunk.reset_parameters()
+        # If model is initialized on cpu, model should be moved to cuda device after injection
+        if not next(_chunk.parameters()).is_cuda:
             _chunk.to(get_current_device())
 
-    # inject configs
-    if inject:
-        inject_config(model[0])
-        if gpc.is_rank_for_log():
-            logger.info(
-                f"inject is enabled, please check the model carefully, "
-                f"if there are any problems, please report issue to us. "
-                f"The injected model is \n {model}"
-            )
+    # print injected model
+    if inject and gpc.is_rank_for_log():
+        logger.info(
+            f"inject is enabled, please check the model carefully, "
+            f"if there are any problems, please report issue to us. "
+            f"The injected model is \n {model}"
+        )
diff --git a/tests/test_infer/test_trainer_generate.py b/tests/test_infer/test_trainer_generate.py
index 3ccbfb54..537a4077 100644
--- a/tests/test_infer/test_trainer_generate.py
+++ b/tests/test_infer/test_trainer_generate.py
@@ -10,7 +10,7 @@
 from internlm.core.trainer import TrainState, Trainer  # noqa: E402
 from internlm.data import build_train_loader_with_data_type  # noqa: E402
 from internlm.initialize import initialize_distributed_env  # noqa: E402
-from internlm.model.losses import FlashGPTLMLoss  # noqa: E402
+from internlm.model.losses import InternLoss  # noqa: E402
 from internlm.train import (  # noqa: E402
     get_scheduler_hooks,
     initialize_model,
@@ -25,7 +25,7 @@ def setup_generator(config, tokenizer):
     model = initialize_model()
     isp_communicator = initialize_parallel_communicator(model)
 
-    criterion = FlashGPTLMLoss()
+    criterion = InternLoss()
 
     # initialize the train data loader
     train_dl, _ = build_train_loader_with_data_type()
diff --git a/tests/test_training/7B_check_acc.py b/tests/test_training/7B_check_acc.py
index 3b727d7c..cb3902bc 100644
--- a/tests/test_training/7B_check_acc.py
+++ b/tests/test_training/7B_check_acc.py
@@ -1,16 +1,20 @@
 import os
 
-JOB_NAME = "7b_train"
+JOB_NAME = "7b_internlm2_train"
+model_type = "INTERNLM2_PUBLIC"
 DO_ALERT = False
 
+VOCAB_SIZE = 92544
 SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
+NUM_KV_ATTENTION_HEAD = 8
+MLP_RATIO = 3.5
 NUM_LAYER = 32
-VOCAB_SIZE = 103168
 
-MODEL_ONLY_FOLDER = os.path.join(os.environ["share_path"], "quailty_assurance/7B_model_weights_ckpt/init")
+MODEL_ONLY_FOLDER = os.path.join(
+    os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init"
+)
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 # SAVE_CKPT_FOLDER = "local:llm_ckpts_0925_9"
@@ -121,7 +125,8 @@
 )
 
 model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    checkpoint=False,
+    num_chunks=1,
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
@@ -129,13 +134,22 @@
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
+    no_bias=True,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.bfloat16",
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
 )
 """
 zero1 parallel:
@@ -150,9 +164,9 @@
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=dict(size=8),
-    tensor=dict(size=1, mode="mtp"),
-    pipeline=dict(size=1, interleaved_overlap=True),
+    zero1=dict(size=-1),
+    tensor=dict(size=2, mode="mtp"),
+    pipeline=dict(size=2, interleaved_overlap=True),
     weight=dict(size=1, overlap=True),
 )
 
@@ -165,5 +179,30 @@
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
         light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
     ),
 )
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
+
+enable_tb = False
diff --git a/tests/test_training/7B_check_init.py b/tests/test_training/7B_check_init.py
index 6f72c7d7..03107d02 100644
--- a/tests/test_training/7B_check_init.py
+++ b/tests/test_training/7B_check_init.py
@@ -1,12 +1,14 @@
-JOB_NAME = "7b_train"
+JOB_NAME = "7b_internlm2_train"
+model_type = "INTERNLM2_PUBLIC"
 DO_ALERT = False
 
+VOCAB_SIZE = 92544
 SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
+NUM_KV_ATTENTION_HEAD = 8
+MLP_RATIO = 3.5
 NUM_LAYER = 32
-VOCAB_SIZE = 103168
 
 CHECK_INIT = 1
 
@@ -128,7 +130,8 @@
 )
 
 model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    checkpoint=False,
+    num_chunks=1,
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
@@ -136,13 +139,22 @@
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
+    no_bias=True,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.bfloat16",
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
     use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
 )
 
 parallel = dict(
@@ -161,5 +173,30 @@
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
         light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
     ),
 )
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
+
+enable_tb = False
diff --git a/tests/test_training/test_forward_output_no_fa.py b/tests/test_training/test_forward_output_no_fa.py
index 69c9eb90..ba7f0118 100644
--- a/tests/test_training/test_forward_output_no_fa.py
+++ b/tests/test_training/test_forward_output_no_fa.py
@@ -15,7 +15,7 @@
 from internlm.core.trainer import Trainer
 from internlm.data import build_train_loader_with_data_type
 from internlm.initialize.launch import args_sanity_check
-from internlm.model.losses import FlashGPTLMLoss
+from internlm.model.losses import InternLoss
 from internlm.model.metrics import AccPerplex, SchedulerMetricHook
 from internlm.train import (
     initialize_model,
@@ -56,7 +56,7 @@
             checkpoint=True,
             num_attention_heads=32,
             embed_split_hidden=True,
-            vocab_size=103168,
+            vocab_size=92544,
             embed_grad_scale=1,
             parallel_output=False,
             hidden_size=4096,
@@ -68,8 +68,9 @@
             layer_norm_epsilon=1e-5,
             use_flash_attn=False,
             num_chunks=1,
+            no_bias=True,
         ),
-        model_type="INTERNLM",
+        model_type="INTERNLM2_PUBLIC",
         alert_address=None,
         monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
         grad_scaler=dict(
@@ -174,11 +175,11 @@ def train_check_output(args):
     _ = initialize_parallel_communicator(model)
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=False, label_smoothing=gpc.config.loss.label_smoothing)
+    criterion = InternLoss(parallel_output=False, label_smoothing=gpc.config.loss.label_smoothing)
 
     optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
 
-    train_dl, dataset_types = build_train_loader_with_data_type()
+    _, dataset_types = build_train_loader_with_data_type()
 
     metric = AccPerplex(
         device=get_current_device(),
@@ -226,9 +227,9 @@ def train_check_output(args):
 
     if gpc.is_rank_for_log():
         standard_output_with_fa = torch.load(
-            os.path.join(share_path, "quailty_assurance/7B_no_flash_attention/output_with_fa.pt")
+            os.path.join(share_path, "quailty_assurance/7B_no_flash_attention/output_with_fa_internlm2.pt")
         )
-        tensor1 = standard_output_with_fa
+        tensor1 = standard_output_with_fa[0][0]
         tensor2 = output[0][0][0]
 
         if torch.equal(tensor1, tensor2):
diff --git a/tests/test_training/test_load_ckpt_loss.py b/tests/test_training/test_load_ckpt_loss.py
index e6890517..ddbb24a0 100644
--- a/tests/test_training/test_load_ckpt_loss.py
+++ b/tests/test_training/test_load_ckpt_loss.py
@@ -38,7 +38,7 @@
     args_sanity_check,
 )
 from internlm.model.losses import (  # noqa: E402  #pylint: disable=wrong-import-position
-    FlashGPTLMLoss,
+    InternLoss,
 )
 from internlm.model.metrics import (  # noqa: E402  #pylint: disable=wrong-import-position
     AccPerplex,
@@ -224,7 +224,7 @@ def train_model(args):
     _ = initialize_parallel_communicator(model)
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
+    criterion = InternLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
 
     # initialize the train and validation data loader
     train_dl, dataset_types = build_train_loader_with_data_type()
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index 4094c582..8b506d2d 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -13,7 +13,7 @@
 from internlm.core.trainer import Trainer, TrainState
 from internlm.data import build_train_loader_with_data_type
 from internlm.initialize import initialize_distributed_env
-from internlm.model.losses import FlashGPTLMLoss
+from internlm.model.losses import InternLoss
 from internlm.train import (
     get_scheduler_hooks,
     initialize_model,
@@ -25,25 +25,26 @@
 from internlm.utils.gputest import empty_cache_and_diag
 from internlm.utils.megatron_timers import megatron_timer as timer
 
-CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH", "./configs/7B_sft.py")
-INTERNLM1_CKPT_PATH = os.path.join(os.environ["share_path"], "quailty_assurance/test_loss/model_ckpt")
+CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH", "./configs/7B_internlm2.py")
+INTERNLM2_CKPT_PATH = os.path.join(os.environ["share_path"], "quailty_assurance/test_loss_pri/model_ckpt")
 TOTAL_STEPS = 10
 LOSS_SPIKE_LIMIT = 1.5
 LOSS_DEVIATION_LIMIT = 0.02
 # dp_size = 4
 BASELINE_LOSS_LIST = [
-    11.63298511505127,
-    7.82645320892334,
-    6.727725505828857,
-    6.182029724121094,
-    5.395882606506348,
-    5.394383430480957,
-    5.053952217102051,
-    4.742049694061279,
-    4.629276752471924,
-    4.616517543792725,
+    12.362918853759766,
+    12.404379844665527,
+    12.348219871520996,
+    12.194982528686523,
+    11.80469036102295,
+    11.573806762695312,
+    10.045475006103516,
+    9.660882949829102,
+    9.172087669372559,
+    4.799427032470703,
 ]
 
+
 cur_loss_list = []
 internlm_accelerator = get_accelerator()
 
@@ -59,7 +60,7 @@ def train(
     enable_sp: bool = False,
     save_ckpt: bool = False,
     load_ckpt: bool = False,
-    model_type: str = "INTERNLM",
+    model_type: str = "INTERNLM2_PUBLIC",
     optimizer_ver: str = "v1",
     pp_mode: str = "1F1B",
 ):
@@ -67,24 +68,31 @@ def train(
     config = Config.from_file(CONFIG_FILE_PATH)
 
     # init setting
-    config.data.total_steps = TOTAL_STEPS
+    config.data.total_steps = 50000
     config.data.fixed_random_dataset_seqlen = False
-    config.lr_scheduler.total_steps = TOTAL_STEPS
+    config.data.micro_num = 4
+    config.data.micro_bsz = 2
+    config.lr_scheduler.total_steps = config.data.total_steps
     config.model_type = model_type
     config.ckpt.load_ckpt_folder = None
     config.ckpt.load_ckpt_info = None
     config.ckpt.auto_resume = False
-    total_steps = config.data.total_steps
+    total_steps = TOTAL_STEPS
     skip_batches = config.data.skip_batches
     label_smoothing = config.loss.label_smoothing
+    config.parallel.zero1 = dict(size=-1)
+    config.parallel.tensor = dict(size=1, mode="mtp")
+    config.parallel.pipeline = dict(size=1, interleaved_overlap=True, mode="1f1b")
+    config.parallel.weight = dict(size=1, overlap=True)
 
     if optimizer_ver == "v2":
         config.hybrid_zero_optimizer.use_split_tensor_optim = True
         config.all_gather_size = 512 * 1024 * 1024
+        config.model.checkpoint = True
 
     # update ckpt config
-    if model_type == "INTERNLM" and tp_mode != "isp" and interleaved is False:
-        config.ckpt.load_ckpt_info = dict(path=INTERNLM1_CKPT_PATH, content=("model",), ckpt_type="internlm_test")
+    if model_type == "INTERNLM2_PUBLIC" and tp_mode != "isp" and interleaved is False:
+        config.ckpt.load_ckpt_info = dict(path=INTERNLM2_CKPT_PATH, content=("model",), ckpt_type="internlm2_test")
 
     if save_ckpt:
         config.ckpt.enable_save_ckpt = True
@@ -101,7 +109,7 @@ def train(
         config.hybrid_zero_optimizer.overlap_sync_grad = False
 
     config.parallel.pipeline = dict(size=pp_size, mode=pp_mode)
-    config.parallel.weight = dict(size=wp_size, overlap=True)
+    config.parallel.weight = dict(size=wp_size, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer")
     if interleaved is True:
         config.parallel.pipeline = dict(size=pp_size, interleaved_overlap=True, mode=pp_mode)
         config.model.num_chunks = num_chunks
@@ -166,7 +174,7 @@ def train(
     isp_communicator = initialize_parallel_communicator(model)
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=gpc.config.model.parallel_output, label_smoothing=label_smoothing)
+    criterion = InternLoss(parallel_output=gpc.config.model.parallel_output, label_smoothing=label_smoothing)
 
     # initialize the train data loader
     train_dl, _ = build_train_loader_with_data_type()
@@ -213,7 +221,7 @@ def train(
 
     train_iter = iter(train_dl)
 
-    if model_type == "INTERNLM":
+    if model_type == "INTERNLM2_PUBLIC":
         data_path = os.path.join(os.environ["share_path"], "quailty_assurance/test_loss/data_batch_4DP")
         data_batch = torch.load(f"{data_path}/{gpc.get_local_rank(ParallelMode.DATA)}_data_batch.pt")
 
@@ -222,7 +230,7 @@ def train(
         empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval)
         timer("one-batch").start()
 
-        if model_type == "INTERNLM":
+        if model_type == "INTERNLM2_PUBLIC":
             if batch_count >= 10:
                 batch = data_batch[batch_count - 10]
             else:
@@ -296,7 +304,6 @@ def check_loss_spike():
 
 def check_loss_accuracy():
     if gpc.is_rank_for_log():
-        print(f"cur_loss_list:{cur_loss_list}", flush=True)
         for cur, target in zip(cur_loss_list, BASELINE_LOSS_LIST):
             assert (
                 abs(cur - target) < LOSS_DEVIATION_LIMIT
@@ -464,16 +471,16 @@ def test_training_with_isp():
     global CONFIG_FILE_PATH, BASELINE_LOSS_LIST
     CONFIG_FILE_PATH = "./configs/7B_isp_sft.py"
     BASELINE_LOSS_LIST = [
-        11.595988273620605,
-        7.988386154174805,
-        6.821506500244141,
-        6.2768449783325195,
-        5.478013515472412,
-        5.4622697830200195,
-        5.162247180938721,
-        4.854615211486816,
-        4.744818210601807,
-        4.75523567199707,
+        12.225811004638672,
+        12.103824615478516,
+        12.223844528198242,
+        11.87704849243164,
+        11.651590347290039,
+        11.629219055175781,
+        10.242591857910156,
+        9.768388748168945,
+        9.330610275268555,
+        5.505439758300781,
     ]
 
     # model training
@@ -516,12 +523,3 @@ def test_training_llama2():
     CONFIG_FILE_PATH = "./configs/7B_llama2.py"
 
     train(dp_size=8, model_type="LLAMA2")
-
-
-@pytest.mark.training_internlm2
-def test_training_internlm2():
-    # update config file
-    global CONFIG_FILE_PATH
-    CONFIG_FILE_PATH = "./configs/7B_internlm2.py"
-
-    train(dp_size=8, model_type="INTERNLM2_PUBLIC")
diff --git a/tests/test_training/test_no_fa_train_temp.py b/tests/test_training/test_no_fa_train_temp.py
index f142e503..5f0782b4 100644
--- a/tests/test_training/test_no_fa_train_temp.py
+++ b/tests/test_training/test_no_fa_train_temp.py
@@ -8,7 +8,7 @@
 from internlm.core.context import global_context as gpc
 from internlm.core.trainer import Trainer
 from internlm.data import build_train_loader_with_data_type
-from internlm.model.losses import FlashGPTLMLoss
+from internlm.model.losses import InternLoss
 from internlm.model.metrics import AccPerplex
 from internlm.train import (
     get_scheduler_hooks,
@@ -58,7 +58,7 @@ def train_check(args):
     isp_communicator = initialize_parallel_communicator(model)
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
+    criterion = InternLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
 
     optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model, isp_communicator)
 
diff --git a/tests/test_training/test_norm_weight.py b/tests/test_training/test_norm_weight.py
index 0fd24926..990b334a 100644
--- a/tests/test_training/test_norm_weight.py
+++ b/tests/test_training/test_norm_weight.py
@@ -11,7 +11,7 @@
 from internlm.core.context import global_context as gpc
 from internlm.core.trainer import Trainer
 from internlm.data import build_train_loader_with_data_type
-from internlm.model.losses import FlashGPTLMLoss
+from internlm.model.losses import InternLoss
 from internlm.model.metrics import AccPerplex
 from internlm.train import (
     get_scheduler_hooks,
@@ -78,7 +78,7 @@ def train_check_norm_weight(args):
     isp_communicator = initialize_parallel_communicator(model)
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
+    criterion = InternLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
 
     optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model, isp_communicator)
 
diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
index 4fa096a5..13c01b1c 100644
--- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py
+++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
@@ -21,7 +21,7 @@
 )
 from internlm.eval.evaluation import switch_evaluation_mode
 from internlm.initialize.launch import args_sanity_check
-from internlm.model.losses import FlashGPTLMLoss
+from internlm.model.losses import InternLoss
 from internlm.model.metrics import AccPerplex, SchedulerMetricHook
 from internlm.train import (
     initialize_model,
@@ -275,7 +275,7 @@ def exam_loss(args):
     _ = initialize_parallel_communicator(model)
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
+    criterion = InternLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
 
     # initialize the train and validation data loader
     train_dl, dataset_types = build_train_loader_with_data_type()
diff --git a/tests/test_training/train_CI.py b/tests/test_training/train_CI.py
index b33cf4c3..c7da6f85 100644
--- a/tests/test_training/train_CI.py
+++ b/tests/test_training/train_CI.py
@@ -20,14 +20,14 @@
 from internlm.checkpoint import CheckpointManager  # noqa: E402
 from internlm.core.context import ParallelMode  # noqa: E402
 from internlm.core.context import global_context as gpc  # noqa: E402
-from internlm.core.trainer import TrainState, Trainer  # noqa: E402
+from internlm.core.trainer import Trainer, TrainState  # noqa: E402
 from internlm.data import (  # noqa: E402
     build_train_loader_with_data_type,
     build_valid_loader_with_data_type,
 )
 from internlm.eval.evaluation import evaluate_on_val_dls  # noqa: E402
 from internlm.initialize import initialize_distributed_env  # noqa: E402
-from internlm.model.losses import FlashGPTLMLoss  # noqa: E402
+from internlm.model.losses import InternLoss  # noqa: E402
 from internlm.model.metrics import AccPerplex, SchedulerMetricHook  # noqa: E402
 from internlm.monitor import (  # noqa: E402
     initialize_monitor_manager,
@@ -60,6 +60,7 @@
 
 
 def check_model_weights(model, ckpt_path, total_equal=False):
+    model = model.model
     model1_dict = torch.load(ckpt_path, map_location="cuda")
     model2_dict = model.state_dict()
 
@@ -122,7 +123,7 @@ def main(args):
         config_lines = f.readlines()
 
     # initialize loss function
-    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
+    criterion = InternLoss(parallel_output=True, label_smoothing=label_smoothing)
 
     # initialize the train and validation data loader
     train_dl, dataset_types = build_train_loader_with_data_type()
@@ -214,13 +215,14 @@ def main(args):
     # check model init weights
     if hasattr(gpc.config, "CHECK_INIT") and gpc.config.CHECK_INIT == 1:
         ckpt_name = (
-            f"model_dp{gpc.get_local_rank(ParallelMode.DATA)}"
+            f"model"
             f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}"
             f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt"
         )
-        ckpt_path = os.path.join(os.environ["share_path"], "quailty_assurance/7B_init_dp=2_tp=2_pp=2_ckpt", ckpt_name)
+        ckpt_path = os.path.join(
+            os.environ["share_path"], "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt/init", ckpt_name
+        )
         check_model_weights(model, ckpt_path, total_equal=True)
-
     with initialize_llm_profile(profiling=args.profiling, start_time=current_time) as prof:
         # start iterating the train data and begin training
         for batch_count in range(train_state.batch_count, total_steps):
@@ -327,12 +329,17 @@ def main(args):
                 )
 
             # check model weights
-            if gpc.is_rank_for_log() and batch_count > 0 and batch_count % 100 == 0:
+            if batch_count > 0 and batch_count % 100 == 0:
+                ckpt_name = (
+                    f"model"
+                    f"_tp{gpc.get_local_rank(ParallelMode.TENSOR)}"
+                    f"_pp{gpc.get_local_rank(ParallelMode.PIPELINE)}.pt"
+                )
                 ckpt_path = os.path.join(
                     os.environ["share_path"],
-                    "quailty_assurance/7B_model_weights_ckpt",
+                    "quailty_assurance/7B_internlm2_init_dp=2_tp=2_pp=2_ckpt",
                     str(batch_count),
-                    "model_tp0_pp0.pt",
+                    ckpt_name,
                 )
                 check_model_weights(model, ckpt_path)