Skip to content

Commit

Permalink
Merge branch 'develop' into feat/early-release-isp-rs-memory
Browse files Browse the repository at this point in the history
  • Loading branch information
mwiacx committed Jan 2, 2025
2 parents 48f1b94 + e3f5001 commit d89de09
Show file tree
Hide file tree
Showing 54 changed files with 2,396 additions and 1,023 deletions.
140 changes: 56 additions & 84 deletions .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: e2e-tests
on:
on:
pull_request:
branches:
- "develop"
Expand Down Expand Up @@ -73,68 +73,65 @@ jobs:
training_8GPU_4DP2TP:
strategy:
matrix:
runner: [910B]
runner: [t_cluster]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TP_910B
if: ${{ matrix.runner == '910B' }}
- name: training_8GPU_4DP2TP_T
if: ${{ matrix.runner == 't_cluster' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_8GPU_4DP2TPSP:
strategy:
matrix:
runner: [910B]
runner: [t_cluster]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2TPSP_910B
if: ${{ matrix.runner == '910B' }}
- name: training_8GPU_4DP2TPSP_T
if: ${{ matrix.runner == 't_cluster' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_8GPU_4DP2PP:
strategy:
matrix:
runner: [910B]
runner: [t_cluster]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_8GPU_4DP2PP_910B
if: ${{ matrix.runner == '910B' }}
- name: training_8GPU_4DP2PP_T
if: ${{ matrix.runner == 't_cluster' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_8GPU_4DP2PP_ZB:
runs-on: [t_cluster]
Expand All @@ -157,107 +154,82 @@ jobs:
training_16GPU_4DP2TP2PP_MTP:
strategy:
matrix:
runner: [910B]
runner: [t_cluster]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MTP_910B
if: ${{ matrix.runner == '910B' }}
- name: training_16GPU_4DP2TP2PP_MTP_T
if: ${{ matrix.runner == 't_cluster' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_16GPU_4DP2TP2PP_MSP:
strategy:
matrix:
runner: [910B]
runner: [t_cluster]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_MSP_910B
if: ${{ matrix.runner == '910B' }}
- name: training_16GPU_4DP2TP2PP_MSP_T
if: ${{ matrix.runner == 't_cluster' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_16GPU_4DP2TP2PP_FSP:
strategy:
matrix:
runner: [910B]
runner: [t_cluster]
runs-on: ${{ matrix.runner }}
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_16GPU_4DP2TP2PP_FSP_910B
if: ${{ matrix.runner == '910B' }}
- name: training_16GPU_4DP2TP2PP_FSP_T
if: ${{ matrix.runner == 't_cluster' }}
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
training_llama2:
strategy:
matrix:
runner: [910B]
runner: [t_cluster]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_llama2_910B
- name: training_llama2_T
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
training_internlm2:
strategy:
matrix:
runner: [910B]
runs-on: ${{ matrix.runner }}
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
if [[ ${{ matrix.runner }} == 910B ]];then
sudo git clean -ffdx
fi
- uses: actions/checkout@v3
- name: training_internlm2_910B
run: |
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
bash ../910B_sco.sh $jobname "$start_command"
source activate ${evo_env_torch21_flash2}
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
exit_code=$?
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
6 changes: 2 additions & 4 deletions configs/57B_qwen2_MoE.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@
weight parallel (dict):
1. size: int, the size of weight parallel.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. memory_pool: bool, enable/disable memory pool, defaults to False.
expert parallel (dict):
1. size: int
* if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
Expand All @@ -201,15 +200,14 @@
expert weight parallel (dict):
1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. memory_pool: bool, enable/disable memory pool, defaults to False.
"""
parallel = dict(
zero1=dict(size=-1, fsdp=False),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True, memory_pool=True),
weight=dict(size=1, overlap=True),
expert=dict(size=-1, no_tp=False),
expert_weight=dict(size=1, overlap=True, memory_pool=True),
expert_weight=dict(size=1, overlap=True),
)

cudnn_deterministic = False
Expand Down
26 changes: 24 additions & 2 deletions configs/7B_MoE4_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,20 @@
clip_grad_norm=1.0,
)


# loss config (dict):
# 1. label_smoothing
# 2. op_type: cross_entropy operator type, we support five types for loss computing,
# including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
# default is "py_vocab_parallel".
# "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
# "apex_naive": cross_entropy from apex
# "py_naive": self-implemented cross_entropy
# "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
# "py_vocab_parallel": self-implemented vocab parallel cross_entropy
# * op_types that ends with "naive" only support parallel_output=False;
# * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.

loss = dict(
label_smoothing=0,
moe_loss_coeff=0.1,
Expand Down Expand Up @@ -183,6 +197,10 @@
weight parallel (dict):
1. size: int, the size of weight parallel.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. launch_allgather_before: str, before which module to launch the all gather communication to
prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
Must be used with forward_overlap_per 'layer'.
4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
expert parallel (dict):
1. size: int
* if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
Expand All @@ -193,14 +211,18 @@
expert weight parallel (dict):
1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. launch_allgather_before: str, before which module to launch the all gather communication to
prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
Must be used with forward_overlap_per 'layer'.
4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
"""
parallel = dict(
zero1=dict(size=-1, fsdp=False),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True),
weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
expert=dict(size=-1, no_tp=False),
expert_weight=dict(size=1, overlap=True),
expert_weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
)

cudnn_deterministic = False
Expand Down
18 changes: 15 additions & 3 deletions configs/7B_internlm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,21 @@
clip_grad_norm=1.0,
)

loss = dict(
label_smoothing=0,
)

# loss config (dict):
# 1. label_smoothing
# 2. op_type: cross_entropy operator type, we support five types for loss computing,
# including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
# default is "py_vocab_parallel".
# "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
# "apex_naive": cross_entropy from apex
# "py_naive": self-implemented cross_entropy
# "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
# "py_vocab_parallel": self-implemented vocab parallel cross_entropy

# * op_types that ends with "naive" only support parallel_output=False;
# * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
loss = dict(label_smoothing=0, op_type="py_vocab_parallel")

adam = dict(
lr=1e-4,
Expand Down
Loading

0 comments on commit d89de09

Please sign in to comment.