Skip to content

Commit f763177

Browse files
committed
fix check CUDA_DEVICE_MAX_CONNECTIONS
1 parent 141e9eb commit f763177

File tree

5 files changed

+39
-8
lines changed

5 files changed

+39
-8
lines changed

.github/workflows/demo_in_readme.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,33 @@ jobs:
4545
id: basic_train
4646
run: |
4747
source activate ${evo_env_torch21_flash2}
48+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
49+
export CUDA_DEVICE_MAX_CONNECTIONS=1
4850
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
4951
5052
- name: load_preset_ckpt
5153
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
5254
run: |
5355
source activate ${evo_env_torch21_flash2}
5456
export PYTHONPATH=$PWD:$PYTHONPATH
57+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
58+
export CUDA_DEVICE_MAX_CONNECTIONS=1
5559
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
5660
5761
- name: load_new_ckpt
5862
run: |
5963
source activate ${evo_env_torch21_flash2}
6064
export PYTHONPATH=$PWD:$PYTHONPATH
65+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
66+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6167
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
6268
rm -rf $GITHUB_WORKSPACE/llm_ckpts
6369
6470
- name: torchrun-train
6571
run: |
6672
source activate ${evo_env_torch21_flash2}
73+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
74+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6775
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
6876
rm -rf $GITHUB_WORKSPACE/llm_ckpts
6977

internlm/core/trainer_builder.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import gc
22
import logging
3+
import os
34
import time
45
from functools import partial
56
from typing import Dict, List, Optional, Union
@@ -8,6 +9,7 @@
89
import torch.distributed as dist
910
from torch.utils.data import DataLoader
1011

12+
from internlm.accelerator import AcceleratorType, get_accelerator
1113
from internlm.checkpoint.checkpoint_manager import CheckpointManager
1214
from internlm.core.context import global_context as gpc
1315
from internlm.core.context.process_group_initializer import ParallelMode
@@ -31,7 +33,6 @@
3133
)
3234
from internlm.utils.common import (
3335
BatchSkipper,
34-
check_cuda_env,
3536
enable_pytorch_expandable_segments,
3637
get_current_device,
3738
get_megatron_flops,
@@ -47,6 +48,32 @@
4748

4849
# global llm logger
4950
logger = logging.getLogger(__file__)
51+
internlm_accelerator = get_accelerator()
52+
53+
54+
def check_cuda_env():
55+
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
56+
wp_fwd_per = gpc.config.parallel.weight.get("forward_overlap_per", "layer")
57+
ewp_fwd_per = gpc.config.parallel.expert_weight.get("forward_overlap_per", "layer")
58+
wp_size = gpc.config.parallel.weight.get("size", 1)
59+
ewp_size = gpc.config.parallel.expert_weight.get("size", 1)
60+
open_max_conns = (wp_size == 1 or wp_fwd_per != "layer") and (ewp_size == 1 or ewp_fwd_per != "layer")
61+
if open_max_conns:
62+
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
63+
assert (
64+
max_connections is not None
65+
), "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
66+
assert (
67+
max_connections == "1"
68+
), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)
69+
70+
avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
71+
assert (
72+
avoid_record_streams is not None
73+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
74+
assert (
75+
avoid_record_streams == "1"
76+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)
5077

5178

5279
class TrainerBuilder(Trainer):

internlm/data/tokenized/dummy_dataset.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
from torch.utils.data import Dataset
66

7-
# from internlm.core.context.parallel_context import global_context as gpc
7+
from internlm.core.context.parallel_context import global_context as gpc
88

99

1010
class RandomDataset(Dataset):
@@ -30,7 +30,7 @@ def __init__(self, num_samples=10000, max_len=1024, fixed_seqlen: bool = False)
3030
while len(d) < max_len:
3131
r *= 2
3232
d = list(range(n)) * r
33-
# r = r % gpc.config.model.vocab_size
33+
r = r % gpc.config.model.vocab_size
3434
d = [n, r] + d
3535
d = d[:max_len]
3636
data.append(d)

internlm/utils/common.py

-5
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,6 @@ def enable_pytorch_expandable_segments():
249249
logger.warning("To support the 'expandable_segments' configuration, please upgrade torch to version 2.1.0.")
250250

251251

252-
def check_cuda_env():
253-
if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
254-
logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")
255-
256-
257252
class DummyProfile:
258253
"""
259254
Dummy Profile.

tests/test_data/test_batch_sampler.py

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def do_warmup(args):
4545
rank, worldsize, init_config, should_sccuess, answer = args
4646
build_environment(rank, worldsize, init_config)
4747
gpc.config.model.num_chunks = 1 if gpc.get_world_size(ParallelMode.PIPELINE) == 1 else 2
48+
gpc.config.model.vocab_size = 92544
4849
engine, scheduler = init_model_and_optim(
4950
8,
5051
gpc.config.model.num_chunks,

0 commit comments

Comments
 (0)