-
Notifications
You must be signed in to change notification settings - Fork 737
Description
Checklist
- 1. I have searched related issues but cannot get the expected help.
- 2. The bug has not been fixed in the latest version.
- 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
Describe the bug
程序停在首步卡死不动,半小时后超时失败
Reproduction
internvl3_5_30b_sft.sh脚本内容:
export TF_CPP_MIN_LOG_LEVEL=3
export USE_TCS_LOADER=0
export LAUNCHER=pytorch
CURRENT_PATH=$(pwd)
PROJECT_NAME=internvl3_5_30b_sft
TASK_NAME=$(basename "$0")
TASK_NAME="${TASK_NAME%.*}"
echo "TASK_NAME: $TASK_NAME"
echo "PROJECT_NAME: $PROJECT_NAME"
RANK=0
log_file="gpu_$(date +%Y%m%d_%H%M%S).log"
export OUTPUT_DIR=${CURRENT_PATH}/work_dirs/${PROJECT_NAME}
export TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard
export JOBLOG=${OUTPUT_DIR}/${log_file}
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir -p "$OUTPUT_DIR"
fi
NPROC_PER_NODE=8
WORLD_SIZE=1
BATCH_SIZE=8
PER_DEVICE_BATCH_SIZE=1
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / WORLD_SIZE / NPROC_PER_NODE))
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export TRITON_CACHE_DIR="/dev/shm/triton_wwy/"
export VLLM_CACHE_ROOT="/dev/shm/vllmca_wwy/"
export MASTER_ADDR=127.0.0.1
export MASTER_PORT=34245
export TF_CPP_MIN_LOG_LEVEL=3
export LAUNCHER=pytorch
torchrun
--node-rank=$RANK
--nnodes=$WORLD_SIZE
--nproc-per-node=$NPROC_PER_NODE
--master-addr=$MASTER_ADDR
--master-port=$MASTER_PORT
internvl/train/internvl_chat_finetune.py
--model_name_or_path "/internvl3.5/InternVL3_5-30B-A3B-Instruct"
--conv_style "internvl2_5"
--use_fast_tokenizer False
--output_dir ${OUTPUT_DIR}
--meta_path "${CURRENT_PATH}/shell/data/debug_sft.json"
--overwrite_output_dir True
--force_image_size 448
--max_dynamic_patch 12
--down_sample_ratio 0.5
--drop_path_rate 0.0
--min_num_frame 8
--max_num_frame 32
--freeze_llm False
--freeze_mlp False
--freeze_backbone False
--vision_select_layer -1
--dataloader_num_workers 0
--bf16 True
--max_steps 50
--per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE}
--gradient_accumulation_steps ${GRADIENT_ACC}
--save_strategy "steps"
--save_steps 5000
--save_total_limit 100
--learning_rate 1.0e-5
--weight_decay 0
--warmup_ratio 0.1
--lr_scheduler_type "cosine"
--logging_steps 1
--max_seq_length 4096
--split_annotations False
--do_train True
--grad_checkpoint False
--gradient_checkpointing True
--group_by_length False
--dynamic_image_size True
--use_thumbnail True
--ps_version 'v2'
--use_custom_flash_attn False
--report_to "tensorboard"
--deepspeed "zero_stage3_config.json"
--use_packed_ds True
--num_images_expected 96
--max_packed_tokens 4096
--max_buffer_size 20
--log_freq 1000
--strict_mode False
--replacement True
--allow_overflow False
--remove_unused_columns False
--loss_reduction "sample"
--seed 42
2>&1 | tee -a "${OUTPUT_DIR}/${log_file}"
启动脚本
bash shell/internvl3_5_qwen3/internvl3_5_30b_sft.sh
Environment
模型:InternVL3_5-30B-A3B-Instruct
pip list:
Package Version
-------------------------------- -----------
absl-py 2.3.1
accelerate 1.9.0
addict 2.4.0
aiohappyeyeballs 2.6.1
aiohttp 3.12.15
aiosignal 1.4.0
annotated-types 0.7.0
antlr4-python3-runtime 4.11.0
asgiref 3.9.1
asttokens 3.0.0
async-timeout 5.0.1
attrs 25.3.0
backports-datetime-fromisoformat 2.0.3
bidict 0.22.1
blake3 1.0.5
boto3 1.40.3
botocore 1.40.3
certifi 2025.8.3
charset-normalizer 3.4.2
cli_exit_tools 1.2.7
click 8.2.1
conda-pack 0.8.1
contourpy 1.3.2
cycler 0.12.1
cyclopts 3.22.5
datasets 4.0.0
decorator 5.2.1
decord 0.6.0
deepspeed 0.17.4
dill 0.3.8
Distance 0.1.3
Django 5.2.5
docstring_parser 0.17.0
docutils 0.22
einops 0.8.1
environs 14.3.0
et_xmlfile 2.0.0
exceptiongroup 1.3.0
executing 2.2.0
Faker 37.5.3
fastrlock 0.8.3
filelock 3.18.0
flash_attn 2.8.2
fonttools 4.59.0
frozenlist 1.7.0
fsspec 2025.3.0
funcy 1.18
gitdb 4.0.12
GitPython 3.1.45
grpcio 1.74.0
hf-xet 1.1.7
hjson 3.1.0
huggingface-hub 0.34.3
humanize 4.12.3
idna 3.10
image 1.5.33
ImageHash 4.3.2
imageio 2.37.0
ipdb 0.13.13
ipython 8.37.0
jedi 0.19.2
Jinja2 3.1.6
jmespath 1.0.1
joblib 1.5.1
kernels 0.9.0
kiwisolver 1.4.8
latex2sympy2_extended 1.10.2
lib-detect-testenv 2.0.8
Markdown 3.8.2
markdown-it-py 3.0.0
MarkupSafe 3.0.2
marshmallow 4.0.0
math-verify 0.8.0
matplotlib 3.10.5
matplotlib-inline 0.1.7
mdurl 0.1.2
mmengine 0.10.7
modelscope 1.28.2
mpmath 1.3.0
msgpack 1.1.1
multidict 6.6.3
multiprocess 0.70.16
networkx 3.4.2
ninja 1.11.1.4
nltk 3.9.1
numpy 2.2.6
nvidia-cublas-cu12 12.6.4.1
nvidia-cuda-cupti-cu12 12.6.80
nvidia-cuda-nvrtc-cu12 12.6.77
nvidia-cuda-runtime-cu12 12.6.77
nvidia-cudnn-cu12 9.5.1.17
nvidia-cufft-cu12 11.3.0.4
nvidia-cufile-cu12 1.11.1.6
nvidia-curand-cu12 10.3.7.77
nvidia-cusolver-cu12 11.7.1.2
nvidia-cusparse-cu12 12.5.4.2
nvidia-cusparselt-cu12 0.6.3
nvidia-ml-py 13.580.65
nvidia-nccl-cu12 2.26.2
nvidia-nvjitlink-cu12 12.6.85
nvidia-nvtx-cu12 12.6.77
opencv-python 4.12.0.88
openpyxl 3.1.5
packaging 25.0
pandas 2.3.1
parso 0.8.4
pexpect 4.9.0
pillow 11.3.0
pip 25.2
platformdirs 4.3.8
portalocker 3.2.0
prompt_toolkit 3.0.51
propcache 0.3.2
protobuf 6.31.1
psutil 7.0.0
ptyprocess 0.7.0
pure_eval 0.2.3
py-aiger 6.2.3
py-aiger-cnf 5.0.8
py-cpuinfo 9.0.0
py-spy 0.4.1
pyapproxmc 4.1.24
pyarrow 21.0.0
pycryptosat 5.11.23
pydantic 2.11.7
pydantic_core 2.33.2
Pygments 2.19.2
pyparsing 3.2.3
pypblib 0.0.4
pyrsistent 0.19.3
python-dateutil 2.9.0.post0
python-dotenv 1.1.1
python-sat 1.8.dev18
pytz 2025.2
PyWavelets 1.8.0
PyYAML 6.0.2
regex 2025.7.34
requests 2.32.4
rich 14.1.0
rich-rst 1.3.1
s3transfer 0.13.1
safetensors 0.5.3
scipy 1.15.3
sentencepiece 0.2.0
sentry-sdk 2.34.1
setuptools 80.9.0
six 1.17.0
smmap 5.0.2
sortedcontainers 2.4.0
sqlparse 0.5.3
stack-data 0.6.3
sty 1.0.6
sympy 1.14.0
tabulate 0.9.0
tensorboard 2.20.0
tensorboard-data-server 0.7.2
termcolor 3.1.0
tiktoken 0.10.0
timeout-decorator 0.5.0
timm 1.0.19
tokenizers 0.21.4
tomli 2.2.1
torch 2.7.0
torchaudio 2.7.0
torchvision 0.22.0
tqdm 4.67.1
traitlets 5.14.3
transformers 4.55.0
triton 3.4.0
trl 0.21.0
typing_extensions 4.14.1
typing-inspection 0.4.1
tzdata 2025.2
urllib3 2.5.0
uvloop 0.21.0
validators 0.35.0
wandb 0.21.0
wcwidth 0.2.13
websockets 15.0.1
Werkzeug 3.1.3
wheel 0.45.1
wrapt 1.17.2
wrapt_timeout_decorator 1.5.1
xlsxwriter 3.2.5
xxhash 3.5.0
yapf 0.43.0
yarl 1.20.1