steps=20000
setups as follow
if [ $stage == "train" ] || [ $stage == "all" ]; then
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus west/bin/train.py
--model_config_or_dir $model_conf
--data_path $data/train.jsonl
--output_dir $dir
--pack_size 8192
--bf16 True
--max_steps $steps
--per_device_train_batch_size 1
--per_device_eval_batch_size 1
--gradient_accumulation_steps 4
--save_strategy "steps"
--save_steps 100
--save_total_limit 100
--learning_rate 3e-4
--weight_decay 0.01
--adam_beta2 0.95
--warmup_steps 2000
--lr_scheduler_type "cosine"
--logging_steps 1
--report_to "tensorboard"
--gradient_checkpointing
--dataloader_num_workers 2
--dataloader_prefetch_factor 10
--ignore_data_skip True
--deepspeed conf/ds_config_zero2.json
--accelerator_config conf/accelerator_config.json
fi
am I missing something in there, is there anybody can help ?