Skip to content

Commit

Permalink
+scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Panlichen committed Apr 24, 2023
1 parent 41135de commit bfaa2ce
Showing 1 changed file with 202 additions and 0 deletions.
202 changes: 202 additions & 0 deletions tools/train_a100.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
#!/usr/bin/env bash

clear

FILE=$1
CONFIG=$2
GPUS=$3
NODE=${NODE:-1}
NODE_RANK=${NODE_RANK:-0}
ADDR=${ADDR:-127.0.0.1}
PORT=${PORT:-12345}

export GLOG_logtostderr=1
export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=0 # 禁用lightweight actor

export NCCL_PROTO=Simple
export NCCL_ALGO=Ring
# export NCCL_MAX_NCHANNELS=1
# export NCCL_MIN_NCHANNELS=1
# export NCCL_NTHREADS=64

if [ -z $RUN_TYPE ];then
RUN_TYPE="PURE"
# RUN_TYPE="GDB"
# RUN_TYPE="NSYS"
fi

export ONEFLOW_ENABLE_OFCCL=1
export DISABLE_NCCL_COMPUTE_STREAM=1
# export ONEFLOW_TIME_SHAPE=1
export ONEFLOW_DEBUG_MODE=1
export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1

export GLOG_vmodule=plan_util*=1,of_collective_actor*=1,of_collective_boxing_kernels*=1,collective_backend_ofccl*=1,hierarchical_sub_task_graph_builder_impl*=1,of_request_store*=1,request_store*=1,runtime*=1,scheduler*=1,collective_manager*=1,of_collective_boxing_sub_task_graph_builder*=1,collective_boxing_sub_task_graph_builder*=1
# nn_graph*=1,
# export GLOG_v=1

export SHOW_ALL_PREPARED_COLL=1

export DEV_TRY_ROUND=10
export CHECK_REMAINING_SQE_INTERVAL=10000
export DEBUG_FILE="/home/panlichen/work/oneflow/log/oneflow_cpu_rank_"

export NUM_ITER_ENV=200
echo NUM_ITER_ENV=$NUM_ITER_ENV

if [ $GPUS = 2 ]; then
export CUDA_VISIBLE_DEVICES=4,5

#pure dp
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=100
# export TOLERANT_UNPROGRESSED_CNT=2000
# export NUM_TRY_TASKQ_HEAD=40

#pure tp
export RECV_SUCCESS_FACTOR=20
export RECV_SUCCESS_THRESHOLD=10000
export BASE_CTX_SWITCH_THRESHOLD=120
export TOLERANT_UNPROGRESSED_CNT=10000
export NUM_TRY_TASKQ_HEAD=100
elif [ $GPUS = 4 ]; then
export CUDA_VISIBLE_DEVICES=0,1,4,5
export ONEFLOW_OFCCL_SKIP_NEGO=0

#pure dp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=40
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=30000
# export TOLERANT_UNPROGRESSED_CNT=30000
# export NUM_TRY_TASKQ_HEAD=200

#pure tp
export ONEFLOW_OFCCL_SKIP_NEGO=0
export RECV_SUCCESS_FACTOR=40
export RECV_SUCCESS_THRESHOLD=1000000000
export BASE_CTX_SWITCH_THRESHOLD=100000
export TOLERANT_UNPROGRESSED_CNT=16000
export NUM_TRY_TASKQ_HEAD=200

elif [ $GPUS = 8 ]; then

#pure dp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=30
# export RECV_SUCCESS_THRESHOLD=100000000
# export BASE_CTX_SWITCH_THRESHOLD=120000
# export TOLERANT_UNPROGRESSED_CNT=180000
# export NUM_TRY_TASKQ_HEAD=240

#pure tp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=10
# export RECV_SUCCESS_THRESHOLD=1000000
# export BASE_CTX_SWITCH_THRESHOLD=6000
# export TOLERANT_UNPROGRESSED_CNT=8000
# export NUM_TRY_TASKQ_HEAD=10

#4tp2dp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=10
# export RECV_SUCCESS_THRESHOLD=10000000
# export BASE_CTX_SWITCH_THRESHOLD=20000
# export TOLERANT_UNPROGRESSED_CNT=9000
# export NUM_TRY_TASKQ_HEAD=10

#3d
export ONEFLOW_OFCCL_SKIP_NEGO=0
export RECV_SUCCESS_FACTOR=5
export RECV_SUCCESS_THRESHOLD=10000000
export BASE_CTX_SWITCH_THRESHOLD=20000
export TOLERANT_UNPROGRESSED_CNT=80000
export NUM_TRY_TASKQ_HEAD=10

#2dp4pp
# export ONEFLOW_OFCCL_SKIP_NEGO=0
# export RECV_SUCCESS_FACTOR=5
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=8000
# export TOLERANT_UNPROGRESSED_CNT=80000
# export NUM_TRY_TASKQ_HEAD=10

#2tp4pp
# export ONEFLOW_OFCCL_SKIP_NEGO=1
# export RECV_SUCCESS_FACTOR=10
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=12000
# export TOLERANT_UNPROGRESSED_CNT=8000
# export NUM_TRY_TASKQ_HEAD=10

#4tp2pp
# export ONEFLOW_OFCCL_SKIP_NEGO=1
# export RECV_SUCCESS_FACTOR=10
# export RECV_SUCCESS_THRESHOLD=10000
# export BASE_CTX_SWITCH_THRESHOLD=14000
# export TOLERANT_UNPROGRESSED_CNT=8000
# export NUM_TRY_TASKQ_HEAD=10

fi

echo GPUS=$GPUS
echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_ENABLE_OFCCL
echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_OFCCL_SKIP_NEGO
echo ONEFLOW_ENABLE_OFCCL=$ONEFLOW_DEBUG_MODE
echo ONEFLOW_OFCCL_SKIP_NEGO=$ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE
echo NCCL_PROTO=$NCCL_PROTO
echo NCCL_ALGO=$NCCL_ALGO
echo NCCL_MAX_NCHANNELS=$NCCL_MAX_NCHANNELS
echo NCCL_NTHREADS=$NCCL_NTHREADS
echo ONEFLOW_OFCCL_CHAIN=$ONEFLOW_OFCCL_CHAIN
echo GLOG_vmodule=$GLOG_vmodule
echo GLOG_v=$GLOG_v
echo GLOG_logtostderr=$GLOG_logtostderr

echo RECV_SUCCESS_FACTOR=$RECV_SUCCESS_FACTOR
echo TOLERANT_UNPROGRESSED_CNT=$TOLERANT_UNPROGRESSED_CNT
echo BASE_CTX_SWITCH_THRESHOLD=$BASE_CTX_SWITCH_THRESHOLD
echo NUM_TRY_TASKQ_HEAD=$NUM_TRY_TASKQ_HEAD
echo DEV_TRY_ROUND=$DEV_TRY_ROUND
echo CHECK_REMAINING_SQE_INTERVAL=$CHECK_REMAINING_SQE_INTERVAL
echo DEBUG_FILE=$DEBUG_FILE

export PYTHONUNBUFFERED=1
echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED
export NCCL_LAUNCH_MODE=PARALLEL
echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
# export NCCL_DEBUG=INFO

rm -rf /home/panlichen/work/libai/log
mkdir -p /home/panlichen/work/libai/log

rm -rf /home/panlichen/work/oneflow/log
mkdir -p /home/panlichen/work/oneflow/log

export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true

if [ "$ONEFLOW_ENABLE_OFCCL" == "1" ]; then
NSYS_FILE="ofccl_vit"_${HOST}_${GPUS}_card
else
NSYS_FILE="nccl_vit"_${HOST}_${GPUS}_card
fi

if [ "$RUN_TYPE" == "PURE" ];then
cmd="python3 -m oneflow.distributed.launch"
elif [ "$RUN_TYPE" == "GDB" ];then
cmd="gdb -ex r --args python3 -m oneflow.distributed.launch"
elif [ "$RUN_TYPE" == "NSYS" ];then
if [ ! -d "/home/panlichen/work/oneflow/log/nsys" ];then
mkdir -p /home/panlichen/work/oneflow/log/nsys
fi
# cmd="nsys profile -f true --trace=cuda,cudnn,cublas,osrt,nvtx -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch"
cmd="nsys profile -f true -o /home/panlichen/work/oneflow/log/nsys/$NSYS_FILE python3 -m oneflow.distributed.launch"
fi
echo cmd=$cmd

$cmd \
--nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \
$FILE --config-file $CONFIG ${@:4} \
> /home/panlichen/work/oneflow/log/oneflow.log 2>&1

0 comments on commit bfaa2ce

Please sign in to comment.