-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_single_vg_exp.sh
130 lines (117 loc) · 4.2 KB
/
train_single_vg_exp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash
root=$(dirname "$(dirname "$(readlink -f "$0")")")
source ${root}/scripts/helper_functions.sh
# Basic Settings
export CUDA_VISIBLE_DEVICES=0,1
export LOCAL_RANK=0
export MASTER_PORT=6033
export GPUS_PER_NODE=2
# Experiment Settings
exp_tag=example # experiment tag
arch=tiny # model architecture(define in pretrained_weights folder, e.g. huge, large, base, medium, base, tiny)
trainP=Instruct-2 # training prompt id
valP=Base # validation prompt id
# Hyperparameter Settings
criterion=adjust_label_smoothed_cross_entropy
label_smoothing=0.1
lr=3e-5
max_epoch=1
warmup_ratio=0.06
batch_size=4
update_freq=1
resnet_drop_path_rate=0.0
encoder_drop_path_rate=0.2
decoder_drop_path_rate=0.2
dropout=0.1
attention_dropout=0.0
max_src_length=80
max_tgt_length=20
num_bins=1000
patch_image_size=512
# ================================================================================
# Please do not change the settings below
# ================================================================================
# Basic Settings
task=wsdm_vqa
selected_cols=0,1,2,3,4,5,6,7
# Path Settings
pretrained_weights=${root}/pretrained_weights/ofa_${arch}.pt
folder_struc=${arch}/${exp_tag}/train-P${trainP}
bpe_dir=${root}/utils/BPE
user_dir=${root}/gvig_module
# Dataset Settings
data_dir=${root}/datasets/${exp_tag} # dataset path
train_data=${data_dir}/train-P${trainP}.csv # train data path
val_data=${data_dir}/test_public-P${valP}.csv # validation data path
assert_file_exists ${train_data}
assert_file_exists ${val_data}
train_val_files=${train_data},${val_data}
# Tensorboard Settings
tensorboard_dir=${root}/tensorboard/${folder_struc}/val-P${valP} # tensorboard log path
mkdir -p ${tensorboard_dir}
# Logging Settings
log_dir=${root}/logs/${folder_struc} # log directory path
log_path=${log_dir}/val-P${valP}.log # log file path
mkdir -p ${log_dir}
# Output Checkpoint Settings
save_dir=${root}/checkpoints/${folder_struc} # checkpoint directory path
save_path=${save_dir}/val-P${valP} # checkpoint file path
mkdir -p ${save_dir}
# Display Experiment Settings
display_exp_settings
# Main Execution
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} \
python3 -m torch.distributed.launch \
--nproc_per_node=${GPUS_PER_NODE} \
--master_port=${MASTER_PORT} \
${root}/train.py \
${train_val_files} \
--selected-cols=${selected_cols} \
--bpe-dir=${bpe_dir} \
--user-dir=${user_dir} \
--restore-file=${pretrained_weights} \
--reset-optimizer --reset-dataloader --reset-meters \
--save-dir=${save_path} \
--task=${task} \
--arch=ofa_${arch} \
--criterion=${criterion} \
--label-smoothing=${label_smoothing} \
--batch-size=${batch_size} \
--update-freq=${update_freq} \
--encoder-normalize-before \
--decoder-normalize-before \
--share-decoder-input-output-embed \
--share-all-embeddings \
--layernorm-embedding \
--patch-layernorm-embedding \
--code-layernorm-embedding \
--resnet-drop-path-rate=${resnet_drop_path_rate} \
--encoder-drop-path-rate=${encoder_drop_path_rate} \
--decoder-drop-path-rate=${decoder_drop_path_rate} \
--dropout=${dropout} \
--attention-dropout=${attention_dropout} \
--weight-decay=0.01 --optimizer=adam --adam-betas="(0.9,0.999)" --adam-eps=1e-08 --clip-norm=1.0 \
--lr-scheduler=polynomial_decay --lr=${lr} \
--max-epoch=${max_epoch} --warmup-ratio=${warmup_ratio} \
--log-format=simple --log-interval=10 \
--fixed-validation-seed=7 \
--no-epoch-checkpoints --keep-best-checkpoints=1 \
--save-interval=1 --validate-interval=1 \
--save-interval-updates=500 --validate-interval-updates=500 \
--eval-acc \
--eval-args='{"beam":5,"min_len":4,"max_len_a":0,"max_len_b":4}' \
--best-checkpoint-metric=score --maximize-best-checkpoint-metric \
--max-src-length=${max_src_length} \
--max-tgt-length=${max_tgt_length} \
--find-unused-parameters \
--add-type-embedding \
--scale-attn \
--scale-fc \
--scale-heads \
--disable-entangle \
--num-bins=${num_bins} \
--patch-image-size=${patch_image_size} \
--fp16 \
--fp16-scale-window=512 \
--tensorboard-logdir=${tensorboard_dir} \
--num-workers=0 >${log_path} 2>&1