File tree 5 files changed +25
-34
lines changed
5 files changed +25
-34
lines changed Original file line number Diff line number Diff line change @@ -9,17 +9,11 @@ do_once() {
9
9
}
10
10
11
11
test_body () {
12
- start=` date +%s`
13
- (sleep 10 && pkill -HUP ls && true) &
14
- (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
15
- wait
16
- end=` date +%s`
17
- runtime=$(( end- start))
18
- echo " Data access time: $runtime seconds"
19
- if [ $runtime -gt 3 ]; then
20
- echo " Data access time is greater than 3 seconds, skipping the test"
12
+ if [ $( stat /data/imagenet/train-jpeg --format=" %T" -f) != " ext2/ext3" ]; then
13
+ echo " Not available locally, skipping the test"
21
14
return 0
22
15
fi
16
+
23
17
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \
24
18
--cpu_size 2 --gpu_size 2 --fp16 --nhwc
25
19
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \
Original file line number Diff line number Diff line change @@ -13,17 +13,11 @@ test_body() {
13
13
python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
14
14
--workers 3 --prefetch 2 --epochs 3
15
15
done
16
- start=` date +%s`
17
- (sleep 10 && pkill -HUP ls && true) &
18
- (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
19
- wait
20
- end=` date +%s`
21
- runtime=$(( end- start))
22
- echo " Data access time: $runtime seconds"
23
- if [ $runtime -gt 3 ]; then
24
- echo " Data access time is greater than 3 seconds, skipping the test"
16
+ if [ $( stat /data/imagenet/train-jpeg --format=" %T" -f) != " ext2/ext3" ]; then
17
+ echo " Not available locally, skipping the test"
25
18
return 0
26
19
fi
20
+
27
21
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
28
22
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
29
23
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader
Original file line number Diff line number Diff line change @@ -8,18 +8,11 @@ do_once() {
8
8
}
9
9
10
10
test_body () {
11
- start=` date +%s`
12
- (sleep 10 && pkill -HUP ls && true) &
13
- (ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
14
- wait
15
- end=` date +%s`
16
- runtime=$(( end- start))
17
- echo " Data access time: $runtime seconds"
18
- if [ $runtime -gt 3 ]; then
19
- echo " Data access time is greater than 3 seconds, skipping the test"
11
+ if [ $( stat /data/imagenet/train-jpeg --format=" %T" -f) != " ext2/ext3" ]; then
12
+ echo " Not available locally, skipping the test"
20
13
return 0
21
14
fi
22
- # test code
15
+
23
16
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type " legacy"
24
17
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type " experimental"
25
18
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 16 --workers 3 --prefetch 11 --decoder_type " legacy"
Original file line number Diff line number Diff line change @@ -13,10 +13,10 @@ cd /opt/dali/docs/examples/use_cases/pytorch/resnet50
13
13
NUM_GPUS=$( nvidia-smi -L | wc -l)
14
14
15
15
if [ ! -d " val" ]; then
16
- ln -sf /data_raid /imagenet/val-jpeg/ val
16
+ ln -sf /data /imagenet/val-jpeg/ val
17
17
fi
18
18
if [ ! -d " train" ]; then
19
- ln -sf /data_raid /imagenet/train-jpeg/ train
19
+ ln -sf /data /imagenet/train-jpeg/ train
20
20
fi
21
21
22
22
LOG=dali.log
@@ -26,7 +26,7 @@ SECONDS=0
26
26
# turn off SHARP to avoid NCCL errors
27
27
export NCCL_NVLS_ENABLE=0
28
28
29
- torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 5 ./ 2>&1 | tee $LOG
29
+ torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 2 ./ 2>&1 | tee $LOG
30
30
31
31
RET=${PIPESTATUS[0]}
32
32
echo " Training ran in $SECONDS seconds"
@@ -57,7 +57,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
57
57
printf " TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
58
58
printf " Average perf: %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT
59
59
60
- if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" && " $PERF_RESULT " == " OK" ]]; then
60
+ # check perf only if data is locally available
61
+ if [ $( stat /data/imagenet/val-jpeg --format=" %T" -f) == " ext2/ext3" ] && [ " $PERF_RESULT " != " OK" ]; then
62
+ CAN_AND_EXIT 4
63
+ fi
64
+
65
+ if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" ]]; then
61
66
CLEAN_AND_EXIT 0
62
67
fi
63
68
Original file line number Diff line number Diff line change @@ -6,7 +6,7 @@ mkdir -p idx-files/
6
6
7
7
NUM_GPUS=$( nvidia-smi -L | wc -l)
8
8
9
- DATA_SET_DIR=/data_raid /imagenet/train-val-tfrecord
9
+ DATA_SET_DIR=/data /imagenet/train-val-tfrecord
10
10
for file in $( ls $DATA_SET_DIR /* -of-* ) ;
11
11
do
12
12
file=$( basename ${file} )
@@ -69,7 +69,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
69
69
printf " TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
70
70
printf " mean speed %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT
71
71
72
- if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" && " $PERF_RESULT " == " OK" ]]; then
72
+ # check perf only if data is locally available
73
+ if [ $( stat /data/imagenet/train-val-tfrecord --format=" %T" -f) == " ext2/ext3" ] && [ " $PERF_RESULT " != " OK" ]; then
74
+ CAN_AND_EXIT 4
75
+ fi
76
+
77
+ if [[ " $TOP1_RESULT " == " OK" && " $TOP5_RESULT " == " OK" ]]; then
73
78
CLEAN_AND_EXIT 0
74
79
fi
75
80
You can’t perform that action at this time.
0 commit comments