Skip to content

Commit 211abfa

Browse files
JanuszLstiepan
authored andcommitted
Fix data paths in TL3 short tests (#5845)
- unifies data paths in TL3 short tests with other tests - improves the way the remote file system is detected in the tests Signed-off-by: Janusz Lisiecki <[email protected]>
1 parent 6f1d20c commit 211abfa

File tree

5 files changed

+25
-34
lines changed

5 files changed

+25
-34
lines changed

qa/TL1_separate_executor/test_nofw.sh

+3-9
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,11 @@ do_once() {
99
}
1010

1111
test_body() {
12-
start=`date +%s`
13-
(sleep 10 && pkill -HUP ls && true) &
14-
(ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
15-
wait
16-
end=`date +%s`
17-
runtime=$((end-start))
18-
echo "Data access time: $runtime seconds"
19-
if [ $runtime -gt 3 ]; then
20-
echo "Data access time is greater than 3 seconds, skipping the test"
12+
if [ $(stat /data/imagenet/train-jpeg --format="%T" -f) != "ext2/ext3" ]; then
13+
echo "Not available locally, skipping the test"
2114
return 0
2215
fi
16+
2317
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \
2418
--cpu_size 2 --gpu_size 2 --fp16 --nhwc
2519
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --separate_queue \

qa/TL2_FW_iterators_perf/test_pytorch.sh

+3-9
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,11 @@ test_body() {
1313
python test_RN50_data_fw_iterators.py --framework ${fw} --gpus ${NUM_GPUS} -b 13 \
1414
--workers 3 --prefetch 2 --epochs 3
1515
done
16-
start=`date +%s`
17-
(sleep 10 && pkill -HUP ls && true) &
18-
(ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
19-
wait
20-
end=`date +%s`
21-
runtime=$((end-start))
22-
echo "Data access time: $runtime seconds"
23-
if [ $runtime -gt 3 ]; then
24-
echo "Data access time is greater than 3 seconds, skipping the test"
16+
if [ $(stat /data/imagenet/train-jpeg --format="%T" -f) != "ext2/ext3" ]; then
17+
echo "Not available locally, skipping the test"
2518
return 0
2619
fi
20+
2721
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init fork --benchmark_iters 500 --test_pipes parallel
2822
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --worker_init spawn --benchmark_iters 500 --test_pipes parallel
2923
torchrun --nproc_per_node=${NUM_GPUS} ./test_RN50_external_source_parallel_train_ddp.py /data/imagenet/train-jpeg/ --workers 6 --py_workers 6 --epochs 3 --batch_size 256 --reader_queue_depth 2 --benchmark_iters 500 --test_pipes file_reader

qa/TL2_RN50_data_perf/test.sh

+3-10
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,11 @@ do_once() {
88
}
99

1010
test_body() {
11-
start=`date +%s`
12-
(sleep 10 && pkill -HUP ls && true) &
13-
(ls /data/imagenet/train-jpeg > /dev/null && pkill -HUP sleep) &
14-
wait
15-
end=`date +%s`
16-
runtime=$((end-start))
17-
echo "Data access time: $runtime seconds"
18-
if [ $runtime -gt 3 ]; then
19-
echo "Data access time is greater than 3 seconds, skipping the test"
11+
if [ $(stat /data/imagenet/train-jpeg --format="%T" -f) != "ext2/ext3" ]; then
12+
echo "Not available locally, skipping the test"
2013
return 0
2114
fi
22-
# test code
15+
2316
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type "legacy"
2417
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 256 --workers 3 --prefetch 2 --decoder_type "experimental"
2518
python test_RN50_data_pipeline.py --gpus ${NUM_GPUS} -b 16 --workers 3 --prefetch 11 --decoder_type "legacy"

qa/TL3_RN50_short/test_pytorch.sh

+9-4
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ cd /opt/dali/docs/examples/use_cases/pytorch/resnet50
1313
NUM_GPUS=$(nvidia-smi -L | wc -l)
1414

1515
if [ ! -d "val" ]; then
16-
ln -sf /data_raid/imagenet/val-jpeg/ val
16+
ln -sf /data/imagenet/val-jpeg/ val
1717
fi
1818
if [ ! -d "train" ]; then
19-
ln -sf /data_raid/imagenet/train-jpeg/ train
19+
ln -sf /data/imagenet/train-jpeg/ train
2020
fi
2121

2222
LOG=dali.log
@@ -26,7 +26,7 @@ SECONDS=0
2626
# turn off SHARP to avoid NCCL errors
2727
export NCCL_NVLS_ENABLE=0
2828

29-
torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 5 ./ 2>&1 | tee $LOG
29+
torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 2 ./ 2>&1 | tee $LOG
3030

3131
RET=${PIPESTATUS[0]}
3232
echo "Training ran in $SECONDS seconds"
@@ -57,7 +57,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
5757
printf "TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
5858
printf "Average perf: %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT
5959

60-
if [[ "$TOP1_RESULT" == "OK" && "$TOP5_RESULT" == "OK" && "$PERF_RESULT" == "OK" ]]; then
60+
# check perf only if data is locally available
61+
if [ $(stat /data/imagenet/val-jpeg --format="%T" -f) == "ext2/ext3" ] && [ "$PERF_RESULT" != "OK" ]; then
62+
CAN_AND_EXIT 4
63+
fi
64+
65+
if [[ "$TOP1_RESULT" == "OK" && "$TOP5_RESULT" == "OK" ]]; then
6166
CLEAN_AND_EXIT 0
6267
fi
6368

qa/TL3_RN50_short/test_tensorflow.sh

+7-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mkdir -p idx-files/
66

77
NUM_GPUS=$(nvidia-smi -L | wc -l)
88

9-
DATA_SET_DIR=/data_raid/imagenet/train-val-tfrecord
9+
DATA_SET_DIR=/data/imagenet/train-val-tfrecord
1010
for file in $(ls $DATA_SET_DIR/*-of-*);
1111
do
1212
file=$(basename ${file})
@@ -69,7 +69,12 @@ printf "TOP-1 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP1 $MIN_TOP1 $TOP
6969
printf "TOP-5 Accuracy: %.2f%% (expect at least %f%%) %s\n" $TOP5 $MIN_TOP5 $TOP5_RESULT
7070
printf "mean speed %.2f (expect at least %f) samples/sec %s\n" $PERF $MIN_PERF $PERF_RESULT
7171

72-
if [[ "$TOP1_RESULT" == "OK" && "$TOP5_RESULT" == "OK" && "$PERF_RESULT" == "OK" ]]; then
72+
# check perf only if data is locally available
73+
if [ $(stat /data/imagenet/train-val-tfrecord --format="%T" -f) == "ext2/ext3" ] && [ "$PERF_RESULT" != "OK" ]; then
74+
CAN_AND_EXIT 4
75+
fi
76+
77+
if [[ "$TOP1_RESULT" == "OK" && "$TOP5_RESULT" == "OK" ]]; then
7378
CLEAN_AND_EXIT 0
7479
fi
7580

0 commit comments

Comments
 (0)