Skip to content
This repository has been archived by the owner on Jan 3, 2023. It is now read-only.

Mingshan/Adding resnet50 validation script #478

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions test/validate_resnet50/datasets_make_deterministic.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
diff --git a/scripts/tf_cnn_benchmarks/benchmark_cnn.py b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
index 09b118e..4cf9a12 100644
--- a/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+++ b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
@@ -34,6 +34,7 @@ import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
+import ngraph_bridge

from google.protobuf import text_format

@@ -2479,6 +2480,7 @@ class BenchmarkCNN(object):
fetches = self._build_fetches(global_step, all_logits, losses, device_grads,
enqueue_ops, update_ops, all_accuracy_ops,
phase_train)
+
if global_input_producer_op:
global_input_producer_op = tf.group(*global_input_producer_op)
else:
diff --git a/scripts/tf_cnn_benchmarks/data_utils.py b/scripts/tf_cnn_benchmarks/data_utils.py
index 0376d0b..992ee75 100644
--- a/scripts/tf_cnn_benchmarks/data_utils.py
+++ b/scripts/tf_cnn_benchmarks/data_utils.py
@@ -112,7 +112,10 @@ def create_dataset(batch_size,
if not file_names:
raise ValueError('Found no files in --data_dir matching: {}'
.format(glob_pattern))
- ds = tf.data.TFRecordDataset.list_files(file_names)
+
+ # ds = tf.data.TFRecordDataset.list_files(file_names)
+ ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False, seed=10)
+
ds = ds.apply(
interleave_ops.parallel_interleave(
tf.data.TFRecordDataset, cycle_length=10))
@@ -122,8 +125,9 @@ def create_dataset(batch_size,
counter = counter.repeat()
ds = tf.data.Dataset.zip((ds, counter))
ds = ds.prefetch(buffer_size=batch_size)
- if train:
- ds = ds.shuffle(buffer_size=10000)
+ # Make dataset loader deterministic
+ # if train:
+ # ds = ds.shuffle(buffer_size=10000)
ds = ds.repeat()
ds = ds.apply(
batching.map_and_batch(
diff --git a/scripts/tf_cnn_benchmarks/preprocessing.py b/scripts/tf_cnn_benchmarks/preprocessing.py
index 6a270b0..4e84a1a 100644
--- a/scripts/tf_cnn_benchmarks/preprocessing.py
+++ b/scripts/tf_cnn_benchmarks/preprocessing.py
@@ -335,9 +335,11 @@ def train_image(image_buffer,
else:
image = tf.image.decode_jpeg(image_buffer, channels=3,
dct_method='INTEGER_FAST')
- image = tf.slice(image, bbox_begin, bbox_size)

- distorted_image = tf.image.random_flip_left_right(image)
+ #image = tf.slice(image, bbox_begin, bbox_size)
+
+ #distorted_image = tf.image.random_flip_left_right(image)
+ distorted_image = image

# This resizing operation may distort the images because the aspect
# ratio is not respected.
@@ -361,7 +363,7 @@ def train_image(image_buffer,
distorted_image = distort_color(distorted_image, batch_position,
distort_color_in_yiq=distort_color_in_yiq)

- # Note: This ensures the scaling matches the output of eval_image
+ #Note: This ensures the scaling matches the output of eval_image
distorted_image *= 255

if summary_verbosity >= 3:
@@ -487,10 +489,11 @@ class RecordInputImagePreprocessor(BaseImagePreprocessor):
"""Preprocessing image_buffer as a function of its batch position."""
if self.train:
image = train_image(image_buffer, self.height, self.width, bbox,
- batch_position, self.resize_method, self.distortions,
+ batch_position, self.resize_method, False,
None, summary_verbosity=self.summary_verbosity,
distort_color_in_yiq=self.distort_color_in_yiq,
- fuse_decode_and_crop=self.fuse_decode_and_crop)
+ #fuse_decode_and_crop=self.fuse_decode_and_crop
+ fuse_decode_and_crop=False)
else:
image = tf.image.decode_jpeg(
image_buffer, channels=3, dct_method='INTEGER_FAST')
38 changes: 38 additions & 0 deletions test/validate_resnet50/one_encapsulate.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
diff --git a/scripts/tf_cnn_benchmarks/benchmark_cnn.py b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
index 09b118e..d5a4e29 100644
--- a/scripts/tf_cnn_benchmarks/benchmark_cnn.py
+++ b/scripts/tf_cnn_benchmarks/benchmark_cnn.py
@@ -34,6 +34,7 @@ import numpy as np
import six
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
+import ngraph_bridge

from google.protobuf import text_format

@@ -726,13 +727,23 @@ def benchmark_one_step(sess,
summary_str = None
start_time = time.time()
if summary_op is None:
- results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
+ # get a new set of fetch operation
+ new_fetches = {}
+ for f in fetches:
+ if f == "average_loss":
+ continue
+ new_fetches[f] = fetches[f]
+
+ results = sess.run(new_fetches, options=run_options, run_metadata=run_metadata)
+ #results = sess.run(fetches, options=run_options, run_metadata=run_metadata)
else:
(results, summary_str) = sess.run(
[fetches, summary_op], options=run_options, run_metadata=run_metadata)

if not params.forward_only:
- lossval = results['average_loss']
+ # the calculation is removed in the operations to be fetched
+ #lossval = results['average_loss']
+ lossval = 0
else:
lossval = 0.
if image_producer is not None:
263 changes: 263 additions & 0 deletions test/validate_resnet50/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
from subprocess import check_output, call, Popen, PIPE
import numpy as np
import os

# This script will run resnet50 training validation with synthetic data and real data
# and compare the results with the desired reference run.
# If the reference files are not provided it runs on TF(w/o nGraph) and uses its output
# as reference
# Assumed this validation.py script is under a tensorflow/benchmarks/ repo
# with git head at commit ab01ecc.
# TODO:
# 1. num_bathces set to 100
# 2. Makes certain assumptions about the reference_file 's name and the batch size
# 3. Add Arguments to take in the backend, the reference log files, the number of iterations/batches,
# the data type (real or synthetic)
# 4. Automate the cloning of benchmarks repo and running the script

validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py ' \
+ '--num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 ' \
+ '--num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet ' \
+ '--datasets_use_prefetch=False --print_training_accuracy=True ' \
+ '--num_learning_rate_warmup_epochs=0 --num_batches=100'
validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py ' \
+ '--num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 ' \
+ '--num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet ' \
+ '--datasets_use_prefetch=False --print_training_accuracy=True ' \
+ '--num_learning_rate_warmup_epochs=0 --num_batches=100'
validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py ' \
+ '--num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW ' \
+ '--model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet ' \
+ '--datasets_use_prefetch=False --print_training_accuracy=True ' \
+ '--num_learning_rate_warmup_epochs=0 --num_batches=100'
validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py ' \
+ '--num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 ' \
+ '--batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False ' \
+ '--print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100'


def command_executor(cmd, verbose=False, msg=None, stdout=None):
if verbose or msg is not None:
tag = 'Running COMMAND: ' if msg is None else msg
print(tag + cmd)

p = Popen(
cmd,
shell=True,
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
close_fds=True,
bufsize=1)
output = p.stdout.read()
error_output = p.stderr.read()

return output, error_output


def write_to_file(filename, content):
with open(filename, "w") as text_file:
text_file.write(content)


def parse_training_output(output):
to_parse = False
total_loss = []
top1_acc = []
top5_acc = []

for line in output.strip().split("\n"):
if line.split()[0] == 'Step':
to_parse = True
continue

elif line.startswith('-----'):
to_parse = False
continue

if to_parse:
total_loss.append(line.split()[-3])
top1_acc.append(line.split()[-2])
top5_acc.append(line.split()[-1])

return total_loss, top1_acc, top5_acc


def parse_reference_file(filename):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_reference_file and parse_training_output can be a single function... I think they are separate because one parses a file, and the other parses string. Maybe we keep the string parsing function and just read the file into a string and reuse.

to_parse = False
total_loss = []
top1_acc = []
top5_acc = []

with open(filename) as reference_result:
for line in reference_result:
if line.split()[0] == 'Step':
to_parse = True
continue

elif line.startswith('-----'):
to_parse = False
continue

if to_parse:
total_loss.append(line.split()[-3])
top1_acc.append(line.split()[-2])
top5_acc.append(line.split()[-1])

return total_loss, top1_acc, top5_acc


def check_validation_results(norm_dict, metric):
test_pass = True
for norm in norm_dict:
if norm_dict[norm] > 0.1:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so if we get ref accuracy = 75, and ng accuracy = 75.3, then is it a failure?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This script is not comparing the accuracy. It compares the training loss value at every iteration

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me check.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing for loss. If ref loss is 1, and we get 0.8, is the test passing?

print(metric + " " + norm +
" is greater than the threshold 0.1, validation failed")
test_pass = False
return test_pass


# Return L1, L2, inf norm of the input arrays
def calculate_norm_values(result1, result2):
l1_norm = np.linalg.norm(
(np.array(result1, dtype=np.float) - np.array(result2, dtype=np.float)),
1)

l2_norm = np.linalg.norm(
(np.array(result1, dtype=np.float) - np.array(result2, dtype=np.float)),
2)

inf_norm = np.linalg.norm(
(np.array(result1, dtype=np.float) - np.array(result2, dtype=np.float)),
np.inf)
return {"l1_norm": l1_norm, "l2_norm": l2_norm, "inf_norm": inf_norm}


def run_validation(data_format, reference_file_name, batch_size):
# Apply the patch to make input data loader deterministic for real data validation
# Assume the current directory already has the required patch
if os.path.isfile('./datasets_make_deterministic.patch'):
output, error_output = command_executor(
'git apply --check --whitespace=nowarn ' +
'./datasets_make_deterministic.patch')
if error_output:
print(
"Warning: datasets_make_determinitic.patch is already applied")
else:
command_executor('git apply --whitespace=nowarn ' +
'./datasets_make_deterministic.patch')

# Run the validation command on NGraph
if (data_format == "real_data"):
command_to_run = validate_with_real_data_command_NG + str(batch_size)
elif (data_format == "synthetic_data"):
command_to_run = validate_with_synthetic_data_command_NG + \
str(batch_size)
shresthamalik marked this conversation as resolved.
Show resolved Hide resolved

print("Running: ", command_to_run)
output, error_output = command_executor(command_to_run)
output_string = str(output, 'utf-8')

if output:
ngraph_outputs_total_loss, ngraph_outputs_top1_acc, ngraph_outputs_top5_acc = parse_training_output(
output_string)

elif error_output:
print("Something went wrong executing the command ",
validate_with_real_data_command_NG)
print(str(error_output, 'utf-8'))
exit(1)

print("ngraph total loss ", ngraph_outputs_total_loss)
print("ngraph top1 Accuracy ", ngraph_outputs_top1_acc)
print("ngraph top5 Accuracy ", ngraph_outputs_top5_acc)

write_to_file(
"resnet50_validationResult_NG_" + data_format + "_BS" + str(batch_size)
+ ".txt", output_string)

# Get TF output: Either from a reference file or from actual run command
# check if already has some TF result file
cwd = os.getcwd()
reference_file_path = cwd + reference_file_name + \
'_BS' + str(batch_size) + ".txt"
print("Finding reference file ", reference_file_path)
if os.path.isfile(reference_file_path):
# parse the text file directly
reference_outputs_total_loss, reference_outputs_top1_acc, reference_outputs_top5_acc = parse_reference_file(
reference_file_path)
else:
# Run the validation command on TF
# This requires the TF needs to build with GPU
print("No reference output file found, begin running reference command")
print("Running: ", validate_with_real_data_command_TF)
output, error_output = command_executor(
validate_with_real_data_command_TF)
output_string = str(output, 'utf-8')

if output:
reference_outputs_total_loss, reference_outputs_top1_acc, reference_outputs_top5_acc = parse_training_output(
output_string)
elif error_output:
print("Something went wrong executing the command ",
validate_with_real_data_command_NG)
print(str(error_output, 'utf-8'))
exit(1)

write_to_file(
"resnet50_validaionResultReference" + str(batch_size) + ".txt",
output_string)

print("reference total loss ", reference_outputs_total_loss)
print("reference top1Acc ", reference_outputs_top1_acc)
print("reference top5Acc ", reference_outputs_top5_acc)

# Compare the TF output and NG output
# TF CPU results and GPU results are not the same, so for TF results
# Need to run with TF GPU
assert len(ngraph_outputs_total_loss) == len(
reference_outputs_total_loss), "Number of total_loss values mismatch"
assert len(ngraph_outputs_top1_acc) == len(
reference_outputs_top1_acc), "Number of top1_accuracy values mismatch"
assert len(ngraph_outputs_top5_acc) == len(
reference_outputs_top5_acc), "Number of top5_accuracy values mismatch"

loss_norms = calculate_norm_values(ngraph_outputs_total_loss,
reference_outputs_total_loss)
top1Acc_norms = calculate_norm_values(ngraph_outputs_top1_acc,
reference_outputs_top1_acc)
top5Acc_norms = calculate_norm_values(ngraph_outputs_top5_acc,
reference_outputs_top5_acc)

print(
"loss norms are %f %f %f " %
(loss_norms["l1_norm"], loss_norms["l2_norm"], loss_norms["inf_norm"]))
print("top1Acc norms are %f %f %f " %
(top1Acc_norms["l1_norm"], top1Acc_norms["l2_norm"],
top1Acc_norms["inf_norm"]))
print("top5Acc norms are %f %f %f " %
(top5Acc_norms["l1_norm"], top5Acc_norms["l2_norm"],
top5Acc_norms["inf_norm"]))

loss_result = check_validation_results(loss_norms, "total_loss")
top1Acc_result = check_validation_results(loss_norms, "top1 Accuracy")
top5Acc_result = check_validation_results(loss_norms, "top5 Accuracy")

if ((loss_result and top1Acc_result and top5Acc_result)):
print("Validation test pass")

# reapply the patch
output, error_output = command_executor(
'git apply -R ' + './datasets_make_deterministic.patch')


# Validation with synthetic data

if __name__ == "__main__":
reference_file_name_realData = ''
reference_file_name_syntheticData = ''
shresthamalik marked this conversation as resolved.
Show resolved Hide resolved
batch_size = 100
run_validation("real_data", reference_file_name_realData, batch_size)
batch_size = 100
shresthamalik marked this conversation as resolved.
Show resolved Hide resolved
run_validation("synthetic_data", reference_file_name_syntheticData,
batch_size)