From 2ec2dc3b8ce54ffa40085e32a06723dadc54f751 Mon Sep 17 00:00:00 2001 From: "mingshan.wang" <mingshan@nervana-titanxp26.fm.intel.com> Date: Fri, 29 Mar 2019 15:47:10 -0700 Subject: [PATCH 1/7] Adding resnet50 validation script --- .../datasets_make_deterministic.patch | 89 +++++++ test/validate_resnet50/one_encapsulate.patch | 38 +++ .../resnet50_tf_realData_BS100.txt | 53 ++++ .../resnet50_tf_realData_BS200.txt | 63 +++++ .../resnet50_tf_syntheticData_BS200.txt | 57 +++++ .../resnet50_tf_syntheticData_BS400.txt | 77 ++++++ test/validate_resnet50/validation.py | 226 ++++++++++++++++++ 7 files changed, 603 insertions(+) create mode 100644 test/validate_resnet50/datasets_make_deterministic.patch create mode 100644 test/validate_resnet50/one_encapsulate.patch create mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt create mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt create mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt create mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt create mode 100644 test/validate_resnet50/validation.py diff --git a/test/validate_resnet50/datasets_make_deterministic.patch b/test/validate_resnet50/datasets_make_deterministic.patch new file mode 100644 index 00000000..f923b1b6 --- /dev/null +++ b/test/validate_resnet50/datasets_make_deterministic.patch @@ -0,0 +1,89 @@ +diff --git a/scripts/tf_cnn_benchmarks/benchmark_cnn.py b/scripts/tf_cnn_benchmarks/benchmark_cnn.py +index 09b118e..4cf9a12 100644 +--- a/scripts/tf_cnn_benchmarks/benchmark_cnn.py ++++ b/scripts/tf_cnn_benchmarks/benchmark_cnn.py +@@ -34,6 +34,7 @@ import numpy as np + import six + from six.moves import xrange # pylint: disable=redefined-builtin + import tensorflow as tf ++import ngraph_bridge + + from google.protobuf import text_format + +@@ -2479,6 +2480,7 @@ class BenchmarkCNN(object): + fetches = self._build_fetches(global_step, all_logits, losses, device_grads, + enqueue_ops, update_ops, all_accuracy_ops, + phase_train) ++ + if global_input_producer_op: + global_input_producer_op = tf.group(*global_input_producer_op) + else: +diff --git a/scripts/tf_cnn_benchmarks/data_utils.py b/scripts/tf_cnn_benchmarks/data_utils.py +index 0376d0b..992ee75 100644 +--- a/scripts/tf_cnn_benchmarks/data_utils.py ++++ b/scripts/tf_cnn_benchmarks/data_utils.py +@@ -112,7 +112,10 @@ def create_dataset(batch_size, + if not file_names: + raise ValueError('Found no files in --data_dir matching: {}' + .format(glob_pattern)) +- ds = tf.data.TFRecordDataset.list_files(file_names) ++ ++ # ds = tf.data.TFRecordDataset.list_files(file_names) ++ ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False, seed=10) ++ + ds = ds.apply( + interleave_ops.parallel_interleave( + tf.data.TFRecordDataset, cycle_length=10)) +@@ -122,8 +125,9 @@ def create_dataset(batch_size, + counter = counter.repeat() + ds = tf.data.Dataset.zip((ds, counter)) + ds = ds.prefetch(buffer_size=batch_size) +- if train: +- ds = ds.shuffle(buffer_size=10000) ++ # Make dataset loader deterministic ++ # if train: ++ # ds = ds.shuffle(buffer_size=10000) + ds = ds.repeat() + ds = ds.apply( + batching.map_and_batch( +diff --git a/scripts/tf_cnn_benchmarks/preprocessing.py b/scripts/tf_cnn_benchmarks/preprocessing.py +index 6a270b0..4e84a1a 100644 +--- a/scripts/tf_cnn_benchmarks/preprocessing.py ++++ b/scripts/tf_cnn_benchmarks/preprocessing.py +@@ -335,9 +335,11 @@ def train_image(image_buffer, + else: + image = tf.image.decode_jpeg(image_buffer, channels=3, + dct_method='INTEGER_FAST') +- image = tf.slice(image, bbox_begin, bbox_size) + +- distorted_image = tf.image.random_flip_left_right(image) ++ #image = tf.slice(image, bbox_begin, bbox_size) ++ ++ #distorted_image = tf.image.random_flip_left_right(image) ++ distorted_image = image + + # This resizing operation may distort the images because the aspect + # ratio is not respected. +@@ -361,7 +363,7 @@ def train_image(image_buffer, + distorted_image = distort_color(distorted_image, batch_position, + distort_color_in_yiq=distort_color_in_yiq) + +- # Note: This ensures the scaling matches the output of eval_image ++ #Note: This ensures the scaling matches the output of eval_image + distorted_image *= 255 + + if summary_verbosity >= 3: +@@ -487,10 +489,11 @@ class RecordInputImagePreprocessor(BaseImagePreprocessor): + """Preprocessing image_buffer as a function of its batch position.""" + if self.train: + image = train_image(image_buffer, self.height, self.width, bbox, +- batch_position, self.resize_method, self.distortions, ++ batch_position, self.resize_method, False, + None, summary_verbosity=self.summary_verbosity, + distort_color_in_yiq=self.distort_color_in_yiq, +- fuse_decode_and_crop=self.fuse_decode_and_crop) ++ #fuse_decode_and_crop=self.fuse_decode_and_crop ++ fuse_decode_and_crop=False) + else: + image = tf.image.decode_jpeg( + image_buffer, channels=3, dct_method='INTEGER_FAST') diff --git a/test/validate_resnet50/one_encapsulate.patch b/test/validate_resnet50/one_encapsulate.patch new file mode 100644 index 00000000..f7200e66 --- /dev/null +++ b/test/validate_resnet50/one_encapsulate.patch @@ -0,0 +1,38 @@ +diff --git a/scripts/tf_cnn_benchmarks/benchmark_cnn.py b/scripts/tf_cnn_benchmarks/benchmark_cnn.py +index 09b118e..d5a4e29 100644 +--- a/scripts/tf_cnn_benchmarks/benchmark_cnn.py ++++ b/scripts/tf_cnn_benchmarks/benchmark_cnn.py +@@ -34,6 +34,7 @@ import numpy as np + import six + from six.moves import xrange # pylint: disable=redefined-builtin + import tensorflow as tf ++import ngraph_bridge + + from google.protobuf import text_format + +@@ -726,13 +727,23 @@ def benchmark_one_step(sess, + summary_str = None + start_time = time.time() + if summary_op is None: +- results = sess.run(fetches, options=run_options, run_metadata=run_metadata) ++ # get a new set of fetch operation ++ new_fetches = {} ++ for f in fetches: ++ if f == "average_loss": ++ continue ++ new_fetches[f] = fetches[f] ++ ++ results = sess.run(new_fetches, options=run_options, run_metadata=run_metadata) ++ #results = sess.run(fetches, options=run_options, run_metadata=run_metadata) + else: + (results, summary_str) = sess.run( + [fetches, summary_op], options=run_options, run_metadata=run_metadata) + + if not params.forward_only: +- lossval = results['average_loss'] ++ # the calculation is removed in the operations to be fetched ++ #lossval = results['average_loss'] ++ lossval = 0 + else: + lossval = 0. + if image_producer is not None: diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt new file mode 100644 index 00000000..8f0cce0b --- /dev/null +++ b/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt @@ -0,0 +1,53 @@ +W0328 15:31:38.178014 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:121: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version. +Instructions for updating: +Use `tf.data.experimental.parallel_interleave(...)`. +W0328 15:31:38.197099 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:136: map_and_batch (from tensorflow.contrib.data.python.ops.batching) is deprecated and will be removed in a future version. +Instructions for updating: +Use `tf.data.experimental.map_and_batch(...)`. +W0328 15:31:41.318058 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. +Instructions for updating: +Please switch to tf.train.MonitoredTrainingSession +2019-03-28 15:31:41.734261: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA +2019-03-28 15:31:42.684223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: +name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 +pciBusID: 0000:0c:00.0 +totalMemory: 11.91GiB freeMemory: 11.75GiB +2019-03-28 15:31:42.684259: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 +2019-03-28 15:31:42.996125: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: +2019-03-28 15:31:42.996179: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 +2019-03-28 15:31:42.996185: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N +2019-03-28 15:31:42.996527: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) +I0328 15:31:43.550181 140647503410944 tf_logging.py:115] Running local_init_op. +I0328 15:31:49.378333 140647503410944 tf_logging.py:115] Done running local_init_op. +TensorFlow: 1.12 +Model: resnet50 +Dataset: imagenet +Mode: training +SingleSess: False +Batch size: 32 global + 32.0 per device +Num batches: 100 +Num epochs: 0.00 +Devices: ['/gpu:0'] +Data format: NCHW +Optimizer: sgd +Variables: parameter_server +========== +Generating model +Running warm up +Done warm up +Step Img/sec total_loss top_1_accuracy top_5_accuracy +1 images/sec: 217.4 +/- 0.0 (jitter = 0.0) 8.360 0.000 0.000 +10 images/sec: 215.6 +/- 0.4 (jitter = 1.2) 8.027 0.000 0.000 +20 images/sec: 215.0 +/- 0.3 (jitter = 1.0) 8.333 0.000 0.031 +30 images/sec: 214.9 +/- 0.3 (jitter = 1.0) 8.215 0.000 0.000 +40 images/sec: 214.8 +/- 0.2 (jitter = 1.1) 8.189 0.000 0.000 +50 images/sec: 214.6 +/- 0.2 (jitter = 1.3) 8.177 0.000 0.000 +60 images/sec: 214.3 +/- 0.2 (jitter = 1.4) 8.112 0.000 0.000 +70 images/sec: 214.3 +/- 0.2 (jitter = 1.5) 8.185 0.000 0.000 +80 images/sec: 214.2 +/- 0.2 (jitter = 1.5) 8.120 0.000 0.000 +90 images/sec: 214.2 +/- 0.2 (jitter = 1.5) 8.254 0.000 0.000 +100 images/sec: 214.3 +/- 0.2 (jitter = 1.6) 8.093 0.000 0.000 +---------------------------------------------------------------- +total images/sec: 213.97 +---------------------------------------------------------------- diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt new file mode 100644 index 00000000..fcb83c9e --- /dev/null +++ b/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt @@ -0,0 +1,63 @@ +W0328 15:32:46.495650 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:121: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version. +Instructions for updating: +Use `tf.data.experimental.parallel_interleave(...)`. +W0328 15:32:46.520555 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:136: map_and_batch (from tensorflow.contrib.data.python.ops.batching) is deprecated and will be removed in a future version. +Instructions for updating: +Use `tf.data.experimental.map_and_batch(...)`. +W0328 15:32:49.878959 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. +Instructions for updating: +Please switch to tf.train.MonitoredTrainingSession +2019-03-28 15:32:50.317750: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA +2019-03-28 15:32:51.857166: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: +name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 +pciBusID: 0000:0c:00.0 +totalMemory: 11.91GiB freeMemory: 11.75GiB +2019-03-28 15:32:51.857212: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 +2019-03-28 15:32:52.273544: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: +2019-03-28 15:32:52.273603: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 +2019-03-28 15:32:52.273610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N +2019-03-28 15:32:52.273926: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) +I0328 15:32:52.851403 140049874691840 tf_logging.py:115] Running local_init_op. +I0328 15:32:58.568639 140049874691840 tf_logging.py:115] Done running local_init_op. +TensorFlow: 1.12 +Model: resnet50 +Dataset: imagenet +Mode: training +SingleSess: False +Batch size: 32 global + 32.0 per device +Num batches: 200 +Num epochs: 0.00 +Devices: ['/gpu:0'] +Data format: NCHW +Optimizer: sgd +Variables: parameter_server +========== +Generating model +Running warm up +Done warm up +Step Img/sec total_loss top_1_accuracy top_5_accuracy +1 images/sec: 217.6 +/- 0.0 (jitter = 0.0) 8.360 0.000 0.000 +10 images/sec: 216.7 +/- 0.4 (jitter = 0.3) 8.027 0.000 0.000 +20 images/sec: 216.6 +/- 0.3 (jitter = 0.6) 8.333 0.000 0.031 +30 images/sec: 216.3 +/- 0.3 (jitter = 1.0) 8.216 0.000 0.000 +40 images/sec: 216.3 +/- 0.2 (jitter = 1.0) 8.188 0.000 0.000 +50 images/sec: 216.1 +/- 0.2 (jitter = 1.2) 8.177 0.000 0.000 +60 images/sec: 216.3 +/- 0.2 (jitter = 1.1) 8.107 0.000 0.000 +70 images/sec: 216.4 +/- 0.2 (jitter = 1.0) 8.189 0.000 0.000 +80 images/sec: 216.4 +/- 0.2 (jitter = 0.9) 8.114 0.000 0.000 +90 images/sec: 216.4 +/- 0.2 (jitter = 0.8) 8.246 0.000 0.000 +100 images/sec: 216.3 +/- 0.2 (jitter = 1.1) 8.081 0.000 0.000 +110 images/sec: 216.0 +/- 0.2 (jitter = 1.4) 8.363 0.000 0.000 +120 images/sec: 215.7 +/- 0.2 (jitter = 1.8) 8.027 0.000 0.000 +130 images/sec: 215.5 +/- 0.2 (jitter = 2.1) 8.323 0.000 0.000 +140 images/sec: 215.3 +/- 0.2 (jitter = 2.4) 8.440 0.000 0.000 +150 images/sec: 215.2 +/- 0.2 (jitter = 2.8) 8.038 0.000 0.000 +160 images/sec: 215.0 +/- 0.2 (jitter = 2.9) 8.059 0.000 0.000 +170 images/sec: 214.8 +/- 0.2 (jitter = 3.1) 8.318 0.000 0.000 +180 images/sec: 214.7 +/- 0.2 (jitter = 3.0) 8.113 0.000 0.000 +190 images/sec: 214.6 +/- 0.2 (jitter = 3.0) 8.010 0.031 0.062 +200 images/sec: 214.5 +/- 0.2 (jitter = 2.9) 8.379 0.000 0.000 +---------------------------------------------------------------- +total images/sec: 214.32 +---------------------------------------------------------------- diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt new file mode 100644 index 00000000..9e311243 --- /dev/null +++ b/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt @@ -0,0 +1,57 @@ +W0328 16:15:34.093370 140329842439936 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. +Instructions for updating: +Please switch to tf.train.MonitoredTrainingSession +2019-03-28 16:15:34.516200: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA +2019-03-28 16:15:36.297633: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: +name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 +pciBusID: 0000:0c:00.0 +totalMemory: 11.91GiB freeMemory: 11.75GiB +2019-03-28 16:15:36.297676: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 +2019-03-28 16:15:36.626773: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: +2019-03-28 16:15:36.626811: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 +2019-03-28 16:15:36.626818: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N +2019-03-28 16:15:36.627131: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) +I0328 16:15:37.193925 140329842439936 tf_logging.py:115] Running local_init_op. +I0328 16:15:37.224961 140329842439936 tf_logging.py:115] Done running local_init_op. +TensorFlow: 1.12 +Model: resnet50 +Dataset: imagenet (synthetic) +Mode: training +SingleSess: False +Batch size: 32 global + 32.0 per device +Num batches: 200 +Num epochs: 0.00 +Devices: ['/gpu:0'] +Data format: NCHW +Optimizer: sgd +Variables: parameter_server +========== +Generating model +Running warm up +Done warm up +Step Img/sec total_loss top_1_accuracy top_5_accuracy +1 images/sec: 219.2 +/- 0.0 (jitter = 0.0) 8.229 0.000 0.000 +10 images/sec: 218.4 +/- 0.3 (jitter = 1.1) 8.305 0.000 0.000 +20 images/sec: 218.8 +/- 0.3 (jitter = 1.2) 7.921 0.000 0.000 +30 images/sec: 218.7 +/- 0.3 (jitter = 1.4) 8.055 0.000 0.000 +40 images/sec: 218.0 +/- 0.3 (jitter = 2.3) 8.293 0.000 0.000 +50 images/sec: 217.9 +/- 0.3 (jitter = 1.9) 8.092 0.000 0.000 +60 images/sec: 218.0 +/- 0.2 (jitter = 1.8) 8.082 0.000 0.000 +70 images/sec: 218.1 +/- 0.2 (jitter = 1.7) 8.270 0.000 0.000 +80 images/sec: 218.2 +/- 0.2 (jitter = 1.5) 8.177 0.000 0.000 +90 images/sec: 218.3 +/- 0.2 (jitter = 1.5) 7.983 0.031 0.031 +100 images/sec: 218.3 +/- 0.2 (jitter = 1.5) 8.488 0.000 0.000 +110 images/sec: 218.1 +/- 0.1 (jitter = 1.6) 8.207 0.000 0.000 +120 images/sec: 218.1 +/- 0.1 (jitter = 1.5) 7.931 0.000 0.000 +130 images/sec: 218.0 +/- 0.1 (jitter = 1.5) 8.370 0.000 0.000 +140 images/sec: 218.0 +/- 0.1 (jitter = 1.3) 8.345 0.000 0.000 +150 images/sec: 217.9 +/- 0.1 (jitter = 1.2) 8.192 0.000 0.031 +160 images/sec: 217.9 +/- 0.1 (jitter = 1.2) 8.313 0.031 0.031 +170 images/sec: 217.9 +/- 0.1 (jitter = 1.1) 8.381 0.000 0.000 +180 images/sec: 217.9 +/- 0.1 (jitter = 1.0) 8.061 0.031 0.031 +190 images/sec: 217.8 +/- 0.1 (jitter = 1.0) 8.239 0.000 0.031 +200 images/sec: 217.8 +/- 0.1 (jitter = 1.0) 8.045 0.000 0.000 +---------------------------------------------------------------- +total images/sec: 217.64 +---------------------------------------------------------------- diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt new file mode 100644 index 00000000..6e76d433 --- /dev/null +++ b/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt @@ -0,0 +1,77 @@ +W0328 16:17:53.676301 139959331411712 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. +Instructions for updating: +Please switch to tf.train.MonitoredTrainingSession +2019-03-28 16:17:54.142863: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA +2019-03-28 16:17:55.908861: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: +name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 +pciBusID: 0000:0c:00.0 +totalMemory: 11.91GiB freeMemory: 11.75GiB +2019-03-28 16:17:55.908898: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 +2019-03-28 16:17:56.273486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: +2019-03-28 16:17:56.273528: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 +2019-03-28 16:17:56.273536: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N +2019-03-28 16:17:56.273912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) +I0328 16:17:56.896172 139959331411712 tf_logging.py:115] Running local_init_op. +I0328 16:17:56.923924 139959331411712 tf_logging.py:115] Done running local_init_op. +TensorFlow: 1.12 +Model: resnet50 +Dataset: imagenet (synthetic) +Mode: training +SingleSess: False +Batch size: 32 global + 32.0 per device +Num batches: 400 +Num epochs: 0.01 +Devices: ['/gpu:0'] +Data format: NCHW +Optimizer: sgd +Variables: parameter_server +========== +Generating model +Running warm up +Done warm up +Step Img/sec total_loss top_1_accuracy top_5_accuracy +1 images/sec: 219.2 +/- 0.0 (jitter = 0.0) 8.229 0.000 0.000 +10 images/sec: 219.0 +/- 0.3 (jitter = 0.5) 8.305 0.000 0.000 +20 images/sec: 217.8 +/- 0.4 (jitter = 1.4) 7.921 0.000 0.000 +30 images/sec: 217.5 +/- 0.3 (jitter = 1.7) 8.055 0.000 0.000 +40 images/sec: 217.3 +/- 0.2 (jitter = 1.7) 8.293 0.000 0.000 +50 images/sec: 217.1 +/- 0.2 (jitter = 1.4) 8.093 0.000 0.000 +60 images/sec: 217.1 +/- 0.2 (jitter = 1.3) 8.081 0.000 0.000 +70 images/sec: 217.0 +/- 0.2 (jitter = 1.4) 8.270 0.000 0.000 +80 images/sec: 216.9 +/- 0.2 (jitter = 1.3) 8.175 0.000 0.000 +90 images/sec: 216.8 +/- 0.2 (jitter = 1.4) 7.983 0.031 0.031 +100 images/sec: 216.8 +/- 0.1 (jitter = 1.3) 8.486 0.000 0.000 +110 images/sec: 216.8 +/- 0.1 (jitter = 1.3) 8.206 0.000 0.000 +120 images/sec: 216.8 +/- 0.1 (jitter = 1.2) 7.932 0.000 0.000 +130 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.368 0.000 0.000 +140 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.339 0.000 0.000 +150 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.186 0.000 0.031 +160 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.316 0.031 0.031 +170 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.388 0.000 0.000 +180 images/sec: 216.6 +/- 0.1 (jitter = 1.2) 8.070 0.031 0.031 +190 images/sec: 216.6 +/- 0.1 (jitter = 1.2) 8.227 0.000 0.031 +200 images/sec: 216.6 +/- 0.1 (jitter = 1.2) 8.052 0.000 0.000 +210 images/sec: 216.6 +/- 0.1 (jitter = 1.1) 8.206 0.000 0.000 +220 images/sec: 216.5 +/- 0.1 (jitter = 1.1) 8.453 0.000 0.000 +230 images/sec: 216.5 +/- 0.1 (jitter = 1.1) 8.413 0.000 0.000 +240 images/sec: 216.5 +/- 0.1 (jitter = 1.1) 8.397 0.000 0.062 +250 images/sec: 216.4 +/- 0.1 (jitter = 1.2) 8.183 0.000 0.000 +260 images/sec: 216.4 +/- 0.1 (jitter = 1.2) 8.074 0.000 0.000 +270 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.176 0.000 0.000 +280 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.451 0.000 0.000 +290 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.096 0.000 0.000 +300 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.439 0.000 0.000 +310 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.028 0.000 0.000 +320 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.105 0.000 0.000 +330 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.293 0.000 0.000 +340 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.113 0.000 0.000 +350 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.254 0.000 0.000 +360 images/sec: 216.2 +/- 0.1 (jitter = 1.1) 8.432 0.000 0.000 +370 images/sec: 216.2 +/- 0.1 (jitter = 1.2) 8.156 0.000 0.000 +380 images/sec: 216.2 +/- 0.1 (jitter = 1.2) 8.179 0.000 0.000 +390 images/sec: 216.1 +/- 0.1 (jitter = 1.2) 8.138 0.000 0.031 +400 images/sec: 216.1 +/- 0.1 (jitter = 1.2) 8.299 0.000 0.000 +---------------------------------------------------------------- +total images/sec: 215.97 +---------------------------------------------------------------- diff --git a/test/validate_resnet50/validation.py b/test/validate_resnet50/validation.py new file mode 100644 index 00000000..385946ad --- /dev/null +++ b/test/validate_resnet50/validation.py @@ -0,0 +1,226 @@ +from subprocess import check_output, call, Popen, PIPE +import numpy as np +import os + +''' + This script will run resnet50 training validation with synthetic data and real data +and compare the results with the desired reference run. + Assumed this validation.py script is under a tensorflow/benchmarks/ repo + with git head at commit ab01ecc. +''' + +validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' +validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' +validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' +validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' +reference_file_name_realData = "/tfGPU_results/resnet50_tf_realData" +reference_file_name_syntheticData = "/tfGPU_results/resnet50_tf_syntheticData" + + +def command_executor(cmd, verbose=False, msg=None, stdout=None): + if verbose or msg is not None: + tag = 'Running COMMAND: ' if msg is None else msg + print(tag + cmd) + + p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, + stderr=PIPE, close_fds=True, bufsize=1) + output = p.stdout.read() + error_output = p.stderr.read() + + return output, error_output + + +def write_to_file(filename, content): + with open(filename, "w") as text_file: + text_file.write(content) + + +def parse_training_output(output): + to_parse = False + total_loss = [] + top1_acc = [] + top5_acc = [] + + for line in output.strip().split("\n"): + if line.split()[0] == 'Step': + to_parse = True + continue + + elif line.startswith('-----'): + to_parse = False + continue + + if to_parse: + total_loss.append(line.split()[-3]) + top1_acc.append(line.split()[-2]) + top5_acc.append(line.split()[-1]) + + return total_loss, top1_acc, top5_acc + + +def parse_reference_file(filename): + to_parse = False + total_loss = [] + top1_acc = [] + top5_acc = [] + + with open(filename) as reference_result: + for line in reference_result: + if line.split()[0] == 'Step': + to_parse = True + continue + + elif line.startswith('-----'): + to_parse = False + continue + + if to_parse: + total_loss.append(line.split()[-3]) + top1_acc.append(line.split()[-2]) + top5_acc.append(line.split()[-1]) + + return total_loss, top1_acc, top5_acc + + +def check_validation_results(norm_dict, metric): + test_pass = True + for norm in norm_dict: + if norm_dict[norm] > 0.1: + print(metric + " " + norm + + " is greater than the threshold 0.1, validation failed") + test_pass = False + return test_pass + + +# Return L1, L2, inf norm of the input arrays +def calculate_norm_values(result1, result2): + l1_norm = np.linalg.norm((np.array(result1, dtype=np.float) - + np.array(result2, dtype=np.float)), 1) + + l2_norm = np.linalg.norm((np.array(result1, dtype=np.float) - + np.array(result2, dtype=np.float)), 2) + + inf_norm = np.linalg.norm((np.array(result1, dtype=np.float) - + np.array(result2, dtype=np.float)), np.inf) + return {"l1_norm": l1_norm, "l2_norm": l2_norm, "inf_norm": inf_norm} + + +def run_validation(data_format, reference_file_name, batch_size): + # Apply the patch to make input data loader deterministic for real data validation + # Assume the current directory already has the required patch + if os.path.isfile('./datasets_make_deterministic.patch'): + output, error_output = command_executor( + 'git apply --check --whitespace=nowarn ' + './datasets_make_deterministic.patch') + if error_output: + print("Warning: datasets_make_determinitic.patch is already applied") + else: + command_executor('git apply --whitespace=nowarn ' + + './datasets_make_deterministic.patch') + + # Run the validation command on NGraph + if(data_format == "real_data"): + command_to_run = validate_with_real_data_command_NG + str(batch_size) + elif(data_format == "synthetic_data"): + command_to_run = validate_with_synthetic_data_command_NG + \ + str(batch_size) + + print("Running: ", command_to_run) + output, error_output = command_executor(command_to_run) + output_string = str(output, 'utf-8') + + if output: + ngraph_outputs_total_loss, ngraph_outputs_top1_acc, ngraph_outputs_top5_acc = parse_training_output( + output_string) + + elif error_output: + print("Something went wrong executing the command ", + validate_with_real_data_command_NG) + print(str(error_output, 'utf-8')) + exit(1) + + print("ngraph total loss ", ngraph_outputs_total_loss) + print("ngraph top1 Accuracy ", ngraph_outputs_top1_acc) + print("ngraph top5 Accuracy ", ngraph_outputs_top5_acc) + + write_to_file("resnet50_validationResult_NG_" + data_format + + "_BS" + str(batch_size) + ".txt", output_string) + + # Get TF output: Either from a reference file or from actual run command + # check if already has some TF result file + cwd = os.getcwd() + reference_file_path = cwd + reference_file_name + \ + '_BS' + str(batch_size) + ".txt" + print("Finding reference file ", reference_file_path) + if os.path.isfile(reference_file_path): + # parse the text file directly + reference_outputs_total_loss, reference_outputs_top1_acc, reference_outputs_top5_acc = parse_reference_file( + reference_file_path) + else: + # Run the validation command on TF + # This requires the TF needs to build with GPU + print("No reference output file found, begin running reference command") + print("Running: ", validate_with_real_data_command_TF) + output, error_output = command_executor( + validate_with_real_data_command_TF) + output_string = str(output, 'utf-8') + + if output: + reference_outputs_total_loss, reference_outputs_top1_acc, reference_outputs_top5_acc = parse_training_output( + output_string) + elif error_output: + print("Something went wrong executing the command ", + validate_with_real_data_command_NG) + print(str(error_output, 'utf-8')) + exit(1) + + write_to_file("resnet50_validaionResultReference" + + str(batch_size) + ".txt", output_string) + + print("reference total loss ", reference_outputs_total_loss) + print("reference top1Acc ", reference_outputs_top1_acc) + print("reference top5Acc ", reference_outputs_top5_acc) + + # Compare the TF output and NG output + # TF CPU results and GPU results are not the same, so for TF results + # Need to run with TF GPU + assert len(ngraph_outputs_total_loss) == len( + reference_outputs_total_loss), "Number of total_loss values mismatch" + assert len(ngraph_outputs_top1_acc) == len( + reference_outputs_top1_acc), "Number of top1_accuracy values mismatch" + assert len(ngraph_outputs_top5_acc) == len( + reference_outputs_top5_acc), "Number of top5_accuracy values mismatch" + + loss_norms = calculate_norm_values( + ngraph_outputs_total_loss, reference_outputs_total_loss) + top1Acc_norms = calculate_norm_values( + ngraph_outputs_top1_acc, reference_outputs_top1_acc) + top5Acc_norms = calculate_norm_values( + ngraph_outputs_top5_acc, reference_outputs_top5_acc) + + print("loss norms are %f %f %f " % + (loss_norms["l1_norm"], loss_norms["l2_norm"], loss_norms["inf_norm"])) + print("top1Acc norms are %f %f %f " % ( + top1Acc_norms["l1_norm"], top1Acc_norms["l2_norm"], top1Acc_norms["inf_norm"])) + print("top5Acc norms are %f %f %f " % ( + top5Acc_norms["l1_norm"], top5Acc_norms["l2_norm"], top5Acc_norms["inf_norm"])) + + loss_result = check_validation_results(loss_norms, "total_loss") + top1Acc_result = check_validation_results(loss_norms, "top1 Accuracy") + top5Acc_result = check_validation_results(loss_norms, "top5 Accuracy") + + if((loss_result and top1Acc_result and top5Acc_result)): + print("Validation test pass") + + # reapply the patch + output, error_output = command_executor( + 'git apply -R ' + './datasets_make_deterministic.patch') + +# Validation with synthetic data + + +if __name__ == "__main__": + batch_size = 100 + run_validation("real_data", reference_file_name_realData, batch_size) + batch_size = 200 + run_validation("synthetic_data", + reference_file_name_syntheticData, batch_size) From b86aae2dbd3e25839c66179edf0a4e81daa66630 Mon Sep 17 00:00:00 2001 From: shresthamalik <shrestha.malik@intel.com> Date: Wed, 10 Apr 2019 11:38:15 -0700 Subject: [PATCH 2/7] Code Formatting --- test/validate_resnet50/validation.py | 83 ++++++++++++++++------------ 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/test/validate_resnet50/validation.py b/test/validate_resnet50/validation.py index 385946ad..30b952a1 100644 --- a/test/validate_resnet50/validation.py +++ b/test/validate_resnet50/validation.py @@ -1,7 +1,6 @@ from subprocess import check_output, call, Popen, PIPE import numpy as np import os - ''' This script will run resnet50 training validation with synthetic data and real data and compare the results with the desired reference run. @@ -22,8 +21,14 @@ def command_executor(cmd, verbose=False, msg=None, stdout=None): tag = 'Running COMMAND: ' if msg is None else msg print(tag + cmd) - p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, - stderr=PIPE, close_fds=True, bufsize=1) + p = Popen( + cmd, + shell=True, + stdin=PIPE, + stdout=PIPE, + stderr=PIPE, + close_fds=True, + bufsize=1) output = p.stdout.read() error_output = p.stderr.read() @@ -94,14 +99,17 @@ def check_validation_results(norm_dict, metric): # Return L1, L2, inf norm of the input arrays def calculate_norm_values(result1, result2): - l1_norm = np.linalg.norm((np.array(result1, dtype=np.float) - - np.array(result2, dtype=np.float)), 1) + l1_norm = np.linalg.norm( + (np.array(result1, dtype=np.float) - np.array(result2, dtype=np.float)), + 1) - l2_norm = np.linalg.norm((np.array(result1, dtype=np.float) - - np.array(result2, dtype=np.float)), 2) + l2_norm = np.linalg.norm( + (np.array(result1, dtype=np.float) - np.array(result2, dtype=np.float)), + 2) - inf_norm = np.linalg.norm((np.array(result1, dtype=np.float) - - np.array(result2, dtype=np.float)), np.inf) + inf_norm = np.linalg.norm( + (np.array(result1, dtype=np.float) - np.array(result2, dtype=np.float)), + np.inf) return {"l1_norm": l1_norm, "l2_norm": l2_norm, "inf_norm": inf_norm} @@ -110,17 +118,19 @@ def run_validation(data_format, reference_file_name, batch_size): # Assume the current directory already has the required patch if os.path.isfile('./datasets_make_deterministic.patch'): output, error_output = command_executor( - 'git apply --check --whitespace=nowarn ' + './datasets_make_deterministic.patch') + 'git apply --check --whitespace=nowarn ' + + './datasets_make_deterministic.patch') if error_output: - print("Warning: datasets_make_determinitic.patch is already applied") + print( + "Warning: datasets_make_determinitic.patch is already applied") else: command_executor('git apply --whitespace=nowarn ' + './datasets_make_deterministic.patch') # Run the validation command on NGraph - if(data_format == "real_data"): + if (data_format == "real_data"): command_to_run = validate_with_real_data_command_NG + str(batch_size) - elif(data_format == "synthetic_data"): + elif (data_format == "synthetic_data"): command_to_run = validate_with_synthetic_data_command_NG + \ str(batch_size) @@ -142,8 +152,9 @@ def run_validation(data_format, reference_file_name, batch_size): print("ngraph top1 Accuracy ", ngraph_outputs_top1_acc) print("ngraph top5 Accuracy ", ngraph_outputs_top5_acc) - write_to_file("resnet50_validationResult_NG_" + data_format + - "_BS" + str(batch_size) + ".txt", output_string) + write_to_file( + "resnet50_validationResult_NG_" + data_format + "_BS" + str(batch_size) + + ".txt", output_string) # Get TF output: Either from a reference file or from actual run command # check if already has some TF result file @@ -173,8 +184,9 @@ def run_validation(data_format, reference_file_name, batch_size): print(str(error_output, 'utf-8')) exit(1) - write_to_file("resnet50_validaionResultReference" + - str(batch_size) + ".txt", output_string) + write_to_file( + "resnet50_validaionResultReference" + str(batch_size) + ".txt", + output_string) print("reference total loss ", reference_outputs_total_loss) print("reference top1Acc ", reference_outputs_top1_acc) @@ -190,37 +202,40 @@ def run_validation(data_format, reference_file_name, batch_size): assert len(ngraph_outputs_top5_acc) == len( reference_outputs_top5_acc), "Number of top5_accuracy values mismatch" - loss_norms = calculate_norm_values( - ngraph_outputs_total_loss, reference_outputs_total_loss) - top1Acc_norms = calculate_norm_values( - ngraph_outputs_top1_acc, reference_outputs_top1_acc) - top5Acc_norms = calculate_norm_values( - ngraph_outputs_top5_acc, reference_outputs_top5_acc) - - print("loss norms are %f %f %f " % - (loss_norms["l1_norm"], loss_norms["l2_norm"], loss_norms["inf_norm"])) - print("top1Acc norms are %f %f %f " % ( - top1Acc_norms["l1_norm"], top1Acc_norms["l2_norm"], top1Acc_norms["inf_norm"])) - print("top5Acc norms are %f %f %f " % ( - top5Acc_norms["l1_norm"], top5Acc_norms["l2_norm"], top5Acc_norms["inf_norm"])) + loss_norms = calculate_norm_values(ngraph_outputs_total_loss, + reference_outputs_total_loss) + top1Acc_norms = calculate_norm_values(ngraph_outputs_top1_acc, + reference_outputs_top1_acc) + top5Acc_norms = calculate_norm_values(ngraph_outputs_top5_acc, + reference_outputs_top5_acc) + + print( + "loss norms are %f %f %f " % + (loss_norms["l1_norm"], loss_norms["l2_norm"], loss_norms["inf_norm"])) + print("top1Acc norms are %f %f %f " % + (top1Acc_norms["l1_norm"], top1Acc_norms["l2_norm"], + top1Acc_norms["inf_norm"])) + print("top5Acc norms are %f %f %f " % + (top5Acc_norms["l1_norm"], top5Acc_norms["l2_norm"], + top5Acc_norms["inf_norm"])) loss_result = check_validation_results(loss_norms, "total_loss") top1Acc_result = check_validation_results(loss_norms, "top1 Accuracy") top5Acc_result = check_validation_results(loss_norms, "top5 Accuracy") - if((loss_result and top1Acc_result and top5Acc_result)): + if ((loss_result and top1Acc_result and top5Acc_result)): print("Validation test pass") # reapply the patch output, error_output = command_executor( 'git apply -R ' + './datasets_make_deterministic.patch') -# Validation with synthetic data +# Validation with synthetic data if __name__ == "__main__": batch_size = 100 run_validation("real_data", reference_file_name_realData, batch_size) batch_size = 200 - run_validation("synthetic_data", - reference_file_name_syntheticData, batch_size) + run_validation("synthetic_data", reference_file_name_syntheticData, + batch_size) From 3745b0d2d3fb58ab1dd747bc6e7de6470578ee9e Mon Sep 17 00:00:00 2001 From: shresthamalik <shrestha.malik@intel.com> Date: Fri, 12 Apr 2019 17:58:42 -0700 Subject: [PATCH 3/7] Removed the GPU logs --- .../resnet50_tf_realData_BS100.txt | 53 ------------- .../resnet50_tf_realData_BS200.txt | 63 --------------- .../resnet50_tf_syntheticData_BS200.txt | 57 -------------- .../resnet50_tf_syntheticData_BS400.txt | 77 ------------------- 4 files changed, 250 deletions(-) delete mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt delete mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt delete mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt delete mode 100644 test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt deleted file mode 100644 index 8f0cce0b..00000000 --- a/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS100.txt +++ /dev/null @@ -1,53 +0,0 @@ -W0328 15:31:38.178014 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:121: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version. -Instructions for updating: -Use `tf.data.experimental.parallel_interleave(...)`. -W0328 15:31:38.197099 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:136: map_and_batch (from tensorflow.contrib.data.python.ops.batching) is deprecated and will be removed in a future version. -Instructions for updating: -Use `tf.data.experimental.map_and_batch(...)`. -W0328 15:31:41.318058 140647503410944 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. -Instructions for updating: -Please switch to tf.train.MonitoredTrainingSession -2019-03-28 15:31:41.734261: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA -2019-03-28 15:31:42.684223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: -name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 -pciBusID: 0000:0c:00.0 -totalMemory: 11.91GiB freeMemory: 11.75GiB -2019-03-28 15:31:42.684259: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 -2019-03-28 15:31:42.996125: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: -2019-03-28 15:31:42.996179: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 -2019-03-28 15:31:42.996185: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N -2019-03-28 15:31:42.996527: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) -I0328 15:31:43.550181 140647503410944 tf_logging.py:115] Running local_init_op. -I0328 15:31:49.378333 140647503410944 tf_logging.py:115] Done running local_init_op. -TensorFlow: 1.12 -Model: resnet50 -Dataset: imagenet -Mode: training -SingleSess: False -Batch size: 32 global - 32.0 per device -Num batches: 100 -Num epochs: 0.00 -Devices: ['/gpu:0'] -Data format: NCHW -Optimizer: sgd -Variables: parameter_server -========== -Generating model -Running warm up -Done warm up -Step Img/sec total_loss top_1_accuracy top_5_accuracy -1 images/sec: 217.4 +/- 0.0 (jitter = 0.0) 8.360 0.000 0.000 -10 images/sec: 215.6 +/- 0.4 (jitter = 1.2) 8.027 0.000 0.000 -20 images/sec: 215.0 +/- 0.3 (jitter = 1.0) 8.333 0.000 0.031 -30 images/sec: 214.9 +/- 0.3 (jitter = 1.0) 8.215 0.000 0.000 -40 images/sec: 214.8 +/- 0.2 (jitter = 1.1) 8.189 0.000 0.000 -50 images/sec: 214.6 +/- 0.2 (jitter = 1.3) 8.177 0.000 0.000 -60 images/sec: 214.3 +/- 0.2 (jitter = 1.4) 8.112 0.000 0.000 -70 images/sec: 214.3 +/- 0.2 (jitter = 1.5) 8.185 0.000 0.000 -80 images/sec: 214.2 +/- 0.2 (jitter = 1.5) 8.120 0.000 0.000 -90 images/sec: 214.2 +/- 0.2 (jitter = 1.5) 8.254 0.000 0.000 -100 images/sec: 214.3 +/- 0.2 (jitter = 1.6) 8.093 0.000 0.000 ----------------------------------------------------------------- -total images/sec: 213.97 ----------------------------------------------------------------- diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt deleted file mode 100644 index fcb83c9e..00000000 --- a/test/validate_resnet50/tfGPU_results/resnet50_tf_realData_BS200.txt +++ /dev/null @@ -1,63 +0,0 @@ -W0328 15:32:46.495650 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:121: parallel_interleave (from tensorflow.contrib.data.python.ops.interleave_ops) is deprecated and will be removed in a future version. -Instructions for updating: -Use `tf.data.experimental.parallel_interleave(...)`. -W0328 15:32:46.520555 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/data_utils.py:136: map_and_batch (from tensorflow.contrib.data.python.ops.batching) is deprecated and will be removed in a future version. -Instructions for updating: -Use `tf.data.experimental.map_and_batch(...)`. -W0328 15:32:49.878959 140049874691840 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. -Instructions for updating: -Please switch to tf.train.MonitoredTrainingSession -2019-03-28 15:32:50.317750: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA -2019-03-28 15:32:51.857166: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: -name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 -pciBusID: 0000:0c:00.0 -totalMemory: 11.91GiB freeMemory: 11.75GiB -2019-03-28 15:32:51.857212: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 -2019-03-28 15:32:52.273544: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: -2019-03-28 15:32:52.273603: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 -2019-03-28 15:32:52.273610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N -2019-03-28 15:32:52.273926: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) -I0328 15:32:52.851403 140049874691840 tf_logging.py:115] Running local_init_op. -I0328 15:32:58.568639 140049874691840 tf_logging.py:115] Done running local_init_op. -TensorFlow: 1.12 -Model: resnet50 -Dataset: imagenet -Mode: training -SingleSess: False -Batch size: 32 global - 32.0 per device -Num batches: 200 -Num epochs: 0.00 -Devices: ['/gpu:0'] -Data format: NCHW -Optimizer: sgd -Variables: parameter_server -========== -Generating model -Running warm up -Done warm up -Step Img/sec total_loss top_1_accuracy top_5_accuracy -1 images/sec: 217.6 +/- 0.0 (jitter = 0.0) 8.360 0.000 0.000 -10 images/sec: 216.7 +/- 0.4 (jitter = 0.3) 8.027 0.000 0.000 -20 images/sec: 216.6 +/- 0.3 (jitter = 0.6) 8.333 0.000 0.031 -30 images/sec: 216.3 +/- 0.3 (jitter = 1.0) 8.216 0.000 0.000 -40 images/sec: 216.3 +/- 0.2 (jitter = 1.0) 8.188 0.000 0.000 -50 images/sec: 216.1 +/- 0.2 (jitter = 1.2) 8.177 0.000 0.000 -60 images/sec: 216.3 +/- 0.2 (jitter = 1.1) 8.107 0.000 0.000 -70 images/sec: 216.4 +/- 0.2 (jitter = 1.0) 8.189 0.000 0.000 -80 images/sec: 216.4 +/- 0.2 (jitter = 0.9) 8.114 0.000 0.000 -90 images/sec: 216.4 +/- 0.2 (jitter = 0.8) 8.246 0.000 0.000 -100 images/sec: 216.3 +/- 0.2 (jitter = 1.1) 8.081 0.000 0.000 -110 images/sec: 216.0 +/- 0.2 (jitter = 1.4) 8.363 0.000 0.000 -120 images/sec: 215.7 +/- 0.2 (jitter = 1.8) 8.027 0.000 0.000 -130 images/sec: 215.5 +/- 0.2 (jitter = 2.1) 8.323 0.000 0.000 -140 images/sec: 215.3 +/- 0.2 (jitter = 2.4) 8.440 0.000 0.000 -150 images/sec: 215.2 +/- 0.2 (jitter = 2.8) 8.038 0.000 0.000 -160 images/sec: 215.0 +/- 0.2 (jitter = 2.9) 8.059 0.000 0.000 -170 images/sec: 214.8 +/- 0.2 (jitter = 3.1) 8.318 0.000 0.000 -180 images/sec: 214.7 +/- 0.2 (jitter = 3.0) 8.113 0.000 0.000 -190 images/sec: 214.6 +/- 0.2 (jitter = 3.0) 8.010 0.031 0.062 -200 images/sec: 214.5 +/- 0.2 (jitter = 2.9) 8.379 0.000 0.000 ----------------------------------------------------------------- -total images/sec: 214.32 ----------------------------------------------------------------- diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt deleted file mode 100644 index 9e311243..00000000 --- a/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS200.txt +++ /dev/null @@ -1,57 +0,0 @@ -W0328 16:15:34.093370 140329842439936 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. -Instructions for updating: -Please switch to tf.train.MonitoredTrainingSession -2019-03-28 16:15:34.516200: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA -2019-03-28 16:15:36.297633: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: -name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 -pciBusID: 0000:0c:00.0 -totalMemory: 11.91GiB freeMemory: 11.75GiB -2019-03-28 16:15:36.297676: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 -2019-03-28 16:15:36.626773: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: -2019-03-28 16:15:36.626811: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 -2019-03-28 16:15:36.626818: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N -2019-03-28 16:15:36.627131: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) -I0328 16:15:37.193925 140329842439936 tf_logging.py:115] Running local_init_op. -I0328 16:15:37.224961 140329842439936 tf_logging.py:115] Done running local_init_op. -TensorFlow: 1.12 -Model: resnet50 -Dataset: imagenet (synthetic) -Mode: training -SingleSess: False -Batch size: 32 global - 32.0 per device -Num batches: 200 -Num epochs: 0.00 -Devices: ['/gpu:0'] -Data format: NCHW -Optimizer: sgd -Variables: parameter_server -========== -Generating model -Running warm up -Done warm up -Step Img/sec total_loss top_1_accuracy top_5_accuracy -1 images/sec: 219.2 +/- 0.0 (jitter = 0.0) 8.229 0.000 0.000 -10 images/sec: 218.4 +/- 0.3 (jitter = 1.1) 8.305 0.000 0.000 -20 images/sec: 218.8 +/- 0.3 (jitter = 1.2) 7.921 0.000 0.000 -30 images/sec: 218.7 +/- 0.3 (jitter = 1.4) 8.055 0.000 0.000 -40 images/sec: 218.0 +/- 0.3 (jitter = 2.3) 8.293 0.000 0.000 -50 images/sec: 217.9 +/- 0.3 (jitter = 1.9) 8.092 0.000 0.000 -60 images/sec: 218.0 +/- 0.2 (jitter = 1.8) 8.082 0.000 0.000 -70 images/sec: 218.1 +/- 0.2 (jitter = 1.7) 8.270 0.000 0.000 -80 images/sec: 218.2 +/- 0.2 (jitter = 1.5) 8.177 0.000 0.000 -90 images/sec: 218.3 +/- 0.2 (jitter = 1.5) 7.983 0.031 0.031 -100 images/sec: 218.3 +/- 0.2 (jitter = 1.5) 8.488 0.000 0.000 -110 images/sec: 218.1 +/- 0.1 (jitter = 1.6) 8.207 0.000 0.000 -120 images/sec: 218.1 +/- 0.1 (jitter = 1.5) 7.931 0.000 0.000 -130 images/sec: 218.0 +/- 0.1 (jitter = 1.5) 8.370 0.000 0.000 -140 images/sec: 218.0 +/- 0.1 (jitter = 1.3) 8.345 0.000 0.000 -150 images/sec: 217.9 +/- 0.1 (jitter = 1.2) 8.192 0.000 0.031 -160 images/sec: 217.9 +/- 0.1 (jitter = 1.2) 8.313 0.031 0.031 -170 images/sec: 217.9 +/- 0.1 (jitter = 1.1) 8.381 0.000 0.000 -180 images/sec: 217.9 +/- 0.1 (jitter = 1.0) 8.061 0.031 0.031 -190 images/sec: 217.8 +/- 0.1 (jitter = 1.0) 8.239 0.000 0.031 -200 images/sec: 217.8 +/- 0.1 (jitter = 1.0) 8.045 0.000 0.000 ----------------------------------------------------------------- -total images/sec: 217.64 ----------------------------------------------------------------- diff --git a/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt b/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt deleted file mode 100644 index 6e76d433..00000000 --- a/test/validate_resnet50/tfGPU_results/resnet50_tf_syntheticData_BS400.txt +++ /dev/null @@ -1,77 +0,0 @@ -W0328 16:17:53.676301 139959331411712 tf_logging.py:125] From /localdisk/mingshan/benchmarks/scripts/tf_cnn_benchmarks/benchmark_cnn.py:1842: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version. -Instructions for updating: -Please switch to tf.train.MonitoredTrainingSession -2019-03-28 16:17:54.142863: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA -2019-03-28 16:17:55.908861: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: -name: TITAN Xp major: 6 minor: 1 memoryClockRate(GHz): 1.582 -pciBusID: 0000:0c:00.0 -totalMemory: 11.91GiB freeMemory: 11.75GiB -2019-03-28 16:17:55.908898: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0 -2019-03-28 16:17:56.273486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix: -2019-03-28 16:17:56.273528: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 -2019-03-28 16:17:56.273536: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N -2019-03-28 16:17:56.273912: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11364 MB memory) -> physical GPU (device: 0, name: TITAN Xp, pci bus id: 0000:0c:00.0, compute capability: 6.1) -I0328 16:17:56.896172 139959331411712 tf_logging.py:115] Running local_init_op. -I0328 16:17:56.923924 139959331411712 tf_logging.py:115] Done running local_init_op. -TensorFlow: 1.12 -Model: resnet50 -Dataset: imagenet (synthetic) -Mode: training -SingleSess: False -Batch size: 32 global - 32.0 per device -Num batches: 400 -Num epochs: 0.01 -Devices: ['/gpu:0'] -Data format: NCHW -Optimizer: sgd -Variables: parameter_server -========== -Generating model -Running warm up -Done warm up -Step Img/sec total_loss top_1_accuracy top_5_accuracy -1 images/sec: 219.2 +/- 0.0 (jitter = 0.0) 8.229 0.000 0.000 -10 images/sec: 219.0 +/- 0.3 (jitter = 0.5) 8.305 0.000 0.000 -20 images/sec: 217.8 +/- 0.4 (jitter = 1.4) 7.921 0.000 0.000 -30 images/sec: 217.5 +/- 0.3 (jitter = 1.7) 8.055 0.000 0.000 -40 images/sec: 217.3 +/- 0.2 (jitter = 1.7) 8.293 0.000 0.000 -50 images/sec: 217.1 +/- 0.2 (jitter = 1.4) 8.093 0.000 0.000 -60 images/sec: 217.1 +/- 0.2 (jitter = 1.3) 8.081 0.000 0.000 -70 images/sec: 217.0 +/- 0.2 (jitter = 1.4) 8.270 0.000 0.000 -80 images/sec: 216.9 +/- 0.2 (jitter = 1.3) 8.175 0.000 0.000 -90 images/sec: 216.8 +/- 0.2 (jitter = 1.4) 7.983 0.031 0.031 -100 images/sec: 216.8 +/- 0.1 (jitter = 1.3) 8.486 0.000 0.000 -110 images/sec: 216.8 +/- 0.1 (jitter = 1.3) 8.206 0.000 0.000 -120 images/sec: 216.8 +/- 0.1 (jitter = 1.2) 7.932 0.000 0.000 -130 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.368 0.000 0.000 -140 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.339 0.000 0.000 -150 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.186 0.000 0.031 -160 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.316 0.031 0.031 -170 images/sec: 216.7 +/- 0.1 (jitter = 1.2) 8.388 0.000 0.000 -180 images/sec: 216.6 +/- 0.1 (jitter = 1.2) 8.070 0.031 0.031 -190 images/sec: 216.6 +/- 0.1 (jitter = 1.2) 8.227 0.000 0.031 -200 images/sec: 216.6 +/- 0.1 (jitter = 1.2) 8.052 0.000 0.000 -210 images/sec: 216.6 +/- 0.1 (jitter = 1.1) 8.206 0.000 0.000 -220 images/sec: 216.5 +/- 0.1 (jitter = 1.1) 8.453 0.000 0.000 -230 images/sec: 216.5 +/- 0.1 (jitter = 1.1) 8.413 0.000 0.000 -240 images/sec: 216.5 +/- 0.1 (jitter = 1.1) 8.397 0.000 0.062 -250 images/sec: 216.4 +/- 0.1 (jitter = 1.2) 8.183 0.000 0.000 -260 images/sec: 216.4 +/- 0.1 (jitter = 1.2) 8.074 0.000 0.000 -270 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.176 0.000 0.000 -280 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.451 0.000 0.000 -290 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.096 0.000 0.000 -300 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.439 0.000 0.000 -310 images/sec: 216.4 +/- 0.1 (jitter = 1.1) 8.028 0.000 0.000 -320 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.105 0.000 0.000 -330 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.293 0.000 0.000 -340 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.113 0.000 0.000 -350 images/sec: 216.3 +/- 0.1 (jitter = 1.1) 8.254 0.000 0.000 -360 images/sec: 216.2 +/- 0.1 (jitter = 1.1) 8.432 0.000 0.000 -370 images/sec: 216.2 +/- 0.1 (jitter = 1.2) 8.156 0.000 0.000 -380 images/sec: 216.2 +/- 0.1 (jitter = 1.2) 8.179 0.000 0.000 -390 images/sec: 216.1 +/- 0.1 (jitter = 1.2) 8.138 0.000 0.031 -400 images/sec: 216.1 +/- 0.1 (jitter = 1.2) 8.299 0.000 0.000 ----------------------------------------------------------------- -total images/sec: 215.97 ----------------------------------------------------------------- From 4957bd53e732b7b8f82bddbcf61f832fe694af29 Mon Sep 17 00:00:00 2001 From: shresthamalik <shrestha.malik@intel.com> Date: Fri, 12 Apr 2019 18:11:29 -0700 Subject: [PATCH 4/7] Added TODOs in validation.py --- test/validate_resnet50/validation.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/test/validate_resnet50/validation.py b/test/validate_resnet50/validation.py index 30b952a1..b05df225 100644 --- a/test/validate_resnet50/validation.py +++ b/test/validate_resnet50/validation.py @@ -1,19 +1,25 @@ from subprocess import check_output, call, Popen, PIPE import numpy as np import os -''' - This script will run resnet50 training validation with synthetic data and real data -and compare the results with the desired reference run. - Assumed this validation.py script is under a tensorflow/benchmarks/ repo - with git head at commit ab01ecc. -''' + +# This script will run resnet50 training validation with synthetic data and real data +# and compare the results with the desired reference run. +# Assumed this validation.py script is under a tensorflow/benchmarks/ repo +# with git head at commit ab01ecc. +# TODO: +# 1. num_bathces are not set in validate_commands +# 2. Makes certain assumptions about the reference_file 's name and the batch size +# 3. Add Arguments to take in the backend, the reference log files, the number of iterations/batches, the data type (real or synthetic) +# 4. Automate the cloning of benchmarks repo and running the script validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' -reference_file_name_realData = "/tfGPU_results/resnet50_tf_realData" -reference_file_name_syntheticData = "/tfGPU_results/resnet50_tf_syntheticData" + +# +#reference_file_name_realData = "/tfGPU_results/resnet50_tf_realData" +#reference_file_name_syntheticData = "/tfGPU_results/resnet50_tf_syntheticData" def command_executor(cmd, verbose=False, msg=None, stdout=None): From e10d87b3aba59928634fb198a1822b51a0387477 Mon Sep 17 00:00:00 2001 From: shresthamalik <shrestha.malik@intel.com> Date: Tue, 16 Apr 2019 17:48:08 -0700 Subject: [PATCH 5/7] Minor change --- test/validate_resnet50/validation.py | 17 ++++++----------- tools/build_utils.py | 4 ++-- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/test/validate_resnet50/validation.py b/test/validate_resnet50/validation.py index b05df225..048bccfc 100644 --- a/test/validate_resnet50/validation.py +++ b/test/validate_resnet50/validation.py @@ -7,20 +7,15 @@ # Assumed this validation.py script is under a tensorflow/benchmarks/ repo # with git head at commit ab01ecc. # TODO: -# 1. num_bathces are not set in validate_commands +# 1. num_bathces set to 100 # 2. Makes certain assumptions about the reference_file 's name and the batch size # 3. Add Arguments to take in the backend, the reference log files, the number of iterations/batches, the data type (real or synthetic) # 4. Automate the cloning of benchmarks repo and running the script -validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' -validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' -validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' -validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=' - -# -#reference_file_name_realData = "/tfGPU_results/resnet50_tf_realData" -#reference_file_name_syntheticData = "/tfGPU_results/resnet50_tf_syntheticData" - +validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' def command_executor(cmd, verbose=False, msg=None, stdout=None): if verbose or msg is not None: @@ -242,6 +237,6 @@ def run_validation(data_format, reference_file_name, batch_size): if __name__ == "__main__": batch_size = 100 run_validation("real_data", reference_file_name_realData, batch_size) - batch_size = 200 + batch_size = 100 run_validation("synthetic_data", reference_file_name_syntheticData, batch_size) diff --git a/tools/build_utils.py b/tools/build_utils.py index 10aa75f3..1e8891d7 100755 --- a/tools/build_utils.py +++ b/tools/build_utils.py @@ -145,9 +145,9 @@ def setup_venv(venv_dir): "termcolor>=1.1.0", "protobuf>=3.6.1", "keras_applications>=1.0.6", - "--no-deps", + "--no-deps --no-cache-dir", "keras_preprocessing==1.0.5", - "--no-deps", + "--no-deps --no-cache-dir", "yapf==0.26.0", ] command_executor(package_list) From 398ebf908d59117a2362da7254a1403b18e7c4d3 Mon Sep 17 00:00:00 2001 From: shresthamalik <shrestha.malik@intel.com> Date: Tue, 16 Apr 2019 18:02:25 -0700 Subject: [PATCH 6/7] formatting change --- test/validate_resnet50/validation.py | 31 +++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/test/validate_resnet50/validation.py b/test/validate_resnet50/validation.py index 048bccfc..275b26ce 100644 --- a/test/validate_resnet50/validation.py +++ b/test/validate_resnet50/validation.py @@ -4,18 +4,37 @@ # This script will run resnet50 training validation with synthetic data and real data # and compare the results with the desired reference run. +# If the reference files are not provided it runs on TF(w/o nGraph) and uses its output +# as reference # Assumed this validation.py script is under a tensorflow/benchmarks/ repo # with git head at commit ab01ecc. # TODO: # 1. num_bathces set to 100 # 2. Makes certain assumptions about the reference_file 's name and the batch size -# 3. Add Arguments to take in the backend, the reference log files, the number of iterations/batches, the data type (real or synthetic) +# 3. Add Arguments to take in the backend, the reference log files, the number of iterations/batches, +# the data type (real or synthetic) # 4. Automate the cloning of benchmarks repo and running the script -validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' -validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' -validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' -validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 ' \ + + '--num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet ' \ + + '--datasets_use_prefetch=False --print_training_accuracy=True ' \ + + '--num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 ' \ + + '--num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet ' \ + + '--datasets_use_prefetch=False --print_training_accuracy=True ' \ + + '--num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW ' \ + + '--model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet ' \ + + '--datasets_use_prefetch=False --print_training_accuracy=True ' \ + + '--num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 ' \ + + '--batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False ' \ + + '--print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' + def command_executor(cmd, verbose=False, msg=None, stdout=None): if verbose or msg is not None: @@ -235,6 +254,8 @@ def run_validation(data_format, reference_file_name, batch_size): # Validation with synthetic data if __name__ == "__main__": + reference_file_name_realData = '' + reference_file_name_syntheticData = '' batch_size = 100 run_validation("real_data", reference_file_name_realData, batch_size) batch_size = 100 From cd351b0a6f2ec1e641f69eb3eadfdf7cba39fcbc Mon Sep 17 00:00:00 2001 From: shresthamalik <shrestha.malik@intel.com> Date: Tue, 16 Apr 2019 18:02:25 -0700 Subject: [PATCH 7/7] formatting change --- test/validate_resnet50/validation.py | 31 +++++++++++++++++++++++----- tools/build_utils.py | 4 ++-- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/test/validate_resnet50/validation.py b/test/validate_resnet50/validation.py index 048bccfc..275b26ce 100644 --- a/test/validate_resnet50/validation.py +++ b/test/validate_resnet50/validation.py @@ -4,18 +4,37 @@ # This script will run resnet50 training validation with synthetic data and real data # and compare the results with the desired reference run. +# If the reference files are not provided it runs on TF(w/o nGraph) and uses its output +# as reference # Assumed this validation.py script is under a tensorflow/benchmarks/ repo # with git head at commit ab01ecc. # TODO: # 1. num_bathces set to 100 # 2. Makes certain assumptions about the reference_file 's name and the batch size -# 3. Add Arguments to take in the backend, the reference log files, the number of iterations/batches, the data type (real or synthetic) +# 3. Add Arguments to take in the backend, the reference log files, the number of iterations/batches, +# the data type (real or synthetic) # 4. Automate the cloning of benchmarks repo and running the script -validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' -validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' -validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' -validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py --num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False --print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_real_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --data_format=NCHW --model=resnet50 --batch_size=32 ' \ + + '--num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet ' \ + + '--datasets_use_prefetch=False --print_training_accuracy=True ' \ + + '--num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_real_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --data_format=NHWC --model=resnet50 --batch_size=32 ' \ + + '--num_gpus=1 --data_dir /mnt/data/TF_ImageNet_latest/ --data_name=imagenet ' \ + + '--datasets_use_prefetch=False --print_training_accuracy=True ' \ + + '--num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_synthetic_data_command_NG = 'NGRAPH_TF_BACKEND=GPU python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --tf_random_seed=1234 --data_format=NCHW ' \ + + '--model=resnet50 --batch_size=32 --num_gpus=1 --data_name=imagenet ' \ + + '--datasets_use_prefetch=False --print_training_accuracy=True ' \ + + '--num_learning_rate_warmup_epochs=0 --num_batches=100' +validate_with_synthetic_data_command_TF = 'NGRAPH_TF_DISABLE=1 python tf_cnn_benchmarks.py ' \ + + '--num_inter_threads=2 --tf_random_seed=1234 --data_format=NHWC --model=resnet50 ' \ + + '--batch_size=32 --num_gpus=1 --data_name=imagenet --datasets_use_prefetch=False ' \ + + '--print_training_accuracy=True --num_learning_rate_warmup_epochs=0 --num_batches=100' + def command_executor(cmd, verbose=False, msg=None, stdout=None): if verbose or msg is not None: @@ -235,6 +254,8 @@ def run_validation(data_format, reference_file_name, batch_size): # Validation with synthetic data if __name__ == "__main__": + reference_file_name_realData = '' + reference_file_name_syntheticData = '' batch_size = 100 run_validation("real_data", reference_file_name_realData, batch_size) batch_size = 100 diff --git a/tools/build_utils.py b/tools/build_utils.py index 1e8891d7..10aa75f3 100755 --- a/tools/build_utils.py +++ b/tools/build_utils.py @@ -145,9 +145,9 @@ def setup_venv(venv_dir): "termcolor>=1.1.0", "protobuf>=3.6.1", "keras_applications>=1.0.6", - "--no-deps --no-cache-dir", + "--no-deps", "keras_preprocessing==1.0.5", - "--no-deps --no-cache-dir", + "--no-deps", "yapf==0.26.0", ] command_executor(package_list)