From 59aef83328cbc1df07f6d234720837b8f0e05adc Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 22:43:34 +0200
Subject: [PATCH 01/72] Attempt to compute all base and grad-cam class
 operations using torch and not numpy :scientist:

---
 pytorch_grad_cam/base_cam.py | 32 ++++++++++++++++----------------
 pytorch_grad_cam/grad_cam.py |  2 +-
 setup.py                     |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index 7ee192971..640cec851 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -36,7 +36,7 @@ def get_cam_weights(self,
                         target_layers: List[torch.nn.Module],
                         targets: List[torch.nn.Module],
                         activations: torch.Tensor,
-                        grads: torch.Tensor) -> np.ndarray:
+                        grads: torch.Tensor) -> torch.Tensor:
         raise Exception("Not Implemented")
 
     def get_cam_image(self,
@@ -45,7 +45,7 @@ def get_cam_image(self,
                       targets: List[torch.nn.Module],
                       activations: torch.Tensor,
                       grads: torch.Tensor,
-                      eigen_smooth: bool = False) -> np.ndarray:
+                      eigen_smooth: bool = False) -> torch.Tensor:
 
         weights = self.get_cam_weights(input_tensor,
                                        target_layer,
@@ -62,7 +62,7 @@ def get_cam_image(self,
     def forward(self,
                 input_tensor: torch.Tensor,
                 targets: List[torch.nn.Module],
-                eigen_smooth: bool = False) -> np.ndarray:
+                eigen_smooth: bool = False) -> torch.Tensor:
 
         if self.cuda:
             input_tensor = input_tensor.cuda()
@@ -73,7 +73,7 @@ def forward(self,
 
         outputs = self.activations_and_grads(input_tensor)
         if targets is None:
-            target_categories = np.argmax(outputs.cpu().data.numpy(), axis=-1)
+            target_categories = torch.argmax(outputs.data, axis=-1)
             targets = [ClassifierOutputTarget(
                 category) for category in target_categories]
 
@@ -106,10 +106,10 @@ def compute_cam_per_layer(
             self,
             input_tensor: torch.Tensor,
             targets: List[torch.nn.Module],
-            eigen_smooth: bool) -> np.ndarray:
-        activations_list = [a.cpu().data.numpy()
+            eigen_smooth: bool) -> torch.Tensor:
+        activations_list = [a.data
                             for a in self.activations_and_grads.activations]
-        grads_list = [g.cpu().data.numpy()
+        grads_list = [g.data
                       for g in self.activations_and_grads.gradients]
         target_size = self.get_target_width_height(input_tensor)
 
@@ -130,7 +130,7 @@ def compute_cam_per_layer(
                                      layer_activations,
                                      layer_grads,
                                      eigen_smooth)
-            cam = np.maximum(cam, 0)
+            cam = torch.maximum(cam, 0)
             scaled = scale_cam_image(cam, target_size)
             cam_per_target_layer.append(scaled[:, None, :])
 
@@ -138,16 +138,16 @@ def compute_cam_per_layer(
 
     def aggregate_multi_layers(
             self,
-            cam_per_target_layer: np.ndarray) -> np.ndarray:
-        cam_per_target_layer = np.concatenate(cam_per_target_layer, axis=1)
-        cam_per_target_layer = np.maximum(cam_per_target_layer, 0)
-        result = np.mean(cam_per_target_layer, axis=1)
+            cam_per_target_layer: torch.Tensor) -> torch.Tensor:
+        cam_per_target_layer = torch.concatenate(cam_per_target_layer, axis=1)
+        cam_per_target_layer = torch.maximum(cam_per_target_layer, 0)
+        result = torch.mean(cam_per_target_layer, axis=1)
         return scale_cam_image(result)
 
     def forward_augmentation_smoothing(self,
                                        input_tensor: torch.Tensor,
                                        targets: List[torch.nn.Module],
-                                       eigen_smooth: bool = False) -> np.ndarray:
+                                       eigen_smooth: bool = False) -> torch.Tensor:
         transforms = tta.Compose(
             [
                 tta.HorizontalFlip(),
@@ -167,18 +167,18 @@ def forward_augmentation_smoothing(self,
             cam = transform.deaugment_mask(cam)
 
             # Back to numpy float32, HxW
-            cam = cam.numpy()
+            # cam = cam.numpy()
             cam = cam[:, 0, :, :]
             cams.append(cam)
 
-        cam = np.mean(np.float32(cams), axis=0)
+        cam = torch.mean(torch.float32(cams), axis=0)
         return cam
 
     def __call__(self,
                  input_tensor: torch.Tensor,
                  targets: List[torch.nn.Module] = None,
                  aug_smooth: bool = False,
-                 eigen_smooth: bool = False) -> np.ndarray:
+                 eigen_smooth: bool = False) -> torch.Tensor:
 
         # Smooth the CAM result with test time augmentation
         if aug_smooth is True:
diff --git a/pytorch_grad_cam/grad_cam.py b/pytorch_grad_cam/grad_cam.py
index 025bf45dd..1b9c93b5f 100644
--- a/pytorch_grad_cam/grad_cam.py
+++ b/pytorch_grad_cam/grad_cam.py
@@ -19,4 +19,4 @@ def get_cam_weights(self,
                         target_category,
                         activations,
                         grads):
-        return np.mean(grads, axis=(2, 3))
+        return torch.mean(grads, axis=(2, 3))
diff --git a/setup.py b/setup.py
index 1d8ace600..ea87b563d 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 setuptools.setup(
     name='grad-cam',
-    version='1.4.6',
+    version='1.4.7',
     author='Jacob Gildenblat',
     author_email='jacob.gildenblat@gmail.com',
     description='Many Class Activation Map methods implemented in Pytorch for classification, segmentation, object detection and more',

From 8bf752483962624be24f7ae8e2ee2facbf0746be Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 22:54:48 +0200
Subject: [PATCH 02/72] Bump other version :cop:

---
 setup.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 203e6a636..dceb4f5bc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = grad-cam
-version = 1.1.0
+version = 1.4.7
 author = Jacob Gildenblat
 author_email = jacob.gildenblat@gmail.com
 description = Many Class Activation Map methods implemented in Pytorch. Including Grad-CAM, Grad-CAM++, Score-CAM, Ablation-CAM and XGrad-CAM
@@ -16,4 +16,4 @@ classifiers =
 
 [options]
 packages = find:
-python_requires = >=3.6
\ No newline at end of file
+python_requires = >=3.6

From fa8c8d7e80b8ab93703eaadca1594529b6a2770c Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 22:57:42 +0200
Subject: [PATCH 03/72] Fix import to use torch over Numpy :cop:

---
 pytorch_grad_cam/grad_cam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/grad_cam.py b/pytorch_grad_cam/grad_cam.py
index 1b9c93b5f..efb66e76e 100644
--- a/pytorch_grad_cam/grad_cam.py
+++ b/pytorch_grad_cam/grad_cam.py
@@ -1,4 +1,4 @@
-import numpy as np
+import torch
 from pytorch_grad_cam.base_cam import BaseCAM
 
 

From ce52619809fc4c186ca1feb57a23dd65c33c9e5a Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:02:49 +0200
Subject: [PATCH 04/72] Convert max pos 2 to a tensor :cop:

---
 pytorch_grad_cam/base_cam.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index 640cec851..d81adee50 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -130,7 +130,7 @@ def compute_cam_per_layer(
                                      layer_activations,
                                      layer_grads,
                                      eigen_smooth)
-            cam = torch.maximum(cam, 0)
+            cam = torch.maximum(cam, torch.tensor(0))
             scaled = scale_cam_image(cam, target_size)
             cam_per_target_layer.append(scaled[:, None, :])
 
@@ -140,7 +140,7 @@ def aggregate_multi_layers(
             self,
             cam_per_target_layer: torch.Tensor) -> torch.Tensor:
         cam_per_target_layer = torch.concatenate(cam_per_target_layer, axis=1)
-        cam_per_target_layer = torch.maximum(cam_per_target_layer, 0)
+        cam_per_target_layer = torch.maximum(cam_per_target_layer, torch.tensor(0))
         result = torch.mean(cam_per_target_layer, axis=1)
         return scale_cam_image(result)
 

From 73f720bd5ab34d4d441cf293b4eca95eb3b3ff01 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:07:09 +0200
Subject: [PATCH 05/72] Begin to find and migrate more numpy calls to torch
 calls. Also fix some minor bugs :cop:

---
 pytorch_grad_cam/base_cam.py    | 4 ++--
 pytorch_grad_cam/utils/image.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index d81adee50..b9f3c3e63 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -169,9 +169,9 @@ def forward_augmentation_smoothing(self,
             # Back to numpy float32, HxW
             # cam = cam.numpy()
             cam = cam[:, 0, :, :]
-            cams.append(cam)
+            cams.append(cam) # TODO: Handle this for torch tensors
 
-        cam = torch.mean(torch.float32(cams), axis=0)
+        cam = torch.mean(cams.to(torch.float32), axis=0)
         return cam
 
     def __call__(self,
diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 34d92ba6f..57c4a4f18 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -160,12 +160,12 @@ def show_factorization_on_image(img: np.ndarray,
 def scale_cam_image(cam, target_size=None):
     result = []
     for img in cam:
-        img = img - np.min(img)
-        img = img / (1e-7 + np.max(img))
+        img = img - torch.min(img)
+        img = img / (1e-7 + torch.max(img))
         if target_size is not None:
-            img = cv2.resize(img, target_size)
+            img = cv2.resize(img, target_size) # TODO: Change this to handle torch tensors via a convert
         result.append(img)
-    result = np.float32(result)
+    result = result.to(torch.float32)
 
     return result
 

From f96e21fb54c11e94f43a1af2148821e5f46eb711 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:11:05 +0200
Subject: [PATCH 06/72] Make use of one strategy for resizing tensors over the
 cv2 call :cop:

---
 pytorch_grad_cam/utils/image.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 57c4a4f18..025db23be 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -4,6 +4,7 @@
 import cv2
 import numpy as np
 import torch
+import torchvision.transforms.functional as F
 from torchvision.transforms import Compose, Normalize, ToTensor
 from typing import List, Dict
 import math
@@ -163,7 +164,7 @@ def scale_cam_image(cam, target_size=None):
         img = img - torch.min(img)
         img = img / (1e-7 + torch.max(img))
         if target_size is not None:
-            img = cv2.resize(img, target_size) # TODO: Change this to handle torch tensors via a convert
+            img = F.resize(img, target_size) # TODO: Investigate better resizing techniques - Keeping defaults for now
         result.append(img)
     result = result.to(torch.float32)
 

From ef3dcf5f960cace552ff8bd43e43d6dc2e2f1524 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:16:14 +0200
Subject: [PATCH 07/72] Go back to CPU for cv2 resizing :cop:

---
 pytorch_grad_cam/utils/image.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 025db23be..9b22b638a 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -164,7 +164,16 @@ def scale_cam_image(cam, target_size=None):
         img = img - torch.min(img)
         img = img / (1e-7 + torch.max(img))
         if target_size is not None:
-            img = F.resize(img, target_size) # TODO: Investigate better resizing techniques - Keeping defaults for now
+            # There seem to be many different ways to resize a torch tensor
+            # with varying results
+            # TODO: Investigate these
+            # For now going to convert to cpu numpy and back just to get
+            # the crude experiment working - and then begin to tune and refine
+            # Possible way:
+            # img = F.resize(img, target_size) # TODO: Investigate better resizing techniques - Keeping defaults for now
+
+            # Convert to numpy
+            img = torch.tensor(cv2.resize(img.cpu().numpy(), target_size))
         result.append(img)
     result = result.to(torch.float32)
 

From bf9b9dbc6b7bc5aa5ef591e947696c267d6d3b00 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:19:50 +0200
Subject: [PATCH 08/72] Attempt to get the image scaling function working with
 torch :cop:

---
 pytorch_grad_cam/utils/image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 9b22b638a..918214b20 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -175,7 +175,7 @@ def scale_cam_image(cam, target_size=None):
             # Convert to numpy
             img = torch.tensor(cv2.resize(img.cpu().numpy(), target_size))
         result.append(img)
-    result = result.to(torch.float32)
+    result = torch.tensor(np.array(result)).to(torch.float32) # TODO: Optimise this to use pre-initialised torch tensor
 
     return result
 

From df9b0358fb7d0ab9ac5a81ffd9651c4a930281f4 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:30:23 +0200
Subject: [PATCH 09/72] Use torch tensor only on the list of numpy arrays :cop:

---
 pytorch_grad_cam/utils/image.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 918214b20..883150ae2 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -173,9 +173,10 @@ def scale_cam_image(cam, target_size=None):
             # img = F.resize(img, target_size) # TODO: Investigate better resizing techniques - Keeping defaults for now
 
             # Convert to numpy
-            img = torch.tensor(cv2.resize(img.cpu().numpy(), target_size))
+            # img = torch.tensor(cv2.resize(img.cpu().numpy(), target_size))
+            img = cv2.resize(img.cpu().numpy(), target_size)
         result.append(img)
-    result = torch.tensor(np.array(result)).to(torch.float32) # TODO: Optimise this to use pre-initialised torch tensor
+    result = torch.tensor(np.array(result).astype('float32')) # TODO: Optimise this to use pre-initialised torch tensor
 
     return result
 

From fe6cb924fce1d9770d451baf572ae936602595e3 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:32:58 +0200
Subject: [PATCH 10/72] Use the correct torch function :cop:

---
 pytorch_grad_cam/base_cam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index b9f3c3e63..547f001e4 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -139,7 +139,7 @@ def compute_cam_per_layer(
     def aggregate_multi_layers(
             self,
             cam_per_target_layer: torch.Tensor) -> torch.Tensor:
-        cam_per_target_layer = torch.concatenate(cam_per_target_layer, axis=1)
+        cam_per_target_layer = torch.cat(cam_per_target_layer, axis=1)
         cam_per_target_layer = torch.maximum(cam_per_target_layer, torch.tensor(0))
         result = torch.mean(cam_per_target_layer, axis=1)
         return scale_cam_image(result)

From f58e88cd5c50c5508af31e4633cb852f5fd707d3 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:43:49 +0200
Subject: [PATCH 11/72] Attempt to fix torch resizing :cop:

---
 pytorch_grad_cam/utils/image.py | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 883150ae2..5d7681a44 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -4,8 +4,7 @@
 import cv2
 import numpy as np
 import torch
-import torchvision.transforms.functional as F
-from torchvision.transforms import Compose, Normalize, ToTensor
+from torchvision.transforms import Compose, Normalize, ToTensor, Resize
 from typing import List, Dict
 import math
 
@@ -159,26 +158,22 @@ def show_factorization_on_image(img: np.ndarray,
 
 
 def scale_cam_image(cam, target_size=None):
-    result = []
-    for img in cam:
+    if target_size is not None:
+        result = torch.zeros([cam.shape[0], target_size[0], target_size[1]])
+    else:
+        result = torch.zeros(cam.shape)
+
+    for i in range(cam.shape[0]):
+        img = cam[i]
         img = img - torch.min(img)
         img = img / (1e-7 + torch.max(img))
+
         if target_size is not None:
-            # There seem to be many different ways to resize a torch tensor
-            # with varying results
-            # TODO: Investigate these
-            # For now going to convert to cpu numpy and back just to get
-            # the crude experiment working - and then begin to tune and refine
-            # Possible way:
-            # img = F.resize(img, target_size) # TODO: Investigate better resizing techniques - Keeping defaults for now
-
-            # Convert to numpy
-            # img = torch.tensor(cv2.resize(img.cpu().numpy(), target_size))
-            img = cv2.resize(img.cpu().numpy(), target_size)
-        result.append(img)
-    result = torch.tensor(np.array(result).astype('float32')) # TODO: Optimise this to use pre-initialised torch tensor
+            img = Resize(img, target_size)
 
-    return result
+        result[i] = img
+
+    return result.to(torch.float32)
 
 
 def scale_accross_batch_and_channels(tensor, target_size):

From 594bb0c771a594c4b36e738ba10f49a74e5683ec Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:52:36 +0200
Subject: [PATCH 12/72] Use a transpose for the experiment. Investigate a
 proper resize later :cop:

---
 pytorch_grad_cam/utils/image.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 5d7681a44..e571d7470 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -169,7 +169,9 @@ def scale_cam_image(cam, target_size=None):
         img = img / (1e-7 + torch.max(img))
 
         if target_size is not None:
-            img = Resize(img, target_size)
+            # transform = Resize(target_size)
+            # img = Resize(size = target_size)(img)
+            img = img.T # Swap axes around for now. TODO: Investigate a better solution
 
         result[i] = img
 

From f4739c2d17afa4f4ed10a92d7d0f479edcb0b84b Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:56:30 +0200
Subject: [PATCH 13/72] Remove the resize for now :cop:

---
 pytorch_grad_cam/utils/image.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index e571d7470..2d7d416e0 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -168,10 +168,10 @@ def scale_cam_image(cam, target_size=None):
         img = img - torch.min(img)
         img = img / (1e-7 + torch.max(img))
 
-        if target_size is not None:
+        # if target_size is not None:
             # transform = Resize(target_size)
             # img = Resize(size = target_size)(img)
-            img = img.T # Swap axes around for now. TODO: Investigate a better solution
+
 
         result[i] = img
 

From 995550e53348a9a4456978b4cecb48d3870b9682 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sat, 5 Nov 2022 23:59:28 +0200
Subject: [PATCH 14/72] Disable the scaling function from changing dimensions
 (for now)

---
 pytorch_grad_cam/utils/image.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 2d7d416e0..8b83deb56 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -158,10 +158,16 @@ def show_factorization_on_image(img: np.ndarray,
 
 
 def scale_cam_image(cam, target_size=None):
-    if target_size is not None:
-        result = torch.zeros([cam.shape[0], target_size[0], target_size[1]])
-    else:
-        result = torch.zeros(cam.shape)
+    # Disabled the target_size scaling for now
+    # It appears to swap the axes dimensions and needs further work for the
+    # proof of concept
+
+    # if target_size is not None:
+    #     result = torch.zeros([cam.shape[0], target_size[0], target_size[1]])
+    # else:
+    #     result = torch.zeros(cam.shape)
+
+    result = torch.zeros(cam.shape)
 
     for i in range(cam.shape[0]):
         img = cam[i]

From 02b94515511a7b87ec0d1f30375925a0915f32d9 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:17:18 +0200
Subject: [PATCH 15/72] Create a simple benchmark :cop:

---
 benchmarks/torch_benchmark.py | 40 +++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 benchmarks/torch_benchmark.py

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
new file mode 100644
index 000000000..8595c6a69
--- /dev/null
+++ b/benchmarks/torch_benchmark.py
@@ -0,0 +1,40 @@
+import argparse
+import cv2
+import numpy as np
+import torch
+
+from pytorch_grad_cam import GradCAM, \
+    ScoreCAM, \
+    GradCAMPlusPlus, \
+    AblationCAM, \
+    XGradCAM, \
+    EigenCAM, \
+    EigenGradCAM, \
+    LayerCAM, \
+    FullGrad
+
+import torchvision # You may need to install separately
+from torchvision import models
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+model =  models.resnet50()
+random_tensor = torch.rand((256, 60, 3)) # TODO: Use real data?
+
+# Test with numpy v1.4.6 (master)
+# Test with torch v1.4.7 (wip)
+
+# Run on CPU with profiler (save the profile to print later)
+dev = torch.device('cpu')
+model.to(dev)
+
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+
+print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15))
+breakpoint() # For now as I write this
+
+# Run on CUDA with profiler (save the profile to print later)
+
+# Run on CPU x100 (get min, max, and avg times)
+
+# Run on CUDA x100

From cc557d1d2974a8cc44b50fd44f9a4de9c204f26e Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:20:08 +0200
Subject: [PATCH 16/72] Add in basic GradCAM :cop:

---
 benchmarks/torch_benchmark.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 8595c6a69..7634b4b1b 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -26,10 +26,12 @@
 
 # Run on CPU with profiler (save the profile to print later)
 dev = torch.device('cpu')
+use_cuda = False
 model.to(dev)
+target_layers = [model.blocks[-1].norm1]
 
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-
+    GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
 print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15))
 breakpoint() # For now as I write this
 

From 71f51d017ef9f196880fc74d8d30d08a8d9d9ea0 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:23:55 +0200
Subject: [PATCH 17/72] Continue to write a simple GradCAM :cop:

---
 benchmarks/torch_benchmark.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 7634b4b1b..5aafc8b1a 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -21,17 +21,29 @@
 model =  models.resnet50()
 random_tensor = torch.rand((256, 60, 3)) # TODO: Use real data?
 
+# TODOs:
 # Test with numpy v1.4.6 (master)
 # Test with torch v1.4.7 (wip)
+# Test other CAMs besides GradCAM
 
 # Run on CPU with profiler (save the profile to print later)
 dev = torch.device('cpu')
 use_cuda = False
+
 model.to(dev)
-target_layers = [model.blocks[-1].norm1]
+random_tensor.to(dev)
+
+# Some defaults I use in research code
+target_layers = [model.fc]
+batch_size = 8
+targets = None # [ClassifierOutputTarget(None)]
 
+# Profile the CPU call
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
+    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
+    cam_function.batch_size = batch_size
+    heatmap = cam_function(input_tensor=input_tensor, targets=targets)
+
 print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15))
 breakpoint() # For now as I write this
 

From 06efbc4fb1decabf932ff4f8d1cea9548e1fe974 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:24:58 +0200
Subject: [PATCH 18/72] Properly name the variable :cop:

---
 benchmarks/torch_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 5aafc8b1a..97683bce3 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -19,7 +19,7 @@
 from torch.profiler import profile, record_function, ProfilerActivity
 
 model =  models.resnet50()
-random_tensor = torch.rand((256, 60, 3)) # TODO: Use real data?
+input_tensor = torch.rand((256, 60, 3)) # TODO: Use real data?
 
 # TODOs:
 # Test with numpy v1.4.6 (master)
@@ -31,7 +31,7 @@
 use_cuda = False
 
 model.to(dev)
-random_tensor.to(dev)
+input_tensor.to(dev)
 
 # Some defaults I use in research code
 target_layers = [model.fc]

From f2578d7d1395745c43937b3033f9dcfa385a6c1d Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:25:47 +0200
Subject: [PATCH 19/72] Fix the tensor stack :cop:

---
 benchmarks/torch_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 97683bce3..82c648549 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -19,7 +19,7 @@
 from torch.profiler import profile, record_function, ProfilerActivity
 
 model =  models.resnet50()
-input_tensor = torch.rand((256, 60, 3)) # TODO: Use real data?
+input_tensor = torch.rand((1, 256, 60, 3)) # TODO: Use real data?
 
 # TODOs:
 # Test with numpy v1.4.6 (master)

From a4d2750f06bac92dd6283f9a77cdfceba1c21a11 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:28:04 +0200
Subject: [PATCH 20/72] Fix the dimensions needed for Resnet :cop:

---
 benchmarks/torch_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 82c648549..129d3157b 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -19,7 +19,7 @@
 from torch.profiler import profile, record_function, ProfilerActivity
 
 model =  models.resnet50()
-input_tensor = torch.rand((1, 256, 60, 3)) # TODO: Use real data?
+input_tensor = torch.rand((1, 3, 256, 60)) # TODO: Use real data?
 
 # TODOs:
 # Test with numpy v1.4.6 (master)

From 705812289b20fd24116a3068395a598f2eb8d457 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:32:55 +0200
Subject: [PATCH 21/72] Change target layer :cop:

---
 benchmarks/torch_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 129d3157b..38b5334ac 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -34,7 +34,7 @@
 input_tensor.to(dev)
 
 # Some defaults I use in research code
-target_layers = [model.fc]
+target_layers = [model.layer4]
 batch_size = 8
 targets = None # [ClassifierOutputTarget(None)]
 

From 1a77f74a5867021780b12d971ba627fe5a122b17 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:36:00 +0200
Subject: [PATCH 22/72] Add in cuda profiling :cop:

---
 benchmarks/torch_benchmark.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 38b5334ac..ea9abb57b 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -2,6 +2,7 @@
 import cv2
 import numpy as np
 import torch
+import time
 
 from pytorch_grad_cam import GradCAM, \
     ScoreCAM, \
@@ -44,11 +45,25 @@
     cam_function.batch_size = batch_size
     heatmap = cam_function(input_tensor=input_tensor, targets=targets)
 
-print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15))
-breakpoint() # For now as I write this
+cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
+dev = torch.device('cuda')
+use_cuda = True
+
+model.to(dev)
+input_tensor.to(dev)
+
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
+    cam_function.batch_size = batch_size
+    heatmap = cam_function(input_tensor=input_tensor, targets=targets)
+
+cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+
+breakpoint()
 
 # Run on CPU x100 (get min, max, and avg times)
 
+
 # Run on CUDA x100

From f4b759ad12b66d4fcece2d38f9fdcb297d96c4ab Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:40:16 +0200
Subject: [PATCH 23/72] Create the large loop :cop:

---
 benchmarks/torch_benchmark.py | 43 +++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index ea9abb57b..4d4a6166e 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -19,13 +19,15 @@
 
 from torch.profiler import profile, record_function, ProfilerActivity
 
+number_of_inputs = 1000
 model =  models.resnet50()
-input_tensor = torch.rand((1, 3, 256, 60)) # TODO: Use real data?
+input_tensor = torch.rand((number_of_inputs, 3, 256, 60)) # TODO: Use real data?
 
 # TODOs:
 # Test with numpy v1.4.6 (master)
 # Test with torch v1.4.7 (wip)
 # Test other CAMs besides GradCAM
+# Nice output
 
 # Run on CPU with profiler (save the profile to print later)
 dev = torch.device('cpu')
@@ -61,9 +63,42 @@
 
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
-breakpoint()
+# Run on CPU x1000 (get min, max, and avg times)
+cpu_min_time = 10000000000000
+cpu_max_time = 0
+cpu_sum_of_times = 0
+
+for i in range(number_of_inputs):
+    start_time = time.time()
+
+    input_tensor = torch.rand((number_of_inputs, 3, 256, 60)) # TODO: Use real data?
+
+    dev = torch.device('cpu')
+    use_cuda = False
 
-# Run on CPU x100 (get min, max, and avg times)
+    model.to(dev)
+    input_tensor.to(dev)
 
+    # Some defaults I use in research code
+    target_layers = [model.layer4]
+    batch_size = 8
+    targets = None # [ClassifierOutputTarget(None)]
 
-# Run on CUDA x100
+    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
+    cam_function.batch_size = batch_size
+    heatmap = cam_function(input_tensor=input_tensor, targets=targets)
+
+    end_time = time.time()
+    time_difference = end_time - start_time
+
+    cpu_sum_of_times += time_difference
+
+    if time_difference > cpu_max_time:
+        cpu_max_time = time_difference
+
+    if time_difference < cpu_min_time:
+        cpu_min_time = time_difference
+
+cpu_avg_time = cpu_sum_of_times / number_of_inputs
+breakpoint()
+# Run on CUDA x1000

From 15ca2becdb16c7c67f4c32abc174285d25fbc6aa Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:47:57 +0200
Subject: [PATCH 24/72] Refactor code to share some algorithm :cop:

---
 benchmarks/torch_benchmark.py | 109 +++++++++++++++-------------------
 1 file changed, 48 insertions(+), 61 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 4d4a6166e..7d4c78a7a 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -19,86 +19,73 @@
 
 from torch.profiler import profile, record_function, ProfilerActivity
 
-number_of_inputs = 1000
-model =  models.resnet50()
-input_tensor = torch.rand((number_of_inputs, 3, 256, 60)) # TODO: Use real data?
+def run_gradcam(model, number_of_inputs, use_cuda=False):
+    min_time = 10000000000000
+    max_time = 0
+    sum_of_times = 0
 
-# TODOs:
-# Test with numpy v1.4.6 (master)
-# Test with torch v1.4.7 (wip)
-# Test other CAMs besides GradCAM
-# Nice output
+    dev = torch.device('cpu')
+    if use_cuda:
+        dev = torch.device('cuda:0')
 
-# Run on CPU with profiler (save the profile to print later)
-dev = torch.device('cpu')
-use_cuda = False
+    # TODO: Use real data?
+    # TODO: Configurable dimensions?
 
-model.to(dev)
-input_tensor.to(dev)
+    # Some defaults I use in research code
+    input_tensor = torch.rand((number_of_inputs, 3, 256, 60))
+    batch_size = 8
+    targets = None # [ClassifierOutputTarget(None)]
 
-# Some defaults I use in research code
-target_layers = [model.layer4]
-batch_size = 8
-targets = None # [ClassifierOutputTarget(None)]
+    model.to(dev)
+    target_layers = [model.layer4] # Last CNN layer of ResNet50
 
-# Profile the CPU call
-with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
     cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
     cam_function.batch_size = batch_size
-    heatmap = cam_function(input_tensor=input_tensor, targets=targets)
-
-cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
-
-# Run on CUDA with profiler (save the profile to print later)
-dev = torch.device('cuda')
-use_cuda = True
 
-model.to(dev)
-input_tensor.to(dev)
+    for i in range(number_of_inputs):
+        start_time = time.time()
 
-with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
-    cam_function.batch_size = batch_size
-    heatmap = cam_function(input_tensor=input_tensor, targets=targets)
+        # Actual code to benchmark
+        input_image = input_tensor[i].to(dev)
+        heatmap = cam_function(input_tensor=input_image, targets=targets)
 
-cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+        end_time = time.time()
+        time_difference = end_time - start_time
 
-# Run on CPU x1000 (get min, max, and avg times)
-cpu_min_time = 10000000000000
-cpu_max_time = 0
-cpu_sum_of_times = 0
+        sum_of_times += time_difference
 
-for i in range(number_of_inputs):
-    start_time = time.time()
+        if time_difference > max_time:
+            max_time = time_difference
 
-    input_tensor = torch.rand((number_of_inputs, 3, 256, 60)) # TODO: Use real data?
+        if time_difference < min_time:
+            min_time = time_difference
 
-    dev = torch.device('cpu')
-    use_cuda = False
+    avg_time = sum_of_times / number_of_inputs
+    return [min_time, max_time, avg_time]
 
-    model.to(dev)
-    input_tensor.to(dev)
-
-    # Some defaults I use in research code
-    target_layers = [model.layer4]
-    batch_size = 8
-    targets = None # [ClassifierOutputTarget(None)]
+number_of_inputs = 1000
+model =  models.resnet50()
 
-    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
-    cam_function.batch_size = batch_size
-    heatmap = cam_function(input_tensor=input_tensor, targets=targets)
+# TODOs:
+# Test with numpy v1.4.6 (master)
+# Test with torch v1.4.7 (wip)
+# Test other CAMs besides GradCAM
+# Nice output
 
-    end_time = time.time()
-    time_difference = end_time - start_time
+# Run on CPU with profiler (save the profile to print later)
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = run_gradcam(model, number_of_inputs, use_cuda=False)
+cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
-    cpu_sum_of_times += time_difference
+# Run on CUDA with profiler (save the profile to print later)
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, use_cuda=True)
+cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
-    if time_difference > cpu_max_time:
-        cpu_max_time = time_difference
+# Run on CPU x1000 (get min, max, and avg times)
+cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, use_cuda=False)
 
-    if time_difference < cpu_min_time:
-        cpu_min_time = time_difference
+# Run on CUDA x1000
+cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, use_cuda=True)
 
-cpu_avg_time = cpu_sum_of_times / number_of_inputs
 breakpoint()
-# Run on CUDA x1000

From fb1b50d3e280b47a1ff843a7872f6a13119531e0 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:49:21 +0200
Subject: [PATCH 25/72] Fix batching :cop:

---
 benchmarks/torch_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 7d4c78a7a..7235b00cd 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -42,11 +42,11 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
     cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
     cam_function.batch_size = batch_size
 
-    for i in range(number_of_inputs):
+    for i in range(0, number_of_inputs, batch_size):
         start_time = time.time()
 
         # Actual code to benchmark
-        input_image = input_tensor[i].to(dev)
+        input_image = input_tensor[i:i+batch_size].to(dev)
         heatmap = cam_function(input_tensor=input_image, targets=targets)
 
         end_time = time.time()

From 62c17096270fc15b67e9e8f28d1021b9871aca87 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 11:53:49 +0200
Subject: [PATCH 26/72] Add in proper output :cop:

---
 benchmarks/torch_benchmark.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 7235b00cd..f78f28b1e 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -66,6 +66,8 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
 number_of_inputs = 1000
 model =  models.resnet50()
 
+print(f'Benchmarking GradCAM using {number_of_inputs} images for ResNet50...')
+
 # TODOs:
 # Test with numpy v1.4.6 (master)
 # Test with torch v1.4.7 (wip)
@@ -73,19 +75,46 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
 # Nice output
 
 # Run on CPU with profiler (save the profile to print later)
+print('Profile list of images on CPU...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
     cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = run_gradcam(model, number_of_inputs, use_cuda=False)
 cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
+print('Profile list of images on Cuda...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
     cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, use_cuda=True)
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CPU x1000 (get min, max, and avg times)
+print('Run list of images on CPU...')
 cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, use_cuda=False)
 
 # Run on CUDA x1000
+print('Run list of images on Cuda...')
 cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, use_cuda=True)
 
-breakpoint()
+print('Complete!')
+
+print('==============================================================================\n\n')
+print('CPU Profile:\n')
+print(cpu_profile)
+
+print('==============================================================================\n\n')
+print('Cuda Profile:\n')
+print(cuda_profile)
+
+print('==============================================================================\n\n')
+print('CPU Timing (No Profiler):\n')
+print(f'Min time: {cpu_min_time}\n')
+print(f'Max time: {cpu_max_time}\n')
+print(f'Avg time: {cpu_avg_time}\n')
+
+print('==============================================================================\n\n')
+print('Cuda Timing (No Profiler):\n')
+print(f'Min time: {cuda_min_time}\n')
+print(f'Max time: {cuda_max_time}\n')
+print(f'Avg time: {cuda_avg_time}\n')
+
+print('==============================================================================\n\n')
+print('Done!')

From 045d200e651195ab7e3cf40e5b07550ef311f34d Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 12:00:49 +0200
Subject: [PATCH 27/72] Add in loading bar :cop:

---
 benchmarks/torch_benchmark.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index f78f28b1e..4aeef745f 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -3,6 +3,7 @@
 import numpy as np
 import torch
 import time
+import tqdm
 
 from pytorch_grad_cam import GradCAM, \
     ScoreCAM, \
@@ -42,6 +43,8 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
     cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
     cam_function.batch_size = batch_size
 
+    pbar = tqdm.tqdm(total=number_of_inputs)
+
     for i in range(0, number_of_inputs, batch_size):
         start_time = time.time()
 
@@ -60,6 +63,8 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
         if time_difference < min_time:
             min_time = time_difference
 
+        pbar.update(batch_size)
+
     avg_time = sum_of_times / number_of_inputs
     return [min_time, max_time, avg_time]
 

From 20d7ebd5dc3d3ed8cf1ec8d881d1acfdfe836e94 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 12:01:02 +0200
Subject: [PATCH 28/72] Reduce to 100 images :cop:

---
 benchmarks/torch_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 4aeef745f..39015f611 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -68,7 +68,7 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
     avg_time = sum_of_times / number_of_inputs
     return [min_time, max_time, avg_time]
 
-number_of_inputs = 1000
+number_of_inputs = 100
 model =  models.resnet50()
 
 print(f'Benchmarking GradCAM using {number_of_inputs} images for ResNet50...')

From d9dbc85727fead2cdb55a63dfbec728474e3abed Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 21:34:16 +0200
Subject: [PATCH 29/72] Attempt using a bigger batchsize :cop:

---
 benchmarks/torch_benchmark.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 39015f611..3e054f63a 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -20,7 +20,7 @@
 
 from torch.profiler import profile, record_function, ProfilerActivity
 
-def run_gradcam(model, number_of_inputs, use_cuda=False):
+def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
     min_time = 10000000000000
     max_time = 0
     sum_of_times = 0
@@ -34,7 +34,6 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
 
     # Some defaults I use in research code
     input_tensor = torch.rand((number_of_inputs, 3, 256, 60))
-    batch_size = 8
     targets = None # [ClassifierOutputTarget(None)]
 
     model.to(dev)
@@ -82,22 +81,22 @@ def run_gradcam(model, number_of_inputs, use_cuda=False):
 # Run on CPU with profiler (save the profile to print later)
 print('Profile list of images on CPU...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = run_gradcam(model, number_of_inputs, use_cuda=False)
+    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=False)
 cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
 print('Profile list of images on Cuda...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, use_cuda=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=True)
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CPU x1000 (get min, max, and avg times)
 print('Run list of images on CPU...')
-cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, use_cuda=False)
+cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=False)
 
 # Run on CUDA x1000
 print('Run list of images on Cuda...')
-cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, use_cuda=True)
+cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=True)
 
 print('Complete!')
 

From 2756d71c5791afcedeace35e5ae767ada931e33d Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Sun, 6 Nov 2022 22:22:34 +0200
Subject: [PATCH 30/72] Bump batch_size :cop:

---
 benchmarks/torch_benchmark.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 3e054f63a..2b93f4e57 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -20,6 +20,11 @@
 
 from torch.profiler import profile, record_function, ProfilerActivity
 
+number_of_inputs = 1000
+model =  models.resnet50()
+
+print(f'Benchmarking GradCAM using {number_of_inputs} images for ResNet50...')
+
 def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
     min_time = 10000000000000
     max_time = 0
@@ -67,11 +72,6 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
     avg_time = sum_of_times / number_of_inputs
     return [min_time, max_time, avg_time]
 
-number_of_inputs = 100
-model =  models.resnet50()
-
-print(f'Benchmarking GradCAM using {number_of_inputs} images for ResNet50...')
-
 # TODOs:
 # Test with numpy v1.4.6 (master)
 # Test with torch v1.4.7 (wip)
@@ -81,22 +81,22 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
 # Run on CPU with profiler (save the profile to print later)
 print('Profile list of images on CPU...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=False)
+    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
 print('Profile list of images on Cuda...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CPU x1000 (get min, max, and avg times)
 print('Run list of images on CPU...')
-cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=False)
+cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 
 # Run on CUDA x1000
 print('Run list of images on Cuda...')
-cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=32, use_cuda=True)
+cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 
 print('Complete!')
 

From 4200b99038f68db34c3377fedefe2b518a53f6c4 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Tue, 8 Nov 2022 18:11:09 +0200
Subject: [PATCH 31/72] Add workflow test :cop:

---
 benchmarks/torch_benchmark.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 2b93f4e57..fb2fb285e 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -25,7 +25,7 @@
 
 print(f'Benchmarking GradCAM using {number_of_inputs} images for ResNet50...')
 
-def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
+def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=False):
     min_time = 10000000000000
     max_time = 0
     sum_of_times = 0
@@ -56,6 +56,11 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
         input_image = input_tensor[i:i+batch_size].to(dev)
         heatmap = cam_function(input_tensor=input_image, targets=targets)
 
+        if workflow_test:
+            # Create a binary map
+            threshold_plot = torch.where(heatmap > 0.5, 1, 0)
+            output_image = input_image * threshold_plot
+
         end_time = time.time()
         time_difference = end_time - start_time
 
@@ -90,6 +95,12 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
     cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
+# Run on CUDA with extra workflow
+print('Profile list of images on Cuda and then run workflow...')
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+
 # Run on CPU x1000 (get min, max, and avg times)
 print('Run list of images on CPU...')
 cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
@@ -98,6 +109,10 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
 print('Run list of images on Cuda...')
 cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 
+# Run Workflow
+print('Run list of images on Cuda with a workflow...')
+workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+
 print('Complete!')
 
 print('==============================================================================\n\n')
@@ -108,6 +123,10 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
 print('Cuda Profile:\n')
 print(cuda_profile)
 
+print('==============================================================================\n\n')
+print('Workflow Cuda Profile:\n')
+print(work_flow_cuda_profile)
+
 print('==============================================================================\n\n')
 print('CPU Timing (No Profiler):\n')
 print(f'Min time: {cpu_min_time}\n')
@@ -120,5 +139,11 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False):
 print(f'Max time: {cuda_max_time}\n')
 print(f'Avg time: {cuda_avg_time}\n')
 
+print('==============================================================================\n\n')
+print('Workflow Cuda Timing (No Profiler):\n')
+print(f'Min time: {workflow_cuda_min_time}\n')
+print(f'Max time: {workflow_cuda_max_time}\n')
+print(f'Avg time: {workflow_cuda_avg_time}\n')
+
 print('==============================================================================\n\n')
 print('Done!')

From 1489ea3170f9ca00781d377e6effb1e9fb386a95 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Tue, 8 Nov 2022 18:14:04 +0200
Subject: [PATCH 32/72] Fix tensor issue in 1.4.6 :cop:

---
 benchmarks/torch_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index fb2fb285e..034e99fd0 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -58,7 +58,7 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
 
         if workflow_test:
             # Create a binary map
-            threshold_plot = torch.where(heatmap > 0.5, 1, 0)
+            threshold_plot = torch.where(torch.tensor(heatmap) > 0.5, 1, 0)
             output_image = input_image * threshold_plot
 
         end_time = time.time()

From 8833ee1df54c3218461dab0e1063296c8a1a4e5c Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Tue, 8 Nov 2022 18:17:24 +0200
Subject: [PATCH 33/72] Add inner loop :cop:

---
 benchmarks/torch_benchmark.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 034e99fd0..c8836713d 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -57,9 +57,10 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
         heatmap = cam_function(input_tensor=input_image, targets=targets)
 
         if workflow_test:
-            # Create a binary map
-            threshold_plot = torch.where(torch.tensor(heatmap) > 0.5, 1, 0)
-            output_image = input_image * threshold_plot
+            for j in range(batch_size):
+                # Create a binary map
+                threshold_plot = torch.where(torch.tensor(heatmap[j]) > 0.5, 1, 0)
+                output_image = input_image * threshold_plot
 
         end_time = time.time()
         time_difference = end_time - start_time

From d492c36262ec2fbf19b76dc30515ee23f3a3c405 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Tue, 8 Nov 2022 18:19:12 +0200
Subject: [PATCH 34/72] Force cuda device :cop:

---
 benchmarks/torch_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index c8836713d..0012eaf77 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -59,7 +59,7 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
         if workflow_test:
             for j in range(batch_size):
                 # Create a binary map
-                threshold_plot = torch.where(torch.tensor(heatmap[j]) > 0.5, 1, 0)
+                threshold_plot = torch.where(torch.tensor(heatmap[j]).to(torch.device('cuda:0')) > 0.5, 1, 0)
                 output_image = input_image * threshold_plot
 
         end_time = time.time()

From 5bbdf8f153fc338fbfc544e45a8c8f1e1efc56fc Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Tue, 8 Nov 2022 18:21:01 +0200
Subject: [PATCH 35/72] Fix loop range :cop:

---
 benchmarks/torch_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 0012eaf77..531c20703 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -57,7 +57,7 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
         heatmap = cam_function(input_tensor=input_image, targets=targets)
 
         if workflow_test:
-            for j in range(batch_size):
+            for j in range(heatmap.shape[0]):
                 # Create a binary map
                 threshold_plot = torch.where(torch.tensor(heatmap[j]).to(torch.device('cuda:0')) > 0.5, 1, 0)
                 output_image = input_image * threshold_plot

From b8a8a4669cde9bd8302d32c6dc3cab3a769f307f Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 10 Nov 2022 15:23:16 +0200
Subject: [PATCH 36/72] Make use of the tensor resize transform :cop:

---
 pytorch_grad_cam/utils/image.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 8b83deb56..4aba1bf24 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -174,10 +174,8 @@ def scale_cam_image(cam, target_size=None):
         img = img - torch.min(img)
         img = img / (1e-7 + torch.max(img))
 
-        # if target_size is not None:
-            # transform = Resize(target_size)
-            # img = Resize(size = target_size)(img)
-
+        if target_size is not None:
+            img = img.resize_(target_size)
 
         result[i] = img
 

From 77b19da5ee6e78ddc658b04162924e4865fa21b3 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 10 Nov 2022 15:31:24 +0200
Subject: [PATCH 37/72] Add in a different model to benchmark too :cop:

---
 benchmarks/torch_benchmark.py | 60 +++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 531c20703..1eb3adfa6 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -15,6 +15,8 @@
     LayerCAM, \
     FullGrad
 
+from torch import nn
+
 import torchvision # You may need to install separately
 from torchvision import models
 
@@ -25,6 +27,43 @@
 
 print(f'Benchmarking GradCAM using {number_of_inputs} images for ResNet50...')
 
+# Simple model to test
+class SimpleCNN(nn.Module):
+  def __init__(self):
+    super(SimpleCNN, self).__init__()
+
+    # Grad-CAM interface
+    self.target_layer = nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1)
+    self.target_layers = [self.target_layer]
+
+    self.cnn_stack = nn.Sequential(
+      nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
+      nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
+      nn.ReLU(inplace=True),
+      self.target_layer,
+      nn.ReLU(inplace=True),
+      nn.MaxPool2d((2, 2)),
+      nn.Flatten(),
+      nn.Linear(122880, 10),
+      nn.Linear(10, 1)
+    )
+
+  def forward(self, x):
+    logits = self.cnn_stack(x)
+    logits = F.normalize(logits, dim = 0)
+
+    return logits
+
+def xavier_uniform_init(layer):
+  if type(layer) == nn.Linear or type(layer) == nn.Conv2d:
+    gain = nn.init.calculate_gain('relu')
+
+    if layer.bias is not None:
+      nn.init.zeros_(layer.bias)
+
+    nn.init.xavier_uniform_(layer.weight, gain=gain)
+
+# Code to run benchmark
 def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=False):
     min_time = 10000000000000
     max_time = 0
@@ -102,6 +141,12 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
     cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
+# Run on CUDA with extra workflow
+print('Profile list of images on Cuda and then run workflow with a simple CNN...')
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+simple_work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+
 # Run on CPU x1000 (get min, max, and avg times)
 print('Run list of images on CPU...')
 cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
@@ -114,6 +159,11 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
 print('Run list of images on Cuda with a workflow...')
 workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
+print('Run list of images on Cuda with a workflow using simple CNN...')
+model = SimpleCNN()
+model.apply(xavier_uniform_init) # Randomise more weights
+simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+
 print('Complete!')
 
 print('==============================================================================\n\n')
@@ -128,6 +178,10 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
 print('Workflow Cuda Profile:\n')
 print(work_flow_cuda_profile)
 
+print('==============================================================================\n\n')
+print('Simple Workflow Cuda Profile:\n')
+print(simple_work_flow_cuda_profile)
+
 print('==============================================================================\n\n')
 print('CPU Timing (No Profiler):\n')
 print(f'Min time: {cpu_min_time}\n')
@@ -146,5 +200,11 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
 print(f'Max time: {workflow_cuda_max_time}\n')
 print(f'Avg time: {workflow_cuda_avg_time}\n')
 
+print('==============================================================================\n\n')
+print('Simple Workflow Cuda Timing (No Profiler):\n')
+print(f'Min time: {workflow_cuda_min_time}\n')
+print(f'Max time: {workflow_cuda_max_time}\n')
+print(f'Avg time: {workflow_cuda_avg_time}\n')
+
 print('==============================================================================\n\n')
 print('Done!')

From 85f196badd5f1db6510192a2881ed0841bf24198 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 10 Nov 2022 15:36:35 +0200
Subject: [PATCH 38/72] handle the tensor list size :cop:

---
 pytorch_grad_cam/utils/image.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 4aba1bf24..9a6714d17 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -162,12 +162,10 @@ def scale_cam_image(cam, target_size=None):
     # It appears to swap the axes dimensions and needs further work for the
     # proof of concept
 
-    # if target_size is not None:
-    #     result = torch.zeros([cam.shape[0], target_size[0], target_size[1]])
-    # else:
-    #     result = torch.zeros(cam.shape)
-
-    result = torch.zeros(cam.shape)
+    if target_size is not None:
+        result = torch.zeros([cam.shape[0], target_size[0], target_size[1]])
+    else:
+        result = torch.zeros(cam.shape)
 
     for i in range(cam.shape[0]):
         img = cam[i]

From a56647d29334866b7dcbd2ab54110082a9e2e588 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 10 Nov 2022 15:43:45 +0200
Subject: [PATCH 39/72] Correct the dimensions in the resize :cop:

---
 benchmarks/torch_benchmark.py   | 1 +
 pytorch_grad_cam/utils/image.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 1eb3adfa6..47c5db03e 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -35,6 +35,7 @@ def __init__(self):
     # Grad-CAM interface
     self.target_layer = nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1)
     self.target_layers = [self.target_layer]
+    self.layer4 = self.target_layer
 
     self.cnn_stack = nn.Sequential(
       nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
diff --git a/pytorch_grad_cam/utils/image.py b/pytorch_grad_cam/utils/image.py
index 9a6714d17..fd9a6f3a3 100644
--- a/pytorch_grad_cam/utils/image.py
+++ b/pytorch_grad_cam/utils/image.py
@@ -163,7 +163,7 @@ def scale_cam_image(cam, target_size=None):
     # proof of concept
 
     if target_size is not None:
-        result = torch.zeros([cam.shape[0], target_size[0], target_size[1]])
+        result = torch.zeros([cam.shape[0], target_size[1], target_size[0]])
     else:
         result = torch.zeros(cam.shape)
 
@@ -173,7 +173,7 @@ def scale_cam_image(cam, target_size=None):
         img = img / (1e-7 + torch.max(img))
 
         if target_size is not None:
-            img = img.resize_(target_size)
+            img = img.resize_(target_size).T
 
         result[i] = img
 

From 922d2d31964a3852013cf75ff0d445737b0d6546 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 10 Nov 2022 15:47:48 +0200
Subject: [PATCH 40/72] pdate using the correct models in the benchmark :cop:

---
 benchmarks/torch_benchmark.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 47c5db03e..7fbb77bc1 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -16,6 +16,7 @@
     FullGrad
 
 from torch import nn
+import torch.nn.functional as F
 
 import torchvision # You may need to install separately
 from torchvision import models
@@ -144,10 +145,13 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
 
 # Run on CUDA with extra workflow
 print('Profile list of images on Cuda and then run workflow with a simple CNN...')
+model = SimpleCNN()
+model.apply(xavier_uniform_init) # Randomise more weights
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
     cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 simple_work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
+model =  models.resnet50()
 # Run on CPU x1000 (get min, max, and avg times)
 print('Run list of images on CPU...')
 cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)

From 5dbc8bcd71cf819c859ac08bb0b26a683aa0e9f1 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 10 Nov 2022 15:50:54 +0200
Subject: [PATCH 41/72] Fix output :cop:

---
 benchmarks/torch_benchmark.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 7fbb77bc1..15187eded 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -207,9 +207,9 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
 
 print('==============================================================================\n\n')
 print('Simple Workflow Cuda Timing (No Profiler):\n')
-print(f'Min time: {workflow_cuda_min_time}\n')
-print(f'Max time: {workflow_cuda_max_time}\n')
-print(f'Avg time: {workflow_cuda_avg_time}\n')
+print(f'Min time: {simple_workflow_cuda_min_time}\n')
+print(f'Max time: {simple_workflow_cuda_max_time}\n')
+print(f'Avg time: {simple_workflow_cuda_avg_time}\n')
 
 print('==============================================================================\n\n')
 print('Done!')

From 20ab49fd79dc91cdb8601d89d704f59a7db319c7 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 13:56:01 +0200
Subject: [PATCH 42/72] Improve benchmarking and make a functions file to store
 reusable components :cop:

---
 benchmarks/benchmark_functions.py | 107 ++++++++++++++++++++++++++++
 benchmarks/models_benchmark.py    |  53 ++++++++++++++
 benchmarks/torch_benchmark.py     | 113 ++++--------------------------
 3 files changed, 172 insertions(+), 101 deletions(-)
 create mode 100644 benchmarks/benchmark_functions.py
 create mode 100644 benchmarks/models_benchmark.py

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
new file mode 100644
index 000000000..f9ab50d04
--- /dev/null
+++ b/benchmarks/benchmark_functions.py
@@ -0,0 +1,107 @@
+import argparse
+import cv2
+import numpy as np
+import torch
+import time
+import tqdm
+
+from pytorch_grad_cam import GradCAM
+
+from torch import nn
+import torch.nn.functional as F
+
+import torchvision # You may need to install separately
+from torchvision import models
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+# Simple model to test
+class SimpleCNN(nn.Module):
+  def __init__(self):
+    super(SimpleCNN, self).__init__()
+
+    # Grad-CAM interface
+    self.target_layer = nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1)
+    self.target_layers = [self.target_layer]
+    self.layer4 = self.target_layer
+
+    self.cnn_stack = nn.Sequential(
+      nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
+      nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
+      nn.ReLU(inplace=True),
+      self.target_layer,
+      nn.ReLU(inplace=True),
+      nn.MaxPool2d((2, 2)),
+      nn.Flatten(),
+      nn.Linear(122880, 10),
+      nn.Linear(10, 1)
+    )
+
+  def forward(self, x):
+    logits = self.cnn_stack(x)
+    logits = F.normalize(logits, dim = 0)
+
+    return logits
+
+def xavier_uniform_init(layer):
+  if type(layer) == nn.Linear or type(layer) == nn.Conv2d:
+    gain = nn.init.calculate_gain('relu')
+
+    if layer.bias is not None:
+      nn.init.zeros_(layer.bias)
+
+    nn.init.xavier_uniform_(layer.weight, gain=gain)
+
+# Code to run benchmark
+def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False):
+    min_time = 10000000000000
+    max_time = 0
+    sum_of_times = 0
+
+    dev = torch.device('cpu')
+    if use_cuda:
+        dev = torch.device('cuda:0')
+
+    # TODO: Use real data?
+    # TODO: Configurable dimensions?
+
+    # Some defaults I use in research code
+    input_tensor = torch.rand((number_of_inputs, 3, 256, 60))
+    targets = None # [ClassifierOutputTarget(None)]
+
+    model.to(dev)
+    target_layers = [model.layer4] # Last CNN layer of ResNet50
+
+    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
+    cam_function.batch_size = batch_size
+
+    pbar = tqdm.tqdm(total=number_of_inputs)
+
+    for i in range(0, number_of_inputs, batch_size):
+        start_time = time.time()
+
+        # Actual code to benchmark
+        input_image = input_tensor[i:i+batch_size].to(dev)
+        heatmap = cam_function(input_tensor=input_image, targets=targets)
+
+        if workflow_test:
+            for j in range(heatmap.shape[0]):
+                # Create a binary map
+                threshold_plot = torch.where(torch.tensor(heatmap[j]).to(torch.device('cuda:0')) > 0.5, 1, 0)
+                output_image = input_image * threshold_plot
+
+        end_time = time.time()
+        time_difference = end_time - start_time
+
+        sum_of_times += time_difference
+
+        if time_difference > max_time:
+            max_time = time_difference
+
+        if time_difference < min_time:
+            min_time = time_difference
+
+        pbar.update(batch_size)
+
+    avg_time = sum_of_times / number_of_inputs
+    return [min_time, max_time, avg_time]
diff --git a/benchmarks/models_benchmark.py b/benchmarks/models_benchmark.py
new file mode 100644
index 000000000..561af7f22
--- /dev/null
+++ b/benchmarks/models_benchmark.py
@@ -0,0 +1,53 @@
+import argparse
+import cv2
+import numpy as np
+import torch
+import time
+import tqdm
+
+from pytorch_grad_cam import GradCAM
+
+from torch import nn
+import torch.nn.functional as F
+
+import torchvision # You may need to install separately
+from torchvision import models
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+import benchmark_functions
+
+number_of_inputs = 1000
+
+print(f'Benchmarking GradCAM using {number_of_inputs} images for multiple models...')
+
+models_to_benchmark = [
+    ["SimpleCNN", benchmark_functions.SimpleCNN()],
+    ["resnet18", models.resnet18()],
+    ["resnet34", models.resnet34()],
+    ["resnet50", models.resnet50()],
+    ["alexnet", models.alexnet()],
+    ["vgg16", models.vgg16()],
+    ["googlenet", models.googlenet()],
+    ["mobilenet_v2", models.mobilenet_v2()],
+    ["densenet161", models.densenet161()]
+]
+
+for model_name, model in tqdm.tqdm(models_to_benchmark):
+    print('==============================================================================\n\n')
+    print(f'Simple Workflow for model #{model_name}:\n')
+
+    model.apply(xavier_uniform_init) # Randomise more weights
+    cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True)
+    cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True)
+
+    print(f'Cuda Min time: {cuda_min_time}\n')
+    print(f'Cuda Max time: {cuda_max_time}\n')
+    print(f'Cuda Avg time: {cuda_avg_time}\n\n')
+    print(f'CPU Min time: {cpu_min_time}\n')
+    print(f'CPU Max time: {cpu_max_time}\n')
+    print(f'CPU Avg time: {cpu_avg_time}\n')
+
+
+print('==============================================================================\n\n')
+print('Done!')
diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 15187eded..1528a1fbf 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -23,102 +23,13 @@
 
 from torch.profiler import profile, record_function, ProfilerActivity
 
+import benchmark_functions
+
 number_of_inputs = 1000
 model =  models.resnet50()
 
 print(f'Benchmarking GradCAM using {number_of_inputs} images for ResNet50...')
 
-# Simple model to test
-class SimpleCNN(nn.Module):
-  def __init__(self):
-    super(SimpleCNN, self).__init__()
-
-    # Grad-CAM interface
-    self.target_layer = nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1)
-    self.target_layers = [self.target_layer]
-    self.layer4 = self.target_layer
-
-    self.cnn_stack = nn.Sequential(
-      nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
-      nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
-      nn.ReLU(inplace=True),
-      self.target_layer,
-      nn.ReLU(inplace=True),
-      nn.MaxPool2d((2, 2)),
-      nn.Flatten(),
-      nn.Linear(122880, 10),
-      nn.Linear(10, 1)
-    )
-
-  def forward(self, x):
-    logits = self.cnn_stack(x)
-    logits = F.normalize(logits, dim = 0)
-
-    return logits
-
-def xavier_uniform_init(layer):
-  if type(layer) == nn.Linear or type(layer) == nn.Conv2d:
-    gain = nn.init.calculate_gain('relu')
-
-    if layer.bias is not None:
-      nn.init.zeros_(layer.bias)
-
-    nn.init.xavier_uniform_(layer.weight, gain=gain)
-
-# Code to run benchmark
-def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=False):
-    min_time = 10000000000000
-    max_time = 0
-    sum_of_times = 0
-
-    dev = torch.device('cpu')
-    if use_cuda:
-        dev = torch.device('cuda:0')
-
-    # TODO: Use real data?
-    # TODO: Configurable dimensions?
-
-    # Some defaults I use in research code
-    input_tensor = torch.rand((number_of_inputs, 3, 256, 60))
-    targets = None # [ClassifierOutputTarget(None)]
-
-    model.to(dev)
-    target_layers = [model.layer4] # Last CNN layer of ResNet50
-
-    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
-    cam_function.batch_size = batch_size
-
-    pbar = tqdm.tqdm(total=number_of_inputs)
-
-    for i in range(0, number_of_inputs, batch_size):
-        start_time = time.time()
-
-        # Actual code to benchmark
-        input_image = input_tensor[i:i+batch_size].to(dev)
-        heatmap = cam_function(input_tensor=input_image, targets=targets)
-
-        if workflow_test:
-            for j in range(heatmap.shape[0]):
-                # Create a binary map
-                threshold_plot = torch.where(torch.tensor(heatmap[j]).to(torch.device('cuda:0')) > 0.5, 1, 0)
-                output_image = input_image * threshold_plot
-
-        end_time = time.time()
-        time_difference = end_time - start_time
-
-        sum_of_times += time_difference
-
-        if time_difference > max_time:
-            max_time = time_difference
-
-        if time_difference < min_time:
-            min_time = time_difference
-
-        pbar.update(batch_size)
-
-    avg_time = sum_of_times / number_of_inputs
-    return [min_time, max_time, avg_time]
-
 # TODOs:
 # Test with numpy v1.4.6 (master)
 # Test with torch v1.4.7 (wip)
@@ -128,46 +39,46 @@ def run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_
 # Run on CPU with profiler (save the profile to print later)
 print('Profile list of images on CPU...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
 print('Profile list of images on Cuda...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with extra workflow
 print('Profile list of images on Cuda and then run workflow...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with extra workflow
 print('Profile list of images on Cuda and then run workflow with a simple CNN...')
-model = SimpleCNN()
+model = benchmark_functions.SimpleCNN()
 model.apply(xavier_uniform_init) # Randomise more weights
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 simple_work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 model =  models.resnet50()
 # Run on CPU x1000 (get min, max, and avg times)
 print('Run list of images on CPU...')
-cpu_min_time, cpu_max_time, cpu_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 
 # Run on CUDA x1000
 print('Run list of images on Cuda...')
-cuda_min_time, cuda_max_time, cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 
 # Run Workflow
 print('Run list of images on Cuda with a workflow...')
-workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Run list of images on Cuda with a workflow using simple CNN...')
-model = SimpleCNN()
+model = benchmark_functions.SimpleCNN()
 model.apply(xavier_uniform_init) # Randomise more weights
-simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time = run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Complete!')
 

From 3eceb8439bd94caff7491bb399d33639e05f27f1 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 13:59:36 +0200
Subject: [PATCH 43/72] Make use of shared functions :cop:

---
 benchmarks/models_benchmark.py | 2 +-
 benchmarks/torch_benchmark.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/models_benchmark.py b/benchmarks/models_benchmark.py
index 561af7f22..a97887f26 100644
--- a/benchmarks/models_benchmark.py
+++ b/benchmarks/models_benchmark.py
@@ -37,7 +37,7 @@
     print('==============================================================================\n\n')
     print(f'Simple Workflow for model #{model_name}:\n')
 
-    model.apply(xavier_uniform_init) # Randomise more weights
+    model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
     cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True)
     cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True)
 
diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index 1528a1fbf..a7c58ffbf 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -57,7 +57,7 @@
 # Run on CUDA with extra workflow
 print('Profile list of images on Cuda and then run workflow with a simple CNN...')
 model = benchmark_functions.SimpleCNN()
-model.apply(xavier_uniform_init) # Randomise more weights
+model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
     cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 simple_work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
@@ -77,7 +77,7 @@
 
 print('Run list of images on Cuda with a workflow using simple CNN...')
 model = benchmark_functions.SimpleCNN()
-model.apply(xavier_uniform_init) # Randomise more weights
+model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
 simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Complete!')

From 901391e359debd443087efe0136d15bb8ffcc3a6 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 14:01:20 +0200
Subject: [PATCH 44/72] Attempt to fix device memory issues :cop:

---
 benchmarks/benchmark_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index f9ab50d04..2b1f4997d 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -87,7 +87,7 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
         if workflow_test:
             for j in range(heatmap.shape[0]):
                 # Create a binary map
-                threshold_plot = torch.where(torch.tensor(heatmap[j]).to(torch.device('cuda:0')) > 0.5, 1, 0)
+                threshold_plot = torch.where(torch.tensor(heatmap[j]).to(torch.device('cuda:0')) > 0.5, 1, 0).to(dev)
                 output_image = input_image * threshold_plot
 
         end_time = time.time()

From 2748c5ca361ce846953f366b15f620df69419cec Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 14:18:10 +0200
Subject: [PATCH 45/72] Select the last CNN model as the GradCAM taregt layer
 :scientist:

---
 benchmarks/benchmark_functions.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 2b1f4997d..2c9f7f60c 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -36,6 +36,7 @@ def __init__(self):
       nn.Linear(122880, 10),
       nn.Linear(10, 1)
     )
+    self.features = slef.cnn_stack
 
   def forward(self, x):
     logits = self.cnn_stack(x)
@@ -52,6 +53,13 @@ def xavier_uniform_init(layer):
 
     nn.init.xavier_uniform_(layer.weight, gain=gain)
 
+def last_cnn_layer(features):
+  for feature in features:
+    if isinstance(feature, nn.Conv2d):
+      return feature
+
+  return None
+
 # Code to run benchmark
 def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False):
     min_time = 10000000000000
@@ -70,7 +78,7 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
     targets = None # [ClassifierOutputTarget(None)]
 
     model.to(dev)
-    target_layers = [model.layer4] # Last CNN layer of ResNet50
+    target_layers = [last_cnn_layer(model.features)] # Last CNN layer of ResNet50
 
     cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
     cam_function.batch_size = batch_size

From b77aa5b0fcd372373f0c40d51adbcfe5b429bd04 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 14:18:43 +0200
Subject: [PATCH 46/72] Fix spelling miskate :cop:

---
 benchmarks/benchmark_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 2c9f7f60c..776a4c990 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -36,7 +36,7 @@ def __init__(self):
       nn.Linear(122880, 10),
       nn.Linear(10, 1)
     )
-    self.features = slef.cnn_stack
+    self.features = self.cnn_stack
 
   def forward(self, x):
     logits = self.cnn_stack(x)

From 65f1b1fbd8d6493d5daed0ddc1031009abf7ddfa Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 14:25:40 +0200
Subject: [PATCH 47/72] Attempt another way to iterate through model params
 :cop:

---
 benchmarks/benchmark_functions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 776a4c990..097e65423 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -53,9 +53,9 @@ def xavier_uniform_init(layer):
 
     nn.init.xavier_uniform_(layer.weight, gain=gain)
 
-def last_cnn_layer(features):
-  for feature in features:
-    if isinstance(feature, nn.Conv2d):
+def last_cnn_layer(model):
+  for name, param in model.named_parameters():
+    if isinstance(param, nn.Conv2d):
       return feature
 
   return None
@@ -78,7 +78,7 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
     targets = None # [ClassifierOutputTarget(None)]
 
     model.to(dev)
-    target_layers = [last_cnn_layer(model.features)] # Last CNN layer of ResNet50
+    target_layers = [last_cnn_layer(model)] # Last CNN layer of ResNet50
 
     cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
     cam_function.batch_size = batch_size

From 9c274bec8f7a462314fbfad9b454a98ebdcc27dc Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 21:59:46 +0200
Subject: [PATCH 48/72] Handle multiple models :cop:

---
 benchmarks/benchmark_functions.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 097e65423..30855ce4b 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -54,7 +54,13 @@ def xavier_uniform_init(layer):
     nn.init.xavier_uniform_(layer.weight, gain=gain)
 
 def last_cnn_layer(model):
-  for name, param in model.named_parameters():
+  if hasattr(model, 'layer4'):
+    return model.layer4
+
+  if hasattr(model, 'conv3'):
+    return model.conv3
+
+  for param in model.features:
     if isinstance(param, nn.Conv2d):
       return feature
 

From eaaf0a945e46181cbcbb4c92e712c899c899e992 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 22:03:26 +0200
Subject: [PATCH 49/72] Fix feature bug :cop:

---
 benchmarks/benchmark_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 30855ce4b..5804bbbe4 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -60,8 +60,8 @@ def last_cnn_layer(model):
   if hasattr(model, 'conv3'):
     return model.conv3
 
-  for param in model.features:
-    if isinstance(param, nn.Conv2d):
+  for feature in model.features:
+    if isinstance(feature, nn.Conv2d):
       return feature
 
   return None

From dc5db2edc744a0247228e20f25244dfaa35105b0 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 22:05:11 +0200
Subject: [PATCH 50/72] Cleanup progress :cop:

---
 benchmarks/benchmark_functions.py | 5 +++--
 benchmarks/models_benchmark.py    | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 5804bbbe4..85f4ed6f1 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -67,7 +67,7 @@ def last_cnn_layer(model):
   return None
 
 # Code to run benchmark
-def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False):
+def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False, progress_bar=True):
     min_time = 10000000000000
     max_time = 0
     sum_of_times = 0
@@ -115,7 +115,8 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
         if time_difference < min_time:
             min_time = time_difference
 
-        pbar.update(batch_size)
+        if progress_bar:
+          pbar.update(batch_size)
 
     avg_time = sum_of_times / number_of_inputs
     return [min_time, max_time, avg_time]
diff --git a/benchmarks/models_benchmark.py b/benchmarks/models_benchmark.py
index a97887f26..08be44206 100644
--- a/benchmarks/models_benchmark.py
+++ b/benchmarks/models_benchmark.py
@@ -38,8 +38,8 @@
     print(f'Simple Workflow for model #{model_name}:\n')
 
     model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
-    cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True)
-    cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True)
+    cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True, progress_bar=False)
+    cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True, progress_bar=False)
 
     print(f'Cuda Min time: {cuda_min_time}\n')
     print(f'Cuda Max time: {cuda_max_time}\n')

From c75bbef2bfd320fca2fa0a0885b439552a9fcaa1 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 18 Nov 2022 23:14:17 +0200
Subject: [PATCH 51/72] Add in method benchmark :cop:

---
 benchmarks/benchmark_functions.py |  4 +-
 benchmarks/methods_benchmark.py   | 64 +++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/methods_benchmark.py

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 85f4ed6f1..d06060d3e 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -67,7 +67,7 @@ def last_cnn_layer(model):
   return None
 
 # Code to run benchmark
-def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False, progress_bar=True):
+def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False, progress_bar=True, method=GradCAM):
     min_time = 10000000000000
     max_time = 0
     sum_of_times = 0
@@ -86,7 +86,7 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
     model.to(dev)
     target_layers = [last_cnn_layer(model)] # Last CNN layer of ResNet50
 
-    cam_function = GradCAM(model=model, target_layers=target_layers, use_cuda=use_cuda)
+    cam_function = method(model=model, target_layers=target_layers, use_cuda=use_cuda)
     cam_function.batch_size = batch_size
 
     pbar = tqdm.tqdm(total=number_of_inputs)
diff --git a/benchmarks/methods_benchmark.py b/benchmarks/methods_benchmark.py
new file mode 100644
index 000000000..8c52229b5
--- /dev/null
+++ b/benchmarks/methods_benchmark.py
@@ -0,0 +1,64 @@
+import argparse
+import cv2
+import numpy as np
+import torch
+import time
+import tqdm
+
+from pytorch_grad_cam import GradCAM, \
+    ScoreCAM, \
+    GradCAMPlusPlus, \
+    AblationCAM, \
+    XGradCAM, \
+    EigenCAM, \
+    EigenGradCAM, \
+    LayerCAM, \
+    FullGrad
+
+from torch import nn
+import torch.nn.functional as F
+
+import torchvision # You may need to install separately
+from torchvision import models
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+import benchmark_functions
+
+number_of_inputs = 1000
+
+print(f'Benchmarking GradCAM using {number_of_inputs} images for multiple models...')
+
+methods_to_benchmark = [
+    ['GradCAM', GradCAM],
+    ['ScoreCAM', ScoreCAM],
+    ['GradCAMPlusPlus', GradCAMPlusPlus],
+    ['AblationCAM', AblationCAM],
+    ['XGradCAM', XGradCAM],
+    ['EigenCAM', EigenCAM],
+    ['EigenGradCAM', EigenGradCAM],
+    ['LayerCAM', LayerCAM],
+    ['FullGrad', FullGrad]
+]
+
+model = benchmark_functions.SimpleCNN()
+# model = models.resnet18()
+
+model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
+
+for method_name, method in tqdm.tqdm(methods_to_benchmark):
+    print('==============================================================================\n\n')
+    print(f'Simple Workflow for method #{method_name}:\n')
+
+    cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True, progress_bar=False, method=method)
+    cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True, progress_bar=False, method=method)
+
+    print(f'Cuda Min time: {cuda_min_time}\n')
+    print(f'Cuda Max time: {cuda_max_time}\n')
+    print(f'Cuda Avg time: {cuda_avg_time}\n\n')
+    print(f'CPU Min time: {cpu_min_time}\n')
+    print(f'CPU Max time: {cpu_max_time}\n')
+    print(f'CPU Avg time: {cpu_avg_time}\n')
+
+print('==============================================================================\n\n')
+print('Done!')

From 915b99f14ef572de9d9dde188a5e7922acd0b0e3 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 17 Feb 2023 22:36:01 +0200
Subject: [PATCH 52/72] Patch in cuda device support :cop:

---
 pytorch_grad_cam/base_cam.py     | 13 +++++++++++--
 pytorch_grad_cam/fullgrad_cam.py |  3 ++-
 pytorch_grad_cam/grad_cam.py     |  3 ++-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index 547f001e4..62cbca659 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -13,14 +13,21 @@ def __init__(self,
                  model: torch.nn.Module,
                  target_layers: List[torch.nn.Module],
                  use_cuda: bool = False,
+                 cuda_device: None,
                  reshape_transform: Callable = None,
                  compute_input_gradient: bool = False,
                  uses_gradients: bool = True) -> None:
         self.model = model.eval()
         self.target_layers = target_layers
+
         self.cuda = use_cuda
-        if self.cuda:
+        self.cuda_device = cuda_device
+
+        if self.cuda_device and self.cuda:
+            self.model.to(self.cuda_device)
+        elif self.cuda:
             self.model = model.cuda()
+
         self.reshape_transform = reshape_transform
         self.compute_input_gradient = compute_input_gradient
         self.uses_gradients = uses_gradients
@@ -64,7 +71,9 @@ def forward(self,
                 targets: List[torch.nn.Module],
                 eigen_smooth: bool = False) -> torch.Tensor:
 
-        if self.cuda:
+        if self.cuda_device and self.cuda:
+            input_tensor = input_tensor.to(self.cuda_device)
+        elif self.cuda:
             input_tensor = input_tensor.cuda()
 
         if self.compute_input_gradient:
diff --git a/pytorch_grad_cam/fullgrad_cam.py b/pytorch_grad_cam/fullgrad_cam.py
index 1a2685eff..f1e289094 100644
--- a/pytorch_grad_cam/fullgrad_cam.py
+++ b/pytorch_grad_cam/fullgrad_cam.py
@@ -9,7 +9,7 @@
 
 
 class FullGrad(BaseCAM):
-    def __init__(self, model, target_layers, use_cuda=False,
+    def __init__(self, model, target_layers, use_cuda=False, cuda_device: None,
                  reshape_transform=None):
         if len(target_layers) > 0:
             print(
@@ -28,6 +28,7 @@ def layer_with_2D_bias(layer):
             model,
             target_layers,
             use_cuda,
+            cuda_device,
             reshape_transform,
             compute_input_gradient=True)
         self.bias_data = [self.get_bias_data(
diff --git a/pytorch_grad_cam/grad_cam.py b/pytorch_grad_cam/grad_cam.py
index efb66e76e..718481484 100644
--- a/pytorch_grad_cam/grad_cam.py
+++ b/pytorch_grad_cam/grad_cam.py
@@ -3,7 +3,7 @@
 
 
 class GradCAM(BaseCAM):
-    def __init__(self, model, target_layers, use_cuda=False,
+    def __init__(self, model, target_layers, use_cuda=False, cuda_device: None,
                  reshape_transform=None):
         super(
             GradCAM,
@@ -11,6 +11,7 @@ def __init__(self, model, target_layers, use_cuda=False,
             model,
             target_layers,
             use_cuda,
+            cuda_device,
             reshape_transform)
 
     def get_cam_weights(self,

From 0305eecc9fd34ea44108f3e5f1e03a29151db976 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 17 Feb 2023 22:43:35 +0200
Subject: [PATCH 53/72] Fix cuda device call :cop:

---
 pytorch_grad_cam/fullgrad_cam.py | 2 +-
 pytorch_grad_cam/grad_cam.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_grad_cam/fullgrad_cam.py b/pytorch_grad_cam/fullgrad_cam.py
index f1e289094..d2b438a4c 100644
--- a/pytorch_grad_cam/fullgrad_cam.py
+++ b/pytorch_grad_cam/fullgrad_cam.py
@@ -9,7 +9,7 @@
 
 
 class FullGrad(BaseCAM):
-    def __init__(self, model, target_layers, use_cuda=False, cuda_device: None,
+    def __init__(self, model, target_layers, use_cuda=False, cuda_device=None,
                  reshape_transform=None):
         if len(target_layers) > 0:
             print(
diff --git a/pytorch_grad_cam/grad_cam.py b/pytorch_grad_cam/grad_cam.py
index 718481484..2a48bdd8b 100644
--- a/pytorch_grad_cam/grad_cam.py
+++ b/pytorch_grad_cam/grad_cam.py
@@ -3,7 +3,7 @@
 
 
 class GradCAM(BaseCAM):
-    def __init__(self, model, target_layers, use_cuda=False, cuda_device: None,
+    def __init__(self, model, target_layers, use_cuda=False, cuda_device=None,
                  reshape_transform=None):
         super(
             GradCAM,

From 0da12a0ae047f0a69acb43ccd91bc3507dbb0573 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Fri, 17 Feb 2023 22:46:17 +0200
Subject: [PATCH 54/72] Fix cuda device call :cop:

---
 pytorch_grad_cam/base_cam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index 62cbca659..055111fbc 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -13,7 +13,7 @@ def __init__(self,
                  model: torch.nn.Module,
                  target_layers: List[torch.nn.Module],
                  use_cuda: bool = False,
-                 cuda_device: None,
+                 cuda_device = None,
                  reshape_transform: Callable = None,
                  compute_input_gradient: bool = False,
                  uses_gradients: bool = True) -> None:

From 82f71e728e9670307480801e428ac01dd4a94165 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 11:48:46 +0200
Subject: [PATCH 55/72] Work on a single image benchmark :cop:

---
 benchmarks/single_image_benchmark.py | 122 +++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 benchmarks/single_image_benchmark.py

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
new file mode 100644
index 000000000..cb9568ffb
--- /dev/null
+++ b/benchmarks/single_image_benchmark.py
@@ -0,0 +1,122 @@
+import argparse
+import cv2
+import numpy as np
+import torch
+import time
+import tqdm
+
+from pytorch_grad_cam import GradCAM, \
+    ScoreCAM, \
+    GradCAMPlusPlus, \
+    AblationCAM, \
+    XGradCAM, \
+    EigenCAM, \
+    EigenGradCAM, \
+    LayerCAM, \
+    FullGrad
+
+from torch import nn
+import torch.nn.functional as F
+
+import torchvision # You may need to install separately
+from torchvision import models
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+import benchmark_functions
+
+number_of_inputs = 1
+model =  models.resnet50()
+
+# TODO: Load image
+
+print(f'Benchmarking GradCAM using {number_of_inputs} image for ResNet50...')
+
+# Run on CPU with profiler (save the profile to print later)
+print('Profile list of images on CPU...')
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+
+# Run on CUDA with profiler (save the profile to print later)
+print('Profile list of images on Cuda...')
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+
+# Run on CUDA with extra workflow
+print('Profile list of images on Cuda and then run workflow...')
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+
+# Run on CUDA with extra workflow
+print('Profile list of images on Cuda and then run workflow with a simple CNN...')
+model = benchmark_functions.SimpleCNN()
+model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
+with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+simple_work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+
+model =  models.resnet50()
+# Run on CPU x1000 (get min, max, and avg times)
+print('Run list of images on CPU...')
+cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+
+# Run on CUDA x1000
+print('Run list of images on Cuda...')
+cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+
+# Run Workflow
+print('Run list of images on Cuda with a workflow...')
+workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+
+print('Run list of images on Cuda with a workflow using simple CNN...')
+model = benchmark_functions.SimpleCNN()
+model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
+simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+
+print('Complete!')
+
+print('==============================================================================\n\n')
+print('CPU Profile:\n')
+print(cpu_profile)
+
+print('==============================================================================\n\n')
+print('Cuda Profile:\n')
+print(cuda_profile)
+
+print('==============================================================================\n\n')
+print('Workflow Cuda Profile:\n')
+print(work_flow_cuda_profile)
+
+print('==============================================================================\n\n')
+print('Simple Workflow Cuda Profile:\n')
+print(simple_work_flow_cuda_profile)
+
+print('==============================================================================\n\n')
+print('CPU Timing (No Profiler):\n')
+print(f'Min time: {cpu_min_time}\n')
+print(f'Max time: {cpu_max_time}\n')
+print(f'Avg time: {cpu_avg_time}\n')
+
+print('==============================================================================\n\n')
+print('Cuda Timing (No Profiler):\n')
+print(f'Min time: {cuda_min_time}\n')
+print(f'Max time: {cuda_max_time}\n')
+print(f'Avg time: {cuda_avg_time}\n')
+
+print('==============================================================================\n\n')
+print('Workflow Cuda Timing (No Profiler):\n')
+print(f'Min time: {workflow_cuda_min_time}\n')
+print(f'Max time: {workflow_cuda_max_time}\n')
+print(f'Avg time: {workflow_cuda_avg_time}\n')
+
+print('==============================================================================\n\n')
+print('Simple Workflow Cuda Timing (No Profiler):\n')
+print(f'Min time: {simple_workflow_cuda_min_time}\n')
+print(f'Max time: {simple_workflow_cuda_max_time}\n')
+print(f'Avg time: {simple_workflow_cuda_avg_time}\n')
+
+print('==============================================================================\n\n')
+print('Done!')

From 199815c849aa30d43667732f6b08dc768681a8c3 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 11:54:27 +0200
Subject: [PATCH 56/72] Disable cpu benchmarking :cop:

---
 benchmarks/single_image_benchmark.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index cb9568ffb..1f9f870ee 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -33,10 +33,10 @@
 print(f'Benchmarking GradCAM using {number_of_inputs} image for ResNet50...')
 
 # Run on CPU with profiler (save the profile to print later)
-print('Profile list of images on CPU...')
-with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
-cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
+# print('Profile list of images on CPU...')
+# with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
+#     cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+# cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
 print('Profile list of images on Cuda...')
@@ -60,8 +60,8 @@
 
 model =  models.resnet50()
 # Run on CPU x1000 (get min, max, and avg times)
-print('Run list of images on CPU...')
-cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+# print('Run list of images on CPU...')
+# cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 
 # Run on CUDA x1000
 print('Run list of images on Cuda...')
@@ -78,9 +78,9 @@
 
 print('Complete!')
 
-print('==============================================================================\n\n')
-print('CPU Profile:\n')
-print(cpu_profile)
+# print('==============================================================================\n\n')
+# print('CPU Profile:\n')
+# print(cpu_profile)
 
 print('==============================================================================\n\n')
 print('Cuda Profile:\n')
@@ -94,11 +94,11 @@
 print('Simple Workflow Cuda Profile:\n')
 print(simple_work_flow_cuda_profile)
 
-print('==============================================================================\n\n')
-print('CPU Timing (No Profiler):\n')
-print(f'Min time: {cpu_min_time}\n')
-print(f'Max time: {cpu_max_time}\n')
-print(f'Avg time: {cpu_avg_time}\n')
+# print('==============================================================================\n\n')
+# print('CPU Timing (No Profiler):\n')
+# print(f'Min time: {cpu_min_time}\n')
+# print(f'Max time: {cpu_max_time}\n')
+# print(f'Avg time: {cpu_avg_time}\n')
 
 print('==============================================================================\n\n')
 print('Cuda Timing (No Profiler):\n')

From cf020cfe8e24e421e7f95754b17b90e36299c9e5 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:11:32 +0200
Subject: [PATCH 57/72] output the resultant image and allow inputting a image
 :cop:

---
 benchmarks/benchmark_functions.py    |  9 ++++++---
 benchmarks/methods_benchmark.py      |  4 ++--
 benchmarks/models_benchmark.py       |  4 ++--
 benchmarks/single_image_benchmark.py | 16 ++++++++--------
 benchmarks/torch_benchmark.py        | 16 ++++++++--------
 5 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index d06060d3e..ce9ed05e3 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -67,7 +67,7 @@ def last_cnn_layer(model):
   return None
 
 # Code to run benchmark
-def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False, progress_bar=True, method=GradCAM):
+def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False, progress_bar=True, method=GradCAM, input_image=None):
     min_time = 10000000000000
     max_time = 0
     sum_of_times = 0
@@ -95,7 +95,10 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
         start_time = time.time()
 
         # Actual code to benchmark
-        input_image = input_tensor[i:i+batch_size].to(dev)
+        if input_image is None:
+          input_image = input_tensor[i:i+batch_size]
+        input_image = input_image.to(dev)
+
         heatmap = cam_function(input_tensor=input_image, targets=targets)
 
         if workflow_test:
@@ -119,4 +122,4 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
           pbar.update(batch_size)
 
     avg_time = sum_of_times / number_of_inputs
-    return [min_time, max_time, avg_time]
+    return [min_time, max_time, avg_time, output_image]
diff --git a/benchmarks/methods_benchmark.py b/benchmarks/methods_benchmark.py
index 8c52229b5..5660e3a79 100644
--- a/benchmarks/methods_benchmark.py
+++ b/benchmarks/methods_benchmark.py
@@ -50,8 +50,8 @@
     print('==============================================================================\n\n')
     print(f'Simple Workflow for method #{method_name}:\n')
 
-    cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True, progress_bar=False, method=method)
-    cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True, progress_bar=False, method=method)
+    cpu_min_time, cpu_max_time, cpu_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True, progress_bar=False, method=method)
+    cuda_min_time, cuda_max_time, cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True, progress_bar=False, method=method)
 
     print(f'Cuda Min time: {cuda_min_time}\n')
     print(f'Cuda Max time: {cuda_max_time}\n')
diff --git a/benchmarks/models_benchmark.py b/benchmarks/models_benchmark.py
index 08be44206..9c6fdfa89 100644
--- a/benchmarks/models_benchmark.py
+++ b/benchmarks/models_benchmark.py
@@ -38,8 +38,8 @@
     print(f'Simple Workflow for model #{model_name}:\n')
 
     model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
-    cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True, progress_bar=False)
-    cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True, progress_bar=False)
+    cpu_min_time, cpu_max_time, cpu_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=False, workflow_test=True, progress_bar=False)
+    cuda_min_time, cuda_max_time, cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=8, use_cuda=True, workflow_test=True, progress_bar=False)
 
     print(f'Cuda Min time: {cuda_min_time}\n')
     print(f'Cuda Max time: {cuda_max_time}\n')
diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index 1f9f870ee..ca86da669 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -35,19 +35,19 @@
 # Run on CPU with profiler (save the profile to print later)
 # print('Profile list of images on CPU...')
 # with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-#     cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+#     cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 # cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
 print('Profile list of images on Cuda...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with extra workflow
 print('Profile list of images on Cuda and then run workflow...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with extra workflow
@@ -55,26 +55,26 @@
 model = benchmark_functions.SimpleCNN()
 model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 simple_work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 model =  models.resnet50()
 # Run on CPU x1000 (get min, max, and avg times)
 # print('Run list of images on CPU...')
-# cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+# cpu_min_time, cpu_max_time, cpu_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 
 # Run on CUDA x1000
 print('Run list of images on Cuda...')
-cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+cuda_min_time, cuda_max_time, cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 
 # Run Workflow
 print('Run list of images on Cuda with a workflow...')
-workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Run list of images on Cuda with a workflow using simple CNN...')
 model = benchmark_functions.SimpleCNN()
 model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
-simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Complete!')
 
diff --git a/benchmarks/torch_benchmark.py b/benchmarks/torch_benchmark.py
index a7c58ffbf..793f099c5 100644
--- a/benchmarks/torch_benchmark.py
+++ b/benchmarks/torch_benchmark.py
@@ -39,19 +39,19 @@
 # Run on CPU with profiler (save the profile to print later)
 print('Profile list of images on CPU...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+    cpu_profile_min_time, cpu_profile_max_time, cpu_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 cpu_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with profiler (save the profile to print later)
 print('Profile list of images on Cuda...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with extra workflow
 print('Profile list of images on Cuda and then run workflow...')
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 # Run on CUDA with extra workflow
@@ -59,26 +59,26 @@
 model = benchmark_functions.SimpleCNN()
 model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
 with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
-    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+    cuda_profile_min_time, cuda_profile_max_time, cuda_profile_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 simple_work_flow_cuda_profile = prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=15)
 
 model =  models.resnet50()
 # Run on CPU x1000 (get min, max, and avg times)
 print('Run list of images on CPU...')
-cpu_min_time, cpu_max_time, cpu_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
+cpu_min_time, cpu_max_time, cpu_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=False)
 
 # Run on CUDA x1000
 print('Run list of images on Cuda...')
-cuda_min_time, cuda_max_time, cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
+cuda_min_time, cuda_max_time, cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True)
 
 # Run Workflow
 print('Run list of images on Cuda with a workflow...')
-workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+workflow_cuda_min_time, workflow_cuda_max_time, workflow_cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Run list of images on Cuda with a workflow using simple CNN...')
 model = benchmark_functions.SimpleCNN()
 model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
-simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Complete!')
 

From 9513c866ee474bea61232277a7f5f5118f124706 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:18:59 +0200
Subject: [PATCH 58/72] Allow for output saving for a snaity check :cop:

---
 benchmarks/benchmark_functions.py    | 7 +++++--
 benchmarks/single_image_benchmark.py | 9 ++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index ce9ed05e3..661f15896 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -66,6 +66,9 @@ def last_cnn_layer(model):
 
   return None
 
+def save_image(image, path):
+  return torchvision.utils.save_image(tensor: image, fp: path)
+
 # Code to run benchmark
 def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False, progress_bar=True, method=GradCAM, input_image=None):
     min_time = 10000000000000
@@ -86,7 +89,7 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
     model.to(dev)
     target_layers = [last_cnn_layer(model)] # Last CNN layer of ResNet50
 
-    cam_function = method(model=model, target_layers=target_layers, use_cuda=use_cuda)
+    cam_function = method(model=model, target_layers=target_layers, cuda_device=dev, use_cuda=use_cuda)
     cam_function.batch_size = batch_size
 
     pbar = tqdm.tqdm(total=number_of_inputs)
@@ -122,4 +125,4 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
           pbar.update(batch_size)
 
     avg_time = sum_of_times / number_of_inputs
-    return [min_time, max_time, avg_time, output_image]
+    return [min_time, max_time, avg_time, [threshold_plot, output_image]]
diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index ca86da669..c7c3959cc 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -74,7 +74,7 @@
 print('Run list of images on Cuda with a workflow using simple CNN...')
 model = benchmark_functions.SimpleCNN()
 model.apply(benchmark_functions.xavier_uniform_init) # Randomise more weights
-simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time, _output_image = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
+simple_workflow_cuda_min_time, simple_workflow_cuda_max_time, simple_workflow_cuda_avg_time, output = benchmark_functions.run_gradcam(model, number_of_inputs, batch_size=64, use_cuda=True, workflow_test=True)
 
 print('Complete!')
 
@@ -118,5 +118,12 @@
 print(f'Max time: {simple_workflow_cuda_max_time}\n')
 print(f'Avg time: {simple_workflow_cuda_avg_time}\n')
 
+print('==============================================================================\n\n')
+print('Output the resultant heat-map')
+threshold_plot, output_image = output
+
+benchmark_functions.save_image(threshold_plot, '~/threshold.png')
+benchmark_functions.save_image(output_image, '~/output_image.png')
+
 print('==============================================================================\n\n')
 print('Done!')

From 186c14bdffedd92e1e643ffb525c0f0d2489ff2a Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:19:43 +0200
Subject: [PATCH 59/72] Allow for output saving for a snaity check :cop:

---
 benchmarks/benchmark_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 661f15896..77ba3e297 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -67,7 +67,7 @@ def last_cnn_layer(model):
   return None
 
 def save_image(image, path):
-  return torchvision.utils.save_image(tensor: image, fp: path)
+  return torchvision.utils.save_image(image, path)
 
 # Code to run benchmark
 def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_test=False, progress_bar=True, method=GradCAM, input_image=None):

From b6d220230d9dac964b5b5e54f13fcb19ca17d7e1 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:27:54 +0200
Subject: [PATCH 60/72] Allow for output saving for a snaity check :cop:

---
 benchmarks/benchmark_functions.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/benchmark_functions.py b/benchmarks/benchmark_functions.py
index 77ba3e297..4ebca6111 100644
--- a/benchmarks/benchmark_functions.py
+++ b/benchmarks/benchmark_functions.py
@@ -97,6 +97,9 @@ def run_gradcam(model, number_of_inputs, batch_size=1, use_cuda=False, workflow_
     for i in range(0, number_of_inputs, batch_size):
         start_time = time.time()
 
+        threshold_plot = torch.rand((number_of_inputs, 3, 256, 60))
+        output_image = torch.rand((number_of_inputs, 3, 256, 60))
+
         # Actual code to benchmark
         if input_image is None:
           input_image = input_tensor[i:i+batch_size]

From 84a0689fb3cfecbae606eaac23b0959eb7551442 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:28:47 +0200
Subject: [PATCH 61/72] Allow for output saving for a snaity check :cop:

---
 benchmarks/single_image_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index c7c3959cc..450e3afed 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -122,8 +122,8 @@
 print('Output the resultant heat-map')
 threshold_plot, output_image = output
 
-benchmark_functions.save_image(threshold_plot, '~/threshold.png')
-benchmark_functions.save_image(output_image, '~/output_image.png')
+benchmark_functions.save_image(threshold_plot.to("cpu", torch.uint8), '~/threshold.png')
+benchmark_functions.save_image(output_image.to("cpu", torch.uint8), '~/output_image.png')
 
 print('==============================================================================\n\n')
 print('Done!')

From 18f8d8e6a6b2c83529577c7711e25cca9d994753 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:36:56 +0200
Subject: [PATCH 62/72] Open image :cop:

---
 benchmarks/single_image_benchmark.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index 450e3afed..da56a0618 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -28,7 +28,9 @@
 number_of_inputs = 1
 model =  models.resnet50()
 
-# TODO: Load image
+# Just hard-coding a path for now
+image_path = '~/image.jpg'
+input_tensor = torch.read_image(image_path)
 
 print(f'Benchmarking GradCAM using {number_of_inputs} image for ResNet50...')
 

From 8eaf1b772c7767e431176fa782d9bfdb54aa0c14 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:38:14 +0200
Subject: [PATCH 63/72] Open image :cop:

---
 benchmarks/single_image_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index da56a0618..21b81faf9 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -30,7 +30,7 @@
 
 # Just hard-coding a path for now
 image_path = '~/image.jpg'
-input_tensor = torch.read_image(image_path)
+input_tensor = torchvision.io.read_image(image_path)
 
 print(f'Benchmarking GradCAM using {number_of_inputs} image for ResNet50...')
 

From b8054654b9c2475b5ff93fd920e342e5fbfd472c Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:48:59 +0200
Subject: [PATCH 64/72] Change to simple model :cop:

---
 benchmarks/single_image_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index 21b81faf9..bb5d54df2 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -26,7 +26,7 @@
 import benchmark_functions
 
 number_of_inputs = 1
-model =  models.resnet50()
+model =  benchmark_functions.SimpleCNN()
 
 # Just hard-coding a path for now
 image_path = '~/image.jpg'

From 7393e77030ad9b2236aca1484d6d0bf2924a9e1c Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:50:56 +0200
Subject: [PATCH 65/72] try with trained weights :cop:

---
 benchmarks/single_image_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index bb5d54df2..09f0a9e09 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -26,7 +26,7 @@
 import benchmark_functions
 
 number_of_inputs = 1
-model =  benchmark_functions.SimpleCNN()
+model =  models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
 
 # Just hard-coding a path for now
 image_path = '~/image.jpg'

From 193c9f29d8fa3735b44e7a057c2b91d83f35d26d Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Thu, 9 Mar 2023 12:52:08 +0200
Subject: [PATCH 66/72] try with trained weights :cop:

---
 benchmarks/single_image_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_image_benchmark.py b/benchmarks/single_image_benchmark.py
index 09f0a9e09..2d8368442 100644
--- a/benchmarks/single_image_benchmark.py
+++ b/benchmarks/single_image_benchmark.py
@@ -26,7 +26,7 @@
 import benchmark_functions
 
 number_of_inputs = 1
-model =  models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
+model =  models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
 
 # Just hard-coding a path for now
 image_path = '~/image.jpg'

From a3d327b5953edbfdc64ff3d000fdb1dd4d26131f Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Mon, 17 Apr 2023 15:00:42 +0200
Subject: [PATCH 67/72] Add in cuda device support for gradients and
 activations :cop:

---
 pytorch_grad_cam/activations_and_gradients.py | 19 ++++++++++++++++---
 pytorch_grad_cam/base_cam.py                  |  5 ++++-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/pytorch_grad_cam/activations_and_gradients.py b/pytorch_grad_cam/activations_and_gradients.py
index 0c2071e59..957c976e1 100644
--- a/pytorch_grad_cam/activations_and_gradients.py
+++ b/pytorch_grad_cam/activations_and_gradients.py
@@ -2,12 +2,16 @@ class ActivationsAndGradients:
     """ Class for extracting activations and
     registering gradients from targetted intermediate layers """
 
-    def __init__(self, model, target_layers, reshape_transform):
+    def __init__(self, model, target_layers, reshape_transform, use_cuda: bool = False, cuda_device = None):
         self.model = model
         self.gradients = []
         self.activations = []
         self.reshape_transform = reshape_transform
         self.handles = []
+
+        self.use_cuda = use_cuda
+        self.cuda_device = cuda_device
+
         for target_layer in target_layers:
             self.handles.append(
                 target_layer.register_forward_hook(self.save_activation))
@@ -21,7 +25,11 @@ def save_activation(self, module, input, output):
 
         if self.reshape_transform is not None:
             activation = self.reshape_transform(activation)
-        self.activations.append(activation.cpu().detach())
+
+        if self.use_cuda:
+            self.activations.append(activation.to(self.cuda_device))
+        else:
+            self.activations.append(activation.cpu().detach())
 
     def save_gradient(self, module, input, output):
         if not hasattr(output, "requires_grad") or not output.requires_grad:
@@ -32,13 +40,18 @@ def save_gradient(self, module, input, output):
         def _store_grad(grad):
             if self.reshape_transform is not None:
                 grad = self.reshape_transform(grad)
-            self.gradients = [grad.cpu().detach()] + self.gradients
+
+            if self.use_cuda:
+                self.gradients = [grad.to(self.cuda_device)] + self.gradients
+            else:
+                self.gradients = [grad.cpu().detach()] + self.gradients
 
         output.register_hook(_store_grad)
 
     def __call__(self, x):
         self.gradients = []
         self.activations = []
+
         return self.model(x)
 
     def release(self):
diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index 055111fbc..5c45be2a8 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -32,7 +32,7 @@ def __init__(self,
         self.compute_input_gradient = compute_input_gradient
         self.uses_gradients = uses_gradients
         self.activations_and_grads = ActivationsAndGradients(
-            self.model, target_layers, reshape_transform)
+            self.model, target_layers, reshape_transfor, use_cuda = use_cuda, cuda_device = cuda_device)
 
     """ Get a vector of weights for every channel in the target layer.
         Methods that return weights channels,
@@ -126,8 +126,10 @@ def compute_cam_per_layer(
         # Loop over the saliency image from every layer
         for i in range(len(self.target_layers)):
             target_layer = self.target_layers[i]
+
             layer_activations = None
             layer_grads = None
+
             if i < len(activations_list):
                 layer_activations = activations_list[i]
             if i < len(grads_list):
@@ -139,6 +141,7 @@ def compute_cam_per_layer(
                                      layer_activations,
                                      layer_grads,
                                      eigen_smooth)
+
             cam = torch.maximum(cam, torch.tensor(0))
             scaled = scale_cam_image(cam, target_size)
             cam_per_target_layer.append(scaled[:, None, :])

From 15de5ed4c34dfc42d14a038fdb48b26886e35ba0 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Mon, 17 Apr 2023 15:11:52 +0200
Subject: [PATCH 68/72] Fix typo :cop:

---
 pytorch_grad_cam/base_cam.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index 5c45be2a8..73553def0 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -31,8 +31,9 @@ def __init__(self,
         self.reshape_transform = reshape_transform
         self.compute_input_gradient = compute_input_gradient
         self.uses_gradients = uses_gradients
+
         self.activations_and_grads = ActivationsAndGradients(
-            self.model, target_layers, reshape_transfor, use_cuda = use_cuda, cuda_device = cuda_device)
+            self.model, target_layers, reshape_transform, use_cuda = use_cuda, cuda_device = cuda_device)
 
     """ Get a vector of weights for every channel in the target layer.
         Methods that return weights channels,

From 9e3859948ee49b49fd09d89e04822469adf1a0e9 Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Mon, 17 Apr 2023 17:32:06 +0200
Subject: [PATCH 69/72] Attempt to force a difference FakeTensorMode
 :scientist:

---
 pytorch_grad_cam/activations_and_gradients.py | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/pytorch_grad_cam/activations_and_gradients.py b/pytorch_grad_cam/activations_and_gradients.py
index 957c976e1..0cf9129a1 100644
--- a/pytorch_grad_cam/activations_and_gradients.py
+++ b/pytorch_grad_cam/activations_and_gradients.py
@@ -1,3 +1,12 @@
+# from torch._subclasses import fake_tensor
+
+# from torch._subclasses.fake_tensor import (
+#     FakeTensor,
+#     FakeTensorMode,
+#     FakeTensorConverter)
+
+from torch._subclasses.fake_tensor import FakeTensorMode
+
 class ActivationsAndGradients:
     """ Class for extracting activations and
     registering gradients from targetted intermediate layers """
@@ -12,13 +21,14 @@ def __init__(self, model, target_layers, reshape_transform, use_cuda: bool = Fal
         self.use_cuda = use_cuda
         self.cuda_device = cuda_device
 
-        for target_layer in target_layers:
-            self.handles.append(
-                target_layer.register_forward_hook(self.save_activation))
-            # Because of https://github.com/pytorch/pytorch/issues/61519,
-            # we don't use backward hook to record gradients.
-            self.handles.append(
-                target_layer.register_forward_hook(self.save_gradient))
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            for target_layer in target_layers:
+                self.handles.append(
+                    target_layer.register_forward_hook(self.save_activation))
+                # Because of https://github.com/pytorch/pytorch/issues/61519,
+                # we don't use backward hook to record gradients.
+                self.handles.append(
+                    target_layer.register_forward_hook(self.save_gradient))
 
     def save_activation(self, module, input, output):
         activation = output

From ed64d06d201fe7dd94ac16882786808c20c50dda Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Mon, 17 Apr 2023 19:23:42 +0200
Subject: [PATCH 70/72] Attempt to patch issue with pytorch 2.0 :cop:

---
 pytorch_grad_cam/activations_and_gradients.py | 24 ++++++-------------
 pytorch_grad_cam/base_cam.py                  |  4 ++--
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/pytorch_grad_cam/activations_and_gradients.py b/pytorch_grad_cam/activations_and_gradients.py
index 0cf9129a1..957c976e1 100644
--- a/pytorch_grad_cam/activations_and_gradients.py
+++ b/pytorch_grad_cam/activations_and_gradients.py
@@ -1,12 +1,3 @@
-# from torch._subclasses import fake_tensor
-
-# from torch._subclasses.fake_tensor import (
-#     FakeTensor,
-#     FakeTensorMode,
-#     FakeTensorConverter)
-
-from torch._subclasses.fake_tensor import FakeTensorMode
-
 class ActivationsAndGradients:
     """ Class for extracting activations and
     registering gradients from targetted intermediate layers """
@@ -21,14 +12,13 @@ def __init__(self, model, target_layers, reshape_transform, use_cuda: bool = Fal
         self.use_cuda = use_cuda
         self.cuda_device = cuda_device
 
-        with FakeTensorMode(allow_non_fake_inputs=True):
-            for target_layer in target_layers:
-                self.handles.append(
-                    target_layer.register_forward_hook(self.save_activation))
-                # Because of https://github.com/pytorch/pytorch/issues/61519,
-                # we don't use backward hook to record gradients.
-                self.handles.append(
-                    target_layer.register_forward_hook(self.save_gradient))
+        for target_layer in target_layers:
+            self.handles.append(
+                target_layer.register_forward_hook(self.save_activation))
+            # Because of https://github.com/pytorch/pytorch/issues/61519,
+            # we don't use backward hook to record gradients.
+            self.handles.append(
+                target_layer.register_forward_hook(self.save_gradient))
 
     def save_activation(self, module, input, output):
         activation = output
diff --git a/pytorch_grad_cam/base_cam.py b/pytorch_grad_cam/base_cam.py
index 73553def0..73ee0b47d 100644
--- a/pytorch_grad_cam/base_cam.py
+++ b/pytorch_grad_cam/base_cam.py
@@ -6,7 +6,7 @@
 from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection
 from pytorch_grad_cam.utils.image import scale_cam_image
 from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
-
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 
 class BaseCAM:
     def __init__(self,
@@ -143,7 +143,7 @@ def compute_cam_per_layer(
                                      layer_grads,
                                      eigen_smooth)
 
-            cam = torch.maximum(cam, torch.tensor(0))
+            with FakeTensorMode(allow_non_fake_inputs=True): cam = torch.maximum(cam.cpu(), torch.tensor(0))
             scaled = scale_cam_image(cam, target_size)
             cam_per_target_layer.append(scaled[:, None, :])
 

From 273bd81ecdebda2618177cc7f1a7fed7372d28aa Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Wed, 26 Apr 2023 18:20:00 +0200
Subject: [PATCH 71/72] Make more meaningful changes extracted from another
 branch :cop:

---
 pytorch_grad_cam/utils/svd_on_activations.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_grad_cam/utils/svd_on_activations.py b/pytorch_grad_cam/utils/svd_on_activations.py
index a406aeea8..2370cbc0d 100644
--- a/pytorch_grad_cam/utils/svd_on_activations.py
+++ b/pytorch_grad_cam/utils/svd_on_activations.py
@@ -1,9 +1,9 @@
-import numpy as np
+import torch
 
 
 def get_2d_projection(activation_batch):
     # TBD: use pytorch batch svd implementation
-    activation_batch[np.isnan(activation_batch)] = 0
+    activation_batch[torch.isnan(activation_batch)] = 0
     projections = []
     for activations in activation_batch:
         reshaped_activations = (activations).reshape(
@@ -12,8 +12,8 @@ def get_2d_projection(activation_batch):
         # Otherwise the image returned is negative
         reshaped_activations = reshaped_activations - \
             reshaped_activations.mean(axis=0)
-        U, S, VT = np.linalg.svd(reshaped_activations, full_matrices=True)
+        U, S, VT = torch.linalg.svd(reshaped_activations, full_matrices=True)
         projection = reshaped_activations @ VT[0, :]
         projection = projection.reshape(activations.shape[1:])
         projections.append(projection)
-    return np.float32(projections)
+    return torch.tensor(projections).to(torch.float32)

From 9ed4c9be91c562374d30b5a4c12d85dbfd05118f Mon Sep 17 00:00:00 2001
From: trex22 <contact@jasonchalom.com>
Date: Wed, 26 Apr 2023 18:21:09 +0200
Subject: [PATCH 72/72] Remove TODO :cop:

---
 pytorch_grad_cam/utils/svd_on_activations.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_grad_cam/utils/svd_on_activations.py b/pytorch_grad_cam/utils/svd_on_activations.py
index 2370cbc0d..91bfab0e1 100644
--- a/pytorch_grad_cam/utils/svd_on_activations.py
+++ b/pytorch_grad_cam/utils/svd_on_activations.py
@@ -2,7 +2,6 @@
 
 
 def get_2d_projection(activation_batch):
-    # TBD: use pytorch batch svd implementation
     activation_batch[torch.isnan(activation_batch)] = 0
     projections = []
     for activations in activation_batch: