pytorch
diff --git a/Diff for: ‎.github/scripts/generate_binary_build_matrix.py
-1 b/Diff for: ‎.github/scripts/generate_binary_build_matrix.py
-1
diff --git a/Diff for: ‎core/conversion/converters/impl/batch_norm.cpp
+11-4 b/Diff for: ‎core/conversion/converters/impl/batch_norm.cpp
+11-4
diff --git a/Diff for: ‎core/util/prelude.h
+1 b/Diff for: ‎core/util/prelude.h
+1
diff --git a/Diff for: ‎docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py
-4 b/Diff for: ‎docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py
-4
diff --git a/Diff for: ‎examples/dynamo/custom_kernel_plugins.py
-4 b/Diff for: ‎examples/dynamo/custom_kernel_plugins.py
-4
diff --git a/Diff for: ‎notebooks/CitriNet-example.ipynb
+4-10 b/Diff for: ‎notebooks/CitriNet-example.ipynb
+4-10
diff --git a/Diff for: ‎notebooks/EfficientNet-example.ipynb
+9-9 b/Diff for: ‎notebooks/EfficientNet-example.ipynb
+9-9
diff --git a/Diff for: ‎notebooks/Hugging-Face-BERT.ipynb
+6-6 b/Diff for: ‎notebooks/Hugging-Face-BERT.ipynb
+6-6
diff --git a/Diff for: ‎notebooks/Resnet50-CPP.ipynb
-1 b/Diff for: ‎notebooks/Resnet50-CPP.ipynb
-1
diff --git a/Diff for: ‎notebooks/Resnet50-example.ipynb
+10-11 b/Diff for: ‎notebooks/Resnet50-example.ipynb
+10-11
@@ -469,7 +469,6 @@ def generate_wheels_matrix(
     ret: List[Dict[str, Any]] = []
     for python_version in python_versions:
         for arch_version in arches:
-
             # TODO: Enable Python 3.13 support for ROCM
             if arch_version in ROCM_ARCHES and python_version == "3.13":
                 continue
 
@@ -134,9 +134,14 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
 
               auto eps = static_cast<float>(args[7].unwrapToDouble(1e-5f));
 
-              auto scales = args[1].unwrapToTensor(at::ones(shape[1], options)).cpu().contiguous();
-              auto bias = args[2].unwrapToTensor(at::zeros(shape[1], options)).cpu().contiguous();
-
+              auto scales = at::ones(shape[1], options);
+              if (!args[1].IValue()->isNone()) {
+                scales = args[1].unwrapToTensor(at::ones(shape[1], options)).cpu().contiguous();
+              }
+              auto bias = at::zeros(shape[1], options);
+              if (!args[2].IValue()->isNone()) {
+                bias = args[2].unwrapToTensor(at::zeros(shape[1], options)).cpu().contiguous();
+              }
               // track_running_stats=True
               if (!args[3].IValue()->isNone() || !args[4].IValue()->isNone()) {
                 auto running_mean = args[3].unwrapToTensor();
@@ -154,6 +159,8 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
                 return true;
               }
 
+              // Not sure this actually does something since the cudnn_enabled is from the PyTorch context.
+              // We need cuDNN either way to run this converter
               auto cudnn_enabled = static_cast<bool>(args[8].unwrapToBool(false));
               if (!cudnn_enabled) {
                 LOG_DEBUG(
@@ -162,7 +169,7 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
                     so for some functionalities, users need to install correct \
                     cuDNN version by themselves. Please see our support matrix \
                     here: https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html.");
-                return false;
+                // return false;
               }
 
               const int relu = 0;
 
@@ -2,6 +2,7 @@
 
 // A collection of headers from util that will typically get included in most
 // files
+#include <cstdint>
 #include "core/util/Exception.h"
 #include "core/util/build_info.h"
 #include "core/util/jit_util.h"
 
@@ -316,7 +316,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 import cupy as cp  # Needed to work around API gaps in PyTorch to build torch.Tensors around preallocated CUDA memory
 import numpy as np
-
 import tensorrt as trt
 
 
@@ -348,7 +347,6 @@ def get_output_dimensions(
         inputs: List[trt.DimsExprs],
         exprBuilder: trt.IExprBuilder,
     ) -> trt.DimsExprs:
-
         output_dims = trt.DimsExprs(inputs[0])
 
         for i in range(np.size(self.pads) // 2):
@@ -404,7 +402,6 @@ def enqueue(
         workspace: int,
         stream: int,
     ) -> None:
-
         # Host code is slightly different as this will be run as part of the TRT execution
         in_dtype = torchtrt.dtype.try_from(input_desc[0].type).to(np.dtype)
 
@@ -528,7 +525,6 @@ def circular_padding_converter(
     kwargs: Dict[str, Argument],
     name: str,
 ):
-
     # How to retrieve a plugin if it is defined elsewhere (e.g. linked library)
     plugin_registry = trt.get_plugin_registry()
     plugin_creator = plugin_registry.get_plugin_creator(
 
@@ -316,7 +316,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 import cupy as cp  # Needed to work around API gaps in PyTorch to build torch.Tensors around preallocated CUDA memory
 import numpy as np
-
 import tensorrt as trt
 
 
@@ -348,7 +347,6 @@ def get_output_dimensions(
         inputs: List[trt.DimsExprs],
         exprBuilder: trt.IExprBuilder,
     ) -> trt.DimsExprs:
-
         output_dims = trt.DimsExprs(inputs[0])
 
         for i in range(np.size(self.pads) // 2):
@@ -404,7 +402,6 @@ def enqueue(
         workspace: int,
         stream: int,
     ) -> None:
-
         # Host code is slightly different as this will be run as part of the TRT execution
         in_dtype = torchtrt.dtype.try_from(input_desc[0].type).to(np.dtype)
 
@@ -528,7 +525,6 @@ def circular_padding_converter(
     kwargs: Dict[str, Argument],
     name: str,
 ):
-
     # How to retrieve a plugin if it is defined elsewhere (e.g. linked library)
     plugin_registry = trt.get_plugin_registry()
     plugin_creator = plugin_registry.get_plugin_creator(
 
@@ -384,12 +384,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import nemo\n",
     "import torch\n",
     "\n",
     "import nemo.collections.asr as nemo_asr\n",
     "from nemo.core import typecheck\n",
-    "typecheck.set_typecheck_enabled(False) "
+    "typecheck.set_typecheck_enabled(False)"
    ]
   },
   {
@@ -572,11 +571,8 @@
     "from __future__ import absolute_import\n",
     "from __future__ import division\n",
     "\n",
-    "import argparse\n",
     "import timeit\n",
     "import numpy as np\n",
-    "import torch\n",
-    "import torch_tensorrt as trtorch\n",
     "import torch.backends.cudnn as cudnn\n",
     "\n",
     "def benchmark(model, input_tensor, num_loops, model_name, batch_size):\n",
@@ -632,7 +628,7 @@
     "    else:\n",
     "        model_name = f\"{variant}.ts\"\n",
     "\n",
-    "    print(f\"Loading model: {model_name}\") \n",
+    "    print(f\"Loading model: {model_name}\")\n",
     "    # Load traced model to CPU first\n",
     "    model = torch.jit.load(model_name).cuda()\n",
     "    cudnn.benchmark = True\n",
@@ -727,9 +723,7 @@
    ],
    "source": [
     "import torch\n",
-    "import torch.nn as nn\n",
     "import torch_tensorrt as torchtrt\n",
-    "import argparse\n",
     "\n",
     "variant = \"stt_en_citrinet_256\"\n",
     "precisions = [torch.float, torch.half]\n",
@@ -827,7 +821,7 @@
     "    else:\n",
     "        model_name = f\"{variant}.ts\"\n",
     "\n",
-    "    print(f\"Loading model: {model_name}\") \n",
+    "    print(f\"Loading model: {model_name}\")\n",
     "    # Load traced model to CPU first\n",
     "    model = torch.jit.load(model_name).cuda()\n",
     "    cudnn.benchmark = True\n",
@@ -906,7 +900,7 @@
     "    else:\n",
     "        model_name = f\"{variant}.ts\"\n",
     "\n",
-    "    print(f\"Loading model: {model_name}\") \n",
+    "    print(f\"Loading model: {model_name}\")\n",
     "    # Load traced model to CPU first\n",
     "    model = torch.jit.load(model_name).cuda()\n",
     "    cudnn.benchmark = True\n",
 
@@ -167,7 +167,7 @@
     "import torch.backends.cudnn as cudnn\n",
     "from timm.data import resolve_data_config\n",
     "from timm.data.transforms_factory import create_transform\n",
-    "import json \n",
+    "import json\n",
     "\n",
     "efficientnet_b0_model = timm.create_model('efficientnet_b0',pretrained=True)\n",
     "model = efficientnet_b0_model.eval().to(\"cuda\")"
@@ -305,13 +305,13 @@
     "        transforms.ToTensor(),\n",
     "        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
     "    ])\n",
-    "    input_tensor = preprocess(img)      \n",
+    "    input_tensor = preprocess(img)\n",
     "    plt.subplot(2,2,i+1)\n",
     "    plt.imshow(img)\n",
     "    plt.axis('off')\n",
     "\n",
     "# loading labels\n",
-    "with open(\"./data/imagenet_class_index.json\") as json_file: \n",
+    "with open(\"./data/imagenet_class_index.json\") as json_file:\n",
     "    d = json.load(json_file)"
    ]
   },
@@ -341,7 +341,7 @@
     "    preprocess = efficientnet_preprocess()\n",
     "    input_tensor = preprocess(img)\n",
     "    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n",
-    "    \n",
+    "\n",
     "    # move the input and model to GPU for speed if available\n",
     "    if torch.cuda.is_available():\n",
     "        input_batch = input_batch.to('cuda')\n",
@@ -351,7 +351,7 @@
     "        output = model(input_batch)\n",
     "        # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n",
     "        sm_output = torch.nn.functional.softmax(output[0], dim=0)\n",
-    "        \n",
+    "\n",
     "    ind = torch.argmax(sm_output)\n",
     "    return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n",
     "\n",
@@ -360,7 +360,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
@@ -430,13 +430,13 @@
     "for i in range(4):\n",
     "    img_path = './data/img%d.JPG'%i\n",
     "    img = Image.open(img_path)\n",
-    "    \n",
+    "\n",
     "    pred, prob = predict(img_path, efficientnet_b0_model)\n",
     "    print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n",
     "\n",
     "    plt.subplot(2,2,i+1)\n",
-    "    plt.imshow(img);\n",
-    "    plt.axis('off');\n",
+    "    plt.imshow(img)\n",
+    "    plt.axis('off')\n",
     "    plt.title(pred[1])"
    ]
   },
 
@@ -233,9 +233,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "masked_sentences = ['Paris is the [MASK] of France.', \n",
-    "                    'The primary [MASK] of the United States is English.', \n",
-    "                    'A baseball game consists of at least nine [MASK].', \n",
+    "masked_sentences = ['Paris is the [MASK] of France.',\n",
+    "                    'The primary [MASK] of the United States is English.',\n",
+    "                    'A baseball game consists of at least nine [MASK].',\n",
     "                    'Topology is a branch of [MASK] concerned with the properties of geometric objects that remain unchanged under continuous transformations.']\n",
     "pos_masks = [4, 3, 9, 6]"
    ]
@@ -357,7 +357,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "trt_model = torch_tensorrt.compile(traced_mlm_model, \n",
+    "trt_model = torch_tensorrt.compile(traced_mlm_model,\n",
     "    inputs= [torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # input_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # token_type_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32)], # attention_mask\n",
@@ -396,7 +396,7 @@
     "enc_inputs = enc(masked_sentences, return_tensors='pt', padding='max_length', max_length=128)\n",
     "enc_inputs = {k: v.type(torch.int32).cuda() for k, v in enc_inputs.items()}\n",
     "output_trt = trt_model(enc_inputs['input_ids'], enc_inputs['token_type_ids'], enc_inputs['attention_mask'])\n",
-    "most_likely_token_ids_trt = [torch.argmax(output_trt[i, pos, :]) for i, pos in enumerate(pos_masks)] \n",
+    "most_likely_token_ids_trt = [torch.argmax(output_trt[i, pos, :]) for i, pos in enumerate(pos_masks)]\n",
     "unmasked_tokens_trt = enc.decode(most_likely_token_ids_trt).split(' ')\n",
     "unmasked_sentences_trt = [masked_sentences[i].replace('[MASK]', token) for i, token in enumerate(unmasked_tokens_trt)]\n",
     "for sentence in unmasked_sentences_trt:\n",
@@ -418,7 +418,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "trt_model_fp16 = torch_tensorrt.compile(traced_mlm_model, \n",
+    "trt_model_fp16 = torch_tensorrt.compile(traced_mlm_model,\n",
     "    inputs= [torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # input_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # token_type_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32)], # attention_mask\n",
 
@@ -70,7 +70,6 @@
    "outputs": [],
    "source": [
     "import torch\n",
-    "import torchvision\n",
     "\n",
     "torch.hub._validate_not_a_forked_repo=lambda a,b,c: True\n",
     "\n",
 
@@ -428,7 +428,6 @@
    ],
    "source": [
     "import torch\n",
-    "import torchvision\n",
     "\n",
     "torch.hub._validate_not_a_forked_repo=lambda a,b,c: True\n",
     "\n",
@@ -558,7 +557,7 @@
     "from PIL import Image\n",
     "from torchvision import transforms\n",
     "import matplotlib.pyplot as plt\n",
-    "import json \n",
+    "import json\n",
     "\n",
     "fig, axes = plt.subplots(nrows=2, ncols=2)\n",
     "\n",
@@ -571,13 +570,13 @@
     "        transforms.ToTensor(),\n",
     "        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
     "    ])\n",
-    "    input_tensor = preprocess(img)      \n",
+    "    input_tensor = preprocess(img)\n",
     "    plt.subplot(2,2,i+1)\n",
     "    plt.imshow(img)\n",
     "    plt.axis('off')\n",
     "\n",
-    "# loading labels    \n",
-    "with open(\"./data/imagenet_class_index.json\") as json_file: \n",
+    "# loading labels\n",
+    "with open(\"./data/imagenet_class_index.json\") as json_file:\n",
     "    d = json.load(json_file)"
    ]
   },
@@ -614,7 +613,7 @@
     "    preprocess = rn50_preprocess()\n",
     "    input_tensor = preprocess(img)\n",
     "    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n",
-    "    \n",
+    "\n",
     "    # move the input and model to GPU for speed if available\n",
     "    if torch.cuda.is_available():\n",
     "        input_batch = input_batch.to('cuda')\n",
@@ -624,7 +623,7 @@
     "        output = model(input_batch)\n",
     "        # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n",
     "        sm_output = torch.nn.functional.softmax(output[0], dim=0)\n",
-    "        \n",
+    "\n",
     "    ind = torch.argmax(sm_output)\n",
     "    return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n",
     "\n",
@@ -633,7 +632,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
@@ -695,13 +694,13 @@
     "for i in range(4):\n",
     "    img_path = './data/img%d.JPG'%i\n",
     "    img = Image.open(img_path)\n",
-    "    \n",
+    "\n",
     "    pred, prob = predict(img_path, resnet50_model)\n",
     "    print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n",
     "\n",
     "    plt.subplot(2,2,i+1)\n",
-    "    plt.imshow(img);\n",
-    "    plt.axis('off');\n",
+    "    plt.imshow(img)\n",
+    "    plt.axis('off')\n",
     "    plt.title(pred[1])"
    ]
   },