From 84dd6ed18c2ec7a2d8a199ec1a437d09284d86bd Mon Sep 17 00:00:00 2001 From: Balarama Raju Buddharaju Date: Wed, 7 Aug 2024 12:11:46 -0700 Subject: [PATCH] Updates for Release 10.3-GA. Signed-off-by: Balarama Raju Buddharaju --- .clang-format | 2 +- CHANGELOG.md | 18 +- LICENSE | 21 +- README.md | 18 +- VERSION | 2 +- demo/BERT/README.md | 2 +- demo/Diffusion/README.md | 26 +- demo/Diffusion/demo_img2vid.py | 117 ++++ demo/Diffusion/models.py | 389 +++++++++-- demo/Diffusion/requirements.txt | 2 +- demo/Diffusion/stable_diffusion_3_pipeline.py | 18 +- demo/Diffusion/stable_diffusion_pipeline.py | 23 +- .../stable_video_diffusion_pipeline.py | 641 ++++++++++++++++++ demo/Diffusion/utilities.py | 170 ++++- demo/Diffusion/utils_sd3/mmdit.py | 1 - docker/rockylinux8.Dockerfile | 18 +- docker/rockylinux9.Dockerfile | 18 +- docker/ubuntu-20.04.Dockerfile | 18 +- docker/ubuntu-22.04-aarch64.Dockerfile | 2 +- docker/ubuntu-22.04.Dockerfile | 18 +- docker/ubuntu-cross-aarch64.Dockerfile | 2 +- include/NvInfer.h | 85 ++- include/NvInferImpl.h | 3 + include/NvInferRuntime.h | 57 ++ include/NvInferVersion.h | 6 +- parsers/onnx | 2 +- plugin/README.md | 6 +- plugin/api/inferPlugin.cpp | 4 +- plugin/gridAnchorPlugin/README.md | 2 +- plugin/instanceNormalizationPlugin/README.md | 2 +- .../multiscaleDeformableAttnPlugin/README.md | 2 +- plugin/nvFasterRCNN/README.md | 2 +- plugin/priorBoxPlugin/README.md | 2 +- plugin/scatterElementsPlugin/README.md | 14 +- .../ScatterElementsPlugin_PluginConfig.yaml | 16 +- .../scatterElementsCommon.h | 41 ++ .../scatterElementsPlugin.cpp | 243 ++++--- .../scatterElementsPlugin.h | 119 ++-- .../scatterElementsPluginKernel.h | 1 + .../scatterElementsPluginLegacy.cpp | 316 +++++++++ .../scatterElementsPluginLegacy.h | 123 ++++ python/CMakeLists.txt | 6 + python/docstrings/infer/pyCoreDoc.h | 6 +- python/docstrings/infer/pyGraphDoc.h | 27 +- python/docstrings/infer/pyPluginDoc.h | 33 + .../bindings_wheel/tensorrt/__init__.py | 1 - python/src/infer/pyCore.cpp | 12 +- python/src/infer/pyGraph.cpp | 5 +- python/src/infer/pyPlugin.cpp | 113 ++- quickstart/Makefile.config | 5 +- .../SemanticSegmentation/tutorial-runtime.cpp | 47 +- .../tutorial-runtime.ipynb | 136 +++- quickstart/common/logging.h | 2 +- quickstart/common/util.h | 15 - requirements.txt | 2 +- samples/common/sampleEngines.cpp | 28 +- samples/common/sampleOptions.cpp | 12 +- samples/common/sampleOptions.h | 1 + samples/python/detectron2/requirements.txt | 3 +- samples/python/downloader.py | 32 +- samples/python/efficientdet/requirements.txt | 6 +- samples/python/efficientnet/requirements.txt | 3 +- .../python/non_zero_plugin/requirements.txt | 6 +- samples/python/python_plugin/requirements.txt | 6 +- .../tensorflow_object_detection_api/README.md | 2 + .../requirements.txt | 8 +- samples/sampleDynamicReshape/README.md | 2 +- samples/sampleINT8API/README.md | 19 +- scripts/stubify.sh | 2 + tools/Polygraphy/CHANGELOG.md | 5 + .../05_inspecting_inference_outputs/README.md | 2 +- .../06_inspecting_input_data/README.md | 2 +- .../extension_module/setup.py | 1 + tools/Polygraphy/polygraphy/README.md | 11 + tools/Polygraphy/polygraphy/__init__.py | 2 +- .../Polygraphy/polygraphy/comparator/util.py | 8 +- tools/Polygraphy/polygraphy/datatype/numpy.py | 2 +- tools/Polygraphy/polygraphy/logger/logger.py | 28 +- tools/Polygraphy/tests/conftest.py | 18 + tools/Polygraphy/tests/logger/test_logger.py | 43 ++ 80 files changed, 2701 insertions(+), 533 deletions(-) mode change 100644 => 100755 demo/Diffusion/README.md create mode 100644 demo/Diffusion/demo_img2vid.py mode change 100644 => 100755 demo/Diffusion/requirements.txt mode change 100755 => 100644 demo/Diffusion/stable_diffusion_pipeline.py create mode 100644 demo/Diffusion/stable_video_diffusion_pipeline.py create mode 100644 plugin/scatterElementsPlugin/scatterElementsCommon.h create mode 100644 plugin/scatterElementsPlugin/scatterElementsPluginLegacy.cpp create mode 100644 plugin/scatterElementsPlugin/scatterElementsPluginLegacy.h diff --git a/.clang-format b/.clang-format index da4c3928a..4e34c05e2 100644 --- a/.clang-format +++ b/.clang-format @@ -74,7 +74,7 @@ SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 -StatementMacros: [API_ENTRY_TRY] +StatementMacros: [API_ENTRY_TRY,TRT_TRY] TabWidth: 4 UseTab: Never ... diff --git a/CHANGELOG.md b/CHANGELOG.md index ebb29fa9f..a67bb266c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,22 @@ # TensorRT OSS Release Changelog -## 10.2.0 GA - 2024-07-10 +## 10.3.0 GA - 2024-08-07 + +Key Features and Updates: + + - Demo changes + - Added [Stable Video Diffusion](demo/Diffusion)(`SVD`) pipeline. + - Plugin changes + - Deprecated Version 1 of [ScatterElements plugin](plugin/scatterElementsPlugin). It is superseded by Version 2, which implements the `IPluginV3` interface. + - Quickstart guide + - Updated the [SemanticSegmentation](quickstart/SemanticSegmentation) guide with latest APIs. + - Parser changes + - Added support for tensor `axes` inputs for `Slice` node. + - Updated `ScatterElements` importer to use Version 2 of [ScatterElements plugin](plugin/scatterElementsPlugin), which implements the `IPluginV3` interface. + - Updated tooling + - Polygraphy v0.49.13 + +## 10.2.0 GA - 2024-07-09 Key Features and Updates: diff --git a/LICENSE b/LICENSE index e80db763a..1ac1dc1f8 100644 --- a/LICENSE +++ b/LICENSE @@ -337,10 +337,11 @@ limitations under the License. > demo/Diffusion/utilities.py + > demo/Diffusion/stable_video_diffusion_pipeline.py HuggingFace diffusers library. - Copyright 2022 The HuggingFace Team. + Copyright 2024 The HuggingFace Team. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -380,3 +381,21 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + > demo/Diffusion/utilities.py + + ModelScope library. + + Copyright (c) Alibaba, Inc. and its affiliates. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 991048b96..4a120aabd 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ You can skip the **Build** section to enjoy TensorRT with Python. To build the TensorRT-OSS components, you will first need the following software packages. **TensorRT GA build** -* TensorRT v10.2.0.19 +* TensorRT v10.3.0.26 * Available from direct download links listed below **System Packages** @@ -73,25 +73,25 @@ To build the TensorRT-OSS components, you will first need the following software If using the TensorRT OSS build container, TensorRT libraries are preinstalled under `/usr/lib/x86_64-linux-gnu` and you may skip this step. Else download and extract the TensorRT GA build from [NVIDIA Developer Zone](https://developer.nvidia.com) with the direct links below: - - [TensorRT 10.2.0.19 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz) - - [TensorRT 10.2.0.19 for CUDA 12.5, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz) - - [TensorRT 10.2.0.19 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows.win10.cuda-11.8.zip) - - [TensorRT 10.2.0.19 for CUDA 12.5, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows.win10.cuda-12.5.zip) + - [TensorRT 10.3.0.26 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz) + - [TensorRT 10.3.0.26 for CUDA 12.5, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz) + - [TensorRT 10.3.0.26 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/zip/TensorRT-10.3.0.26.Windows.win10.cuda-11.8.zip) + - [TensorRT 10.3.0.26 for CUDA 12.5, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/zip/TensorRT-10.3.0.26.Windows.win10.cuda-12.5.zip) **Example: Ubuntu 20.04 on x86-64 with cuda-12.5** ```bash cd ~/Downloads - tar -xvzf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz - export TRT_LIBPATH=`pwd`/TensorRT-10.2.0.19 + tar -xvzf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz + export TRT_LIBPATH=`pwd`/TensorRT-10.3.0.26 ``` **Example: Windows on x86-64 with cuda-12.5** ```powershell - Expand-Archive -Path TensorRT-10.2.0.19.Windows.win10.cuda-12.5.zip - $env:TRT_LIBPATH="$pwd\TensorRT-10.2.0.19\lib" + Expand-Archive -Path TensorRT-10.3.0.26.Windows.win10.cuda-12.5.zip + $env:TRT_LIBPATH="$pwd\TensorRT-10.3.0.26\lib" ``` ## Setting Up The Build Environment diff --git a/VERSION b/VERSION index 0afb8eb4b..92bc5b535 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -10.2.0.19 +10.3.0.26 diff --git a/demo/BERT/README.md b/demo/BERT/README.md index 68cc327ce..b48bd8be5 100755 --- a/demo/BERT/README.md +++ b/demo/BERT/README.md @@ -75,7 +75,7 @@ The following software version configuration has been tested: |Software|Version| |--------|-------| |Python|>=3.8| -|TensorRT|10.2.0.19| +|TensorRT|10.3.0.26| |CUDA|12.5| ## Setup diff --git a/demo/Diffusion/README.md b/demo/Diffusion/README.md old mode 100644 new mode 100755 index 5016a12e3..469f0b268 --- a/demo/Diffusion/README.md +++ b/demo/Diffusion/README.md @@ -48,14 +48,14 @@ onnx 1.15.0 onnx-graphsurgeon 0.5.2 onnxruntime 1.16.3 polygraphy 0.49.9 -tensorrt 10.2.0.19 +tensorrt 10.3.0.26 tokenizers 0.13.3 torch 2.2.0 transformers 4.33.1 controlnet-aux 0.0.6 nvidia-modelopt 0.11.2 ``` -> NOTE: optionally install HuggingFace [accelerate](https://pypi.org/project/accelerate/) package for faster and less memory-intense model loading. +> NOTE: optionally install HuggingFace [accelerate](https://pypi.org/project/accelerate/) package for faster and less memory-intense model loading. Note that installing accelerate is known to cause failures while running certain pipelines in Torch Compile mode ([known issue](https://github.com/huggingface/diffusers/issues/9091)) # Running demoDiffusion @@ -178,6 +178,28 @@ python3 demo_txt2img_sd3.py "dog wearing a sweater and a blue collar" --version Note that a denosing-percentage is applied to the number of denoising-steps when an input image conditioning is provided. Its default value is set to 0.6. This parameter can be updated using `--denoising-percentage` +### Image-to-video using SVD (Stable Video Diffusion) + +Download the pre-exported ONNX model + +```bash +git lfs install +git clone https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1-tensorrt onnx-svd-xt-1-1 +cd onnx-svd-xt-1-1 && git lfs pull && cd .. +``` + +SVD-XT-1.1 (25 frames at resolution 576x1024) +```bash +python3 demo_img2vid.py --version svd-xt-1.1 --onnx-dir onnx-svd-xt-1-1 --engine-dir engine-svd-xt-1-1 --hf-token=$HF_TOKEN +``` + +You may also specify a custom conditioning image using `--input-image`: +```bash +python3 demo_img2vid.py --version svd-xt-1.1 --onnx-dir onnx-svd-xt-1-1 --engine-dir engine-svd-xt-1-1 --input-image https://www.hdcarwallpapers.com/walls/2018_chevrolet_camaro_zl1_nascar_race_car_2-HD.jpg --hf-token=$HF_TOKEN +``` + +NOTE: The min and max guidance scales are configured using --min-guidance-scale and --max-guidance-scale respectively. + ## Configuration options - Noise scheduler can be set using `--scheduler `. Note: not all schedulers are available for every version. - To accelerate engine building time use `--timing-cache `. The cache file will be created if it does not already exist. Note that performance may degrade if cache files are used across multiple GPU targets. It is recommended to use timing caches only during development. To achieve the best perfromance in deployment, please build engines without timing cache. diff --git a/demo/Diffusion/demo_img2vid.py b/demo/Diffusion/demo_img2vid.py new file mode 100644 index 000000000..7d7b2b048 --- /dev/null +++ b/demo/Diffusion/demo_img2vid.py @@ -0,0 +1,117 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +from PIL import Image + +from stable_video_diffusion_pipeline import StableVideoDiffusionPipeline +from utilities import ( + PIPELINE_TYPE, + add_arguments, + download_image, +) + +def parseArgs(): + parser = argparse.ArgumentParser(description="Options for Stable Diffusion Img2Vid Demo", conflict_handler='resolve') + parser = add_arguments(parser) + parser.add_argument('--version', type=str, default="svd-xt-1.1", choices=["svd-xt-1.1"], help="Version of Stable Video Diffusion") + parser.add_argument('--input-image', type=str, default="", help="Path to the input image") + parser.add_argument('--height', type=int, default=576, help="Height of image to generate (must be multiple of 8)") + parser.add_argument('--width', type=int, default=1024, help="Width of image to generate (must be multiple of 8)") + parser.add_argument('--min-guidance-scale', type=float, default=1.0, help="The minimum guidance scale. Used for the classifier free guidance with first frame") + parser.add_argument('--max-guidance-scale', type=float, default=3.0, help="The maximum guidance scale. Used for the classifier free guidance with last frame") + parser.add_argument('--denoising-steps', type=int, default=25, help="Number of denoising steps") + parser.add_argument('--num-warmup-runs', type=int, default=1, help="Number of warmup runs before benchmarking performance") + return parser.parse_args() + +def process_pipeline_args(args): + + if not args.input_image: + args.input_image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png?download=true" + if isinstance(args.input_image, str): + input_image = download_image(args.input_image).resize((args.width, args.height)) + elif isinstance(args.input_image, Image.Image): + input_image = Image.open(args.input_image) + else: + raise ValueError(f"Input image(s) must be of type `PIL.Image.Image` or `str` (URL) but is {type(args.input_image)}") + + if args.height % 8 != 0 or args.width % 8 != 0: + raise ValueError(f"Image height and width have to be divisible by 8 but are: {args.image_height} and {args.width}.") + + # TODO enable BS>1 + max_batch_size = 1 + args.build_static_batch = True + + if args.batch_size > max_batch_size: + raise ValueError(f"Batch size {args.batch_size} is larger than allowed {max_batch_size}.") + + if not args.build_static_batch or args.build_dynamic_shape: + raise ValueError(f"Dynamic shapes not supported. Do not specify `--build-dynamic-shape`") + + kwargs_init_pipeline = { + 'version': args.version, + 'max_batch_size': max_batch_size, + 'denoising_steps': args.denoising_steps, + 'scheduler': args.scheduler, + 'min_guidance_scale': args.min_guidance_scale, + 'max_guidance_scale': args.max_guidance_scale, + 'output_dir': args.output_dir, + 'hf_token': args.hf_token, + 'verbose': args.verbose, + 'nvtx_profile': args.nvtx_profile, + 'use_cuda_graph': args.use_cuda_graph, + 'framework_model_dir': args.framework_model_dir, + 'torch_inference': args.torch_inference, + } + + kwargs_load_engine = { + 'onnx_opset': args.onnx_opset, + 'opt_batch_size': args.batch_size, + 'opt_image_height': args.height, + 'opt_image_width': args.width, + 'static_batch': args.build_static_batch, + 'static_shape': not args.build_dynamic_shape, + 'enable_all_tactics': args.build_all_tactics, + 'enable_refit': args.build_enable_refit, + 'timing_cache': args.timing_cache, + } + + args_run_demo = (input_image, args.height, args.width, args.batch_size, args.batch_count, args.num_warmup_runs, args.use_cuda_graph) + + return kwargs_init_pipeline, kwargs_load_engine, args_run_demo + +if __name__ == "__main__": + print("[I] Initializing StableDiffusion img2vid demo using TensorRT") + args = parseArgs() + kwargs_init_pipeline, kwargs_load_engine, args_run_demo = process_pipeline_args(args) + + # Initialize demo + demo = StableVideoDiffusionPipeline( + pipeline_type=PIPELINE_TYPE.IMG2VID, + **kwargs_init_pipeline) + demo.loadEngines( + args.engine_dir, + args.framework_model_dir, + args.onnx_dir, + **kwargs_load_engine) + demo.loadResources(args.height, args.width, args.batch_size, args.seed) + + # Run inference + demo.run(*args_run_demo) + + demo.teardown() diff --git a/demo/Diffusion/models.py b/demo/Diffusion/models.py index 162eb6adc..d9d82b69d 100644 --- a/demo/Diffusion/models.py +++ b/demo/Diffusion/models.py @@ -19,8 +19,10 @@ from diffusers.loaders import LoraLoaderMixin from diffusers.models import ( AutoencoderKL, + AutoencoderKLTemporalDecoder, ControlNetModel, - UNet2DConditionModel + UNet2DConditionModel, + UNetSpatioTemporalConditionModel, ) import json import numpy as np @@ -35,9 +37,11 @@ import torch import torch.nn.functional as F from transformers import ( + CLIPImageProcessor, CLIPTextModel, CLIPTextModelWithProjection, - CLIPTokenizer + CLIPTokenizer, + CLIPVisionModelWithProjection, ) from huggingface_hub import hf_hub_download from utilities import merge_loras @@ -168,49 +172,38 @@ def fuse_mha_qkv_int8_sq(self): def get_path(version, pipeline, controlnets=None): if controlnets is not None: return ["lllyasviel/sd-controlnet-" + modality for modality in controlnets] - - if version == "1.4": - if pipeline.is_inpaint(): - return "runwayml/stable-diffusion-inpainting" - else: - return "CompVis/stable-diffusion-v1-4" + + if version in ("1.4", "1.5") and pipeline.is_inpaint(): + return "runwayml/stable-diffusion-inpainting" + elif version == "1.4": + return "CompVis/stable-diffusion-v1-4" elif version == "1.5": - if pipeline.is_inpaint(): - return "runwayml/stable-diffusion-inpainting" - else: - return "runwayml/stable-diffusion-v1-5" + return "runwayml/stable-diffusion-v1-5" elif version == 'dreamshaper-7': return 'Lykon/dreamshaper-7' + elif version in ("2.0-base", "2.0") and pipeline.is_inpaint(): + return "stabilityai/stable-diffusion-2-inpainting" elif version == "2.0-base": - if pipeline.is_inpaint(): - return "stabilityai/stable-diffusion-2-inpainting" - else: - return "stabilityai/stable-diffusion-2-base" + return "stabilityai/stable-diffusion-2-base" elif version == "2.0": - if pipeline.is_inpaint(): - return "stabilityai/stable-diffusion-2-inpainting" - else: - return "stabilityai/stable-diffusion-2" - elif version == "2.1": - return "stabilityai/stable-diffusion-2-1" + return "stabilityai/stable-diffusion-2" elif version == "2.1-base": return "stabilityai/stable-diffusion-2-1-base" - elif version == 'xl-1.0': - if pipeline.is_sd_xl_base(): - return "stabilityai/stable-diffusion-xl-base-1.0" - elif pipeline.is_sd_xl_refiner(): - return "stabilityai/stable-diffusion-xl-refiner-1.0" - else: - raise ValueError(f"Unsupported SDXL 1.0 pipeline {pipeline.name}") - elif version == 'xl-turbo': - if pipeline.is_sd_xl_base(): - return "stabilityai/sdxl-turbo" - else: - raise ValueError(f"Unsupported SDXL Turbo pipeline {pipeline.name}") + elif version == "2.1": + return "stabilityai/stable-diffusion-2-1" + elif version == 'xl-1.0' and pipeline.is_sd_xl_base(): + return "stabilityai/stable-diffusion-xl-base-1.0" + elif version == 'xl-1.0' and pipeline.is_sd_xl_refiner(): + return "stabilityai/stable-diffusion-xl-refiner-1.0" + # TODO SDXL turbo with refiner + elif version == 'xl-turbo' and pipeline.is_sd_xl_base(): + return "stabilityai/sdxl-turbo" elif version == 'sd3': return "stabilityai/stable-diffusion-3-medium" + elif version == 'svd-xt-1.1' and pipeline.is_img2vid(): + return "stabilityai/stable-video-diffusion-img2vid-xt-1-1" else: - raise ValueError(f"Incorrect version {version}") + raise ValueError(f"Unsupported version {version} + pipeline {pipeline.name}") def get_clip_embedding_dim(version, pipeline): if version in ("1.4", "1.5", "dreamshaper-7"): @@ -239,11 +232,13 @@ def get_unet_embedding_dim(version, pipeline): return 2048 elif version in ("xl-1.0", "xl-turbo") and pipeline.is_sd_xl_refiner(): return 1280 + elif pipeline.is_img2vid(): + return 1024 else: raise ValueError(f"Invalid version {version} + pipeline {pipeline}") -# FIXME after serialization support for torch.compile is added -def get_checkpoint_dir(framework_model_dir, version, pipeline, subfolder, torch_inference): +# FIXME serialization not supported for torch.compile +def get_checkpoint_dir(framework_model_dir, version, pipeline, subfolder): return os.path.join(framework_model_dir, version, pipeline, subfolder) torch_inference_modes = ['default', 'reduce-overhead', 'max-autotune'] @@ -400,14 +395,14 @@ def export_onnx(model): model = merge_loras(model, self.lora_dict, self.lora_alphas, self.lora_scales) inputs = self.get_sample_input(1, opt_image_height, opt_image_width, static_shape) torch.onnx.export(model, - inputs, - onnx_path, - export_params=True, - opset_version=onnx_opset, - do_constant_folding=True, - input_names=self.get_input_names(), - output_names=self.get_output_names(), - dynamic_axes=self.get_dynamic_axes(), + inputs, + onnx_path, + export_params=True, + opset_version=onnx_opset, + do_constant_folding=True, + input_names=self.get_input_names(), + output_names=self.get_output_names(), + dynamic_axes=self.get_dynamic_axes(), ) if custom_model: with torch.inference_mode(): @@ -544,7 +539,7 @@ def __init__(self, self.extra_output_names = ['hidden_states'] def get_model(self, torch_inference=''): - clip_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + clip_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) if not os.path.exists(clip_model_dir): model = CLIPTextModel.from_pretrained(self.path, subfolder=self.subfolder, @@ -552,7 +547,7 @@ def get_model(self, torch_inference=''): use_auth_token=self.hf_token).to(self.device) model.save_pretrained(clip_model_dir) else: - print(f"[I] Load CLIP pytorch model from: {clip_model_dir}") + print(f"[I] Load CLIPTextModel model from: {clip_model_dir}") model = CLIPTextModel.from_pretrained(clip_model_dir).to(self.device) model = optimize_checkpoint(model, torch_inference) return model @@ -630,7 +625,7 @@ def __init__(self, self.subfolder = subfolder def get_model(self, torch_inference=''): - clip_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + clip_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) if not os.path.exists(clip_model_dir): model = CLIPTextModelWithProjection.from_pretrained(self.path, subfolder=self.subfolder, @@ -638,7 +633,7 @@ def get_model(self, torch_inference=''): use_auth_token=self.hf_token).to(self.device) model.save_pretrained(clip_model_dir) else: - print(f"[I] Load CLIP pytorch model from: {clip_model_dir}") + print(f"[I] Load CLIPTextModelWithProjection model from: {clip_model_dir}") model = CLIPTextModelWithProjection.from_pretrained(clip_model_dir).to(self.device) model = optimize_checkpoint(model, torch_inference) return model @@ -664,10 +659,10 @@ def __init__(self, verbose, framework_model_dir, max_batch_size, - embedding_dim, + embedding_dim=None, fp16=False, + pooled_output=False, ): - super(SD3_CLIPGModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, max_batch_size=max_batch_size, embedding_dim=embedding_dim) self.CLIPG_CONFIG = { "hidden_act": "gelu", "hidden_size": 1280, @@ -675,17 +670,20 @@ def __init__(self, "num_attention_heads": 20, "num_hidden_layers": 32 } + super(SD3_CLIPGModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, max_batch_size=max_batch_size, embedding_dim=self.CLIPG_CONFIG["hidden_size"] if embedding_dim is None else embedding_dim) self.subfolder = 'text_encoders' + if pooled_output: + self.extra_output_names = ['pooled_output'] def get_model(self, torch_inference=''): - clip_g_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + clip_g_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) clip_g_filename="clip_g.safetensors" clip_g_model_path = f"{clip_g_model_dir}/{clip_g_filename}" if not os.path.exists(clip_g_model_path): hf_hub_download( repo_id=self.path, filename=clip_g_filename, - local_dir=get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, '', torch_inference), + local_dir=get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, ''), subfolder=self.subfolder ) with safe_open(clip_g_model_path, framework="pt", device=self.device) as f: @@ -695,7 +693,33 @@ def get_model(self, torch_inference=''): model = optimize_checkpoint(model, torch_inference) return model -class SD3_CLIPLModel(CLIPModel): + def get_shape_dict(self, batch_size, image_height, image_width): + self.check_dims(batch_size, image_height, image_width) + output = { + 'input_ids': (batch_size, self.text_maxlen), + 'text_embeddings': (batch_size, self.text_maxlen, self.embedding_dim) + } + if 'pooled_output' in self.extra_output_names: + output["pooled_output"] = (batch_size, self.embedding_dim) + + return output + + def optimize(self, onnx_graph): + opt = Optimizer(onnx_graph, verbose=self.verbose) + opt.info(self.name + ': original') + opt.select_outputs([0, 1]) + opt.cleanup() + opt.fold_constants() + opt.info(self.name + ': fold constants') + opt.infer_shapes() + opt.info(self.name + ': shape inference') + opt.select_outputs([0, 1], names=['text_embeddings', 'pooled_output']) # rename network output + opt.info(self.name + ': rename output[0] and output[1]') + opt_onnx_graph = opt.cleanup(return_onnx=True) + opt.info(self.name + ': finished') + return opt_onnx_graph + +class SD3_CLIPLModel(SD3_CLIPGModel): def __init__(self, version, pipeline, @@ -704,10 +728,9 @@ def __init__(self, verbose, framework_model_dir, max_batch_size, - embedding_dim, fp16=False, + pooled_output=False, ): - super(SD3_CLIPLModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, max_batch_size=max_batch_size, embedding_dim=embedding_dim) self.CLIPL_CONFIG = { "hidden_act": "quick_gelu", "hidden_size": 768, @@ -715,17 +738,20 @@ def __init__(self, "num_attention_heads": 12, "num_hidden_layers": 12 } + super(SD3_CLIPLModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, max_batch_size=max_batch_size, embedding_dim=self.CLIPL_CONFIG["hidden_size"]) self.subfolder = 'text_encoders' + if pooled_output: + self.extra_output_names = ['pooled_output'] def get_model(self, torch_inference=''): - clip_l_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + clip_l_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) clip_l_filename="clip_l.safetensors" clip_l_model_path = f"{clip_l_model_dir}/{clip_l_filename}" if not os.path.exists(clip_l_model_path): hf_hub_download( repo_id=self.path, filename=clip_l_filename, - local_dir=get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, '', torch_inference), + local_dir=get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, ''), subfolder=self.subfolder ) with safe_open(clip_l_model_path, framework="pt", device=self.device) as f: @@ -758,14 +784,14 @@ def __init__(self, self.subfolder = 'text_encoders' def get_model(self, torch_inference=''): - t5xxl_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + t5xxl_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) t5xxl_filename="t5xxl_fp16.safetensors" t5xxl_model_path = f"{t5xxl_model_dir}/{t5xxl_filename}" if not os.path.exists(t5xxl_model_path): hf_hub_download( repo_id=self.path, filename=t5xxl_filename, - local_dir=get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, '', torch_inference), + local_dir=get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, ''), subfolder=self.subfolder ) with safe_open(t5xxl_model_path, framework="pt", device=self.device) as f: @@ -775,6 +801,67 @@ def get_model(self, torch_inference=''): model = optimize_checkpoint(model, torch_inference) return model +class CLIPVisionWithProjModel(BaseModel): + def __init__(self, + version, + pipeline, + device, + hf_token, + verbose, + framework_model_dir, + max_batch_size=1, + subfolder="image_encoder", + ): + + super(CLIPVisionWithProjModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, max_batch_size=max_batch_size) + self.subfolder = subfolder + + def get_model(self, torch_inference=''): + clip_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) + if not os.path.exists(clip_model_dir): + model = CLIPVisionModelWithProjection.from_pretrained(self.path, + subfolder=self.subfolder, + use_safetensors=self.hf_safetensor, + use_auth_token=self.hf_token).to(self.device) + model.save_pretrained(clip_model_dir) + else: + print(f"[I] Load CLIPVisionModelWithProjection model from: {clip_model_dir}") + model = CLIPVisionModelWithProjection.from_pretrained(clip_model_dir).to(self.device) + model = optimize_checkpoint(model, torch_inference) + return model + + +class CLIPImageProcessorModel(BaseModel): + def __init__(self, + version, + pipeline, + device, + hf_token, + verbose, + framework_model_dir, + max_batch_size=1, + subfolder="feature_extractor", + ): + + super(CLIPImageProcessorModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, max_batch_size=max_batch_size) + self.subfolder = subfolder + + def get_model(self, torch_inference=''): + clip_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) + # NOTE to(device) not supported + if not os.path.exists(clip_model_dir): + model = CLIPImageProcessor.from_pretrained(self.path, + subfolder=self.subfolder, + use_safetensors=self.hf_safetensor, + use_auth_token=self.hf_token) + model.save_pretrained(clip_model_dir) + else: + print(f"[I] Load CLIPImageProcessor model from: {clip_model_dir}") + model = CLIPImageProcessor.from_pretrained(clip_model_dir) + model = optimize_checkpoint(model, torch_inference) + return model + + class UNet2DConditionControlNetModel(torch.nn.Module): def __init__(self, unet, controlnets) -> None: super().__init__() @@ -858,7 +945,7 @@ def get_model(self, torch_inference=''): # FIXME - cache UNet2DConditionControlNetModel model = UNet2DConditionControlNetModel(unet_model, controlnets) else: - unet_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + unet_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) unet_path = self.get_model_path(unet_model_dir, model_opts) if not os.path.exists(unet_path): model = UNet2DConditionModel.from_pretrained(self.path, @@ -868,7 +955,7 @@ def get_model(self, torch_inference=''): **model_opts).to(self.device) model.save_pretrained(unet_model_dir, **model_opts) else: - print(f"[I] Load UNet pytorch model from: {unet_path}") + print(f"[I] Load UNet2DConditionModel model from: {unet_path}") model = UNet2DConditionModel.from_pretrained(unet_model_dir, **model_opts).to(self.device) if torch_inference: model.to(memory_format=torch.channels_last) @@ -996,7 +1083,7 @@ def __init__(self, def get_model(self, torch_inference=''): model_opts = {'variant': 'fp16', 'torch_dtype': torch.float16} if self.fp16 else {} - unet_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + unet_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) unet_path = self.get_model_path(unet_model_dir, model_opts) if not os.path.exists(unet_path): model = UNet2DConditionModel.from_pretrained(self.path, @@ -1009,7 +1096,7 @@ def get_model(self, torch_inference=''): model.set_default_attn_processor() model.save_pretrained(unet_model_dir, **model_opts) else: - print(f"[I] Load UNet pytorch model from: {unet_path}") + print(f"[I] Load UNet2DConditionModel model from: {unet_path}") model = UNet2DConditionModel.from_pretrained(unet_model_dir, **model_opts).to(self.device) model = optimize_checkpoint(model, torch_inference) return model @@ -1100,7 +1187,7 @@ def __init__(self, self.xB = 2 def get_model(self, torch_inference=''): - sd3_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + sd3_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) sd3_filename="sd3_medium.safetensors" sd3_model_path = f"{sd3_model_dir}/{sd3_filename}" if not os.path.exists(sd3_model_path): @@ -1160,6 +1247,92 @@ def get_sample_input(self, batch_size, image_height, image_width, static_shape): } ) + +class UNetTemporalModel(BaseModel): + def __init__(self, + version, + pipeline, + device, + hf_token, + verbose, + framework_model_dir, + fp16 = False, + max_batch_size = 16, + num_frames = 14, + do_classifier_free_guidance = True, + ): + super(UNetTemporalModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, max_batch_size=max_batch_size, embedding_dim=get_unet_embedding_dim(version, pipeline)) + self.subfolder = 'unet' + self.unet_dim = 4 + self.num_frames = num_frames + self.out_channels = 4 + self.cross_attention_dim = 1024 + self.xB = 2 if do_classifier_free_guidance else 1 # batch multiplier + + def get_model(self, torch_inference=''): + model_opts = {'variant': 'fp16', 'torch_dtype': torch.float16} if self.fp16 else {} + unet_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) + unet_path = self.get_model_path(unet_model_dir, model_opts) + if not os.path.exists(unet_path): + model = UNetSpatioTemporalConditionModel.from_pretrained(self.path, + subfolder=self.subfolder, + use_safetensors=self.hf_safetensor, + use_auth_token=self.hf_token, + **model_opts).to(self.device) + model.save_pretrained(unet_model_dir, **model_opts) + else: + print(f"[I] Load UNetSpatioTemporalConditionModel model from: {unet_path}") + model = UNetSpatioTemporalConditionModel.from_pretrained(unet_model_dir, **model_opts).to(self.device) + model = optimize_checkpoint(model, torch_inference) + return model + + def get_input_names(self): + return ['sample', 'timestep', 'encoder_hidden_states', 'added_time_ids'] + + def get_output_names(self): + return ['latent'] + + def get_dynamic_axes(self): + xB = str(self.xB)+'B' + return { + 'sample': {0: xB, 1: 'num_frames', 3: 'H', 4: 'W'}, + 'encoder_hidden_states': {0: xB}, + 'added_time_ids': {0: xB} + } + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width, min_latent_height, max_latent_height, min_latent_width, max_latent_width = \ + self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) + return { + 'sample': [(self.xB*min_batch, self.num_frames, 2*self.out_channels, min_latent_height, min_latent_width), (self.xB*batch_size, self.num_frames, 2*self.out_channels, latent_height, latent_width), (self.xB*max_batch, self.num_frames, 2*self.out_channels, max_latent_height, max_latent_width)], + 'encoder_hidden_states': [(self.xB*min_batch, 1, self.cross_attention_dim), (self.xB*batch_size, 1, self.cross_attention_dim), (self.xB*max_batch, 1, self.cross_attention_dim)], + 'added_time_ids': [(self.xB*min_batch, 3), (self.xB*batch_size, 3), (self.xB*max_batch, 3)], + } + + + def get_shape_dict(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return { + 'sample': (self.xB*batch_size, self.num_frames, 2*self.out_channels, latent_height, latent_width), + 'timestep': (1,), + 'encoder_hidden_states': (self.xB*batch_size, 1, self.cross_attention_dim), + 'added_time_ids': (self.xB*batch_size, 3), + } + + def get_sample_input(self, batch_size, image_height, image_width): + # TODO chunk_size if forward_chunking is used + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + + dtype = torch.float16 if self.fp16 else torch.float32 + return ( + torch.randn(self.xB*batch_size, self.num_frames, 2*self.out_channels, latent_height, latent_width, dtype=dtype, device=self.device), + torch.tensor([1.], dtype=torch.float32, device=self.device), + torch.randn(self.xB*batch_size, 1, self.cross_attention_dim, dtype=dtype, device=self.device), + torch.randn(self.xB*batch_size, 3, dtype=dtype, device=self.device), + ) + + class VAEModel(BaseModel): def __init__(self, version, @@ -1175,7 +1348,7 @@ def __init__(self, self.subfolder = 'vae' def get_model(self, torch_inference=''): - vae_decoder_model_path = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + vae_decoder_model_path = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) if not os.path.exists(vae_decoder_model_path): model = AutoencoderKL.from_pretrained(self.path, subfolder=self.subfolder, @@ -1183,7 +1356,7 @@ def get_model(self, torch_inference=''): use_auth_token=self.hf_token).to(self.device) model.save_pretrained(vae_decoder_model_path) else: - print(f"[I] Load VAE decoder pytorch model from: {vae_decoder_model_path}") + print(f"[I] Load AutoencoderKL (decoder) model from: {vae_decoder_model_path}") model = AutoencoderKL.from_pretrained(vae_decoder_model_path).to(self.device) model.forward = model.decode model = optimize_checkpoint(model, torch_inference) @@ -1236,7 +1409,7 @@ def __init__(self, def get_model(self, torch_inference=''): dtype = torch.float16 if self.fp16 else torch.float32 - sd3_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + sd3_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) sd3_filename="sd3_medium.safetensors" sd3_model_path = f"{sd3_model_dir}/{sd3_filename}" if not os.path.exists(sd3_model_path): @@ -1271,10 +1444,80 @@ def get_sample_input(self, batch_size, image_height, image_width, static_shape): dtype = torch.float16 if self.fp16 else torch.float32 return torch.randn(batch_size, 16, latent_height, latent_width, dtype=dtype, device=self.device) +class VAEDecTemporalModel(BaseModel): + def __init__(self, + version, + pipeline, + device, + hf_token, + verbose, + framework_model_dir, + max_batch_size = 16, + decode_chunk_size = 14, + ): + super(VAEDecTemporalModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, max_batch_size=max_batch_size) + self.subfolder = 'vae' + self.decode_chunk_size = decode_chunk_size + + def get_model(self, torch_inference=''): + vae_decoder_model_path = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) + if not os.path.exists(vae_decoder_model_path): + model = AutoencoderKLTemporalDecoder.from_pretrained(self.path, + subfolder=self.subfolder, + use_safetensors=self.hf_safetensor, + use_auth_token=self.hf_token).to(self.device) + model.save_pretrained(vae_decoder_model_path) + else: + print(f"[I] Load AutoencoderKLTemporalDecoder model from: {vae_decoder_model_path}") + model = AutoencoderKLTemporalDecoder.from_pretrained(vae_decoder_model_path).to(self.device) + model.forward = model.decode + model = optimize_checkpoint(model, torch_inference) + return model + + def get_input_names(self): + return ['latent', 'num_frames_in'] + + def get_output_names(self): + return ['frames'] + + def get_dynamic_axes(self): + return { + 'latent': {0: 'num_frames_in', 2: 'H', 3: 'W'}, + 'frames': {0: 'num_frames_in', 2: '8H', 3: '8W'} + } + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + assert batch_size == 1 + _, _, _, _, _, _, min_latent_height, max_latent_height, min_latent_width, max_latent_width = \ + self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) + return { + 'latent': [(1, 4, min_latent_height, min_latent_width), (self.decode_chunk_size, 4, latent_height, latent_width), (self.decode_chunk_size, 4, max_latent_height, max_latent_width)], + 'num_frames_in': [(1,), (1,), (1,)], + } + + def get_shape_dict(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + assert batch_size == 1 + return { + 'latent': (self.decode_chunk_size, 4, latent_height, latent_width), + #'num_frames_in': (1,), + 'frames': (self.decode_chunk_size, 3, image_height, image_width) + } + + def get_sample_input(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + assert batch_size == 1 + return ( + torch.randn(self.decode_chunk_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device), + self.decode_chunk_size, + ) + + class TorchVAEEncoder(torch.nn.Module): def __init__(self, version, pipeline, hf_token, device, path, framework_model_dir, hf_safetensor=False): super().__init__() - vae_encoder_model_dir = get_checkpoint_dir(framework_model_dir, version, pipeline, 'vae_encoder', '') + vae_encoder_model_dir = get_checkpoint_dir(framework_model_dir, version, pipeline, 'vae_encoder') if not os.path.exists(vae_encoder_model_dir): self.vae_encoder = AutoencoderKL.from_pretrained(path, subfolder='vae', @@ -1282,7 +1525,7 @@ def __init__(self, version, pipeline, hf_token, device, path, framework_model_di use_auth_token=hf_token).to(device) self.vae_encoder.save_pretrained(vae_encoder_model_dir) else: - print(f"[I] Load VAE encoder pytorch model from: {vae_encoder_model_dir}") + print(f"[I] Load AutoencoderKL (encoder) model from: {vae_encoder_model_dir}") self.vae_encoder = AutoencoderKL.from_pretrained(vae_encoder_model_dir).to(device) def forward(self, x): @@ -1357,7 +1600,7 @@ def __init__(self, def get_model(self, torch_inference=''): dtype = torch.float16 if self.fp16 else torch.float32 - sd3_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder, torch_inference) + sd3_model_dir = get_checkpoint_dir(self.framework_model_dir, self.version, self.pipeline, self.subfolder) sd3_filename="sd3_medium.safetensors" sd3_model_path = f"{sd3_model_dir}/{sd3_filename}" if not os.path.exists(sd3_model_path): @@ -1391,7 +1634,7 @@ def get_sample_input(self, batch_size, image_height, image_width, static_shape): return torch.randn(batch_size, 3, image_height, image_width, dtype=dtype, device=self.device) def make_tokenizer(version, pipeline, hf_token, framework_model_dir, subfolder="tokenizer", **kwargs): - tokenizer_model_dir = get_checkpoint_dir(framework_model_dir, version, pipeline.name, subfolder, '') + tokenizer_model_dir = get_checkpoint_dir(framework_model_dir, version, pipeline.name, subfolder) if not os.path.exists(tokenizer_model_dir): model = CLIPTokenizer.from_pretrained(get_path(version, pipeline), subfolder=subfolder, @@ -1399,6 +1642,6 @@ def make_tokenizer(version, pipeline, hf_token, framework_model_dir, subfolder=" use_auth_token=hf_token) model.save_pretrained(tokenizer_model_dir) else: - print(f"[I] Load tokenizer pytorch model from: {tokenizer_model_dir}") + print(f"[I] Load CLIPTokenizer model from: {tokenizer_model_dir}") model = CLIPTokenizer.from_pretrained(tokenizer_model_dir) return model diff --git a/demo/Diffusion/requirements.txt b/demo/Diffusion/requirements.txt old mode 100644 new mode 100755 index 8874c6618..fc2979b51 --- a/demo/Diffusion/requirements.txt +++ b/demo/Diffusion/requirements.txt @@ -9,7 +9,7 @@ onnx==1.15.0 onnxruntime==1.16.3 opencv-python==4.8.0.74 scipy -transformers==4.33.1 +transformers==4.36.2 --extra-index-url https://pypi.nvidia.com nvidia-modelopt==0.11.2 onnx-graphsurgeon diff --git a/demo/Diffusion/stable_diffusion_3_pipeline.py b/demo/Diffusion/stable_diffusion_3_pipeline.py index 33059ea43..ea691e966 100644 --- a/demo/Diffusion/stable_diffusion_3_pipeline.py +++ b/demo/Diffusion/stable_diffusion_3_pipeline.py @@ -250,15 +250,14 @@ def loadEngines( self.tokenizer = SD3Tokenizer() # Load text encoders - embedding_dim = get_clip_embedding_dim(self.version, self.pipeline_type) if 'clip_g' in self.stages: - self.models['clip_g'] = SD3_CLIPGModel(**models_args, fp16=True, embedding_dim=embedding_dim) + self.models['clip_g'] = SD3_CLIPGModel(**models_args, fp16=True, pooled_output=True) if 'clip_l' in self.stages: - self.models['clip_l'] = SD3_CLIPLModel(**models_args, fp16=True, embedding_dim=embedding_dim) + self.models['clip_l'] = SD3_CLIPLModel(**models_args, fp16=True, pooled_output=True) if 't5xxl' in self.stages: - self.models['t5xxl'] = SD3_T5XXLModel(**models_args, fp16=True, embedding_dim=embedding_dim) + self.models['t5xxl'] = SD3_T5XXLModel(**models_args, fp16=True, embedding_dim=get_clip_embedding_dim(self.version, self.pipeline_type)) # Load MMDiT model if 'mmdit' in self.stages: @@ -275,7 +274,7 @@ def loadEngines( # Configure pipeline models to load model_names = self.models.keys() # Torch fallback - self.torch_fallback = dict(zip(model_names, [self.torch_inference or model_name in ('clip_g', 'clip_l', 't5xxl') for model_name in model_names])) + self.torch_fallback = dict(zip(model_names, [self.torch_inference or model_name in ('t5xxl') for model_name in model_names])) onnx_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir, opt=False) for model_name in model_names])) onnx_opt_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir) for model_name in model_names])) @@ -334,7 +333,7 @@ def activateEngines(self, shared_device_memory=None): self.shared_device_memory = shared_device_memory # Load and activate TensorRT engines for engine in self.engine.values(): - engine.activate(reuse_device_memory=self.shared_device_memory) + engine.activate(device_memory=self.shared_device_memory) def runEngine(self, model_name, feed_dict): engine = self.engine[model_name] @@ -374,7 +373,8 @@ def print_summary(self, denoising_steps, walltime_ms, batch_size): def save_image(self, images, pipeline, prompt, seed): # Save image image_name_prefix = pipeline+''.join(set(['-'+prompt[i].replace(' ','_')[:10] for i in range(len(prompt))]))+'-'+str(seed)+'-' - save_image(images, self.output_dir, image_name_prefix) + image_name_suffix = 'torch' if self.torch_inference else 'trt' + save_image(images, self.output_dir, image_name_prefix, image_name_suffix) def encode_prompt(self, prompt, negative_prompt): def encode_token_weights(model_name, token_weight_pairs): @@ -385,8 +385,8 @@ def encode_token_weights(model_name, token_weight_pairs): if self.torch_inference or self.torch_fallback[model_name]: out, pooled = self.torch_models[model_name](tokens) else: - out = self.runEngine('t5xxl', {'input_ids': tokens})['text_embeddings'] - pooled = None + trt_out = self.runEngine(model_name, {'input_ids': tokens}) + out, pooled = trt_out['text_embeddings'], trt_out["pooled_output"] self.profile_stop(model_name) diff --git a/demo/Diffusion/stable_diffusion_pipeline.py b/demo/Diffusion/stable_diffusion_pipeline.py old mode 100755 new mode 100644 index 9a1761ca0..c1316c66f --- a/demo/Diffusion/stable_diffusion_pipeline.py +++ b/demo/Diffusion/stable_diffusion_pipeline.py @@ -169,6 +169,8 @@ def __init__( self.stages.append('vae') elif self.pipeline_type.is_sd_xl_refiner(): self.stages = ['clip2', 'unetxl', 'vae'] + elif self.pipeline_type.is_img2vid(): + self.stages = ['clip-vis', 'clip-imgfe', 'unet-temp', 'vae-temp'] else: raise ValueError(f"Unsupported pipeline {self.pipeline_type.name}.") self.return_latents = return_latents @@ -183,7 +185,8 @@ def __init__( '2.1-base': 'PNDM', '2.1': 'DDIM', 'xl-1.0' : 'Euler', - 'xl-turbo': 'EulerA' + 'xl-turbo': 'EulerA', + 'svd-xt-1.1': 'Euler' } if not scheduler: @@ -273,6 +276,9 @@ def teardown(self): if self.shared_device_memory: cudart.cudaFree(self.shared_device_memory) + for torch_model in self.torch_models.values(): + del torch_model + cudart.cudaStreamDestroy(self.stream) del self.stream @@ -324,7 +330,6 @@ def loadEngines( quantization_alpha=0.8, calibration_size=32, calib_batch_size=2, - denoising_steps=30, ): """ Build and load engines for TensorRT accelerated inference. @@ -370,9 +375,6 @@ def loadEngines( Recommendation: 32, 64, 128 for SDXL calib_batch_size (int): The batch size to use for calibration. Defaults to 2. - denoising_steps (int): - The number of denoising steps. - More denoising steps usually lead to a higher quality image at the expense of slower inference. """ # Create directories if missing for directory in [engine_dir, onnx_dir]: @@ -435,7 +437,7 @@ def loadEngines( if int8: assert self.pipeline_type.is_sd_xl_base(), "int8 quantization only supported for SDXL pipeline" use_int8['unetxl'] = True - model_suffix['unetxl'] += f"-int8.l{quantization_level}.bs2.s{denoising_steps}.c{calibration_size}.p{quantization_percentile}.a{quantization_alpha}" + model_suffix['unetxl'] += f"-int8.l{quantization_level}.bs2.s{self.denoising_steps}.c{calibration_size}.p{quantization_percentile}.a{quantization_alpha}" onnx_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir, opt=False, suffix=model_suffix[model_name]) for model_name in model_names])) onnx_opt_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir, suffix=model_suffix[model_name]) for model_name in model_names])) engine_path = dict(zip(model_names, [self.getEnginePath(model_name, engine_dir, do_engine_refit[model_name], suffix=model_suffix[model_name]) for model_name in model_names])) @@ -465,7 +467,7 @@ def loadEngines( quantization_level, quantization_alpha, quantization_percentile, - denoising_steps + self.denoising_steps ) def do_calibrate(base, calibration_prompts, **kwargs): @@ -487,7 +489,7 @@ def calibration_loop(unet): base=pipeline, calibration_prompts=calibration_prompts, calib_size=calibration_size // calib_batch_size, - n_steps=denoising_steps, + n_steps=self.denoising_steps, ) print(f"[I] Performing int8 calibration for {calibration_size} steps.") @@ -580,7 +582,7 @@ def activateEngines(self, shared_device_memory=None): self.shared_device_memory = shared_device_memory # Load and activate TensorRT engines for engine in self.engine.values(): - engine.activate(reuse_device_memory=self.shared_device_memory) + engine.activate(device_memory=self.shared_device_memory) def runEngine(self, model_name, feed_dict): engine = self.engine[model_name] @@ -830,7 +832,8 @@ def print_summary(self, denoising_steps, walltime_ms, batch_size): def save_image(self, images, pipeline, prompt, seed): # Save image image_name_prefix = pipeline+''.join(set(['-'+prompt[i].replace(' ','_')[:10] for i in range(len(prompt))]))+'-'+str(seed)+'-' - save_image(images, self.output_dir, image_name_prefix) + image_name_suffix = 'torch' if self.torch_inference else 'trt' + save_image(images, self.output_dir, image_name_prefix, image_name_suffix) def infer( self, diff --git a/demo/Diffusion/stable_video_diffusion_pipeline.py b/demo/Diffusion/stable_video_diffusion_pipeline.py new file mode 100644 index 000000000..e2370539e --- /dev/null +++ b/demo/Diffusion/stable_video_diffusion_pipeline.py @@ -0,0 +1,641 @@ +# +# Copyright 2024 The HuggingFace Inc. team. +# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from cuda import cudart +from diffusers.image_processor import VaeImageProcessor +from diffusers.utils.torch_utils import randn_tensor +from models import ( + CLIPImageProcessorModel, + CLIPVisionWithProjModel, + UNetTemporalModel, + VAEDecTemporalModel, +) +import os +import pathlib +import tensorrt as trt +import time +import torch +import random +from tqdm.auto import tqdm +from typing import Optional +from utilities import ( + GiB, + PIPELINE_TYPE, + TRT_LOGGER, + Engine, + _append_dims, + _resize_with_antialiasing, + tensor2vid, +) +from stable_diffusion_pipeline import StableDiffusionPipeline + +class StableVideoDiffusionPipeline(StableDiffusionPipeline): + """ + Application showcasing the acceleration of Stable Video Diffusion pipelines using NVidia TensorRT. + """ + def __init__( + self, + version='svd-xt-1.1', + pipeline_type=PIPELINE_TYPE.IMG2VID, + min_guidance_scale: float = 1.0, + max_guidance_scale: float = 3.0, + decode_chunk_size: Optional[int] = None, + **kwargs + ): + """ + Initializes the Diffusion pipeline. + + Args: + version (str): + The version of the pipeline. Should be one of [svd-xt-1.1] + pipeline_type (PIPELINE_TYPE): + Type of current pipeline. + min_guidance_scale (`float`, *optional*, defaults to 1.0): + The minimum guidance scale. Used for the classifier free guidance with first frame. + max_guidance_scale (`float`, *optional*, defaults to 3.0): + The maximum guidance scale. Used for the classifier free guidance with last frame. + `max_guidance_scale = 1` corresponds to doing no classifier free guidance. + decode_chunk_size (`int`, *optional*): + The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency + between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once + for maximal quality. Reduce `decode_chunk_size` to reduce memory usage. + """ + super().__init__( + version=version, + pipeline_type=pipeline_type, + **kwargs + ) + self.min_guidance_scale = min_guidance_scale + self.max_guidance_scale = max_guidance_scale + self.do_classifier_free_guidance = max_guidance_scale > 1 + # FIXME vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.vae_scale_factor = 8 + # FIXME num_frames = self.config.num_frames + select_num_frames = { + 'svd-xt-1.1': 25, + } + self.num_frames = select_num_frames.get(version, 14) + # TODO decode_chunk_size from args + self.decode_chunk_size = 8 if not decode_chunk_size else decode_chunk_size + # TODO: scaling_factor = vae.config.scaling_factor + self.scaling_factor = 0.18215 + + # TODO user configurable cuda_device_id + cuda_device_id = 0 + vram_size = cudart.cudaGetDeviceProperties(cuda_device_id)[1].totalGlobalMem + self.low_vram = vram_size < GiB(40) + if self.low_vram: + print(f"[W] WARNING low VRAM ({vram_size/GiB(1):.2f} GB) mode selected. Certain optimizations may be skipped.") + if self.use_cuda_graph and self.low_vram: + print(f"[W] WARNING CUDA graph disabled in low VRAM mode.") + self.use_cuda_graph = False + + self.config = {} + if self.pipeline_type.is_img2vid(): + self.config['clip_vis_torch_fallback'] = True + self.config['clip_imgfe_torch_fallback'] = True + self.config['vae_temp_torch_fallback'] = True + + # initialized in loadEngines() + self.max_shared_device_memory_size = 0 + + def loadResources(self, image_height, image_width, batch_size, seed): + # Initialize noise generator + self.seed = seed + self.generator = torch.Generator(device="cuda").manual_seed(seed) if seed else None + + # Create CUDA events and stream + for stage in ['clip', 'denoise', 'vae', 'vae_encoder']: + self.events[stage] = [cudart.cudaEventCreate()[1], cudart.cudaEventCreate()[1]] + self.stream = cudart.cudaStreamCreate()[1] + + # Allocate shared device memory for TensorRT engines + if not self.low_vram and not self.torch_inference: + for model_name in self.models.keys(): + if not self.torch_fallback[model_name]: + self.max_shared_device_memory_size = max(self.max_shared_device_memory_size, self.engine[model_name].engine.device_memory_size) + self.shared_device_memory = cudart.cudaMalloc(self.max_shared_device_memory_size)[1] + # Activate TensorRT engines + for model_name in self.models.keys(): + if not self.torch_fallback[model_name]: + self.engine[model_name].activate(device_memory=self.shared_device_memory) + alloc_shape = self.models[model_name].get_shape_dict(batch_size, image_height, image_width) + self.engine[model_name].allocate_buffers(shape_dict=alloc_shape, device=self.device) + + def loadEngines( + self, + engine_dir, + framework_model_dir, + onnx_dir, + onnx_opset, + opt_batch_size, + opt_image_height, + opt_image_width, + static_batch=False, + static_shape=True, + enable_refit=False, + enable_all_tactics=False, + timing_cache=None, + ): + """ + Build and load engines for TensorRT accelerated inference. + Export ONNX models first, if applicable. + + Args: + engine_dir (str): + Directory to store the TensorRT engines. + framework_model_dir (str): + Directory to store the framework model ckpt. + onnx_dir (str): + Directory to store the ONNX models. + onnx_opset (int): + ONNX opset version to export the models. + opt_batch_size (int): + Batch size to optimize for during engine building. + opt_image_height (int): + Image height to optimize for during engine building. Must be a multiple of 8. + opt_image_width (int): + Image width to optimize for during engine building. Must be a multiple of 8. + static_batch (bool): + Build engine only for specified opt_batch_size. + static_shape (bool): + Build engine only for specified opt_image_height & opt_image_width. Default = True. + enable_refit (bool): + Build engines with refit option enabled. + enable_all_tactics (bool): + Enable all tactic sources during TensorRT engine builds. + timing_cache (str): + Path to the timing cache to speed up TensorRT build. + """ + # Create directories if missing + for directory in [engine_dir, onnx_dir]: + if not os.path.exists(directory): + print(f"[I] Create directory: {directory}") + pathlib.Path(directory).mkdir(parents=True) + + # Load pipeline models + models_args = {'version': self.version, 'pipeline': self.pipeline_type, 'device': self.device, + 'hf_token': self.hf_token, 'verbose': self.verbose, 'framework_model_dir': framework_model_dir, + 'max_batch_size': self.max_batch_size} + if 'clip-vis' in self.stages: + self.models['clip-vis'] = CLIPVisionWithProjModel(**models_args, subfolder='image_encoder') + if 'clip-imgfe' in self.stages: + self.models['clip-imgfe'] = CLIPImageProcessorModel(**models_args, subfolder='feature_extractor') + if 'unet-temp' in self.stages: + self.models['unet-temp'] = UNetTemporalModel(**models_args, fp16=True, num_frames=self.num_frames, do_classifier_free_guidance=self.do_classifier_free_guidance) + if 'vae-temp' in self.stages: + self.models['vae-temp'] = VAEDecTemporalModel(**models_args, decode_chunk_size=self.decode_chunk_size) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + # Configure pipeline models to load + model_names = self.models.keys() + self.torch_fallback = dict(zip(model_names, [self.torch_inference or self.config.get(model_name.replace('-','_')+'_torch_fallback', False) for model_name in model_names])) + onnx_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir, opt=False) for model_name in model_names])) + onnx_opt_path = dict(zip(model_names, [self.getOnnxPath(model_name, onnx_dir) for model_name in model_names])) + engine_path = dict(zip(model_names, [self.getEnginePath(model_name, engine_dir) for model_name in model_names])) + do_engine_refit = dict(zip(model_names, [enable_refit and model_name.startswith('unet') for model_name in model_names])) + + # Export models to ONNX + for model_name, obj in self.models.items(): + if self.torch_fallback[model_name]: + continue + do_export_onnx = not os.path.exists(engine_path[model_name]) and not os.path.exists(onnx_opt_path[model_name]) + if do_export_onnx: + obj.export_onnx(onnx_path[model_name], onnx_opt_path[model_name], onnx_opset, opt_image_height, opt_image_width) + + # Build TensorRT engines + for model_name, obj in self.models.items(): + if self.torch_fallback[model_name]: + continue + engine = Engine(engine_path[model_name]) + if not os.path.exists(engine_path[model_name]): + update_output_names = obj.get_output_names() + obj.extra_output_names if obj.extra_output_names else None + engine.build(onnx_opt_path[model_name], + fp16=True, + input_profile=obj.get_input_profile( + opt_batch_size, opt_image_height, opt_image_width, + static_batch=static_batch, static_shape=static_shape + ), + enable_refit=do_engine_refit[model_name], + enable_all_tactics=enable_all_tactics, + timing_cache=timing_cache, + update_output_names=update_output_names, + native_instancenorm=False) + self.engine[model_name] = engine + + # Load TensorRT engines + for model_name in self.models.keys(): + if not self.torch_fallback[model_name]: + self.engine[model_name].load() + + def activateEngines(self, model_name, alloc_shape=None): + if not self.torch_fallback[model_name]: + device_memory_update = self.low_vram and not self.shared_device_memory + if device_memory_update: + assert not self.use_cuda_graph + # Reclaim GPU memory from torch cache + torch.cuda.empty_cache() + self.shared_device_memory = cudart.cudaMalloc(self.max_shared_device_memory_size)[1] + # Create TensorRT execution context + if not self.engine[model_name].context: + assert not self.use_cuda_graph + self.engine[model_name].activate(device_memory=self.shared_device_memory) + if device_memory_update: + self.engine[model_name].reactivate(device_memory=self.shared_device_memory) + if alloc_shape and not self.engine[model_name].tensors: + assert not self.use_cuda_graph + self.engine[model_name].allocate_buffers(shape_dict=alloc_shape, device=self.device) + else: + # Load torch model + if not model_name in self.torch_models: + self.torch_models[model_name] = self.models[model_name].get_model(torch_inference=self.torch_inference) + + def deactivateEngines(self, model_name, release_model=True): + if not release_model: + return + if not self.torch_fallback[model_name]: + assert not self.use_cuda_graph + self.engine[model_name].deallocate_buffers() + self.engine[model_name].deactivate() + # Shared device memory deallocated only in low VRAM mode + if self.low_vram and self.shared_device_memory: + cudart.cudaFree(self.shared_device_memory) + self.shared_device_memory = None + else: + del self.torch_models[model_name] + + def print_summary(self, denoising_steps, walltime_ms, batch_size, num_frames): + print('|-----------------|--------------|') + print('| {:^15} | {:^12} |'.format('Module', 'Latency')) + print('|-----------------|--------------|') + print('| {:^15} | {:>9.2f} ms |'.format('VAE-Enc', cudart.cudaEventElapsedTime(self.events['vae_encoder'][0], self.events['vae_encoder'][1])[1])) + print('| {:^15} | {:>9.2f} ms |'.format('CLIP', cudart.cudaEventElapsedTime(self.events['clip'][0], self.events['clip'][1])[1])) + print('| {:^15} | {:>9.2f} ms |'.format('UNet'+('+CNet' if self.pipeline_type.is_controlnet() else '')+' x '+str(denoising_steps), cudart.cudaEventElapsedTime(self.events['denoise'][0], self.events['denoise'][1])[1])) + print('| {:^15} | {:>9.2f} ms |'.format('VAE-Dec', cudart.cudaEventElapsedTime(self.events['vae'][0], self.events['vae'][1])[1])) + print('|-----------------|--------------|') + print('| {:^15} | {:>9.2f} ms |'.format('Pipeline', walltime_ms)) + print('|-----------------|--------------|') + print('Throughput: {:.2f} videos/min ({} frames)'.format(batch_size*60000./walltime_ms, num_frames)) + + def save_video(self, frames, pipeline, seed): + video_name_prefix = '-'.join([pipeline, 'fp16', str(seed), str(random.randint(1000,9999))]) + video_name_suffix = 'torch' if self.torch_inference else 'trt' + video_path = video_name_prefix+'-'+video_name_suffix+'.gif' + print(f"Saving video to: {video_path}") + frames[0].save(os.path.join(self.output_dir, video_path), save_all=True,optimize=False, append_images=frames[1:], loop=0) + + def _encode_image(self, image, num_videos_per_prompt, do_classifier_free_guidance): + dtype = next(self.torch_models['clip-vis'].parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.image_processor.pil_to_numpy(image) + image = self.image_processor.numpy_to_pt(image) + + # We normalize the image before resizing to match with the original implementation. + # Then we unnormalize it after resizing. + image = image * 2.0 - 1.0 + image = _resize_with_antialiasing(image, (224, 224)) + image = (image + 1.0) / 2.0 + + # Normalize the image with for CLIP input + image = self.torch_models['clip-imgfe']( + images=image, + do_normalize=True, + do_center_crop=False, + do_resize=False, + do_rescale=False, + return_tensors="pt", + ).pixel_values + + image = image.to(device=self.device, dtype=dtype) + image_embeddings = self.torch_models['clip-vis'](image).image_embeds + image_embeddings = image_embeddings.unsqueeze(1) + + # duplicate image embeddings for each generation per prompt, using mps friendly method + bs_embed, seq_len, _ = image_embeddings.shape + image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1) + image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1) + + if do_classifier_free_guidance: + negative_image_embeddings = torch.zeros_like(image_embeddings) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + image_embeddings = torch.cat([negative_image_embeddings, image_embeddings]) + + return image_embeddings + + def _encode_vae_image( + self, + image: torch.Tensor, + device, + num_videos_per_prompt, + do_classifier_free_guidance, + ): + image = image.to(device=device) + image_latents = self.torch_models['vae-temp'].encode(image).latent_dist.mode() + + if do_classifier_free_guidance: + negative_image_latents = torch.zeros_like(image_latents) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + image_latents = torch.cat([negative_image_latents, image_latents]) + + # duplicate image_latents for each generation per prompt, using mps friendly method + image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1) + + return image_latents + + def _get_add_time_ids( + self, + fps, + motion_bucket_id, + noise_aug_strength, + dtype, + batch_size, + num_videos_per_prompt, + do_classifier_free_guidance, + ): + add_time_ids = [fps, motion_bucket_id, noise_aug_strength] + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1) + + if do_classifier_free_guidance: + add_time_ids = torch.cat([add_time_ids, add_time_ids]) + + return add_time_ids + + def prepare_latents( + self, + batch_size, + num_frames, + num_channels_latents, + height, + width, + dtype, + device, + latents=None, + ): + shape = ( + batch_size, + num_frames, + num_channels_latents // 2, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + + if latents is None: + latents = randn_tensor(shape, generator=self.generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def decode_latents(self, latents, num_frames, decode_chunk_size): + # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width] + latents = latents.flatten(0, 1) + + latents = 1 / self.scaling_factor * latents + + # decode decode_chunk_size frames at a time to avoid OOM + frames = [] + for i in range(0, latents.shape[0], decode_chunk_size): + num_frames_in = latents[i : i + decode_chunk_size].shape[0] + # TODO only pass num_frames_in if it's expected + if self.torch_fallback['vae-temp']: + frame = self.torch_models['vae-temp'].decode(latents[i : i + decode_chunk_size], num_frames=num_frames_in).sample + else: + params = { + 'latent': latents[i : i + decode_chunk_size], + # FIXME segfault + #'num_frames_in': torch.Tensor([num_frames_in]).to(device=latents.device, dtype=torch.int64), + } + frame = self.runEngine('vae-temp', params)['frames'] + frames.append(frame) + frames = torch.cat(frames, dim=0) + + # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width] + frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4) + + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + frames = frames.float() + return frames + + def infer( + self, + input_image, + image_height: int, + image_width: int, + fps: int = 7, + motion_bucket_id: int = 127, + noise_aug_strength: int = 0.02, + num_videos_per_prompt: Optional[int] = 1, + warmup: bool = False, + save_video: bool = True, + ): + """ + Run the video diffusion pipeline. + + Args: + input_image (image): + Input image used to initialize the latents or to be inpainted. + image_height (int): + Height (in pixels) of the image to be generated. Must be a multiple of 8. + image_width (int): + Width (in pixels) of the image to be generated. Must be a multiple of 8. + fps (`int`, *optional*, defaults to 7): + Frames per second. The rate at which the generated images shall be exported to a video after generation. + Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. + motion_bucket_id (`int`, *optional*, defaults to 127): + The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video. + noise_aug_strength (`int`, *optional*, defaults to 0.02): + The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion. + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + warmup (bool): + Indicate if this is a warmup run. + save_video (bool): + Save the video image. + """ + + if self.generator and self.seed: + self.generator.manual_seed(self.seed) + + # TODO + batch_size = 1 + # Fast warmup + denoising_steps = 1 if warmup else self.denoising_steps + + torch.cuda.synchronize() + e2e_tic = time.perf_counter() + + class LoadModelContext: + def __init__(ctx, model_name, alloc_shape=None, release_model=False): + ctx.model_name = model_name + ctx.release_model = release_model + ctx.alloc_shape = alloc_shape + def __enter__(ctx): + self.activateEngines(ctx.model_name, alloc_shape=ctx.alloc_shape) + def __exit__(ctx, exc_type, exc_val, exc_tb): + self.deactivateEngines(ctx.model_name, release_model=ctx.release_model) + + # Release model opportunistically in TensorRT pipeline only in low VRAM mode + release_model = self.low_vram and not self.torch_inference + with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER): + with LoadModelContext('clip-imgfe', release_model=release_model), LoadModelContext('clip-vis', release_model=release_model): + self.profile_start('clip', color='green') + image_embeddings = self._encode_image(input_image, num_videos_per_prompt, self.do_classifier_free_guidance) + self.profile_stop('clip') + # NOTE Stable Diffusion Video was conditioned on fps - 1 + fps = fps - 1 + + self.profile_start('preprocess', color='pink') + input_image = self.image_processor.preprocess(input_image, height=image_height, width=image_width).to(self.device) + noise = randn_tensor(input_image.shape, generator=self.generator, device=input_image.device, dtype=input_image.dtype) + input_image = input_image + noise_aug_strength * noise + self.profile_stop('preprocess') + + # TODO + # assert self.torch_models['vae-temp'].dtype == torch.float32 + + with LoadModelContext('vae-temp'): + self.profile_start('vae_encoder', color='red') + image_latents = self._encode_vae_image(input_image, self.device, num_videos_per_prompt, self.do_classifier_free_guidance) + image_latents = image_latents.to(image_embeddings.dtype) + self.profile_stop('vae_encoder') + + # Repeat the image latents for each frame so we can concatenate them with the noise + # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width] + image_latents = image_latents.unsqueeze(1).repeat(1, self.num_frames, 1, 1, 1) + + # Get Added Time IDs + added_time_ids = self._get_add_time_ids( + fps, + motion_bucket_id, + noise_aug_strength, + image_embeddings.dtype, + batch_size, + num_videos_per_prompt, + self.do_classifier_free_guidance, + ) + added_time_ids = added_time_ids.to(self.device) + + # Prepare timesteps + self.scheduler.set_timesteps(denoising_steps, device=self.device) + timesteps = self.scheduler.timesteps + + # Prepare latent variables + latents = self.prepare_latents( + batch_size * num_videos_per_prompt, + self.num_frames, + 8, # TODO: num_channels_latents = unet.config.in_channels + image_height, + image_width, + image_embeddings.dtype, + input_image.device, + None, # pre-generated latents + ) + + # Prepare guidance scale + guidance_scale = torch.linspace(self.min_guidance_scale, self.max_guidance_scale, self.num_frames).unsqueeze(0) + guidance_scale = guidance_scale.to(self.device, latents.dtype) + guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1) + guidance_scale = _append_dims(guidance_scale, latents.ndim) + + # Denoising loop + num_warmup_steps = len(timesteps) - denoising_steps * self.scheduler.order + unet_shape_dict = self.models['unet-temp'].get_shape_dict(batch_size, image_height, image_width) + with LoadModelContext('unet-temp', alloc_shape=unet_shape_dict, release_model=release_model), tqdm(total=denoising_steps) as progress_bar: + self.profile_start('denoise', color='blue') + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # Concatenate image_latents over channels dimention + latent_model_input = torch.cat([latent_model_input, image_latents], dim=2) + + # predict the noise residual + if self.torch_fallback['unet-temp']: + noise_pred = self.torch_models['unet-temp']( + latent_model_input, + t, + encoder_hidden_states=image_embeddings, + added_time_ids=added_time_ids, + return_dict=False, + )[0] + else: + params = { + "sample": latent_model_input, + "timestep": t, + "encoder_hidden_states": image_embeddings, + "added_time_ids": added_time_ids, + } + noise_pred = self.runEngine('unet-temp', params)['latent'] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + self.profile_stop('denoise') + + with torch.inference_mode(), trt.Runtime(TRT_LOGGER), LoadModelContext('vae-temp'): + self.profile_start('vae', color='red') + self.torch_models['vae-temp'].to(dtype=torch.float16) + frames = self.decode_latents(latents, self.num_frames, self.decode_chunk_size) + frames = tensor2vid(frames, self.image_processor, output_type='pil') + self.profile_stop('vae') + + torch.cuda.synchronize() + + if warmup: + return + + e2e_toc = time.perf_counter() + walltime_ms = (e2e_toc - e2e_tic) * 1000. + self.print_summary(denoising_steps, walltime_ms, batch_size, len(frames[0])) + if save_video: + self.save_video(frames[0], self.pipeline_type.name.lower(), self.seed) + + return frames, walltime_ms + + def run(self, input_image, height, width, batch_size, batch_count, num_warmup_runs, use_cuda_graph, **kwargs): + num_warmup_runs = max(1, num_warmup_runs) if use_cuda_graph else num_warmup_runs + if num_warmup_runs > 0: + print("[I] Warming up ..") + for _ in range(num_warmup_runs): + self.infer(input_image, height, width, warmup=True) + + for _ in range(batch_count): + print("[I] Running StableDiffusion pipeline") + if self.nvtx_profile: + cudart.cudaProfilerStart() + self.infer(input_image, height, width, warmup=False) + if self.nvtx_profile: + cudart.cudaProfilerStop() diff --git a/demo/Diffusion/utilities.py b/demo/Diffusion/utilities.py index 11f368079..6dece14f4 100644 --- a/demo/Diffusion/utilities.py +++ b/demo/Diffusion/utilities.py @@ -1,4 +1,5 @@ # +# Copyright (c) Alibaba, Inc. and its affiliates. # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -46,6 +47,9 @@ TRT_LOGGER = trt.Logger(trt.Logger.ERROR) +def GiB(val): + return val * 1 << 30 + # Map of numpy dtype -> torch dtype numpy_to_torch_dtype_dict = { np.uint8 : torch.uint8, @@ -151,6 +155,7 @@ def CUASSERT(cuda_ret): class PIPELINE_TYPE(Enum): TXT2IMG = auto() IMG2IMG = auto() + IMG2VID = auto() INPAINT = auto() CONTROLNET = auto() XL_BASE = auto() @@ -162,6 +167,9 @@ def is_txt2img(self): def is_img2img(self): return self == self.IMG2IMG + def is_img2vid(self): + return self == self.IMG2VID + def is_inpaint(self): return self == self.INPAINT @@ -236,6 +244,7 @@ def build(self, enable_all_tactics=False, timing_cache=None, update_output_names=None, + native_instancenorm=True, verbose=False, **extra_build_args ): @@ -249,7 +258,10 @@ def build(self, if not enable_all_tactics: extra_build_args['tactic_sources'] = [] - network = network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]) + flags = [] + if native_instancenorm: + flags.append(trt.OnnxParserFlag.NATIVE_INSTANCENORM) + network = network_from_onnx_path(onnx_path, flags=flags) if update_output_names: print(f"Updating network outputs to {update_output_names}") network = ModifyNetworkOutputs(network, update_output_names) @@ -272,13 +284,21 @@ def load(self): print(f"Loading TensorRT engine: {self.engine_path}") self.engine = engine_from_bytes(bytes_from_path(self.engine_path)) - def activate(self, reuse_device_memory=None): - if reuse_device_memory: + def activate(self, device_memory=None): + if device_memory: self.context = self.engine.create_execution_context_without_device_memory() - self.context.device_memory = reuse_device_memory + self.context.device_memory = device_memory else: self.context = self.engine.create_execution_context() + def reactivate(self, device_memory): + assert self.context + self.context.device_memory = device_memory + + def deactivate(self): + del self.context + self.context = None + def allocate_buffers(self, shape_dict=None, device='cuda'): for binding in range(self.engine.num_io_tensors): name = self.engine.get_tensor_name(binding) @@ -293,8 +313,12 @@ def allocate_buffers(self, shape_dict=None, device='cuda'): self.tensors[name] = tensor - def infer(self, feed_dict, stream, use_cuda_graph=False): + def deallocate_buffers(self): + for idx in range(self.engine.num_io_tensors): + binding = self.engine[idx] + del self.tensors[binding] + def infer(self, feed_dict, stream, use_cuda_graph=False): for name, buf in feed_dict.items(): self.tensors[name].copy_(buf) @@ -322,13 +346,13 @@ def infer(self, feed_dict, stream, use_cuda_graph=False): return self.tensors -def save_image(images, image_path_dir, image_name_prefix): +def save_image(images, image_path_dir, image_name_prefix, image_name_suffix): """ Save the generated images to png files. """ images = ((images + 1) * 255 / 2).clamp(0, 255).detach().permute(0, 2, 3, 1).round().type(torch.uint8).cpu().numpy() for i in range(images.shape[0]): - image_path = os.path.join(image_path_dir, image_name_prefix+str(i+1)+'-'+str(random.randint(1000,9999))+'.png') + image_path = os.path.join(image_path_dir, image_name_prefix+str(i+1)+'-'+str(random.randint(1000,9999))+'-'+image_name_suffix+'.png') print(f"Saving image {i+1} / {images.shape[0]} to: {image_path}") Image.fromarray(images[i]).save(image_path) @@ -344,6 +368,137 @@ def preprocess_image(image): image = torch.from_numpy(image).contiguous() return 2.0 * image - 1.0 +# Taken from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py#L620 +def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True): + h, w = input.shape[-2:] + factors = (h / size[0], w / size[1]) + + # First, we have to determine sigma + # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171 + sigmas = ( + max((factors[0] - 1.0) / 2.0, 0.001), + max((factors[1] - 1.0) / 2.0, 0.001), + ) + + # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma + # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206 + # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now + ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) + + # Make sure it is odd + if (ks[0] % 2) == 0: + ks = ks[0] + 1, ks[1] + + if (ks[1] % 2) == 0: + ks = ks[0], ks[1] + 1 + + input = _gaussian_blur2d(input, ks, sigmas) + + output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners) + return output + + +def _compute_padding(kernel_size): + """Compute padding tuple.""" + # 4 or 6 ints: (padding_left, padding_right,padding_top,padding_bottom) + # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad + if len(kernel_size) < 2: + raise AssertionError(kernel_size) + computed = [k - 1 for k in kernel_size] + + # for even kernels we need to do asymmetric padding :( + out_padding = 2 * len(kernel_size) * [0] + + for i in range(len(kernel_size)): + computed_tmp = computed[-(i + 1)] + + pad_front = computed_tmp // 2 + pad_rear = computed_tmp - pad_front + + out_padding[2 * i + 0] = pad_front + out_padding[2 * i + 1] = pad_rear + + return out_padding + + +def _filter2d(input, kernel): + # prepare kernel + b, c, h, w = input.shape + tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype) + + tmp_kernel = tmp_kernel.expand(-1, c, -1, -1) + + height, width = tmp_kernel.shape[-2:] + + padding_shape: list[int] = _compute_padding([height, width]) + input = torch.nn.functional.pad(input, padding_shape, mode="reflect") + + # kernel and input tensor reshape to align element-wise or batch-wise params + tmp_kernel = tmp_kernel.reshape(-1, 1, height, width) + input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1)) + + # convolve the tensor with the kernel. + output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1) + + out = output.view(b, c, h, w) + return out + + +def _gaussian(window_size: int, sigma): + if isinstance(sigma, float): + sigma = torch.tensor([[sigma]]) + + batch_size = sigma.shape[0] + + x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1) + + if window_size % 2 == 0: + x = x + 0.5 + + gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0))) + + return gauss / gauss.sum(-1, keepdim=True) + + +def _gaussian_blur2d(input, kernel_size, sigma): + if isinstance(sigma, tuple): + sigma = torch.tensor([sigma], dtype=input.dtype) + else: + sigma = sigma.to(dtype=input.dtype) + + ky, kx = int(kernel_size[0]), int(kernel_size[1]) + bs = sigma.shape[0] + kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1)) + kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1)) + out_x = _filter2d(input, kernel_x[..., None, :]) + out = _filter2d(out_x, kernel_y[..., None]) + + return out + +def _append_dims(x, target_dims): + """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" + dims_to_append = target_dims - x.ndim + if dims_to_append < 0: + raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") + return x[(...,) + (None,) * dims_to_append] + +# Not a contribution +# Changes made by NVIDIA CORPORATION & AFFILIATES enabling tensor2vid or otherwise documented as +# NVIDIA-proprietary are not a contribution and subject to the terms and conditions at the top of the file +def tensor2vid(video: torch.Tensor, processor, output_type="np"): + # Based on: + # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 + + batch_size, channels, num_frames, height, width = video.shape + outputs = [] + for batch_idx in range(batch_size): + batch_vid = video[batch_idx].permute(1, 0, 2, 3) + batch_output = processor.postprocess(batch_vid, output_type) + + outputs.append(batch_output) + + return outputs + def prepare_mask_and_masked_image(image, mask): """ image: PIL.Image.Image @@ -509,7 +664,6 @@ def process_pipeline_args(args): 'timing_cache': args.timing_cache, 'int8': args.int8, 'quantization_level': args.quantization_level, - 'denoising_steps': args.denoising_steps, } args_run_demo = (args.prompt, args.negative_prompt, args.height, args.width, args.batch_size, args.batch_count, args.num_warmup_runs, args.use_cuda_graph) diff --git a/demo/Diffusion/utils_sd3/mmdit.py b/demo/Diffusion/utils_sd3/mmdit.py index c346b483b..2fe322636 100644 --- a/demo/Diffusion/utils_sd3/mmdit.py +++ b/demo/Diffusion/utils_sd3/mmdit.py @@ -511,7 +511,6 @@ def __init__( device = None, ): super().__init__() - print(f"mmdit initializing with: {input_size=}, {patch_size=}, {in_channels=}, {depth=}, {mlp_ratio=}, {learn_sigma=}, {adm_in_channels=}, {context_embedder_config=}, {register_length=}, {attn_mode=}, {rmsnorm=}, {scale_mod_only=}, {swiglu=}, {out_channels=}, {pos_embed_scaling_factor=}, {pos_embed_offset=}, {pos_embed_max_size=}, {num_patches=}, {qk_norm=}, {qkv_bias=}, {dtype=}, {device=}") self.dtype = dtype self.learn_sigma = learn_sigma self.in_channels = in_channels diff --git a/docker/rockylinux8.Dockerfile b/docker/rockylinux8.Dockerfile index 1ff359e2d..2ad1caf95 100644 --- a/docker/rockylinux8.Dockerfile +++ b/docker/rockylinux8.Dockerfile @@ -25,7 +25,7 @@ ENV NV_CUDNN_VERSION 8.9.6.50-1 ENV NV_CUDNN_PACKAGE libcudnn8-${NV_CUDNN_VERSION}.cuda12.2 ENV NV_CUDNN_PACKAGE_DEV libcudnn8-devel-${NV_CUDNN_VERSION}.cuda12.2 -ENV TRT_VERSION 10.2.0.19 +ENV TRT_VERSION 10.3.0.26 SHELL ["/bin/bash", "-c"] RUN dnf install -y \ @@ -62,15 +62,15 @@ RUN dnf install -y python38 python38-devel &&\ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp38-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp38-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ diff --git a/docker/rockylinux9.Dockerfile b/docker/rockylinux9.Dockerfile index 815b608e1..8741977bc 100644 --- a/docker/rockylinux9.Dockerfile +++ b/docker/rockylinux9.Dockerfile @@ -25,7 +25,7 @@ ENV NV_CUDNN_VERSION 8.9.6.50-1 ENV NV_CUDNN_PACKAGE libcudnn8-${NV_CUDNN_VERSION}.cuda12.2 ENV NV_CUDNN_PACKAGE_DEV libcudnn8-devel-${NV_CUDNN_VERSION}.cuda12.2 -ENV TRT_VERSION 10.2.0.19 +ENV TRT_VERSION 10.3.0.26 SHELL ["/bin/bash", "-c"] RUN dnf install -y \ @@ -67,15 +67,15 @@ RUN dnf -y install \ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp39-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp39-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp39-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp39-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ diff --git a/docker/ubuntu-20.04.Dockerfile b/docker/ubuntu-20.04.Dockerfile index 881139e3a..b481d945d 100644 --- a/docker/ubuntu-20.04.Dockerfile +++ b/docker/ubuntu-20.04.Dockerfile @@ -28,7 +28,7 @@ ENV CUDA_VERSION_MAJOR_MINOR=12.2 ENV NV_CUDNN_PACKAGE "libcudnn8=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" ENV NV_CUDNN_PACKAGE_DEV "libcudnn8-dev=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" -ENV TRT_VERSION 10.2.0.19 +ENV TRT_VERSION 10.3.0.26 SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -84,15 +84,15 @@ RUN apt-get install -y --no-install-recommends \ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp38-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp38-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ diff --git a/docker/ubuntu-22.04-aarch64.Dockerfile b/docker/ubuntu-22.04-aarch64.Dockerfile index 783193d24..e6991c4cc 100644 --- a/docker/ubuntu-22.04-aarch64.Dockerfile +++ b/docker/ubuntu-22.04-aarch64.Dockerfile @@ -20,7 +20,7 @@ ARG CUDA_VERSION=12.5.0 # Multi-arch container support available in non-cudnn containers. FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 -ENV TRT_VERSION 10.2.0.19 +ENV TRT_VERSION 10.3.0.26 SHELL ["/bin/bash", "-c"] # Setup user account diff --git a/docker/ubuntu-22.04.Dockerfile b/docker/ubuntu-22.04.Dockerfile index 924f3c02d..43f872a68 100644 --- a/docker/ubuntu-22.04.Dockerfile +++ b/docker/ubuntu-22.04.Dockerfile @@ -28,7 +28,7 @@ ENV CUDA_VERSION_MAJOR_MINOR=12.2 ENV NV_CUDNN_PACKAGE "libcudnn8=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" ENV NV_CUDNN_PACKAGE_DEV "libcudnn8-dev=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" -ENV TRT_VERSION 10.2.0.19 +ENV TRT_VERSION 10.3.0.26 SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -84,15 +84,15 @@ RUN apt-get install -y --no-install-recommends \ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp310-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp310-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && tar -xf TensorRT-10.2.0.19.Linux.x86_64-gnu.cuda-12.5.tar.gz \ - && cp -a TensorRT-10.2.0.19/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.2.0.19/python/tensorrt-10.2.0-cp310-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.3.0/tars/TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && tar -xf TensorRT-10.3.0.26.Linux.x86_64-gnu.cuda-12.5.tar.gz \ + && cp -a TensorRT-10.3.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.3.0.26/python/tensorrt-10.3.0-cp310-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ diff --git a/docker/ubuntu-cross-aarch64.Dockerfile b/docker/ubuntu-cross-aarch64.Dockerfile index 253dd0421..6a03c874c 100644 --- a/docker/ubuntu-cross-aarch64.Dockerfile +++ b/docker/ubuntu-cross-aarch64.Dockerfile @@ -21,7 +21,7 @@ ARG OS_VERSION=22.04 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${OS_VERSION} LABEL maintainer="NVIDIA CORPORATION" -ENV TRT_VERSION 10.2.0.19 +ENV TRT_VERSION 10.3.0.26 ENV DEBIAN_FRONTEND=noninteractive ARG uid=1000 diff --git a/include/NvInfer.h b/include/NvInfer.h index 11c6994f3..fc8d4ec81 100644 --- a/include/NvInfer.h +++ b/include/NvInfer.h @@ -102,7 +102,7 @@ enum class LayerType : int32_t kNMS = 43, //!< NMS layer kREVERSE_SEQUENCE = 44, //!< Reverse sequence layer kNORMALIZATION = 45, //!< Normalization layer - kPLUGIN_V3 = 46 //!< PluginV3 layer. + kPLUGIN_V3 = 46, //!< PluginV3 layer. }; //! @@ -841,8 +841,8 @@ class ILayer : public INoCopy //! \code //! Shorthand: //! I = dimensions of input image. -//! B = prePadding, before the image data. For deconvolution, prePadding is set before output. -//! A = postPadding, after the image data. For deconvolution, postPadding is set after output. +//! B = prePadding, before the image data. +//! A = postPadding, after the image data. //! P = delta between input and output //! S = stride //! F = filter @@ -2202,7 +2202,6 @@ class IDeconvolutionLayer : public ILayer //! //! Default: (0, 0, ..., 0) //! - //! If executing this layer on DLA, padding must be 0. //! //! \see getPrePadding() //! @@ -2230,7 +2229,6 @@ class IDeconvolutionLayer : public ILayer //! //! Default: (0, 0, ..., 0) //! - //! If executing this layer on DLA, padding must be 0. //! //! \see getPostPadding() //! @@ -2949,8 +2947,10 @@ class IReduceLayer : public ILayer //! //! \brief Layer that represents a padding operation. //! -//! The padding layer adds zero-padding at the start and end of the input tensor. It only supports padding along the two -//! innermost dimensions. Applying negative padding results in cropping of the input. +//! The padding layer adds zero-padding at the start and end of the input tensor. It supports padding +//! only the last two dimensions. Applying negative padding results in cropping of the input. +//! +//! To pad across any subset of dimensions, use ISliceLayer with SampleMode::kFILL. //! //! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI. //! @@ -3237,12 +3237,13 @@ constexpr inline int32_t EnumMax() noexcept //! //! The slice layer has two variants, static and dynamic. Static slice specifies the start, size, and stride //! dimensions at layer creation time via Dims and can use the get/set accessor functions of the ISliceLayer. -//! Dynamic slice specifies one or more of start, size or stride as ITensors, by using ILayer::setInput to add -//! a second, third, or fourth input respectively. The corresponding Dims are used if an input +//! Static slice layers can also optionally specify axes through the get/set accessor functions of the ISliceLayer. +//! Dynamic slice specifies one or more of start, size, stride, or axes as ITensors, by using ILayer::setInput to add +//! a second, third, fourth, or sixth input respectively. The corresponding Dims are used if an input //! is missing or null. //! //! An application can determine if the ISliceLayer has a dynamic output shape based on whether -//! the size input (third input) is present and non-null. +//! the size or axes input is present and non-null. //! //! The slice layer selects for each dimension a start location from within the input tensor, and //! copies elements to the output tensor using the specified stride across the input tensor. @@ -3255,18 +3256,35 @@ constexpr inline int32_t EnumMax() noexcept //! stride = {1, 2} //! output = {{1, 5}} //! +//! If axes are provided then starts, ends, and strides must have the same length as axes +//! and specifies a subset of dimensions to slice. If axes are not provided, starts, ends, and strides +//! must be of the same length as the rank of the input tensor. +//! +//! An example of using slice on a tensor with axes specified: +//! input = {{0, 2, 4}, {1, 3, 5}} +//! start = {1} +//! size = {2} +//! stride = {1} +//! axes = {1} +//! output = {{2, 4}, {3, 5}} +//! //! When the sampleMode is kCLAMP or kREFLECT, for each input dimension, if its size is 0 then the corresponding output //! dimension must be 0 too. //! +//! When the sampleMode is kFILL, the fifth input to the slice layer is used to determine the value to fill in out-of-bound +//! indices. It is an error to specify the fifth input in any other sampleMode. +//! //! A slice layer can produce a shape tensor if the following conditions are met: //! //! * start, size, and stride are build time constants, either as static Dims or as constant input tensors. +//! * axes, if provided, are build time constants, either as static Dims or as a constant input tensor. //! * The number of elements in the output tensor does not exceed 2 * Dims::MAX_DIMS. //! //! The input tensor is a shape tensor if the output is a shape tensor. //! //! The following constraints must be satisfied to execute this layer on DLA: //! * start, size, and stride are build time constants, either as static Dims or as constant input tensors. +//! * axes, if provided, are build time constants, either as static Dims or as a constant input tensor. //! * sampleMode is kSTRICT_BOUNDS. //! * Strides are 1 for all dimensions. //! * Slicing is not performed on the first dimension @@ -3401,6 +3419,10 @@ class ISliceLayer : public ILayer //! or be implicitly convertible to the input data type. //! Implicit data type conversion is supported among kFLOAT, kHALF, kINT8, and kFP8 data types. //! This input is disallowed for other modes. + //! - 5: The axes tensor indicating the corresponding axes that start, size, and stride + //! should apply to, as a 1D Int32 shape tensor. Negative values for axes + //! indicate indexing from the back of the input tensor. Values must be unique and be + //! within the interval of [-rank(input), rank(input)-1]. //! //! Using the corresponding setter resets the input to null. //! @@ -3409,6 +3431,35 @@ class ISliceLayer : public ILayer //! using ILayer::setInput; + //! + //! \brief Set the axes for this ISliceLayer. + //! + //! \param axes The axes on which the starts, ends, and strides parameters of the slice apply to. + //! + //! If a sixth input had been used to create this layer, that input is reset to null by this method. + //! + //! \see getAxes + //! + void setAxes(Dims const& axes) noexcept + { + mImpl->setAxes(axes); + } + + //! + //! \brief Get the axes for this ISliceLayer. + //! + //! \return The axes on which the starts, ends, and strides parameters of this slice apply to. + //! + //! If the sixth input is present and non-null, + //! this function returns a Dims with nbDims = -1. + //! + //! \see setAxes + //! + Dims getAxes() const noexcept + { + return mImpl->getAxes(); + } + protected: apiv::VSliceLayer* mImpl; virtual ~ISliceLayer() noexcept = default; @@ -4007,8 +4058,8 @@ struct EnumMaxImpl //! Resize layer can be used for resizing a N-D tensor. //! //! Resize layer currently supports the following configurations: -//! - InterpolationMode::kNEAREST - resizes innermost `m` dimensions of N-D, where 0 < m <= min(8, N) and N > 0 -//! - InterpolationMode::kLINEAR - resizes innermost `m` dimensions of N-D, where 0 < m <= min(3, N) and N > 0 +//! - InterpolationMode::kNEAREST - resizes last `m` dimensions of N-D, where 0 < m <= min(8, N) and N > 0 +//! - InterpolationMode::kLINEAR - resizes last `m` dimensions of N-D, where 0 < m <= min(3, N) and N > 0 //! //! Default resize mode is InterpolationMode::kNEAREST. //! @@ -5519,6 +5570,7 @@ class IDequantizeLayer : public ILayer apiv::VDequantizeLayer* mImpl; }; + //! //! \class IEinsumLayer //! @@ -7435,6 +7487,7 @@ class INetworkDefinition : public INoCopy return mImpl->addQuantizeV2(input, scale, outputType); } + //! //! \brief Add an Einsum layer to the network. //! @@ -8549,7 +8602,13 @@ enum class PreviewFeature : int32_t //! \deprecated Deprecated in TensorRT 10.0. The default value for this flag is on and can not be changed. //! kPROFILE_SHARING_0806 TRT_DEPRECATED_ENUM = 0, + + //! + //! Allows plugin I/O to be aliased when using IPluginV3OneBuildV2 + //! + kALIASED_PLUGIN_IO_10_03 = 1 }; + namespace impl { //! @@ -8560,7 +8619,7 @@ namespace impl template <> struct EnumMaxImpl { - static constexpr int32_t kVALUE = 1; + static constexpr int32_t kVALUE = 2; }; } // namespace impl diff --git a/include/NvInferImpl.h b/include/NvInferImpl.h index b77cb1823..d202c3d1f 100644 --- a/include/NvInferImpl.h +++ b/include/NvInferImpl.h @@ -703,6 +703,8 @@ class VSliceLayer : public VRoot virtual Dims getStride() const noexcept = 0; virtual void setMode(SampleMode mode) noexcept = 0; virtual SampleMode getMode() const noexcept = 0; + virtual void setAxes(Dims const& axes) noexcept = 0; + virtual Dims getAxes() const noexcept = 0; }; class VShapeLayer : public VRoot @@ -909,6 +911,7 @@ class VDequantizeLayer : public VRoot virtual void setToType(DataType toType) noexcept = 0; }; + class VScatterLayer : public VRoot { public: diff --git a/include/NvInferRuntime.h b/include/NvInferRuntime.h index 93959c372..485628a68 100644 --- a/include/NvInferRuntime.h +++ b/include/NvInferRuntime.h @@ -1093,6 +1093,54 @@ class IPluginV3OneRuntime : public IPluginCapability }; } // namespace v_1_0 +namespace v_2_0 +{ + +class IPluginV3OneBuild : public v_1_0::IPluginV3OneBuild +{ +public: + InterfaceInfo getInterfaceInfo() const noexcept override + { + return InterfaceInfo{"PLUGIN_V3ONE_BUILD", 2, 0}; + } + + //! + //! \brief Communicates to TensorRT that the output at the specified output index is aliased to the input at the + //! returned index + //! + //! Enables read-modify-write behavior in plugins. TensorRT may insert copies to facilitate this capability. + //! + //! \return An integer denoting the index of the input which is aliased to the output at outputIndex. + //! Returning -1 indicates that the output is not aliased to any input. Otherwise, the valid range for + //! return value is [0, nbInputs - 1]. + //! + //! \note A given plugin input can only be aliased to a single plugin output. + //! + //! \note This API will only be called and have an effect when PreviewFeature::kALIASED_PLUGIN_IO_10_03 is turned + //! on. + //! + //! \warning If an input is not shallow copyable, a copy inserted by TensorRT may not work as intended. Therefore, + //! using this feature with tensors requiring deep copies is not supported. + //! + //! \warning If a given tensor is requested to be aliased by two different plugins, this may result in divergent + //! copies of the tensor after writes from each plugin. e.g. In the below example, t1 and t2 could be divergent. + //! + //! +-----+ +--------+ + //! +->|Copy +--> t* ---->|Plugin0 +--> t1 + //! | +-----+ +--------+ + //! t + //! | +-----+ +--------+ + //! +->|Copy +--> t** --->|Plugin1 +--> t2 + //! +-----+ +--------+ + //! + virtual int32_t getAliasedInput(int32_t outputIndex) noexcept + { + return -1; + } +}; + +} // namespace v_2_0 + //! //! \class IPluginV3OneCore //! @@ -1128,6 +1176,15 @@ using IPluginV3OneBuild = v_1_0::IPluginV3OneBuild; //! using IPluginV3OneRuntime = v_1_0::IPluginV3OneRuntime; +//! +//! \class IPluginV3OneBuildV2 +//! +//! \brief A plugin capability interface that extends IPluginV3OneBuild by providing I/O aliasing functionality. +//! +//! \see IPluginV3OneBuild +//! +using IPluginV3OneBuildV2 = v_2_0::IPluginV3OneBuild; + namespace v_1_0 { class IPluginCreatorV3One : public IPluginCreatorInterface diff --git a/include/NvInferVersion.h b/include/NvInferVersion.h index 378acb4c2..11b9cc6df 100644 --- a/include/NvInferVersion.h +++ b/include/NvInferVersion.h @@ -24,9 +24,9 @@ #define NV_INFER_VERSION_H #define NV_TENSORRT_MAJOR 10 //!< TensorRT major version. -#define NV_TENSORRT_MINOR 2 //!< TensorRT minor version. -#define NV_TENSORRT_PATCH 0 //!< TensorRT patch version. -#define NV_TENSORRT_BUILD 19 //!< TensorRT build number. +#define NV_TENSORRT_MINOR 3 //!< TensorRT minor version. +#define NV_TENSORRT_PATCH 0 //!< TensorRT patch version. +#define NV_TENSORRT_BUILD 26 //!< TensorRT build number. #define NV_TENSORRT_LWS_MAJOR 0 //!< TensorRT LWS major version. #define NV_TENSORRT_LWS_MINOR 0 //!< TensorRT LWS minor version. diff --git a/parsers/onnx b/parsers/onnx index f161f9588..62bdde2a0 160000 --- a/parsers/onnx +++ b/parsers/onnx @@ -1 +1 @@ -Subproject commit f161f95883b4ebd8cb789de5efc67b73c0a6e694 +Subproject commit 62bdde2a04fcd53c2409cb895ee18db445b7e755 diff --git a/plugin/README.md b/plugin/README.md index 6f619095b..0416ecc0d 100644 --- a/plugin/README.md +++ b/plugin/README.md @@ -25,7 +25,8 @@ | [gridAnchorPlugin](gridAnchorPlugin) | GridAnchor_TRT | 1 | | [gridAnchorRectPlugin](gridAnchorPlugin) | GridAnchorRect_TRT | 1 | | [groupNormalizationPlugin](groupNormalizationPlugin) | GroupNormalizationPlugin | 1 | -| [instanceNormalizationPlugin](instanceNormalizationPlugin) | InstanceNormalization_TRT | 1 | +| [instanceNormalizationPlugin](instanceNormalizationPlugin) [DEPRECATED] | InstanceNormalization_TRT | 1 | +| [instanceNormalizationPlugin](instanceNormalizationPlugin) | InstanceNormalization_TRT | 2 | | [leakyReluPlugin](leakyReluPlugin) [DEPRECATED] | LReLU_TRT | 1 | | [modulatedDeformConvPlugin](modulatedDeformConvPlugin) | ModulatedDeformConv2d | 1 | | [multilevelCropAndResizePlugin](multilevelCropAndResizePlugin) | MultilevelCropAndResize_TRT | 1 | @@ -46,7 +47,8 @@ | [roiAlignPlugin](roiAlignPlugin) [DEPRECATED] | ROIAlign_TRT | 1 | | [roiAlignPlugin](roiAlignPlugin) | ROIAlign_TRT | 2 | | [resizeNearestPlugin](resizeNearestPlugin) | ResizeNearest_TRT | 1 | -| [scatterElementsPlugin](scatterElementsPlugin) | ScatterElements | 1 | +| [scatterElementsPlugin](scatterElementsPlugin) [DEPRECATED] | ScatterElements | 1 | +| [scatterElementsPlugin](scatterElementsPlugin) | ScatterElements | 2 | | [scatterPlugin](scatterPlugin) | ScatterND | 1 | | [skipLayerNormPlugin](skipLayerNormPlugin) | CustomSkipLayerNormPluginDynamic | 1, 2, 3 | | [specialSlicePlugin](specialSlicePlugin) [DEPRECATED] | SpecialSlice_TRT | 1 | diff --git a/plugin/api/inferPlugin.cpp b/plugin/api/inferPlugin.cpp index ad1ad62a2..28c42cae2 100644 --- a/plugin/api/inferPlugin.cpp +++ b/plugin/api/inferPlugin.cpp @@ -52,6 +52,7 @@ #include "roiAlignPlugin/roiAlignPlugin.h" #include "roiAlignPlugin/roiAlignPluginLegacy.h" #include "scatterElementsPlugin/scatterElementsPlugin.h" +#include "scatterElementsPlugin/scatterElementsPluginLegacy.h" #include "scatterPlugin/scatterPlugin.h" #include "specialSlicePlugin/specialSlicePlugin.h" #include "splitPlugin/split.h" @@ -221,7 +222,8 @@ extern "C" initializePlugin(logger, libNamespace); initializePlugin(logger, libNamespace); initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); + initializePlugin(logger, libNamespace); + initializePlugin(logger, libNamespace); initializePlugin(logger, libNamespace); initializePlugin(logger, libNamespace); initializePlugin(logger, libNamespace); diff --git a/plugin/gridAnchorPlugin/README.md b/plugin/gridAnchorPlugin/README.md index e9fc96732..7ae685011 100644 --- a/plugin/gridAnchorPlugin/README.md +++ b/plugin/gridAnchorPlugin/README.md @@ -97,4 +97,4 @@ This is the first release of this `README.md` file. ## Known issues -There are no known issues in this plugin. +There are no known issues in this plugin. \ No newline at end of file diff --git a/plugin/instanceNormalizationPlugin/README.md b/plugin/instanceNormalizationPlugin/README.md index d6d668b22..776d58cd4 100644 --- a/plugin/instanceNormalizationPlugin/README.md +++ b/plugin/instanceNormalizationPlugin/README.md @@ -13,7 +13,7 @@ The `InstanceNormalizePlugin` is used for the InstanceNormalization layer, which is generally used in deep learning models that perform image generation. This plugin is based off the [ONNX opset 6 definition](https://github.com/onnx/onnx/blob/master/docs/Operators.md#InstanceNormalization), and is used in any ONNX model that uses this operation. -Specifically, given an array of values `x = [x_0, x_1, ..., x_n]` , a scale factor, a bias factor, and an epislon, the InstanceNormalization of x is `scale * (x-mean) / sqrt(variance + epsilon) + bias` where the mean and variance are computed per instance per channel. +Specifically, given an array of values `x = [x_0, x_1, ..., x_n]` , a scale factor, a bias factor, and an epsilon, the InstanceNormalization of x is `scale * (x-mean) / sqrt(variance + epsilon) + bias` where the mean and variance are computed per instance per channel. ### Structure diff --git a/plugin/multiscaleDeformableAttnPlugin/README.md b/plugin/multiscaleDeformableAttnPlugin/README.md index 35f7796e8..4affdcca1 100644 --- a/plugin/multiscaleDeformableAttnPlugin/README.md +++ b/plugin/multiscaleDeformableAttnPlugin/README.md @@ -60,4 +60,4 @@ This is the first release of this `README.md` file. ## Known issues -There are no known issues in this plugin. \ No newline at end of file +There are no known issues in this plugin. diff --git a/plugin/nvFasterRCNN/README.md b/plugin/nvFasterRCNN/README.md index b2547a7c3..4f5a30842 100644 --- a/plugin/nvFasterRCNN/README.md +++ b/plugin/nvFasterRCNN/README.md @@ -101,4 +101,4 @@ This is the first release of this `README.md` file. ## Known issues -There are no known issues in this plugin. +There are no known issues in this plugin. \ No newline at end of file diff --git a/plugin/priorBoxPlugin/README.md b/plugin/priorBoxPlugin/README.md index ebb288850..11aa9aca9 100644 --- a/plugin/priorBoxPlugin/README.md +++ b/plugin/priorBoxPlugin/README.md @@ -95,4 +95,4 @@ This is the first release of this `README.md` file. ## Known issues -There are no known issues in this plugin. +There are no known issues in this plugin. \ No newline at end of file diff --git a/plugin/scatterElementsPlugin/README.md b/plugin/scatterElementsPlugin/README.md index 80736f34c..529bbd207 100644 --- a/plugin/scatterElementsPlugin/README.md +++ b/plugin/scatterElementsPlugin/README.md @@ -13,11 +13,12 @@ The scatterElements plugin implements the scatter operation described in (https://github.com/rusty1s/pytorch_scatter), in compliance with the [ONNX specification for ScatterElements](https://github.com/onnx/onnx/blob/main/docs/Operators.md#ScatterElements) -Note: ScatterElements with reduce="none" is implemented in TRT core, not this plugin. +Note: ScatterElements with reduce="none" is implemented in TRT core, not this plugin. ### Structure -This plugin has the plugin creator class `ScatterElementsPluginCreator` and the plugin class `ScatterElementsPlugin` which extends `IPluginV2DynamicExt`. +This plugin has the 2 versions. The latest is plugin creator class `ScatterElementsPluginV3Creator` and the plugin class `ScatterElementsPluginV3` which extends `IPluginV3`. (name: `ScatterElements`, version: 2) +The legacy plugin that will be deprecated, is plugin creator class `ScatterElementsPluginV2Creator` and the plugin class `ScatterElementsPluginV2`, which extends `IPluginV2DynamicExt` (name: `ScatterElements`, version: 1). The `ScatterElements` plugin consumes the following inputs: @@ -30,13 +31,13 @@ The `ScatterElements` plugin produces the following output: 1. `output` - T: Tensor, same shape as `data`. ## Parameters - + The `ScatterElements` plugin has the following parameters: | Type | Parameter | Description |------------------|---------------------------------|-------------------------------------------------------- |`int` |`axis` | Which axis to scatter on. Default is 0. Negative value means counting dimensions from the back. Accepted range is [-r, r-1] where r = rank(data). -|`char` |`reduction` | Type of reduction to apply: add, mul, max, min. ‘add’: reduction using the addition operation. ‘mul’: reduction using the multiplication operation.‘max’: reduction using the maximum operation.‘min’: reduction using the minimum operation. +|`char` |`reduction` | Type of reduction to apply: add, mul, max, min. ‘add’: reduction using the addition operation. ‘mul’: reduction using the multiplication operation.‘max’: reduction using the maximum operation.‘min’: reduction using the minimum operation. The following resources provide a deeper understanding of the `scatterElements` plugin: @@ -46,12 +47,13 @@ The following resources provide a deeper understanding of the `scatterElements` ## License -For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) +For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation. ## Changelog -Oct 2023: This is the first release of this `README.md` file. +- July 2024: Version 2 of the plugin migrated to `IPluginV3` interface design. The legacy plugin (version 1) using `IPluginV2DynamicExt` interface is deprecated. +- Oct 2023: This is the first release of this `README.md` file. ## Known issues diff --git a/plugin/scatterElementsPlugin/ScatterElementsPlugin_PluginConfig.yaml b/plugin/scatterElementsPlugin/ScatterElementsPlugin_PluginConfig.yaml index 7ad6e8dc4..123376776 100644 --- a/plugin/scatterElementsPlugin/ScatterElementsPlugin_PluginConfig.yaml +++ b/plugin/scatterElementsPlugin/ScatterElementsPlugin_PluginConfig.yaml @@ -17,9 +17,9 @@ --- name: ScatterElements -interface: "IPluginV2DynamicExt" +interface: "IPluginV3" versions: - "1": + "2": inputs: - data - indices @@ -60,7 +60,7 @@ versions: - "add" - "mul" - "min" - - "max" + - "max" config2: input_types: data: float16 @@ -75,7 +75,7 @@ versions: - "add" - "mul" - "min" - - "max" + - "max" config3: input_types: data: int32 @@ -90,7 +90,7 @@ versions: - "add" - "mul" - "min" - - "max" + - "max" config4: input_types: data: int64 @@ -105,7 +105,7 @@ versions: - "add" - "mul" - "min" - - "max" + - "max" config5: input_types: data: bfloat16 @@ -120,7 +120,7 @@ versions: - "add" - "mul" - "min" - - "max" + - "max" outputs: - output attributes: @@ -140,7 +140,7 @@ versions: - "add" - "mul" - "min" - - "max" + - "max" attributes_required: - reduction golden_io_path: "plugin/scatterElementsPlugin/ScatterElementsPlugin_PluginGoldenIO.json" diff --git a/plugin/scatterElementsPlugin/scatterElementsCommon.h b/plugin/scatterElementsPlugin/scatterElementsCommon.h new file mode 100644 index 000000000..a76579aca --- /dev/null +++ b/plugin/scatterElementsPlugin/scatterElementsCommon.h @@ -0,0 +1,41 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SCATTER_ELEMENTS_COMMON_H +#define SCATTER_ELEMENTS_COMMON_H + +#include +#include +#include +#include +#include + +#include "common/plugin.h" + +enum class ReductionType : int32_t +{ + kSUM, + kMUL, + kMEAN, + kMIN, + kMAX +}; + +extern std::unordered_map const kREDUCE_STR_TO_ENUM; +extern std::unordered_map const kREDUCE_ENUM_TO_STR; + +#endif // SCATTER_ELEMENTS_COMMON_H diff --git a/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp b/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp index babbaecc4..92e8d9e62 100644 --- a/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp +++ b/plugin/scatterElementsPlugin/scatterElementsPlugin.cpp @@ -29,82 +29,95 @@ namespace nvinfer1 namespace plugin { -std::map const gReduceToEnum{ +std::unordered_map const kREDUCE_STR_TO_ENUM{ {"add", ReductionType::kSUM}, {"mean", ReductionType::kMEAN}, {"mul", ReductionType::kMUL}, {"min", ReductionType::kMIN}, {"max", ReductionType::kMAX}, }; +std::unordered_map const kREDUCE_ENUM_TO_STR{ + {ReductionType::kSUM, "add"}, + {ReductionType::kMEAN, "mean"}, + {ReductionType::kMUL, "mul"}, + {ReductionType::kMIN, "min"}, + {ReductionType::kMAX, "max"}, +}; // Static class fields initialization -PluginFieldCollection ScatterElementsPluginCreator::gFC{}; -std::vector ScatterElementsPluginCreator::gPluginAttributes; +PluginFieldCollection ScatterElementsPluginV3Creator::gFC{}; +std::vector ScatterElementsPluginV3Creator::gPluginAttributes; namespace { -constexpr char const* kSCATTER_ELEMENTS_NAME{"ScatterElements"}; -constexpr char const* kSCATTER_ELEMENTS_VERSION{"1"}; +constexpr char const* kSCATTER_PLUGIN_VERSION{"2"}; +constexpr char const* kSCATTER_PLUGIN_NAME{"ScatterElements"}; } // namespace -ScatterElementsPlugin::ScatterElementsPlugin(ReductionType reduction, int32_t dim) +ScatterElementsPluginV3::ScatterElementsPluginV3(ReductionType reduction, int32_t dim) : mReduction(reduction) , mAxis(dim) { } -ScatterElementsPlugin::ScatterElementsPlugin(std::string const& reduction, int32_t dim) - : mReduction(gReduceToEnum.at(reduction)) +ScatterElementsPluginV3::ScatterElementsPluginV3(std::string const& reduction, int32_t dim) + : mReduction(kREDUCE_STR_TO_ENUM.at(reduction)) , mAxis(dim) { } -ScatterElementsPlugin::ScatterElementsPlugin(void const* serialData, size_t serialLength) -{ - deserialize_value(&serialData, &serialLength, &mReduction); - deserialize_value(&serialData, &serialLength, &mAxis); -} - -int32_t ScatterElementsPlugin::getNbOutputs() const noexcept +int32_t ScatterElementsPluginV3::getNbOutputs() const noexcept { return 1; } -int32_t ScatterElementsPlugin::initialize() noexcept +IPluginCapability* ScatterElementsPluginV3::getCapabilityInterface(PluginCapabilityType type) noexcept { - return 0; -} - -char const* ScatterElementsPlugin::getPluginType() const noexcept -{ - return kSCATTER_ELEMENTS_NAME; + try + { + if (type == PluginCapabilityType::kBUILD) + { + return static_cast(this); + } + if (type == PluginCapabilityType::kRUNTIME) + { + return static_cast(this); + } + PLUGIN_ASSERT(type == PluginCapabilityType::kCORE); + return static_cast(this); + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; } -char const* ScatterElementsPlugin::getPluginVersion() const noexcept +char const* ScatterElementsPluginV3::getPluginVersion() const noexcept { - return kSCATTER_ELEMENTS_VERSION; + return kSCATTER_PLUGIN_VERSION; } -DimsExprs ScatterElementsPlugin::getOutputDimensions( - int32_t index, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept +int32_t ScatterElementsPluginV3::getOutputShapes(DimsExprs const* inputs, int32_t nbInputs, + DimsExprs const* shapeInputs, int32_t nbShapeInputs, DimsExprs* outputs, int32_t nbOutputs, + IExprBuilder& exprBuilder) noexcept { try { - PLUGIN_VALIDATE(nbInputs == 3); - PLUGIN_VALIDATE(inputs); - PLUGIN_VALIDATE(index <= kOUTPUT_TENSOR_IDX); - // both outputs are of the same size - DimsExprs out(inputs[kDATA_TENSOR_IDX]); - return out; + PLUGIN_ASSERT(nbInputs == 3); + PLUGIN_ASSERT(inputs != nullptr); + PLUGIN_ASSERT(nbOutputs == 1); + outputs[kOUTPUT_TENSOR_IDX] = inputs[kDATA_TENSOR_IDX]; + return pluginStatus_t::STATUS_SUCCESS; } catch (std::exception const& e) { caughtError(e); } - return DimsExprs(); + return pluginStatus_t::STATUS_FAILURE; } -int32_t ScatterElementsPlugin::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, +int32_t ScatterElementsPluginV3::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { try @@ -114,48 +127,67 @@ int32_t ScatterElementsPlugin::enqueue(PluginTensorDesc const* inputDesc, Plugin runScatterElementsKernel(outputs[kOUTPUT_TENSOR_IDX], inputs[kDATA_TENSOR_IDX], inputs[kUPDATES_TENSOR_IDX], inputs[kINDICES_TENSOR_IDX], outputDesc[kOUTPUT_TENSOR_IDX], inputDesc[kDATA_TENSOR_IDX], inputDesc[kUPDATES_TENSOR_IDX], inputDesc[kINDICES_TENSOR_IDX], mAxis, mReduction, stream); - return 0; + return pluginStatus_t::STATUS_SUCCESS; } catch (std::exception const& e) { caughtError(e); - return -1; } + return -1; } -size_t ScatterElementsPlugin::getSerializationSize() const noexcept +int32_t ScatterElementsPluginV3::onShapeChange( + PluginTensorDesc const* in, int32_t nbInputs, PluginTensorDesc const* out, int32_t nbOutputs) noexcept { - auto ret = serialized_size(mReduction) + serialized_size(mAxis); - return ret; + PLUGIN_ASSERT(in != nullptr); + PLUGIN_ASSERT(out != nullptr); + PLUGIN_ASSERT(nbOutputs == 1); + PLUGIN_ASSERT(nbInputs == 3); + auto rank = in[0].dims.nbDims; + // rank of input should be >=1 + PLUGIN_ASSERT(rank >= 1); + // rank of indices should be same as rank of data + PLUGIN_ASSERT(in[1].dims.nbDims == rank); + // rank and shape of updates should be same as indices + PLUGIN_ASSERT(in[2].dims.nbDims == rank); + PLUGIN_VALIDATE(std::equal(in[2].dims.d, in[2].dims.d + rank, in[1].dims.d)) + return pluginStatus_t::STATUS_SUCCESS; } -void ScatterElementsPlugin::serialize(void* buffer) const noexcept +PluginFieldCollection const* ScatterElementsPluginV3::getFieldsToSerialize() noexcept { - serialize_value(&buffer, mReduction); - serialize_value(&buffer, mAxis); + mDataToSerialize.clear(); + // "reduction" field is serialized as string + mDataToSerialize.emplace_back("reduction", kREDUCE_ENUM_TO_STR.at(mReduction).c_str(), PluginFieldType::kCHAR, + kREDUCE_ENUM_TO_STR.at(mReduction).size()); + mDataToSerialize.emplace_back("axis", &mAxis, PluginFieldType::kINT32, 1); + + mFCToSerialize.nbFields = mDataToSerialize.size(); + mFCToSerialize.fields = mDataToSerialize.data(); + return &mFCToSerialize; } -bool ScatterElementsPlugin::supportsFormatCombination( - int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept +bool ScatterElementsPluginV3::supportsFormatCombination( + int32_t pos, DynamicPluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept { try { PLUGIN_VALIDATE(inOut && pos < (nbInputs + nbOutputs)); - if (inOut[pos].format != PluginFormat::kLINEAR) + if (inOut[pos].desc.format != PluginFormat::kLINEAR) { return false; } - auto mytype = inOut[pos].type; - auto firsttype = inOut[kDATA_TENSOR_IDX].type; + auto currentType = inOut[pos].desc.type; + auto firstType = inOut[kDATA_TENSOR_IDX].desc.type; // Only INT64 is supported for indices - return pos == kINDICES_TENSOR_IDX ? (mytype == DataType::kINT64) - : (mytype == firsttype) - && (mytype == DataType::kFLOAT || mytype == DataType::kHALF - || (hasBfloat16AtomicAdd() && mytype == DataType::kBF16) || mytype == DataType::kINT32 - || mytype == DataType::kINT64); + return pos == kINDICES_TENSOR_IDX ? (currentType == DataType::kINT64) + : (currentType == firstType) + && (currentType == DataType::kFLOAT || currentType == DataType::kHALF + || (hasBfloat16AtomicAdd() && currentType == DataType::kBF16) || currentType == DataType::kINT32 + || currentType == DataType::kINT64); } catch (std::exception const& e) { @@ -164,70 +196,97 @@ bool ScatterElementsPlugin::supportsFormatCombination( } } -void ScatterElementsPlugin::terminate() noexcept {} - -void ScatterElementsPlugin::destroy() noexcept +ScatterElementsPluginV3* ScatterElementsPluginV3::clone() noexcept { - // This gets called when the network containing plugin is destroyed - delete this; + try + { + auto* plugin = new ScatterElementsPluginV3(mReduction, mAxis); + plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; + } + catch (std::exception const& e) + { + caughtError(e); + } + return nullptr; } -IPluginV2DynamicExt* ScatterElementsPlugin::clone() const noexcept +IPluginV3* ScatterElementsPluginV3::attachToContext(IPluginResourceContext* context) noexcept { - auto* plugin = new ScatterElementsPlugin(mReduction, mAxis); - plugin->setPluginNamespace(mNamespace.c_str()); - return plugin; + ScatterElementsPluginV3* obj = clone(); + return obj; } -void ScatterElementsPlugin::configurePlugin( +int32_t ScatterElementsPluginV3::configurePlugin( DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept { try { PLUGIN_VALIDATE(nbInputs == 3); + return pluginStatus_t::STATUS_SUCCESS; } catch (std::exception const& e) { caughtError(e); } + return pluginStatus_t::STATUS_FAILURE; } -DataType ScatterElementsPlugin::getOutputDataType( - int32_t index, DataType const* inputTypes, int32_t nbInputs) const noexcept +int32_t ScatterElementsPluginV3::getOutputDataTypes( + DataType* outputTypes, int32_t nbOutputs, DataType const* inputTypes, int32_t nbInputs) const noexcept { try { - PLUGIN_VALIDATE(inputTypes && nbInputs == 3 && index == kOUTPUT_TENSOR_IDX); + PLUGIN_ASSERT(inputTypes != nullptr); + PLUGIN_ASSERT(nbInputs == 3); + PLUGIN_ASSERT(nbOutputs == 1); + outputTypes[kOUTPUT_TENSOR_IDX] = inputTypes[kDATA_TENSOR_IDX]; + return pluginStatus_t::STATUS_SUCCESS; } catch (std::exception const& e) { caughtError(e); } - return inputTypes[kDATA_TENSOR_IDX]; + return pluginStatus_t::STATUS_FAILURE; } -size_t ScatterElementsPlugin::getWorkspaceSize( - PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept +size_t ScatterElementsPluginV3::getWorkspaceSize(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, + DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept { return 0; } -void ScatterElementsPlugin::setPluginNamespace(char const* libNamespace) noexcept +void ScatterElementsPluginV3::setPluginNamespace(char const* libNamespace) noexcept { - mNamespace = libNamespace; + try + { + PLUGIN_ASSERT(libNamespace != nullptr); + mNamespace = libNamespace; + } + catch (std::exception const& e) + { + caughtError(e); + } +} + +char const* ScatterElementsPluginV3::getPluginName() const noexcept +{ + return kSCATTER_PLUGIN_NAME; } -char const* ScatterElementsPlugin::getPluginNamespace() const noexcept +char const* ScatterElementsPluginV3::getPluginNamespace() const noexcept { return mNamespace.c_str(); } // -// ScatterElementsPluginCreator +// ScatterElementsPluginV3Creator // -ScatterElementsPluginCreator::ScatterElementsPluginCreator() +ScatterElementsPluginV3Creator::ScatterElementsPluginV3Creator() { + static std::mutex sMutex; + std::lock_guard guard(sMutex); gPluginAttributes.clear(); gPluginAttributes.emplace_back(PluginField("reduction")); gPluginAttributes.emplace_back(PluginField("axis")); @@ -235,37 +294,38 @@ ScatterElementsPluginCreator::ScatterElementsPluginCreator() gFC.fields = gPluginAttributes.data(); } -char const* ScatterElementsPluginCreator::getPluginName() const noexcept +char const* ScatterElementsPluginV3Creator::getPluginName() const noexcept { - return kSCATTER_ELEMENTS_NAME; + return kSCATTER_PLUGIN_NAME; } -char const* ScatterElementsPluginCreator::getPluginVersion() const noexcept +char const* ScatterElementsPluginV3Creator::getPluginVersion() const noexcept { - return kSCATTER_ELEMENTS_VERSION; + return kSCATTER_PLUGIN_VERSION; } -PluginFieldCollection const* ScatterElementsPluginCreator::getFieldNames() noexcept +PluginFieldCollection const* ScatterElementsPluginV3Creator::getFieldNames() noexcept { return &gFC; } -char const* ScatterElementsPluginCreator::getPluginNamespace() const noexcept +char const* ScatterElementsPluginV3Creator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } -void ScatterElementsPluginCreator::setPluginNamespace(char const* libNamespace) noexcept +void ScatterElementsPluginV3Creator::setPluginNamespace(char const* libNamespace) noexcept { + PLUGIN_VALIDATE(libNamespace != nullptr); mNamespace = libNamespace; } -IPluginV2DynamicExt* ScatterElementsPluginCreator::createPlugin( - char const* name, PluginFieldCollection const* fc) noexcept +IPluginV3* ScatterElementsPluginV3Creator::createPlugin( + char const* name, PluginFieldCollection const* fc, TensorRTPhase phase) noexcept { std::string reductionArg; int32_t axisArg = 0; - ScatterElementsPlugin* plugin = nullptr; + ScatterElementsPluginV3* plugin = nullptr; try { @@ -287,29 +347,22 @@ IPluginV2DynamicExt* ScatterElementsPluginCreator::createPlugin( else if (strcmp(fields[i].name, "reduction") == 0) { auto data = static_cast(fields[i].data); - reductionArg = std::string(data); + reductionArg = fields[i].length != -1 ? std::string(data, fields[i].length) : std::string(data); } } - PLUGIN_VALIDATE(gReduceToEnum.find(reductionArg) != gReduceToEnum.end(), + PLUGIN_VALIDATE(kREDUCE_STR_TO_ENUM.find(reductionArg) != kREDUCE_STR_TO_ENUM.end(), (reductionArg + ": invalid value for 'reduction' plugin argument").c_str()); - plugin = new ScatterElementsPlugin(reductionArg, axisArg); + plugin = new ScatterElementsPluginV3(reductionArg, axisArg); plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; } catch (std::exception& e) { caughtError(e); } - return plugin; -} - -IPluginV2DynamicExt* ScatterElementsPluginCreator::deserializePlugin( - char const* name, void const* serialData, size_t serialLength) noexcept -{ - ScatterElementsPlugin* plugin = new ScatterElementsPlugin(serialData, serialLength); - plugin->setPluginNamespace(mNamespace.c_str()); - return plugin; + return nullptr; } } // namespace plugin diff --git a/plugin/scatterElementsPlugin/scatterElementsPlugin.h b/plugin/scatterElementsPlugin/scatterElementsPlugin.h index 01c2a73df..c0c784b76 100644 --- a/plugin/scatterElementsPlugin/scatterElementsPlugin.h +++ b/plugin/scatterElementsPlugin/scatterElementsPlugin.h @@ -17,117 +17,122 @@ #ifndef TRT_SCATTER_ELEMENTS_PLUGIN_H #define TRT_SCATTER_ELEMENTS_PLUGIN_H - +#include "NvInfer.h" +#include "NvInferPlugin.h" #include "common/plugin.h" -#include -#include -#include -#include +#include "scatterElementsCommon.h" namespace nvinfer1 { namespace plugin { -enum class ReductionType +class ScatterElementsPluginV3 : public IPluginV3, + public IPluginV3OneCore, + public IPluginV3OneBuild, + public IPluginV3OneRuntime { - kSUM, - kMUL, - kMEAN, - kMIN, - kMAX -}; +public: + // ctor and dtor + ScatterElementsPluginV3() = delete; -extern std::map const gReduceToEnum; + ScatterElementsPluginV3(ScatterElementsPluginV3 const&) = delete; -class ScatterElementsPlugin final : public nvinfer1::IPluginV2DynamicExt -{ -public: - ScatterElementsPlugin() = delete; - ScatterElementsPlugin(ScatterElementsPlugin const&) = delete; - ScatterElementsPlugin(std::string const&, int32_t); - ScatterElementsPlugin(ReductionType, int32_t); - ScatterElementsPlugin(void const* serialData, size_t serialLength); - ~ScatterElementsPlugin() override = default; + ScatterElementsPluginV3(std::string const&, int32_t); - int32_t getNbOutputs() const noexcept override; + ScatterElementsPluginV3(ReductionType, int32_t); - nvinfer1::DimsExprs getOutputDimensions(int32_t index, nvinfer1::DimsExprs const* inputs, int32_t nbInputDims, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; + ~ScatterElementsPluginV3() override = default; - int32_t initialize() noexcept override; + // IPluginV3 Methods + IPluginCapability* getCapabilityInterface(PluginCapabilityType type) noexcept override; - void terminate() noexcept override; + ScatterElementsPluginV3* clone() noexcept; + // end IPluginV3 Methods - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; + // IPluginV3Core Methods + char const* getPluginVersion() const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; + char const* getPluginName() const noexcept override; + + char const* getPluginNamespace() const noexcept override; + + void setPluginNamespace(char const* pluginNamespace) noexcept; - size_t getSerializationSize() const noexcept override; + // end IPluginV3Core Methods - void serialize(void* buffer) const noexcept override; + // IPluginV3Build Methods + int32_t getNbOutputs() const noexcept override; bool supportsFormatCombination( - int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; + int32_t pos, DynamicPluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; - char const* getPluginType() const noexcept override; + int32_t getOutputShapes(DimsExprs const* inputs, int32_t nbInputs, DimsExprs const* shapeInputs, + int32_t nbShapeInputs, DimsExprs* outputs, int32_t nbOutputs, IExprBuilder& exprBuilder) noexcept override; - char const* getPluginVersion() const noexcept override; + int32_t configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, + int32_t nbOutputs) noexcept override; - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; + size_t getWorkspaceSize(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, + DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - void destroy() noexcept override; + int32_t getOutputDataTypes( + DataType* outputTypes, int32_t nbOutputs, DataType const* inputTypes, int32_t nbInputs) const noexcept override; + // end IPluginV3Build Methods - nvinfer1::DataType getOutputDataType( - int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override; + // IPluginV3Runtime Methods + int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; + int32_t onShapeChange( + PluginTensorDesc const* in, int32_t nbInputs, PluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - char const* getPluginNamespace() const noexcept override; + IPluginV3* attachToContext(IPluginResourceContext* context) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; + PluginFieldCollection const* getFieldsToSerialize() noexcept override; + // end IPluginV3Runtime Methods private: ReductionType mReduction; int32_t mAxis; + std::vector mDataToSerialize; + nvinfer1::PluginFieldCollection mFCToSerialize; std::string mNamespace; - + // input metadata static constexpr int32_t kINDICES_TENSOR_IDX = 1; static constexpr int32_t kUPDATES_TENSOR_IDX = 2; static constexpr int32_t kDATA_TENSOR_IDX = 0; - // outputs + // output metadata static constexpr int32_t kOUTPUT_TENSOR_IDX = 0; }; -class ScatterElementsPluginCreator : public nvinfer1::IPluginCreator +class ScatterElementsPluginV3Creator : public nvinfer1::IPluginCreatorV3One { public: - ScatterElementsPluginCreator(); + // ctor and dtor + ScatterElementsPluginV3Creator(); - ~ScatterElementsPluginCreator() override = default; + ~ScatterElementsPluginV3Creator() override = default; + // get plugin metadata char const* getPluginName() const noexcept override; char const* getPluginVersion() const noexcept override; nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - nvinfer1::IPluginV2DynamicExt* createPlugin( - char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2DynamicExt* deserializePlugin( - char const* name, void const* serialData, size_t serialLength) noexcept override; + char const* getPluginNamespace() const noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; + // setter + void setPluginNamespace(char const* libNamespace) noexcept; - char const* getPluginNamespace() const noexcept override; + // create plugin + IPluginV3* createPlugin( + char const* name, nvinfer1::PluginFieldCollection const* fc, TensorRTPhase phase) noexcept override; private: static nvinfer1::PluginFieldCollection gFC; - static std::vector gPluginAttributes; + static std::vector gPluginAttributes; std::string mNamespace; }; diff --git a/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h b/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h index 307ef355e..652ff2b58 100644 --- a/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h +++ b/plugin/scatterElementsPlugin/scatterElementsPluginKernel.h @@ -25,6 +25,7 @@ #include "common/plugin.h" #include "scatterElementsPlugin.h" +#include "scatterElementsPluginLegacy.h" namespace nvinfer1 { diff --git a/plugin/scatterElementsPlugin/scatterElementsPluginLegacy.cpp b/plugin/scatterElementsPlugin/scatterElementsPluginLegacy.cpp new file mode 100644 index 000000000..0dfa582f3 --- /dev/null +++ b/plugin/scatterElementsPlugin/scatterElementsPluginLegacy.cpp @@ -0,0 +1,316 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "common/serialize.hpp" +#include "scatterElementsPluginKernel.h" +#include "scatterElementsPluginLegacy.h" + +namespace nvinfer1 +{ +namespace plugin +{ + +std::unordered_map const kREDUCE_STR_TO_ENUM{ + {"add", ReductionType::kSUM}, + {"mean", ReductionType::kMEAN}, + {"mul", ReductionType::kMUL}, + {"min", ReductionType::kMIN}, + {"max", ReductionType::kMAX}, +}; + +// Static class fields initialization +PluginFieldCollection ScatterElementsPluginV2Creator::gFC{}; +std::vector ScatterElementsPluginV2Creator::gPluginAttributes; + +namespace +{ +constexpr char const* kSCATTER_ELEMENTS_NAME{"ScatterElements"}; +constexpr char const* kSCATTER_ELEMENTS_VERSION{"1"}; +} // namespace + +ScatterElementsPluginV2::ScatterElementsPluginV2(ReductionType reduction, int32_t dim) + : mReduction(reduction) + , mAxis(dim) +{ +} + +ScatterElementsPluginV2::ScatterElementsPluginV2(std::string const& reduction, int32_t dim) + : mReduction(kREDUCE_STR_TO_ENUM.at(reduction)) + , mAxis(dim) +{ +} + +ScatterElementsPluginV2::ScatterElementsPluginV2(void const* serialData, size_t serialLength) +{ + deserialize_value(&serialData, &serialLength, &mReduction); + deserialize_value(&serialData, &serialLength, &mAxis); +} + +int32_t ScatterElementsPluginV2::getNbOutputs() const noexcept +{ + return 1; +} + +int32_t ScatterElementsPluginV2::initialize() noexcept +{ + return 0; +} + +char const* ScatterElementsPluginV2::getPluginType() const noexcept +{ + return kSCATTER_ELEMENTS_NAME; +} + +char const* ScatterElementsPluginV2::getPluginVersion() const noexcept +{ + return kSCATTER_ELEMENTS_VERSION; +} + +DimsExprs ScatterElementsPluginV2::getOutputDimensions( + int32_t index, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept +{ + try + { + PLUGIN_VALIDATE(nbInputs == 3); + PLUGIN_VALIDATE(inputs); + PLUGIN_VALIDATE(index <= kOUTPUT_TENSOR_IDX); + // both outputs are of the same size + DimsExprs out(inputs[kDATA_TENSOR_IDX]); + return out; + } + catch (std::exception const& e) + { + caughtError(e); + } + return DimsExprs(); +} + +int32_t ScatterElementsPluginV2::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept +{ + try + { + PLUGIN_VALIDATE(inputDesc[kINDICES_TENSOR_IDX].type == DataType::kINT64); + + runScatterElementsKernel(outputs[kOUTPUT_TENSOR_IDX], inputs[kDATA_TENSOR_IDX], inputs[kUPDATES_TENSOR_IDX], + inputs[kINDICES_TENSOR_IDX], outputDesc[kOUTPUT_TENSOR_IDX], inputDesc[kDATA_TENSOR_IDX], + inputDesc[kUPDATES_TENSOR_IDX], inputDesc[kINDICES_TENSOR_IDX], mAxis, mReduction, stream); + return 0; + } + catch (std::exception const& e) + { + caughtError(e); + return -1; + } +} + +size_t ScatterElementsPluginV2::getSerializationSize() const noexcept +{ + auto ret = serialized_size(mReduction) + serialized_size(mAxis); + return ret; +} + +void ScatterElementsPluginV2::serialize(void* buffer) const noexcept +{ + serialize_value(&buffer, mReduction); + serialize_value(&buffer, mAxis); +} + +bool ScatterElementsPluginV2::supportsFormatCombination( + int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept +{ + try + { + PLUGIN_VALIDATE(inOut && pos < (nbInputs + nbOutputs)); + + if (inOut[pos].format != PluginFormat::kLINEAR) + { + return false; + } + + auto mytype = inOut[pos].type; + auto firsttype = inOut[kDATA_TENSOR_IDX].type; + + // Only INT64 is supported for indices + return pos == kINDICES_TENSOR_IDX ? (mytype == DataType::kINT64) + : (mytype == firsttype) + && (mytype == DataType::kFLOAT || mytype == DataType::kHALF + || (hasBfloat16AtomicAdd() && mytype == DataType::kBF16) || mytype == DataType::kINT32 + || mytype == DataType::kINT64); + } + catch (std::exception const& e) + { + caughtError(e); + return false; + } +} + +void ScatterElementsPluginV2::terminate() noexcept {} + +void ScatterElementsPluginV2::destroy() noexcept +{ + // This gets called when the network containing plugin is destroyed + delete this; +} + +IPluginV2DynamicExt* ScatterElementsPluginV2::clone() const noexcept +{ + auto* plugin = new ScatterElementsPluginV2(mReduction, mAxis); + plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; +} + +void ScatterElementsPluginV2::configurePlugin( + DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept +{ + try + { + PLUGIN_VALIDATE(nbInputs == 3); + } + catch (std::exception const& e) + { + caughtError(e); + } +} + +DataType ScatterElementsPluginV2::getOutputDataType( + int32_t index, DataType const* inputTypes, int32_t nbInputs) const noexcept +{ + try + { + PLUGIN_VALIDATE(inputTypes && nbInputs == 3 && index == kOUTPUT_TENSOR_IDX); + } + catch (std::exception const& e) + { + caughtError(e); + } + return inputTypes[kDATA_TENSOR_IDX]; +} + +size_t ScatterElementsPluginV2::getWorkspaceSize( + PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept +{ + return 0; +} + +void ScatterElementsPluginV2::setPluginNamespace(char const* libNamespace) noexcept +{ + mNamespace = libNamespace; +} + +char const* ScatterElementsPluginV2::getPluginNamespace() const noexcept +{ + return mNamespace.c_str(); +} + +// +// ScatterElementsPluginV2Creator +// + +ScatterElementsPluginV2Creator::ScatterElementsPluginV2Creator() +{ + gPluginAttributes.clear(); + gPluginAttributes.emplace_back(PluginField("reduction")); + gPluginAttributes.emplace_back(PluginField("axis")); + gFC.nbFields = gPluginAttributes.size(); + gFC.fields = gPluginAttributes.data(); +} + +char const* ScatterElementsPluginV2Creator::getPluginName() const noexcept +{ + return kSCATTER_ELEMENTS_NAME; +} + +char const* ScatterElementsPluginV2Creator::getPluginVersion() const noexcept +{ + return kSCATTER_ELEMENTS_VERSION; +} + +PluginFieldCollection const* ScatterElementsPluginV2Creator::getFieldNames() noexcept +{ + return &gFC; +} + +char const* ScatterElementsPluginV2Creator::getPluginNamespace() const noexcept +{ + return mNamespace.c_str(); +} + +void ScatterElementsPluginV2Creator::setPluginNamespace(char const* libNamespace) noexcept +{ + mNamespace = libNamespace; +} + +IPluginV2DynamicExt* ScatterElementsPluginV2Creator::createPlugin( + char const* name, PluginFieldCollection const* fc) noexcept +{ + std::string reductionArg; + int32_t axisArg = 0; + ScatterElementsPluginV2* plugin = nullptr; + + try + { + PLUGIN_VALIDATE(fc != nullptr); + auto fields = fc->fields; + + std::set requiredFields{"reduction"}; + plugin::validateRequiredAttributesExist(requiredFields, fc); + + for (int32_t i = 0; i < fc->nbFields; ++i) + { + PLUGIN_VALIDATE(fields[i].name != nullptr); + PLUGIN_VALIDATE(fields[i].data != nullptr); + if (strcmp(fields[i].name, "axis") == 0) + { + auto data = static_cast(fields[i].data); + axisArg = *data; + } + else if (strcmp(fields[i].name, "reduction") == 0) + { + auto data = static_cast(fields[i].data); + reductionArg = std::string(data); + } + } + + PLUGIN_VALIDATE(kREDUCE_STR_TO_ENUM.find(reductionArg) != kREDUCE_STR_TO_ENUM.end(), + (reductionArg + ": invalid value for 'reduction' plugin argument").c_str()); + + plugin = new ScatterElementsPluginV2(reductionArg, axisArg); + plugin->setPluginNamespace(mNamespace.c_str()); + } + catch (std::exception& e) + { + caughtError(e); + } + return plugin; +} + +IPluginV2DynamicExt* ScatterElementsPluginV2Creator::deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept +{ + ScatterElementsPluginV2* plugin = new ScatterElementsPluginV2(serialData, serialLength); + plugin->setPluginNamespace(mNamespace.c_str()); + return plugin; +} + +} // namespace plugin +} // namespace nvinfer1 diff --git a/plugin/scatterElementsPlugin/scatterElementsPluginLegacy.h b/plugin/scatterElementsPlugin/scatterElementsPluginLegacy.h new file mode 100644 index 000000000..a2fc47782 --- /dev/null +++ b/plugin/scatterElementsPlugin/scatterElementsPluginLegacy.h @@ -0,0 +1,123 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SCATTER_ELEMENTS_PLUGIN_LEGACY_H +#define TRT_SCATTER_ELEMENTS_PLUGIN_LEGACY_H + +#include "common/plugin.h" +#include "scatterElementsCommon.h" + +namespace nvinfer1 +{ +namespace plugin +{ + +class ScatterElementsPluginV2 final : public nvinfer1::IPluginV2DynamicExt +{ +public: + ScatterElementsPluginV2() = delete; + ScatterElementsPluginV2(ScatterElementsPluginV2 const&) = delete; + ScatterElementsPluginV2(std::string const&, int32_t); + ScatterElementsPluginV2(ReductionType, int32_t); + ScatterElementsPluginV2(void const* serialData, size_t serialLength); + ~ScatterElementsPluginV2() override = default; + + int32_t getNbOutputs() const noexcept override; + + nvinfer1::DimsExprs getOutputDimensions(int32_t index, nvinfer1::DimsExprs const* inputs, int32_t nbInputDims, + nvinfer1::IExprBuilder& exprBuilder) noexcept override; + + int32_t initialize() noexcept override; + + void terminate() noexcept override; + + size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, + nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; + + int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, + void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; + + size_t getSerializationSize() const noexcept override; + + void serialize(void* buffer) const noexcept override; + + bool supportsFormatCombination( + int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; + + char const* getPluginType() const noexcept override; + + char const* getPluginVersion() const noexcept override; + + nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; + + void destroy() noexcept override; + + nvinfer1::DataType getOutputDataType( + int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override; + + void setPluginNamespace(char const* pluginNamespace) noexcept override; + + char const* getPluginNamespace() const noexcept override; + + void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, + nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; + +private: + ReductionType mReduction; + int32_t mAxis; + std::string mNamespace; + + static constexpr int32_t kINDICES_TENSOR_IDX = 1; + static constexpr int32_t kUPDATES_TENSOR_IDX = 2; + static constexpr int32_t kDATA_TENSOR_IDX = 0; + // outputs + static constexpr int32_t kOUTPUT_TENSOR_IDX = 0; +}; + +class ScatterElementsPluginV2Creator : public nvinfer1::IPluginCreator +{ +public: + ScatterElementsPluginV2Creator(); + + ~ScatterElementsPluginV2Creator() override = default; + + char const* getPluginName() const noexcept override; + + char const* getPluginVersion() const noexcept override; + + nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; + + nvinfer1::IPluginV2DynamicExt* createPlugin( + char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; + + nvinfer1::IPluginV2DynamicExt* deserializePlugin( + char const* name, void const* serialData, size_t serialLength) noexcept override; + + void setPluginNamespace(char const* pluginNamespace) noexcept override; + + char const* getPluginNamespace() const noexcept override; + +private: + static nvinfer1::PluginFieldCollection gFC; + static std::vector gPluginAttributes; + std::string mNamespace; +}; + +} // namespace plugin +} // namespace nvinfer1 + +#endif // TRT_SCATTER_ELEMENTS_PLUGIN_LEGACY_H diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 66034f8ba..49f5d8a11 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -113,6 +113,12 @@ message(STATUS "PY_CONFIG_INCLUDE: ${PY_CONFIG_INCLUDE}") include_directories(${TENSORRT_ROOT}/include ${PROJECT_SOURCE_DIR}/include ${CUDA_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/docstrings ${ONNX_INC_DIR} ${PYBIND11_DIR}) link_directories(${TENSORRT_BUILD}) +if (MSVC) + # Prevent pybind11 from sharing resources with other, potentially ABI incompatible modules + # https://github.com/pybind/pybind11/issues/2898 + add_definitions(-DPYBIND11_COMPILER_TYPE="_${PROJECT_NAME}_abi") +endif() + if (MSVC) message(STATUS "include_dirs: ${MSVC_COMPILER_DIR}/include ${MSVC_COMPILER_DIR}/../ucrt/include ${NV_WDKSDK_INC}/um ${NV_WDKSDK_INC}/shared") message(STATUS "link dirs: ${PY_LIB_DIR} ${NV_WDKSDK_LIB}/um/x64 ${MSVC_COMPILER_DIR}/lib/amd64 ${MSVC_COMPILER_DIR}/../ucrt/lib/x64") diff --git a/python/docstrings/infer/pyCoreDoc.h b/python/docstrings/infer/pyCoreDoc.h index b59803a92..39a52a9ba 100644 --- a/python/docstrings/infer/pyCoreDoc.h +++ b/python/docstrings/infer/pyCoreDoc.h @@ -713,8 +713,7 @@ constexpr char const* descr = R"trtdoc( :ivar streamable_weights_size: Returns the size of the streamable weights in the engine. This may not include all the weights. :ivar weight_streaming_budget_v2: Set and get the current weight streaming budget for inference. The budget may be set any non-negative value. A value of 0 streams the most weights. Values equal to streamable_weights_size (default) or larger will disable weight streaming. :ivar weight_streaming_scratch_memory_size: The amount of scratch memory required by a TensorRT ExecutionContext to perform inference. This value may change based on the current weight streaming budget. Please use the V2 memory APIs, engine.device_memory_size_v2 and ExecutionContext.set_device_memory() to provide memory which includes the current weight streaming scratch memory. Not specifying these APIs or using the V1 APIs will not include this memory, so TensorRT will resort to allocating itself. - )trtdoc" - ; + )trtdoc"; // Documentation bug with parameters on these three functions because they are overloaded. constexpr char const* serialize = R"trtdoc( @@ -1074,6 +1073,9 @@ constexpr char const* descr = R"trtdoc( constexpr char const* PROFILE_SHARING_0806 = R"trtdoc( [DEPRECATED] Allows optimization profiles to be shared across execution contexts. The default value for this flag is on in TensorRT 10.0. Turning if off is deprecated. )trtdoc"; +constexpr char const* ALIASED_PLUGIN_IO_10_03 = R"trtdoc( + Allows plugin I/O to be aliased when using IPluginV3OneBuildV2. +)trtdoc"; } // namespace PreviewFeatureDoc namespace HardwareCompatibilityLevelDoc diff --git a/python/docstrings/infer/pyGraphDoc.h b/python/docstrings/infer/pyGraphDoc.h index 09ef6cb18..cfd51ed84 100644 --- a/python/docstrings/infer/pyGraphDoc.h +++ b/python/docstrings/infer/pyGraphDoc.h @@ -198,8 +198,8 @@ constexpr const char* descr = R"trtdoc( :ivar dynamic_range: :class:`Tuple[float, float]` [DEPRECATED] Deprecated in TensorRT 10.1. Superseded by explicit quantization. A tuple containing the [minimum, maximum] of the dynamic range, or :class:`None` if the range was not set. :ivar is_shape: :class:`bool` Whether the tensor is a shape tensor. :ivar allowed_formats: :class:`int32` The allowed set of TensorFormat candidates. This should be an integer consisting of one or more :class:`TensorFormat` s, combined via bitwise OR after bit shifting. For example, ``1 << int(TensorFormat.CHW4) | 1 << int(TensorFormat.CHW32)``. -)trtdoc" - ; +)trtdoc"; + constexpr const char* set_dynamic_range = R"trtdoc( [DEPRECATED] Deprecated in TensorRT 10.1. Superseded by explicit quantization. Set dynamic range for the tensor. @@ -771,13 +771,13 @@ constexpr const char* descr = R"trtdoc( The slice layer has two variants, static and dynamic. Static slice specifies the start, size, and stride dimensions at layer creation time via :class:`Dims` and can use the get/set accessor functions of the :class:`ISliceLayer` . - Dynamic slice specifies one or more of start, size or stride as :class:`ITensor`s, by using :func:`ILayer.set_input` to add a second, third, or fourth input respectively. + Dynamic slice specifies one or more of start, size, stride, or axes as :class:`ITensor`s, by using :func:`ILayer.set_input` to add a second, third, fourth, or sixth input respectively. The corresponding :class:`Dims` are used if an input is missing or null. - An application can determine if the :class:`ISliceLayer` has a dynamic output shape based on whether the size input (third input) is present and non-null. + An application can determine if the :class:`ISliceLayer` has a dynamic output shape based on whether the size or axes input is present and non-null. The slice layer selects for each dimension a start location from within the input tensor, and copies elements to the output tensor using the specified stride across the input tensor. - Start, size, and stride tensors must be 1-D :class:`int32` shape tensors if not specified via :class:`Dims` . + Start, size, and stride tensors must be 1-D integer-typed shape tensors if not specified via :class:`Dims` . An example of using slice on a tensor: input = {{0, 2, 4}, {1, 3, 5}} @@ -786,17 +786,32 @@ constexpr const char* descr = R"trtdoc( stride = {1, 2} output = {{1, 5}} + If axes is provided then starts, ends, and strides must have the same length as axes and specifies a subset of dimensions to slice. If axes is not provided, starts, ends, and strides + must be of the same length as the rank of the input tensor. + + An example of using slice on a tensor with axes specified: + input = {{0, 2, 4}, {1, 3, 5}} + start = {1} + size = {2} + stride = {1} + axes = {1} + output = {{2, 4}, {3, 5}} + When the sampleMode is :const:`SampleMode.CLAMP` or :const:`SampleMode.REFLECT` , for each input dimension, if its size is 0 then the corresponding output dimension must be 0 too. + When the sampleMode is :const:`SampleMode.FILL`, the fifth input to the slice layer is used to determine the value to fill in out-of-bound indices. It is an error to specify the fifth input in any other sample mode. + A slice layer can produce a shape tensor if the following conditions are met: * ``start``, ``size``, and ``stride`` are build time constants, either as static :class:`Dims` or as constant input tensors. + * ``axes``, if provided, is a build time constant, either as static :class:`Dims` or as a constant input tensor. * The number of elements in the output tensor does not exceed 2 * :const:`Dims.MAX_DIMS` . The input tensor is a shape tensor if the output is a shape tensor. The following constraints must be satisfied to execute this layer on DLA: * ``start``, ``size``, and ``stride`` are build time constants, either as static :class:`Dims` or as constant input tensors. + * ``axes``, if provided, is a build time constant, either as static :class:`Dims` or as a constant input tensor. * sampleMode is :const:`SampleMode.STRICT_BOUNDS` . * Strides are 1 for all dimensions. * Slicing is not performed on the first dimension @@ -806,6 +821,7 @@ constexpr const char* descr = R"trtdoc( :ivar shape: :class:`Dims` The output dimensions. :ivar stride: :class:`Dims` The slicing stride. :ivar mode: :class:`SampleMode` Controls how :class:`ISliceLayer` handles out of bounds coordinates. + :ivar axes: :class:`Dims` The axes that starts, sizes, and strides correspond to. )trtdoc"; constexpr const char* set_input = R"trtdoc( @@ -823,6 +839,7 @@ constexpr const char* set_input = R"trtdoc( 2 The size tensor of the resulting slice, N-dimensional for Data, and 1-D for Shape. 3 The stride of the slicing operation, N-dimensional for Data, and 1-D for Shape. 4 Value for the :const:`SampleMode.FILL` slice mode. Disallowed for other modes. + 5 The axes tensor indicating the axes that starts, sizes, and strides correspond to. Must be a 1-D tensor. ===== ================================================================================== If this function is called with a value greater than 0, then :attr:`num_inputs` changes diff --git a/python/docstrings/infer/pyPluginDoc.h b/python/docstrings/infer/pyPluginDoc.h index f541a2810..c392d9425 100644 --- a/python/docstrings/infer/pyPluginDoc.h +++ b/python/docstrings/infer/pyPluginDoc.h @@ -420,6 +420,10 @@ constexpr const char* ipluginv3onebuild_descr = R"trtdoc( :ivar num_outputs: :class:`int` The number of outputs from the plugin. This is used by the implementations of :class:`INetworkDefinition` and :class:`Builder`. )trtdoc"; +constexpr const char* ipluginv3onebuildv2_descr = R"trtdoc( + A plugin capability interface that extends IPluginV3OneBuild by providing I/O aliasing functionality. +)trtdoc"; + constexpr const char* ipluginv3oneruntime_descr = R"trtdoc( A plugin capability interface that enables the runtime capability (PluginCapabilityType.RUNTIME). @@ -616,6 +620,35 @@ constexpr const char* get_valid_tactics = R"trtdoc( )trtdoc"; +constexpr const char* get_aliased_input = R"trtdoc( + Communicates to TensorRT that the output at the specified output index is aliased to the input at the returned index + + Enables read-modify-write behavior in plugins. TensorRT may insert copies to facilitate this capability. + + .. note:: + A given plugin input can only be aliased to a single plugin output. + + .. note:: + This API will only be called and have an effect when PreviewFeature.ALIASED_PLUGIN_IO_10_03 is turned on. + + .. warning:: + If an input is not shallow copyable, a copy inserted by TensorRT may not work as intended. Therefore, using this feature with tensors requiring deep copies is not supported. + + .. warning:: + If a given tensor is requested to be aliased by two different plugins, this may result in divergent copies of the tensor after writes from each plugin. e.g. In the below example, t1 and t2 could be divergent. + + +-----+ +--------+ + +->|Copy +--> t* ---->|Plugin0 +--> t1 + | +-----+ +--------+ + t + | +-----+ +--------+ + +->|Copy +--> t** --->|Plugin1 +--> t2 + +-----+ +--------+ + + :returns: An integer denoting the index of the input which is aliased to the output at output_index. Returning -1 indicates that the output is not aliased to any input. Otherwise, the valid range for return value is [0, nbInputs - 1]. + +)trtdoc"; + constexpr const char* attach_to_context = R"trtdoc( Clone the plugin, attach the cloned plugin object to a execution context and grant the cloned plugin access to some context resources. diff --git a/python/packaging/bindings_wheel/tensorrt/__init__.py b/python/packaging/bindings_wheel/tensorrt/__init__.py index 22022e9ea..ac1314d3a 100644 --- a/python/packaging/bindings_wheel/tensorrt/__init__.py +++ b/python/packaging/bindings_wheel/tensorrt/__init__.py @@ -195,7 +195,6 @@ def _itemsize(trt_type): fp8: 1, int4: 0.5, } - if trt_type in mapping: return mapping[trt_type] diff --git a/python/src/infer/pyCore.cpp b/python/src/infer/pyCore.cpp index e96597001..b92933f7e 100644 --- a/python/src/infer/pyCore.cpp +++ b/python/src/infer/pyCore.cpp @@ -1097,7 +1097,7 @@ void bindCore(py::module& m) .def("get_debug_state", &IExecutionContext::getDebugState, "name"_a, IExecutionContextDoc::get_debug_state) .def("set_all_tensors_debug_state", &IExecutionContext::setAllTensorsDebugState, "flag"_a, IExecutionContextDoc::set_all_tensors_debug_state) - ; + ; py::enum_(m, "ExecutionContextAllocationStrategy", py::arithmetic{}, ExecutionContextAllocationStrategyDoc::descr, py::module_local()) @@ -1293,7 +1293,7 @@ void bindCore(py::module& m) "weight_streaming_scratch_memory_size", &ICudaEngine::getWeightStreamingScratchMemorySize) // End weight streaming APIs .def("is_debug_tensor", &ICudaEngine::isDebugTensor, "name"_a, ICudaEngineDoc::is_debug_tensor) - .def("__del__", &utils::doNothingDel); + .def("__del__", &utils::doNothingDel); py::enum_(m, "AllocatorFlag", py::arithmetic{}, AllocatorFlagDoc::descr, py::module_local()) .value("RESIZABLE", AllocatorFlag::kRESIZABLE, AllocatorFlagDoc::RESIZABLE); @@ -1380,7 +1380,9 @@ void bindCore(py::module& m) QuantizationFlagDoc::CALIBRATE_BEFORE_FUSION); py::enum_(m, "PreviewFeature", PreviewFeatureDoc::descr, py::module_local()) - .value("PROFILE_SHARING_0806", PreviewFeature::kPROFILE_SHARING_0806, PreviewFeatureDoc::PROFILE_SHARING_0806); + .value("PROFILE_SHARING_0806", PreviewFeature::kPROFILE_SHARING_0806, PreviewFeatureDoc::PROFILE_SHARING_0806) + .value("ALIASED_PLUGIN_IO_10_03", PreviewFeature::kALIASED_PLUGIN_IO_10_03, + PreviewFeatureDoc::ALIASED_PLUGIN_IO_10_03); py::enum_( m, "HardwareCompatibilityLevel", HardwareCompatibilityLevelDoc::descr, py::module_local()) @@ -1493,7 +1495,7 @@ void bindCore(py::module& m) .def_property("max_aux_streams", &IBuilderConfig::getMaxAuxStreams, &IBuilderConfig::setMaxAuxStreams) .def_property("progress_monitor", &IBuilderConfig::getProgressMonitor, py::cpp_function(&IBuilderConfig::setProgressMonitor, py::keep_alive<1, 2>{})) - .def("__del__", &utils::doNothingDel); + .def("__del__", &utils::doNothingDel); py::enum_(m, "NetworkDefinitionCreationFlag", py::arithmetic{}, NetworkDefinitionCreationFlagDoc::descr, py::module_local()) @@ -1539,7 +1541,7 @@ void bindCore(py::module& m) .def("deserialize_cuda_engine", py::overload_cast(&IRuntime::deserializeCudaEngine), "stream_reader"_a, RuntimeDoc::deserialize_cuda_engine_reader, py::call_guard{}, py::keep_alive<0, 1>{}) - .def_property("DLA_core", &IRuntime::getDLACore, &IRuntime::setDLACore) + .def_property("DLA_core", &IRuntime::getDLACore, &IRuntime::setDLACore) .def_property_readonly("num_DLA_cores", &IRuntime::getNbDLACores) .def_property("gpu_allocator", nullptr, py::cpp_function(&IRuntime::setGpuAllocator, py::keep_alive<1, 2>{})) .def_property("error_recorder", &IRuntime::getErrorRecorder, diff --git a/python/src/infer/pyGraph.cpp b/python/src/infer/pyGraph.cpp index f6ebbfe0e..e2d8564a9 100644 --- a/python/src/infer/pyGraph.cpp +++ b/python/src/infer/pyGraph.cpp @@ -23,6 +23,7 @@ #if ENABLE_INETWORK_SERIALIZE #include "NvInferSerialize.h" #endif + #include "infer/pyGraphDoc.h" // clang-format off @@ -243,7 +244,6 @@ namespace tensorrt else return py::cast(self.getBeta()); }; - } /* lambdas */ void bindGraph(py::module& m) @@ -297,6 +297,7 @@ namespace tensorrt .value("REVERSE_SEQUENCE", LayerType::kREVERSE_SEQUENCE, LayerTypeDoc::REVERSE_SEQUENCE) .value("NORMALIZATION", LayerType::kNORMALIZATION, LayerTypeDoc::NORMALIZATION) .value("PLUGIN_V3", LayerType::kPLUGIN_V3, LayerTypeDoc::PLUGIN_V3) + ; // LayerType py::enum_(m, "TensorFormat", TensorFormatDoc::descr, py::arithmetic{}, py::module_local()) @@ -450,7 +451,6 @@ namespace tensorrt .def_property("axis", &IDequantizeLayer::getAxis, &IDequantizeLayer::setAxis) .def_property("to_type", &IDequantizeLayer::getToType, &IDequantizeLayer::setToType) ; - py::class_>(m, "ISoftMaxLayer", ISoftMaxLayerDoc::descr, py::module_local()) .def_property("axes", &ISoftMaxLayer::getAxes, &ISoftMaxLayer::setAxes) ; @@ -605,6 +605,7 @@ namespace tensorrt .def_property("stride", &ISliceLayer::getStride, &ISliceLayer::setStride) .def_property("mode", &ISliceLayer::getMode, &ISliceLayer::setMode) .def("set_input", &ISliceLayer::setInput, "index"_a, "tensor"_a, ISliceLayerDoc::set_input) + .def_property("axes", &ISliceLayer::getAxes, &ISliceLayer::setAxes) ; py::enum_(m, "InterpolationMode", InterpolationModeDoc::descr, py::module_local()) diff --git a/python/src/infer/pyPlugin.cpp b/python/src/infer/pyPlugin.cpp index 671a14c39..e7c8caa1b 100644 --- a/python/src/infer/pyPlugin.cpp +++ b/python/src/infer/pyPlugin.cpp @@ -38,7 +38,7 @@ } #define PLUGIN_API_CATCH_CAST(func, returnType) \ - catch (const py::cast_error& e) \ + catch (py::cast_error const& e) \ { \ std::cerr << "[ERROR] Return value of " << (func) << "() could not be interpreted as " << (returnType) \ << std::endl; \ @@ -786,7 +786,18 @@ class PyIPluginV3Impl : public IPluginV3 } if (type == PluginCapabilityType::kBUILD) { - return pyResult.cast(); + try + { + return pyResult.cast(); + } + catch (py::cast_error const& e) + { + try + { + return pyResult.cast(); + } + PLUGIN_API_CATCH_CAST("get_capability_interface", " a valid build capability interface") + } } if (type == PluginCapabilityType::kRUNTIME) { @@ -937,13 +948,19 @@ class PyIPluginResourceImpl : public IPluginResource } }; -class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild +template +class PyIPluginV3OneBuildBaseImpl : public T { -public: - using IPluginV3OneBuild::IPluginV3OneBuild; - PyIPluginV3OneBuildImpl() = default; - PyIPluginV3OneBuildImpl(const IPluginV3OneBuild& a){}; +private: + T* mBuild{nullptr}; +protected: + PyIPluginV3OneBuildBaseImpl(T* build) + : mBuild{build} + { + } + +public: APILanguage getAPILanguage() const noexcept final { return APILanguage::kPYTHON; @@ -972,8 +989,7 @@ class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild try { - py::function pyGetValidTactics - = py::get_override(static_cast(this), "get_valid_tactics"); + py::function pyGetValidTactics = py::get_override(static_cast(mBuild), "get_valid_tactics"); mIsTacticsInitialized = true; @@ -1047,7 +1063,7 @@ class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild py::gil_scoped_acquire gil{}; py::function pySupportsFormatCombination - = utils::getOverride(static_cast(this), "supports_format_combination"); + = utils::getOverride(static_cast(mBuild), "supports_format_combination"); if (!pySupportsFormatCombination) { utils::throwPyError(PyExc_RuntimeError, "no implementation provided for supports_format_combination()"); @@ -1081,7 +1097,7 @@ class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild py::gil_scoped_acquire gil{}; py::function pyGetOutputDataTypes - = utils::getOverride(static_cast(this), "get_output_data_types"); + = utils::getOverride(static_cast(mBuild), "get_output_data_types"); if (!pyGetOutputDataTypes) { utils::throwPyError(PyExc_RuntimeError, "no implementation provided for get_output_data_types()"); @@ -1124,8 +1140,7 @@ class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild { py::gil_scoped_acquire gil{}; - py::function pyGetOutputShapes - = utils::getOverride(static_cast(this), "get_output_shapes"); + py::function pyGetOutputShapes = utils::getOverride(static_cast(mBuild), "get_output_shapes"); if (!pyGetOutputShapes) { utils::throwPyError(PyExc_RuntimeError, "no implementation provided for get_output_shapes()"); @@ -1174,8 +1189,7 @@ class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild { py::gil_scoped_acquire gil{}; - py::function pyConfigurePlugin - = utils::getOverride(static_cast(this), "configure_plugin"); + py::function pyConfigurePlugin = utils::getOverride(static_cast(mBuild), "configure_plugin"); if (!pyConfigurePlugin) { @@ -1215,8 +1229,7 @@ class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild { py::gil_scoped_acquire gil{}; - py::function pyGetWorkspaceSize - = py::get_override(static_cast(this), "get_workspace_size"); + py::function pyGetWorkspaceSize = py::get_override(static_cast(mBuild), "get_workspace_size"); if (!pyGetWorkspaceSize) { @@ -1333,6 +1346,57 @@ class PyIPluginV3OneBuildImpl : public IPluginV3OneBuild bool mIsTacticsInitialized{false}; }; +class PyIPluginV3OneBuildImpl : public PyIPluginV3OneBuildBaseImpl +{ +public: + PyIPluginV3OneBuildImpl() + : PyIPluginV3OneBuildBaseImpl(this) + { + } + PyIPluginV3OneBuildImpl(IPluginV3OneBuild const& a) + : PyIPluginV3OneBuildBaseImpl(this){}; +}; + +class PyIPluginV3OneBuildV2Impl : public PyIPluginV3OneBuildBaseImpl +{ +public: + PyIPluginV3OneBuildV2Impl() + : PyIPluginV3OneBuildBaseImpl(this) + { + } + PyIPluginV3OneBuildV2Impl(IPluginV3OneBuildV2 const& a) + : PyIPluginV3OneBuildBaseImpl(this){}; + + int32_t getAliasedInput(int32_t outputIndex) noexcept override + { + try + { + py::gil_scoped_acquire gil{}; + + py::function pyGetAliasedInput + = py::get_override(static_cast(this), "get_aliased_input"); + + if (!pyGetAliasedInput) + { + // if no implementation is provided for get_aliased_input(), default to no aliasing + return -1; + } + + py::object pyResult = pyGetAliasedInput(outputIndex); + + try + { + auto result = pyResult.cast(); + return result; + } + PLUGIN_API_CATCH_CAST("get_aliased_input", "int32_t") + return 0U; + } + PLUGIN_API_CATCH("get_aliased_input") + return -1; + } +}; + class PyIPluginV3OneRuntimeImpl : public IPluginV3OneRuntime { public: @@ -2265,6 +2329,11 @@ IPluginResource* clonePluginResource(IPluginResource& self) return nullptr; } +int32_t getAliasedInput(int32_t outputIndex) +{ + return -1; +} + } // namespace pluginDoc void bindPlugin(py::module& m) @@ -2503,7 +2572,7 @@ void bindPlugin(py::module& m) .def_property("timing_cache_id", &IPluginV3OneBuild::getTimingCacheID, py::cpp_function(lambdas::IPluginV3_get_timing_cache_id, py::keep_alive<1, 2>{})) // The following defs are only for documenting the API for Python-based plugins - .def("get_output_datatypes", &pluginDoc::getOutputDataTypes, "input_types"_a, + .def("get_output_data_types", &pluginDoc::getOutputDataTypes, "input_types"_a, IPluginV3Doc::get_output_data_types) .def("get_output_shapes", &pluginDoc::getOutputShapes, "inputs"_a, "shape_inputs"_a, "expr_builder"_a, IPluginV3Doc::get_output_shapes) @@ -2513,6 +2582,14 @@ void bindPlugin(py::module& m) "num_inputs"_a, IPluginV3Doc::supports_format_combination) .def("get_valid_tactics", &pluginDoc::getValidTactics, IPluginV3Doc::get_valid_tactics); + py::class_>( + m, "IPluginV3OneBuildV2", IPluginV3Doc::ipluginv3onebuildv2_descr, py::module_local()) + .def(py::init<>()) + .def(py::init()) + // The following defs are only for documenting the API for Python-based plugins + .def("get_aliased_input", &pluginDoc::getAliasedInput, IPluginV3Doc::get_valid_tactics); + py::class_>( m, "IPluginV3OneRuntime", IPluginV3Doc::ipluginv3oneruntime_descr, py::module_local()) diff --git a/quickstart/Makefile.config b/quickstart/Makefile.config index 0d290ea59..330658997 100644 --- a/quickstart/Makefile.config +++ b/quickstart/Makefile.config @@ -106,7 +106,6 @@ CUDART_LIB = -lcudart CUDNN_LIB = -lcudnn CUBLAS_LIB = -lcublas NVINFER_LIB = -lnvinfer -NVPARSERS_LIB = -lnvparsers NVINFER_PLUGIN_LIB = -lnvinfer_plugin NVONNXPARSERS_LIB = -lnvonnxparser NVRTC_LIB = -lnvrtc @@ -168,8 +167,8 @@ ifeq ($(USE_CUDART_STATIC), 1) COMMON_LIBS_FOR_EXECUTABLE += $(CUDART_LIB) endif -LIBS = $(NVINFER_LIB) $(NVPARSERS_LIB) $(NVINFER_PLUGIN_LIB) $(NVONNXPARSERS_LIB) $(COMMON_LIBS_FOR_EXECUTABLE) $(PROTO_LIB) $(EXTRA_LIBS) -DLIBS = $(NVINFER_LIB) $(NVPARSERS_LIB) $(NVINFER_PLUGIN_LIB) $(NVONNXPARSERS_LIB) $(COMMON_LIBS_FOR_EXECUTABLE) $(PROTO_LIB) $(EXTRA_LIBS) +LIBS = $(NVINFER_LIB) $(NVINFER_PLUGIN_LIB) $(NVONNXPARSERS_LIB) $(COMMON_LIBS_FOR_EXECUTABLE) $(PROTO_LIB) $(EXTRA_LIBS) +DLIBS = $(NVINFER_LIB) $(NVINFER_PLUGIN_LIB) $(NVONNXPARSERS_LIB) $(COMMON_LIBS_FOR_EXECUTABLE) $(PROTO_LIB) $(EXTRA_LIBS) OBJS = $(patsubst %.cpp, $(OBJDIR)/%.o, $(wildcard *.cpp $(addsuffix /*.cpp, $(EXTRA_DIRECTORIES)))) DOBJS = $(patsubst %.cpp, $(DOBJDIR)/%.o, $(wildcard *.cpp $(addsuffix /*.cpp, $(EXTRA_DIRECTORIES)))) diff --git a/quickstart/SemanticSegmentation/tutorial-runtime.cpp b/quickstart/SemanticSegmentation/tutorial-runtime.cpp index c1f09197c..7d85663a9 100644 --- a/quickstart/SemanticSegmentation/tutorial-runtime.cpp +++ b/quickstart/SemanticSegmentation/tutorial-runtime.cpp @@ -54,7 +54,8 @@ class SampleSegmentation nvinfer1::Dims mInputDims; //!< The dimensions of the input to the network. nvinfer1::Dims mOutputDims; //!< The dimensions of the output to the network. - util::UniquePtr mEngine; //!< The TensorRT engine used to run the network + std::unique_ptr mRuntime; //!< The TensorRT runtime used to run the network + std::unique_ptr mEngine; //!< The TensorRT engine used to run the network }; SampleSegmentation::SampleSegmentation(const std::string& engineFilename) @@ -75,8 +76,8 @@ SampleSegmentation::SampleSegmentation(const std::string& engineFilename) std::vector engineData(fsize); engineFile.read(engineData.data(), fsize); - util::UniquePtr runtime{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())}; - mEngine.reset(runtime->deserializeCudaEngine(engineData.data(), fsize, nullptr)); + mRuntime.reset(nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())); + mEngine.reset(mRuntime->deserializeCudaEngine(engineData.data(), fsize)); assert(mEngine.get() != nullptr); } @@ -87,30 +88,22 @@ SampleSegmentation::SampleSegmentation(const std::string& engineFilename) //! bool SampleSegmentation::infer(const std::string& input_filename, int32_t width, int32_t height, const std::string& output_filename) { - auto context = util::UniquePtr(mEngine->createExecutionContext()); + auto context = std::unique_ptr(mEngine->createExecutionContext()); if (!context) { return false; } - auto input_idx = mEngine->getBindingIndex("input"); - if (input_idx == -1) - { - return false; - } - assert(mEngine->getBindingDataType(input_idx) == nvinfer1::DataType::kFLOAT); - auto input_dims = nvinfer1::Dims4{1, 3 /* channels */, height, width}; - context->setBindingDimensions(input_idx, input_dims); + char const* input_name = "input"; + assert(mEngine->getTensorDataType(input_name) == nvinfer1::DataType::kFLOAT); + auto input_dims = nvinfer1::Dims4{1, /* channels */ 3, height, width}; + context->setInputShape(input_name, input_dims); auto input_size = util::getMemorySize(input_dims, sizeof(float)); - auto output_idx = mEngine->getBindingIndex("output"); - if (output_idx == -1) - { - return false; - } - assert(mEngine->getBindingDataType(output_idx) == nvinfer1::DataType::kINT32); - auto output_dims = context->getBindingDimensions(output_idx); - auto output_size = util::getMemorySize(output_dims, sizeof(int32_t)); + char const* output_name = "output"; + assert(mEngine->getTensorDataType(output_name) == nvinfer1::DataType::kINT64); + auto output_dims = context->getTensorShape(output_name); + auto output_size = util::getMemorySize(output_dims, sizeof(int64_t)); // Allocate CUDA memory for input and output bindings void* input_mem{nullptr}; @@ -145,10 +138,11 @@ bool SampleSegmentation::infer(const std::string& input_filename, int32_t width, gLogError << "ERROR: CUDA memory copy of input failed, size = " << input_size << " bytes" << std::endl; return false; } + context->setTensorAddress(input_name, input_mem); + context->setTensorAddress(output_name, output_mem); // Run TensorRT inference - void* bindings[] = {input_mem, output_mem}; - bool status = context->enqueueV2(bindings, stream, nullptr); + bool status = context->enqueueV3(stream); if (!status) { gLogError << "ERROR: TensorRT inference failed" << std::endl; @@ -156,7 +150,7 @@ bool SampleSegmentation::infer(const std::string& input_filename, int32_t width, } // Copy predictions from output binding memory - auto output_buffer = std::unique_ptr{new int[output_size]}; + auto output_buffer = std::unique_ptr{new int64_t[output_size]}; if (cudaMemcpyAsync(output_buffer.get(), output_mem, output_size, cudaMemcpyDeviceToHost, stream) != cudaSuccess) { gLogError << "ERROR: CUDA memory copy of output failed, size = " << output_size << " bytes" << std::endl; @@ -168,7 +162,12 @@ bool SampleSegmentation::infer(const std::string& input_filename, int32_t width, const int num_classes{21}; const std::vector palette{(0x1 << 25) - 1, (0x1 << 15) - 1, (0x1 << 21) - 1}; auto output_image{util::ArgmaxImageWriter(output_filename, output_dims, palette, num_classes)}; - output_image.process(output_buffer.get()); + int64_t* output_ptr = output_buffer.get(); + std::vector output_buffer_casted(output_size); + for (size_t i = 0; i < output_size; ++i) { + output_buffer_casted[i] = static_cast(output_ptr[i]); + } + output_image.process(output_buffer_casted.data()); output_image.write(); // Free CUDA resources diff --git a/quickstart/SemanticSegmentation/tutorial-runtime.ipynb b/quickstart/SemanticSegmentation/tutorial-runtime.ipynb index 3258bd28f..d0058498a 100644 --- a/quickstart/SemanticSegmentation/tutorial-runtime.ipynb +++ b/quickstart/SemanticSegmentation/tutorial-runtime.ipynb @@ -7,7 +7,7 @@ "outputs": [], "source": [ "#\n", - "# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\n", + "# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -30,6 +30,11 @@ "### Check the TensorRT version" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "code", "execution_count": 2, @@ -39,7 +44,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "TensorRT version: 7.2.2.1\r\n" + "TensorRT version: 10.2.0.post1\n" ] } ], @@ -51,13 +56,57 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Import required modules" + "### Prepare the input image and ONNX model file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/lib/python3/dist-packages/requests/__init__.py:87: RequestsDependencyWarning: urllib3 (2.0.7) or chardet (4.0.0) doesn't match a supported version!\n", + " warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n", + "Exporting ppm image input.ppm\n", + "Using cache found in /home/simengl/.cache/torch/hub/pytorch_vision_v0.6.0\n", + "Exporting ONNX model fcn-resnet101.onnx\n" + ] + } + ], + "source": [ + "!python3 export.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Build TensorRT engine from the ONNX model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "!trtexec --onnx=fcn-resnet101.onnx --saveEngine=fcn-resnet101.engine --optShapes=input:1x3x1026x1282" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Import required modules" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -77,6 +126,11 @@ "output_file = \"output.ppm\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -86,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -127,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -162,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -174,32 +228,37 @@ " image_height = img.height\n", "\n", " with engine.create_execution_context() as context:\n", - " # Set input shape based on image dimensions for inference\n", - " context.set_binding_shape(engine.get_binding_index(\"input\"), (1, 3, image_height, image_width))\n", " # Allocate host and device buffers\n", - " bindings = []\n", - " for binding in engine:\n", - " binding_idx = engine.get_binding_index(binding)\n", - " size = trt.volume(context.get_binding_shape(binding_idx))\n", - " dtype = trt.nptype(engine.get_binding_dtype(binding))\n", - " if engine.binding_is_input(binding):\n", + " tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]\n", + " for tensor in tensor_names:\n", + " size = trt.volume(context.get_tensor_shape(tensor))\n", + " dtype = trt.nptype(engine.get_tensor_dtype(tensor))\n", + " \n", + " if engine.get_tensor_mode(tensor) == trt.TensorIOMode.INPUT:\n", + " context.set_input_shape(tensor, (1, 3, image_height, image_width))\n", " input_buffer = np.ascontiguousarray(input_image)\n", " input_memory = cuda.mem_alloc(input_image.nbytes)\n", - " bindings.append(int(input_memory))\n", + " context.set_tensor_address(tensor, int(input_memory))\n", " else:\n", " output_buffer = cuda.pagelocked_empty(size, dtype)\n", " output_memory = cuda.mem_alloc(output_buffer.nbytes)\n", - " bindings.append(int(output_memory))\n", + " context.set_tensor_address(tensor, int(output_memory))\n", "\n", " stream = cuda.Stream()\n", + " \n", " # Transfer input data to the GPU.\n", " cuda.memcpy_htod_async(input_memory, input_buffer, stream)\n", + " \n", " # Run inference\n", - " context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)\n", + " context.execute_async_v3(stream_handle=stream.handle)\n", + " \n", " # Transfer prediction output from the GPU.\n", " cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)\n", + " \n", " # Synchronize the stream\n", " stream.synchronize()\n", + " output_d64 = np.array(output_buffer, dtype=np.int64)\n", + " np.savetxt('test.out', output_d64.astype(int), fmt='%i', delimiter=' ', newline=' ')\n", "\n", " with postprocess(np.reshape(output_buffer, (image_height, image_width))) as img:\n", " print(\"Writing output image to file {}\".format(output_file))\n", @@ -215,29 +274,27 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -254,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -262,7 +319,14 @@ "output_type": "stream", "text": [ "Running TensorRT inference for FCN-ResNet101\n", - "Reading engine from file fcn-resnet101.engine\n", + "Reading engine from file fcn-resnet101.engine\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[07/31/2024-12:07:11] [TRT] [W] Using an engine plan file across different models of devices is not recommended and is likely to affect performance or even cause errors.\n", "Reading input image from file input.ppm\n", "Writing output image to file output.ppm\n" ] @@ -283,29 +347,27 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAT4AAAD8CAYAAADub8g7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAySklEQVR4nO29eZQk9XXn+7kRkVstvS/0SjfQ0M3e0IBkEAKhBZAG5DcajXTkMZL1hpl5tse23hsLjeeM58077w+98YxHnkU2z5ox9pEsPEhyI40WJBYhWaKhGyGggaaLbqC76X2vJTMjI+78EVFVmVVZlWvlUnk/59SpjF9sNyMjvnF/v9/93Z+oKoZhGL2E024DDMMwWo0Jn2EYPYcJn2EYPYcJn2EYPYcJn2EYPYcJn2EYPUfLhU9E7hSRPSIyJCIPtPr8hmEY0so4PhFxgdeBDwAHgeeAT6rqKy0zwjCMnqfVHt+NwJCq7lPVPPB14N4W22AYRo/jtfh8a4ADRcsHgZuKNxCR+4H748XrW2SXYRjzjxOqurzcilYLX0VU9UHgQQARsfF0hmHUy1szrWh1VfcQsK5oeW1cZhiG0TJaLXzPAZtEZKOIJIFPAI+22AbDMHqcllZ1VbUgIr8F/ABwgf+mqrtbaYNhGEZLw1lqxdr4DMNogF2quq3cChu5YRhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz2HCZxhGz1G38InIOhF5UkReEZHdIvI7cfkSEfmhiOyN/y+Oy0VE/kREhkTkRRG5rllfwjAMoxYa8fgKwP+pqpcD7wJ+U0QuBx4AHlfVTcDj8TLAXcCm+O9+4MsNnNswDKNu6hY+VT2sqs/Hn88DrwJrgHuBh+LNHgI+Gn++F/hLjXgGWCQiq+o9v2EYRr00pY1PRDYAW4EdwEpVPRyvOgKsjD+vAQ4U7XYwLpt6rPtFZKeI7GyGbYZhGFNpWPhEZAD4BvC7qnqueJ2qKqC1HE9VH1TVbaq6rVHbDMMwytGQ8IlIgkj0vqqq34yLj45XYeP/x+LyQ8C6ot3XxmWGYRgtpZFeXQG+Aryqqv+haNWjwH3x5/uA7UXlvx737r4LOFtUJTYMw2gZEtVG69hR5BbgJ8BLQBgX/0uidr6/AdYDbwEfV9VTsVD+Z+BOYBT4jKrO2o4nIvUZZxiGAbtmajKrW/hagQmfYRgNMKPw2cgNwzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+wzB6DhM+owMRSPYhfYvBTbbbGGMe4rXbAMOYwPFw11xN8sp7cNdcg6QXEJ56C3/oxxRef5zwzMF2W2jMEywDs9EZpAZI3/xPSG79OHhpopkKIlQVPXeYsSf+PYU9P6LGifuM3mXGDMzm8RntJzVI30f+X7yL34M47rTVIoIsXE3fXX/IqIYUXn+8DUYa8wlr4zPai7ikf+Uf411ya1nRK9k0vYD0rb+FpBe0yDhjvmLCZ7QVd/02kls/jkh1t6Kz+ELc9TfOsVXGfMeEz2gryavugUSm+h0cl8SWD0GVQmkY5bC7x2gfbhJ3+aUlHRmVEBG81VdbdddoCBM+o21IZhEysLz2/foWIwPL5sAio1cw4TPahjO4HEn1176jm8BZuKb5Bhk9gwmf0TachWvBTdS+ozhIsg7BNIwYEz6jbUhmEVB9+55hNAsTPqNtBMf3QliofcfQJzx7qPkGGT2DCZ/RNsJT+9GRk7XvGARo9nzzDTJ6BhM+o23o6BmCI7vr2O8k4fDxObDI6BVM+Iw2ogQn36xtD1UK77wI+ZG5McnoCUz4jPZSyNW8S3hyP2g4B8YYvYIJn9E+Ehm8i99T0y4iQvKa/w0ZvGCOjDJ6gYaFT0RcEfmFiHwnXt4oIjtEZEhEHhaRZFyeipeH4vUbGj230cU4Lqltn8JduaXmXWXwApLX/CoWCmPUSzM8vt8BXi1a/iLwx6p6CXAa+Gxc/lngdFz+x/F2Ro+SuPQOUr9yP+LWnhJSREhe/0nctVvnwDKjF2hI+ERkLfBh4M/jZQHeBzwSb/IQ8NH4873xMvH6O6SW0enGvMFZcSnp2z8HXqruY0h6IZk7/zWycHUTLTN6hUY9vv8I/D4w3tK8FDijquNRqQeB8UGVa4ADAPH6s/H2JYjI/SKyU0R2Nmib0YE4yy+l7+7/B1mwqqasLFMREZylG+m7+99CarCJFhq9QN3CJyIfAY6p6q4m2oOqPqiq22bKlW90L87KzfR99N/hrNzckOiNIyK467eRvvmfQIXszYZRTCNzbtwM3CMidwNpYAHwJWCRiHixV7cWGB9bdAhYBxwUEQ9YCNQRtm90GzKwnOSV95C8/hPIwIqmiN7EscUhufXjBIdfxn/1+007rjG/qdvjU9UvqOpaVd0AfAJ4QlU/BTwJfCze7D5ge/z50XiZeP0T2slTvBmN47i4F95E/z/8M1Lv/ec4gyubKnrjSCJN+vbfw1m5uenHNuYncxHH93ngcyIyRNSG95W4/CvA0rj8c8ADc3Buo1NIDZK+4/P0//0/wVl28ZwIXjEyeAF993wRZ8Vlc3oeY35g8+oazSeRIXPXvyGx5UNVTyLUDFSV8NSbjH7z9whP7mvZeY2OZcZ5dW3khtFcxCF106dJbP5AS0UP4p7eJRvIfOALSP+0gAHDmMCEz2gq7vobSN14H+K0Z656EcG98EbSt/42uMm22GB0PiZ8RvMQl9T1n6xtusi5MEOExBUfwbvkvW21w+hcTPiMpiELVuKuvXbOOzKqwk2Qef/ncZZvarclRgdiwmc0jcSl74vn0Wg/IoIMLCd9yz+DNlW7jc7FhM9oCrJgNakbfr3lHRqzISJ4F92Mu+aadptidBidc5caXU3i0vchgyvabcZ0vDTpm/8pJNLttsToIEz4jIaRhWtI3fBrHeXtjSMiuOuuw9vw7nabYnQQnXenGl2GkLz2Y8iCVe02ZEbETZDYdHu7zTA6CBM+oyFkcCXJK/9eZ/TkzoK75lokvaDdZhgdggmf0RDehTcgA8vabUZFnMXr8C69o91mGB2CCZ9RP06CxOYP0g1zX4jjkrruH7Y9uNroDEz4jLpx116Ld+GNHV/NHcdZdjHuikvbbYbRAZjwGXUiJK/+VfC6KEzETdowNgMw4TPqJdWPu/qqrvH2IB7De/F7INnfblOMNmPCZ9SFM7gSpws6NabiLLuYxKbb2m2G0WZM+Iy6cC+4vCs7CsTxSG79uKWs6nFM+Iy68NZe15EjNarBXXEZ7vJL2m2G0Ua688412ouXwl11RbutqJ9EhsSWD7XbCqONmPAZNSPpBV2d2l1EorG7lrigZzHhM2rGWbgGSS9stxkN4Sxej7P4wnabYbQJEz6jZpzBleAm2m1GYyQyJC57P3RROI7RPEz4jJpx5kHHgIiQ2vZreJfc1m5TjDZgwmfUiOAsWNVVgcszIal+0u/7v3AWr2+3KUaLMeEzaiOey2K+4CxaS/r2z3V/1d2oCRM+Y96gYUA4fBwNC6hqVftEPbzvwrXZ2HoKEz6jNkSQDvWO9NxhRr76GUa3/z569p2qxY9EBu+im+fWOKOjMOEzaiPRh7NwdbutKIuqEo6eprDnR4xu/xcEbz+HaljVvuH5o3NsndFJmPAZNSGOC47bbjPK4ixYiRdPJRkcfpmRb/wuhaGnK4qfZs8RHHlt5g08kAHphnyrRpWY8Bk1IQPLOzd42UmQuvG+yQQE+WHGvvuv8Xd/d8Zqr6qS3/U1wuN7Zzxs5uYMC+5bQOaODHRmLd+oERM+oyacxes7NrOJiOCuuRp3xWUTZTp2huzT/wkdOYmqooFfIoKaPYv/8neAqCx5bZLMHRnc1W7k4SUgcUkCZ8AhdXUKd0lnertGbXjtNsDoLtxVV3T2aAcvjXfp7QSHX5oo0vNHCQ69ABqSe/avcNdeS/LqX8VZsoHCmzsIzx4CwFnkkLklg/QJqatSjD42iv+2j6Tj7+tF2wRHgzZ8MaOZmPAZVSPpBSQuvaNi8LLmR8BLR+2BLUZEcJdsiMR53LPTEP/1JwiOvEJ4ch/BO7/Ef2k76dvvJDi8C+I2QM0raHQMUpC5LYO330P6Jr9v+oY04amQ4GwA+SbbnhacpQ7uQhdnmYOOKuFI3D6pEJwICE+G486p0QANCZ+ILAL+HLiS6Of4DWAP8DCwAXgT+LiqnpboafkScDcwCnxaVZ9v5PxGa3FXXYWzaO2s26gquWf+gsTmD7RtYh9n6cYoSWp+dKLMf+0HEBQmlnXsDP6rjyApARcIQBKlHRgyICSvSk4IvYjgrnYZ/NQg4WiIv8cnuyOLZmtUIg9S16bQrKK+4i51cZY4eKs8nAVOZE8ZNKeMfHOEwoFC+Q2Mqmm0je9LwPdVdTNwDfAq8ADwuKpuAh6PlwHuAjbFf/cDX27w3Ea1OJDYlMBd40K9zXMiJLZ8EJwK78pClsK+nxIcH6rzRI3jDKzA6Z+SFj/wmeoqFd4sgA+DnxwktTVF6oZUiXcnItO8WxFBUoK72CV1U4q+D/XNKFQzkbgwQea2DH1399F/bz/pW9KkrojaD8WTifNO+0sJmVszkVgbDVG3xyciC4FbgU8DqGoeyIvIvcBt8WYPAU8BnwfuBf5So5blZ0RkkYisUtXDdVtvlJKMvZYCqB895NIvpG9Ik9qaAiA4FpDdkaVwqICOKVQX5oYMXoB30S0Vq7nh+eMEp96EINfIN2mMRAbJLIbTb8++nUJwKsBZ4pD5QJRGv5YxyCKCuyoSKw2q9/rCkRDNK06mNr9j3OP0Nnj4e/ya9jVKaaSquxE4Dvx3EbkG2AX8DrCySMyOACvjz2uAA0X7H4zLTPiagUDfB/pIbEiggRKeDUGIvIi+Sc/FXeXSf28/OqoEJwKyz2Yp7K9cdUpcdAsy1YsqQ3j2IPhZ2howIA4yWN14Yh1TNFe7CE1QRztfcCQg91yO9HvStSd7EEhclDDha5BG7k4PuA74sqpuBUaYrNYCEHt3NTWAiMj9IrJTRHY2YFvv4YC73MUZiBrHE+sTJNYlcPqdkodLRBBHcAYcvAs9+j/Sj7exwvvP8fAuvb1yp4YqweHdIIKzpH1JPsVxcAYvqGpbd1l0zerFWeqQuj5Ve3BznR0UIoIWrHejURoRvoPAQVXdES8/QiSER0VkFUD8/1i8/hCwrmj/tXFZCar6oKpuU9VtDdjWeyhQoxMgIkhGSN+YnvXBdRavx1tzbWUTzh/Df/nb4CaQ1EBtxjQZb/02qDQZkgPJK5M1t9EVI46QviluSqhW/AScZU7t3h7Ry8V6dRunbuFT1SPAAREZjxa9A3gFeBS4Ly67D9gef34U+HWJeBdw1tr3mkgI/tu1V39EBGeBgyRnfgi9dddXnIRbw4Dczr8iPPUmTv9SnMEVNdvSTNy1W3EWr5t1G2+DR/LyZF0CVIwkhcxtGRKXVTeswxl0SGyofwhIeKrKhlljRhqN4/tt4KsikgT2AZ8hEtO/EZHPAm8BH4+3/S5RKMsQUTjLZxo8t1GMUHeVTVIS3Qnl+iNSgySuumfW/VWVwtCPyf/if0THW7i6olDONZJZhLvmGsJTb5XfwIH0tnTzIlk96Lu9j+GTwwTHZw9wdle5Jb3HtTLbS8qojoZ+dlV9AShXJb2jzLYK/GYj5zNmxl3ukri0yQNJ3SSpG+/DXX3VjF6RqhIceoGxH30R/DGAjphvV0RwV27Bf+3buMsdCCE8HaK5qJ7oLHYiAWrSKBQRgUHo+3Afw48Mo8Mz10e9C+p/7KLvZcPmGsVGbswHkpC5vf74LklINBRrZNJTkcGVpG/9bRKX3zWjkGmQx9/9XbI//hN05ERd555L3GWr6f9IP4mLPVAIz4RRGE9BcZe5TY+HExHcFS6Z92YY/d5o+VAhiTpEGhFcd6WLpKX2wGljAhO+eUDqihTeeq/+hykB/Xf2U3ingNPvEJwZwN34JZwlm8seUzVEz75DbsdfkH/xW3FwcOfhrV2ASBpxIkF3l7m4y+bWWxIRkpclyT2fIzhcvsrbaFXVWeiQvDxJ7uUchFFThTPg4Aw6OAsdJCOIFw1/E2fyXOFoSHgypHC4QHA06GnhNOHrdjxIXpEsucFrRURKRMFDUBJlkxFo9hzZn/4p/mvfQ8dOQTj94ZG+pZ2RyMBdSTRUZazF5yV6gTA3yQzEETK3Z6IwmjAa4zvRTsvsQdiqUdB6eDYk98scuRdyTR9z3A2Y8HU57nIXd0WzvZgswtdQ/U1gyaSIqU945ls46W/Qf7eLpAcIz4cUDhYoHCgQHAsgjBKCdkI7HywClhJFXrUQoaEQmapO4UldKbJEorHJ7pKoSp7YkGD0+6OE53qrp9iEr5vxiOLHmvwrCqA8jrAP5R7QC4DTCM/irfw7vBWpSa9iVTQOGB/yu/OMPjVaOX6uZfQBl9Jy4StEHlVZBMTtAG+YyHP0Nnj0//1+RraP9FSYjAlfhyIpgWQU8zXRJiTgLo3GhiLgrffw1jXQtjfb+QHYT5SHIpLCifIyA/dJQvKaJP5+H2fZxU23pz4E5SbgSaTJUb/hSIiOKs4SB5zJ6qUGSva5bOT9lrMoGcVNdgoigrvcpe/9fQz/7XDPVHtN+DoFiUTOW+vhbfDw1kR54MST0jBzqW0gfRPMouqhAgLJLX0w0N7g5QlEQK8GBoDzTTusFpTRH41S2F/AXe2S3JScaG7w90WpqmZM/qDUlNCgFYgI3oUe6evTZJ/J9sTIEBO+NiMpwdvokdySnBS7TugYqAMRQQaS0OCoDQ0D9PxRZGAF4jZ6iy4D1gO7GzxOhKriv+nj7/UhgML+QpTkYfzlVKG2qHklPBfiLu6sWDxxhNQNKfJ784Qn5n+Vt3N87l7DiaqGg58ajGLNNk1PKNCNOANLcPqX1L2/qlJ44ycM/+WvkXv2ITRoNOlmAtjWVCemcLDAtA7bkKpTfHUqkhbS188+bnu+YMLXJlJbU/S9vw93uYu43evlTcNdAm6q7t317CGyT/57dOQEuZ9+mfwvHq5+YvByiKC8G6jfptLDScd5a81CREhensRbO/8rgiZ8bSBxSYL0zemo/W6eobll4NQ3dC5KdPC1yfG1QZ7czx4kOPxyY+LHRkoTA9VPeD4kv2ce9wAkIH1Let5Po2nC12KcBQ6Z92UmZ+6aR6iCDFxS575KcGBXNBKkuHz0NGP/81+h5440YFkKuLrh6q4GythTY1Ulbp2RRNSJ1amICN4aD2/1/Pb6OvcXmI8IpG5M4Szq/ra88giSvLS+75YfJvvUlyA/Mm1VeHI/+V8+Ur/XJ0JYuB7Cxm53HVP8t+LheS51jfUVR6LpAToZJ8ryPJ8x4Wsh3nqP1NWpeSp6gGTAra9KGRx5leDoqzMcF/K7t6Pn3qnfNmcL6NL69yeex2S8UyOkrkzI0icdn1ZKRCKPbx6rwzz+ah2GQzS2cl7XIPqJYuZqQ1UpvPUchNOrkM4Ch74P99F/1xgE3wCtb/yrOIvBvbah6q5kikRLmd6zWwVOn9MV94CzKEp2MF8x4WsRzgInitObr94eAKuBwdp3y4/gD/14erkHfR/qI3l5ksSFHu7iHwD7JycKrwlB+RUaidUQkYafGHeV2xVPnfQJ3vouUOg66YKfYH7grffm9Rs0Yh31jM73X3uM8PjeaeXuMnfKkLxhhG9S19ACEWAzsKD2fcdJRJ5Q3Qh4a7vj5SdOlF5rvsb0mfC1AGeBQ/rddUwl2GUoF9W+T1jAf/3xslXY5JXJkmphdPV+Dhyt08JlwKo6940NaET3UjIHmXTmDvcCd15GH4AJX0tIbErgLJzfl1pxgBU15+ELT75J4eAL08qdpQ7JLeUmAjoF7KizupsAttSx3ySNJDJ1Bp2oja9LcAYc3KXdI9S10D2/QreShOTVjc/k1fn0A7XF8GkY4O/ejjjDpSscIg+5TNOAAMJPgfpi6ZSr0DrrbyLSUEopyUhXdGxMEM/VPB8x4ZtjEhsS8/atWcoKosSf1aPZ/Xhrf8DgPxosmUDHXRllPJn5ZfEy8GbtJooQ5efrq33fJuAscbqqzUxE8C7yusrmajHhm0ucOFHoPL/KUaVzC5CuYSfFyTxJ4qIcziKH5FXJiVXeOq/CkKkxhHpDW5YQtfW1Hkl035hsd/n8bOeb549ke3EWOVEDcZfd7PWgXFNj+95RhO8jcX7B4jmBvZWz93xGa34G1BPQnAGuaEvKuXrnPW4nzkB0D883uu+X6CISGxNNn8KwM1kAXFH95qrAY8DklJQ6GkuRQ5UZis8CT9feySEOyg3UU39TVcLR+nNPdaXn5ET38XzDhG+ucKIsLL3g7cGFwPIatj+H8PiE9KhqlOOO6lOzR50cPyISwFq5glrbI4Fobt4GJuUJTs7NrGtzyXjSgrmePKnVmPDNEe5yF29VN3Xh1Ycq8YiIKr0CVeBF4O3JMh+CE7Eo1BQrtx/4SR2hLcuAq1pf3e3SlO7uUrexwO0OZH59mw4icUkimtJ1vpN3CEcuq6F9L0T4EVKUrjg4HRCcCsZXR8kAqkBQhG9T+7y5DsqHqPn2L4AO169enTK7Ws0kwbtgfr3ETfjmAgFvdXcMTWoEVcU/OACyvoa93gaeLzlG4e3CRFje+JwU1TME/Lg2r08EuAK0trlBNFQ0V7/wdcKoDVUlOB3gv+2jZSaDL4eIlIQbzQdM+OYCr8Exnd1CCMGJdUh6cXXba4jwt8C5ybICpRmNNZrTotrce0KA8AgwPY/frKawkODUNQ1mdq6M9EWTSaW2paIwnTaiqvh7fIa/PszIN0covFX9dXYWd1cMYiXml//aIYjTeBaPbkBHFOm/CZxqvYHTwN+VPD/ByYDgaGmjv7/XJ70tXcM0GfuIenjvqqHKLYT52+DoY7grJz3Mhr10iQKVvVVeNO/xei/KuNziaUGLUVUIwd/jM/rY6ITXOvr9UQY+NlDV6IzxeD4d69KGyimY8M0B0i9dNSazHqJqbgZ3zXure6BVgWcpCWFRpfBOYdros+B4gL/PJ7G5ul7xqL3wkXhSoeq8TxHBXbGN3M9uIuH/BM0qOqIkLksg6RkCjZXyHpJEnp272CV5eZLk5UlItk/oitFQ8d/wyb2QK2lSgKiHOvtclr47+6KX9Sw4fQ7OAodgrPt6psthwjcXCB1fLSh+gOt6QAMITq3Hu2x1lTvkER5Dirs2FQoHyoy5DWHsp2NRSqqBam17A4KvoPwm4mWq2kO8JO7quxn+xmOQD0AhMZSg/+/1l/U2dVineTzeBo/UdSm8VXHaMaczBA8mq7Yj3x+BGeZH8l/3Ca4P8FZWkAIPnIXONO+8W5nfbkmbcPqcmq6sqqKBogWNPjfY7qQaNcLP1lamI8rY42Po+frOFZ4NkfQN4FYb3Po60RjbSRsLBwr4+/zyxz8V4r8xfZ0GUeP81O8lQHhsO9mnv4QG5Y9ZDulbCr5MhJr4+3zye/Nlr5uGWhKS4i536b+rn8QlCZwBp/OmCQ0h90JuRtED0JyS+0WuqnsusWH+BDKb8M0B0lddG58GSuFogexPsgw/PMz5r55n7IkxdKRB8Qtg9PFRhr8xjP+GX/ZYuV/kyO3KkX0mW3XvXjHhGQf3gutqeNBfpOQJDCD78+ysD6X/1nTbgyMBw38zTGF/YZrd4ZkC+V2PUNj/8yptAkn1g1fk3inkX86XTSuv2Unhk7SQuSODDHaY2BWhYzoZHzkL/ht+xRfgeMICGezM71orDQmfiPyeiOwWkZdF5K9FJC0iG0Vkh4gMicjDIpKMt03Fy0Px+g1N+QYdSDgcQoWIDC0oo4+Ncv5r58n+PEvhQIHgSEBuZ47hR4YJT4d1iZ+qktuVI787j2Yjry48O92Y8aFXud05/CF/wtus9hwarMFdsblKowKEl0o7NU4FFA7PnloqPBFOb/87GRCeCRl5dHqvZHAygCCPv+cxVKsLiZHMIiRZmq2lcLCA/+Z00ZWMTDRhJK9IRlm1O1T0JqjiJ9VhLfuSmYqzwCG5aX4Ep9YtfCKyBvjnwDZVvZJoUMsngC8Cf6yqlxB143023uWzwOm4/I/j7eYlmtVZhU9V8V/3I8+ijMcTHA3I/ixbc6S/qhIcCcg+m504f3hmepVRtWi2sDyMfHsk8qLerjK8QYHUjZCsdmKh88AbJecv7C/M6u1BNHl3OFJ6Icc9GM3FHSPFZuUj24Oje6DQwKTfAWT/LgtTasxOv4PTHzVjJDbNr+GIuZ25qA1zlt9fREhek+zOMcdTaLSq6wEZEfGIkpwdBt4HPBKvfwj4aPz53niZeP0dMp/unCJ0TCPxm2X92M/GZhVHf8gnPF19IK+qEhwNGPn2yOSA/5jw1BTvMZwybrQQdTKMPDpC8E7lqlE47CJ9t1T14Efn3UuUOXnyfPnXKguT5nT6NSj6GpKZrGaq6oTw6bkj6OgpqqN8T1RwPJjW8SJ9gndhFJ5SKQREAZWa311NRZISNbtUQXAsYPSJ0YovI3e5S/pXakg/1qHULXyqegj4I6JQ/MNEo8V3AWdUdfyOOQisiT+vAQ7E+xbi7adNdCoi94vIThHZWa9t7UZ9nXHO1fGetvDk7KKmOa1YFSw+ZuFgJFzlxDI4FUx/AsuYp6NK/vXyDfsT2xSU3C8cJFnd3BU6cgL/tf+f4jpreDacHKI2686RxzqxqEXX1QNvhVe6bfzdNXee4NjrVXmvkuzDWXDB9BUBZHdmS35HEYl6cDdWMXFUKkH43q2QamOHQNwTWy3+az7Z57IVvb7EpYmunzirkaruYiIvbiPRvIL9wJ2NGqSqD6rqNlXd1uix2oUkBPFmuDFCyO+trhpWzdAtVSU4FjDyrfKiVyvhqXBGN0U1ignzh/qQvsrxcqpK/sXtBIdeKBlSVjhauZpbfIyJB1GZeGEkLkpEUzVObFg0wbeG8XSVVfhbjgte+XarwoFCFPtWhLvSJXNrZlZvV4Fw22bC924lvOVq1JHoe4SRVxoOhzOmt1JfCc4G0cuzwd59cSQKnq6B3M64zXeWczv9TpRNuotpJI7v/cB+VT0OICLfBG4GFomIF3t1a4FD8faHiOYfPBhXjRcCJxs4f8ciGZmxHSQ8GxIcri4WqqqJbeL2qFoi6jUfBeuWY3yUQdn9shq1fYVQVaBiWKDw5jM4idjjHN+lhua3mcTcXeaWDPrXES3xDoODz0N+FFKV2iEFmamtMoiqvN7GyU6M8bl1/bd9CCKPKhwOIYiuj+YVZ0mC4EwGffwtKAziOKsInniD8EQQheLklfSNaVLbUiUCqqrkXsyR/WkWZ4nDwL0DyILS6zzePuoMOjhLnYqBx7Xmg9ScMvbUGN4Fs/TgulGVNzjUvTF9jQjf28C7RKSPKD3GHcBO4EngY8DXgfuA7fH2j8bLP4/XP6FzPVCyTTiDTtn8ZapK/pX8rO1/EySpOFeHquLv92eMhZugQKnzM8sIBHflzBmjg+MBwckAZ4lX1dAwzQ0Tnn4L9YKoo2A8aqSGZzE8HZaKZkyxx6GqFA4VSsQ/PHeY8MxB3JUVep5FcFdupjD0VNnV+dfyJC5K4CxzJq6L/4bPyHdHohdAgsnrO356Ab7+g0mjVac5n8GR6aKhOSX3Qg7NKsE7Af4bPslrkyXtmONtsQDpW9OktqZm/L1KOrFqIDwVjejI3JYpK6wigrfSI1/LG6zDaKSNbwdRJ8XzwEvxsR4EPg98TkSGiNrwvhLv8hVgaVz+OeCBBuzuaCQl5R/u/JQB+bPg9DuVU5X7cSxchZs7PBeWpHqSTDxZ9BSkT0hcVL5NSlUJjgcQgrN4XWns20znPfsOOnqacCQsqbY7i6of8B4cD0peFJKWyONYVPRSyEN2x5RecD+Lv+/vKlYXozar2yFRfgKi4EjA+YfP4++ZrP7JgESip9G5Jz6Po3FZqNFfGRMKR6ZX94OjQdTUML5NmTbe/It5NBcFqOdfyM862ZzmtGwQeDXkXsjNmsSg2k6TTqWhirqq/qGqblbVK1X1H6lqTlX3qeqNqnqJqv4DVc3F22bj5Uvi9fua8xW6h+BMUFIdmw1noVMxt2dwLCA4VvmVrgUtCc0QkWhynynH91Z5SP/MN/R4KEkUv1f5xg9PvAFBFLKTf3Wy00SSM7wYyh1jJJwUg3GPNDnZdqWq+G/7Za9DcPAXUEU8n7PsEhJbZm6e1pGo+jce5OsucxueJzk8FxKcLrU5OBmU9vSXC6LOF3drVzhJgfrTaPmQfSY7o7C6i92uHvDa3S2UXUZwNKh6OthKXpFq3OtbTedoVgnOlG7oDDrT2iG9tbNMJRhG3hdeCu/iWyuGskQxha9MLOdeyk14fZKqYX7ZuJ0NJvPCOYucyV7F8ZEWZZ7v4MQQmjtf8RTiJkjf+ls4s1SLw7MhuRejoV2SEZKbGwzkLZSGFE141JUoHjJ3QQXxmeqJ1shMgdxAdN4uVo8uNr27GG+fqZZq5uKtZjgSEKUkmjJ0bercFonNCZJXzjyXreYVHVacRWtxl11UxTkDgmOvT+4/HDXKqyrOQqe6jpvxQ2Un3SBvTZQUAC+6puHpcGK+jmk2Dx8nPPVWVeeQ/qWkb/tdSMyc4CC3K0dhX3Su1NYUzrIGvb6p3n8lkSpQMgpntvZYVaVwpNBQ4lTCqJe33MvV6XO6OgNR91reZWhWZ3xApyGxR1bJq6oyRTtEWThKHgIBJx39/NInZN6biUYlwESW3mKh1KwSjoW4yy6BROUAVh07Q3i6VHTye2PPzKVyNpBiCpOdMZIWkpfHAu3D6BOj0wK2Jwh8gnderOoUIoK3/ka8jTfPuI1mlbGfjqFZRQaF9PWNBfLqaGnIytTQk3BkytDH4pAdh4pB1P5ev+EI6sLhQvlJkrogA9FsmPDNAe7i0htSNeqlqzqlujR/DtbwbFhalZKoaiuDkZCMe3+qSnAoYPSHoyVv+vBMCD64666jqva9MwfQ0dMlZcHRAB3VaB7dGrylYs9WRJA4Li7/ej4a+jabHcPHqz6PuB7Jq+4BZ+bG1eBIwNjTY1CIhaeBjOzBmcnA8onZzIourY6W7xiBKOfjbLWC8FyIv7++jo0SfCY89fmECd8cUC6RZf7VfMXEBZMHoPlv0zB6aCc8p3gUwoLPLCDz3syEmPhDPsOPDkdVqiJ7NavgpnBXX11d+97hlyGcMpY2qxNVtXFvsyrKPXNBnHKpyc+ju3Yr0j9tQFEJ+V/m8d/wZ+69rxIdKxU2Z3Fpu2s4FpaE6KivE16+k3aiTqJyx1XFf9WfMVazVgoHC+0dezcHmPC1gnwcvlAtxTFhTWTqoH5JCk7GmRhlEp4MGf3+KHpeo4c6vjvGh8Q5SzbgLt1Y+USqFA48P708iHqiNdTaRGPKtVBV/H1+2Vi4RpFkpvwQtin25F/JR0HMDQyWCc9MCfPpd6K5LcZPMxJ9z/GXVXA8mKjWy8DMqc/CMyHZ57P1GzaF4FRQ2ps8D+jiDukOZkoNJBwNCc9X/4RIqvahRtUQnozTPJWpyY17e+MPVnEQdngyJD+kpN/za7M2/k8cK3eO4OhrZdeNPjWKv8+PRjtU+SwFJ0I0uxgyCowSns0z9uRYXcG5FSnk0ZHKA4r8vT7+UGNtaDqm5F/Mk741PTEipGSoY9xjndySRD0lv3uy1uBkyvf6a6BRjsU6E8yWIxwO0fOKpnQymDqvXS2GJnxzwNQB3OPtY9UfgGniqaqEJ0I0p5PhHDUGzo+neXIXudHxzoY4/Q6SkCggtmgMsfpRNUxRci/lcJduI7Hlzuqque+8hJ4/Un6DfJR5pja7If/O7XiXbAb/BNkd2wnPnKlu5/xoFIJSZSIg9bOoX+U8vU147nMv5HAWO1Fa9yPBtI6EwjsF/Dd8nEUO+den/z4lbYKqFN4qRM0qzSTuREpvS0+0a+Zfzc/cqdQFmPDNAcHxoORhK5sdpRJF2493jgx/axgdU5wBB3eVi3+gNgEZHxKV3JSkcKhA9rks7gqX5JYk+d35kjHEwfGoM0Y8wd8Xknr3PeBWEbumIf5Lj0LYRHcsCMj+cDuZzCLyzz1JYc/eqnf1h54m9e5/jCxYWdX2Onq6qti/ZqFZZfR7o5GAlbtHQhh7eizq7CrSs8KBAvndedwV7kRHWP7VfJSLsQl9GlMpvFlg+K3hqIkiEYUndXO7nwnfHOAP+aRvSCP9UYdBbRNkMzEPQnGa89zzuYnG6vBcWPMxowNDbkeO3HO5iSpTYbgwEZtWsumwMvzIcJwI4EISF7+nKq8pPHOQwpvP1G5bBfTsSUYf/q+Qr63tSs8fxR96iuTWj1dlv7PgApwFqwlP7a/X1PqYRUTCM+G0mL8JwXTiYXwStQnOqRhp3MnVvObDtmHCNweEJ0Nyz+dI3xzFeVUzrKyEgCgD81xRpWZOpIC6ZmsVWU7idsLd/xPNnm3EupmpUfQiFP/Fv43CVKponyTVj7t2a+uFrx6UKCtMk3pvewnr1Z0jss9lGf3uKNkd2TnpfWwl3qorqtuwkKOw76dza0wdBKffrqrDAkDEIbn1HyDpBXNsldFOTPjmCh/yu/Nkn85Wl4aqUxEXZ8mGqqu5wYkOzD2RG6bw1rNVB+G6F2wh/YEHaphTxOg2TPiMylTZI1o4sAv80Tk2ph6U3HN/VZPXl7j8bvo+/G9xVm6hq8dmGWUx4TMqEKIjpyp6SxoGFN56tkU21U54Yh+Foaeq9vpEHLxL76D/Ew+SuOqeKEW9MW8w4TNmRxV/7xMQzDwPg6pG1dwDu1psXC0o+Ze2Q+48qtXNWSwiOJmFZD74L/Euek8LbDRahfXqGhXxX/shmhsmccWHES+Ns3ANzqI1E72kOnyM7BN/VMOUju0hOPQiIw//U6R/Ke7KLbhrt+Iu2YD0LWKiOhsWCM8cIji5H8ksxBlYTnD0te7o5TWqRjo564KIdK5xPUksDol0NG535WachavxX/ke4ckO7NSohOMiqUGkb8lkO2bgEw4fAz8H4kQzsPlZujpat3fZNdNsjSZ8hmHMV2YUPmvjMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj5zDhMwyj56gofCLy30TkmIi8XFS2RER+KCJ74/+L43IRkT8RkSEReVFEriva5754+70ict/cfB3DMIzKVOPx/QVw55SyB4DHVXUT8Hi8DHAXsCn+ux/4MkRCCfwhcBNwI/CH42JpGIbRaioKn6o+DUzNKX4v8FD8+SHgo0Xlf6kRzwCLRGQV8CHgh6p6SlVPAz9kupgahmG0hHrn3Fipqofjz0eAlfHnNcCBou0OxmUzlU9DRO4n8hYNwzDmhIYnG1JVbWaKeFV9EHgQLPW8YRhzQ729ukfjKizx/2Nx+SFgXdF2a+OymcoNwzBaTr3C9ygw3jN7H7C9qPzX497ddwFn4yrxD4APisjiuFPjg3GZYRhG61HVWf+AvwYOAz5R29xngaVEvbl7gR8BS+JtBfgvwBvAS8C2ouP8BjAU/32m0nnjfdT+7M/+7K/Ov50zaYtNL2kYxnxlxuklG+7cmGOGgT3tNqJKlgEn2m1EFXSLndA9tpqdzacZtl4404pOF749Myl2pyEiO7vB1m6xE7rHVrOz+cy1rTZW1zCMnsOEzzCMnqPThe/BdhtQA91ia7fYCd1jq9nZfObU1o7u1TUMw5gLOt3jMwzDaDomfIZh9BwdK3wicqeI7ImTmj5QeY85tWWdiDwpIq+IyG4R+Z24vOaErC2y1xWRX4jId+LljSKyI7bnYRFJxuWpeHkoXr+hxXYuEpFHROQ1EXlVRN7diddURH4v/t1fFpG/FpF0p1zTbkkUPIOd/y7+7V8UkW+JyKKidV+I7dwjIh8qKm+OLlQzdKzVf4BLNOztIiAJ/BK4vI32rAKuiz8PAq8DlwP/H/BAXP4A8MX4893A94iG8L0L2NFiez8HfA34Trz8N8An4s9/Cvyz+PP/Afxp/PkTwMMttvMh4H+PPyeBRZ12TYnSp+0HMkXX8tOdck2BW4HrgJeLymq6hsASYF/8f3H8eXEL7Pwg4MWfv1hk5+XxM58CNsZa4DZTF1p2k9d4kd4N/KBo+QvAF9ptV5E924EPEI0qWRWXrSIKuAb4M+CTRdtPbNcC29YSjaN+H/Cd+CY/UXSDTVxbokQR744/e/F20iI7F8aCIlPKO+qaMplLckl8jb5DlFi3Y64psGGKoNR0DYFPAn9WVF6y3VzZOWXdrwJfjT+XPO/j17SZutCpVd2qE5e2mrjqshXYQe0JWVvBfwR+Hwjj5aXAGVUtlLFlws54/dl4+1awETgO/Pe4Wv7nItJPh11TVT0E/BHwNlGyjrPALjrzmo4zZ4mC55DfIPJGmcWeptnZqcLXkYjIAPAN4HdV9VzxOo1eQW2NDRKRjwDHVHVXO+2oEo+o6vNlVd0KjDA5dwvQMdd0MdGUChuB1UA/XTRtQidcw0qIyB8ABeCrrTpnpwpfxyUuFZEEkeh9VVW/GRfXmpB1rrkZuEdE3gS+TlTd/RLR3Cfj47KLbZmwM16/EDjZAjshelsfVNUd8fIjRELYadf0/cB+VT2uqj7wTaLr3InXdJyuSRQsIp8GPgJ8KhZpZrGnaXZ2qvA9B2yKe86SRI3Ej7bLGBER4CvAq6r6H4pW1ZqQdU5R1S+o6lpV3UB0zZ5Q1U8BTwIfm8HOcfs/Fm/fEu9AVY8AB0TksrjoDuAVOuyaElVx3yUiffF9MG5nx13TIroiUbCI3EnULHOPqo5Osf8TcQ/5RqJZG5+lmbowl42uDTaE3k3Ue/oG8AdttuUWourCi8AL8d/d1JGQtYU238Zkr+5F8Y0zBPwPIBWXp+PloXj9RS228VpgZ3xd/5aoR7HjrinwfwOvAS8Df0XU29gR15Q2Jgpugp1DRG1248/UnxZt/wexnXuAu4rKm6ILNmTNMIyeo1OruoZhGHOGCZ9hGD2HCZ9hGD2HCZ9hGD2HCZ9hGD2HCZ9hGD2HCZ9hGD3H/wL8+ifg/3OaEgAAAABJRU5ErkJggg==\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -316,7 +378,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -330,7 +392,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/quickstart/common/logging.h b/quickstart/common/logging.h index d891e1686..9b722c5e9 100644 --- a/quickstart/common/logging.h +++ b/quickstart/common/logging.h @@ -242,7 +242,7 @@ class Logger : public nvinfer1::ILogger //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! - void log(Severity severity, const char* msg) override + void log(Severity severity, const char* msg) noexcept override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } diff --git a/quickstart/common/util.h b/quickstart/common/util.h index 554579694..c798a4524 100644 --- a/quickstart/common/util.h +++ b/quickstart/common/util.h @@ -26,21 +26,6 @@ namespace util { -struct InferDeleter -{ - template - void operator()(T* obj) const - { - if (obj) - { - obj->destroy(); - } - } -}; - -template -using UniquePtr = std::unique_ptr; - size_t getMemorySize(const nvinfer1::Dims& dims, const int32_t elem_size); struct PPM diff --git a/requirements.txt b/requirements.txt index bb70e1071..6f4c68798 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ torchvision==0.11.3; python_version<"3.10" torchvision==0.12.0; python_version=="3.10" Pillow numpy -pycuda +pycuda==2024.1 pytest --extra-index-url https://pypi.ngc.nvidia.com onnx-graphsurgeon diff --git a/samples/common/sampleEngines.cpp b/samples/common/sampleEngines.cpp index 2fc29eaaa..f56e335fd 100644 --- a/samples/common/sampleEngines.cpp +++ b/samples/common/sampleEngines.cpp @@ -675,8 +675,7 @@ void setPreviewFeatures(IBuilderConfig& config, BuildOptions const& build) config.setPreviewFeature(feat, build.previewFeatures.at(featVal)); } }; - // unused - static_cast(setFlag); + setFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03); } } // namespace @@ -1218,7 +1217,7 @@ bool networkToSerializedEngine( { if (!checkSafeEngine(serializedEngine->data(), serializedEngine->size())) { - sample::gLogError << "Consistency validation is not successful." << std::endl; + sample::gLogError << "Consistency validation is not supported." << std::endl; return false; } } @@ -1365,7 +1364,7 @@ bool loadEngineToBuildEnv(std::string const& filepath, bool enableConsistency, B { if (!checkSafeEngine(engineBlob.data(), fsize)) { - sample::gLogError << "Consistency validation is not successful." << std::endl; + sample::gLogError << "Consistency validation is not enabled." << std::endl; return false; } } @@ -1801,25 +1800,6 @@ nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize) { - if (!hasConsistencyChecker()) - { - sample::gLogError << "Cannot perform consistency check because the checker is not loaded.." << std::endl; - return false; - } - auto checker = std::unique_ptr( - createConsistencyChecker(sample::gLogger.getTRTLogger(), serializedEngine, engineSize)); - if (checker.get() == nullptr) - { - sample::gLogError << "Failed to create consistency checker." << std::endl; - return false; - } - sample::gLogInfo << "Start consistency checking." << std::endl; - if (!checker->validate()) - { - sample::gLogError << "Consistency validation failed." << std::endl; - return false; - } - sample::gLogInfo << "Consistency validation passed." << std::endl; - return true; + return hasConsistencyChecker(); } } // namespace sample diff --git a/samples/common/sampleOptions.cpp b/samples/common/sampleOptions.cpp index 802c02ad4..c4235a498 100644 --- a/samples/common/sampleOptions.cpp +++ b/samples/common/sampleOptions.cpp @@ -905,6 +905,7 @@ std::string previewFeatureToString(PreviewFeature feature) gLogWarning << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." << std::endl; break; } + case PreviewFeature::kALIASED_PLUGIN_IO_10_03: return "kALIASED_PLUGIN_IO_10_03"; } return "Invalid Preview Feature"; // clang-format on @@ -925,8 +926,8 @@ std::ostream& printPreviewFlags(std::ostream& os, BuildOptions const& options) os << previewFeatureToString(feat) << (options.previewFeatures.at(featVal) ? " [ON], " : " [OFF], "); } }; - // unused - static_cast(addFlag); + + addFlag(PreviewFeature::kALIASED_PLUGIN_IO_10_03); return os; } @@ -1510,6 +1511,10 @@ void BuildOptions::parse(Arguments& arguments) << "profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect." << std::endl; } + else if (featureName == "aliasedPluginIO1003") + { + feat = PreviewFeature::kALIASED_PLUGIN_IO_10_03; + } else { throw std::invalid_argument(std::string("Unknown preview feature: ") + featureName); @@ -2546,7 +2551,8 @@ void BuildOptions::help(std::ostream& os) " --preview=features Specify preview feature to be used by adding (+) or removing (-) preview features from the default" "\n" R"( Preview Features: features ::= [","feature])" "\n" " feature ::= (+|-)flag" "\n" - R"( flag ::= "profileSharing0806")" "\n" + R"( flag ::= "aliasedPluginIO1003")" "\n" + R"( |"profileSharing0806")" "\n" " --builderOptimizationLevel Set the builder optimization level. (default is 3)" "\n" " Higher level allows TensorRT to spend more building time for more optimization options." "\n" " Valid values include integers from 0 to the maximum optimization level, which is currently 5." "\n" diff --git a/samples/common/sampleOptions.h b/samples/common/sampleOptions.h index 4e59da114..bd5fe880d 100644 --- a/samples/common/sampleOptions.h +++ b/samples/common/sampleOptions.h @@ -399,6 +399,7 @@ class TaskInferenceOptions : public Options static void help(std::ostream& out); }; + Arguments argsToArgumentsMap(int32_t argc, char* argv[]); bool parseHelp(Arguments& arguments); diff --git a/samples/python/detectron2/requirements.txt b/samples/python/detectron2/requirements.txt index 0dd8a25b6..d355b4912 100644 --- a/samples/python/detectron2/requirements.txt +++ b/samples/python/detectron2/requirements.txt @@ -1,5 +1,6 @@ onnx==1.16.0 -onnxruntime==1.15.1 +onnxruntime==1.15.1; python_version <= "3.10" +onnxruntime==1.18.1; python_version >= "3.11" Pillow>=10.0.0 git+https://github.com/facebookresearch/detectron2.git git+https://github.com/NVIDIA/TensorRT#subdirectory=tools/onnx-graphsurgeon diff --git a/samples/python/downloader.py b/samples/python/downloader.py index 70bc6bd5f..c4240b3d7 100755 --- a/samples/python/downloader.py +++ b/samples/python/downloader.py @@ -92,17 +92,27 @@ def _downloadFile(path, url): session = requests.Session() retries = Retry(total=10, backoff_factor=0.5) session.mount("http://", HTTPAdapter(max_retries=retries)) - r = session.get(url, stream=True, timeout=30) - - size = int(r.headers.get("content-length", 0)) - from tqdm import tqdm - - progress_bar = tqdm(total=size, unit="iB", unit_scale=True) - with open(path, "wb") as fd: - for chunk in r.iter_content(chunk_size=1024): - progress_bar.update(len(chunk)) - fd.write(chunk) - progress_bar.close() + session.mount("https://", HTTPAdapter(max_retries=retries)) + try: + r = session.get(url, stream=True, timeout=30) + + if r.status_code == 200: + logger.info("Connecting to %s is successful.", url) + + size = int(r.headers.get("content-length", 0)) + from tqdm import tqdm + + progress_bar = tqdm(total=size, unit="iB", unit_scale=True) + with open(path, "wb") as fd: + for chunk in r.iter_content(chunk_size=1024): + progress_bar.update(len(chunk)) + fd.write(chunk) + progress_bar.close() + else: + logger.info("Failed to connect to %s with status code: %s.", url, r.status_code) + + except requests.exceptions.RequestException as e: + logger.debug("Error occurred while requesting connection to %s: %s.", url, e) allGood = True for f in sample_data.files: diff --git a/samples/python/efficientdet/requirements.txt b/samples/python/efficientdet/requirements.txt index 4eef5ebd8..bb83a8daa 100644 --- a/samples/python/efficientdet/requirements.txt +++ b/samples/python/efficientdet/requirements.txt @@ -1,6 +1,8 @@ Pillow>=10.0.0 -onnx==1.14.0 -onnxruntime==1.15.1 +onnx==1.14.0; python_version <= "3.10" +onnx==1.16.1; python_version >= "3.11" +onnxruntime==1.15.1; python_version <= "3.10" +onnxruntime==1.18.1; python_version >= "3.11" tf2onnx==1.8.1 cuda-python==12.2.0; python_version <= "3.10" cuda-python==12.5.0; python_version >= "3.11" diff --git a/samples/python/efficientnet/requirements.txt b/samples/python/efficientnet/requirements.txt index 4dd8fc5f4..73bf53efe 100644 --- a/samples/python/efficientnet/requirements.txt +++ b/samples/python/efficientnet/requirements.txt @@ -1,5 +1,6 @@ Pillow>=10.0.0 -onnx==1.14.0 +onnx==1.14.0; python_version <= "3.10" +onnx==1.16.1; python_version >= "3.11" tensorrt>=7.1.0.0 tf2onnx==1.8.1 cuda-python==12.2.0; python_version <= "3.10" diff --git a/samples/python/non_zero_plugin/requirements.txt b/samples/python/non_zero_plugin/requirements.txt index ae724c401..8f84ea544 100644 --- a/samples/python/non_zero_plugin/requirements.txt +++ b/samples/python/non_zero_plugin/requirements.txt @@ -1,10 +1,12 @@ -cuda-python +cuda-python==12.2.0; python_version <= "3.10" +cuda-python==12.5.0; python_version >= "3.11" cupy-cuda12x torch --extra-index-url https://pypi.ngc.nvidia.com polygraphy colored -numpy==1.23.5; platform_system != "Windows" +numpy==1.23.5; (platform_system != "Windows" and python_version <= "3.10") +numpy==1.26.4; (platform_system != "Windows" and python_version >= "3.11") --extra-index-url https://pypi.ngc.nvidia.com onnx-graphsurgeon pywin32; platform_system == "Windows" diff --git a/samples/python/python_plugin/requirements.txt b/samples/python/python_plugin/requirements.txt index 7cd81a175..4bf8cddac 100644 --- a/samples/python/python_plugin/requirements.txt +++ b/samples/python/python_plugin/requirements.txt @@ -1,4 +1,5 @@ -cuda-python +cuda-python==12.2.0; python_version <= "3.10" +cuda-python==12.5.0; python_version >= "3.11" cupy-cuda12x numba triton; platform_system != "Windows" @@ -6,7 +7,8 @@ torch --extra-index-url https://pypi.ngc.nvidia.com polygraphy colored -numpy==1.23.5; platform_system != "Windows" +numpy==1.23.5; (platform_system != "Windows" and python_version <= "3.10") +numpy==1.26.4; (platform_system != "Windows" and python_version >= "3.11") --extra-index-url https://pypi.ngc.nvidia.com onnx-graphsurgeon pywin32; platform_system == "Windows" diff --git a/samples/python/tensorflow_object_detection_api/README.md b/samples/python/tensorflow_object_detection_api/README.md index b25f1ebfa..de0548e44 100644 --- a/samples/python/tensorflow_object_detection_api/README.md +++ b/samples/python/tensorflow_object_detection_api/README.md @@ -31,6 +31,8 @@ cd /workspace/models/research git checkout 66e22c4 protoc object_detection/protos/*.proto --python_out=. cp object_detection/packages/tf2/setup.py ./ +## Pin pyyaml==6.0.1 to avoid v5.4.1 with known CVEs +sed -i '22i\ '"'"'pyyaml==6.0.1'"'"',' setup.py pip --use-deprecated=legacy-resolver install . ``` diff --git a/samples/python/tensorflow_object_detection_api/requirements.txt b/samples/python/tensorflow_object_detection_api/requirements.txt index b1efdbfb8..eb6d1ce33 100644 --- a/samples/python/tensorflow_object_detection_api/requirements.txt +++ b/samples/python/tensorflow_object_detection_api/requirements.txt @@ -1,5 +1,7 @@ -onnx==1.14.0 -onnxruntime==1.15.1 +onnx==1.14.0; python_version <= "3.10" +onnx==1.16.1; python_version >= "3.11" +onnxruntime==1.15.1; python_version <= "3.10" +onnxruntime==1.18.1; python_version >= "3.11" Pillow>=10.0.0 tf2onnx==1.15.0 pycocotools; platform_system != "Windows" @@ -8,7 +10,7 @@ cuda-python==12.2.0; python_version <= "3.10" cuda-python==12.5.0; python_version >= "3.11" pywin32; platform_system == "Windows" Cython<3.0 -pyyaml==5.3.1 +pyyaml==6.0.1 requests==2.32.2 tqdm==4.66.4 numpy==1.24.4; python_version <= "3.10" diff --git a/samples/sampleDynamicReshape/README.md b/samples/sampleDynamicReshape/README.md index 4c1ced618..4216dbc67 100644 --- a/samples/sampleDynamicReshape/README.md +++ b/samples/sampleDynamicReshape/README.md @@ -202,7 +202,7 @@ The IResizeLayer implements the resize operation on an input tensor. For example: ```bash - ./sample_dynamic_reshape --datadir $TRT_DATADIR/char-rnn --fp16 + ./sample_dynamic_reshape --datadir $TRT_DATADIR/mnist --fp16 ``` 3. Verify that the sample ran successfully. If the sample runs successfully you should see output similar to the following: diff --git a/samples/sampleINT8API/README.md b/samples/sampleINT8API/README.md index 173ea20dd..215796557 100644 --- a/samples/sampleINT8API/README.md +++ b/samples/sampleINT8API/README.md @@ -120,7 +120,7 @@ After the engine has been built, it can be used just like an FP32 engine. For ex 3. Enqueue the inference work and perform actual inference. ``` - context->enqueueV3(input_stream)) + context->enqueueV3(input_stream); ``` 4. Copy data from the device output buffers to the host output buffers. @@ -150,6 +150,8 @@ Set the output type of this layer. Setting the output type forces TensorRT to ch ## Preparing sample data +`ResNet50.onnx` is located in the `data/resnet50` directory. + In addition to the model file and input image, you will need per-tensor dynamic range stored in a text file along with the ImageNet label reference file. The following required files are included in the package and are located in the `data/int8_api` directory. @@ -163,21 +165,6 @@ The ResNet-50 per-tensor dynamic ranges file. `airliner.ppm` The image to be inferred. -1. Download the [ONNX ResNet-50 model](https://github.com/onnx/models/tree/master/vision/classification/resnet/model). - ```bash - wget https://download.onnxruntime.ai/onnx/models/resnet50.tar.gz -O $TRT_DATADIR/int8_api/resnet50.tar.gz - ``` - -2. Unpackage the model file. - ```bash - tar zxvf $TRT_DATADIR/int8_api/resnet50.tar.gz -C $TRT_DATADIR/int8_api/ - ``` - -3. Copy `resnet50/model.onnx` to the `data/int8_api/resnet50.onnx` directory. - ```bash - mv $TRT_DATADIR/int8_api/resnet50/model.onnx $TRT_DATADIR/int8_api/resnet50.onnx - ``` - ## Running the sample 1. Compile the sample by following build instructions in [TensorRT README](https://github.com/NVIDIA/TensorRT/). diff --git a/scripts/stubify.sh b/scripts/stubify.sh index 788d46721..7755c8091 100755 --- a/scripts/stubify.sh +++ b/scripts/stubify.sh @@ -44,6 +44,8 @@ OS=$(lsb_release -si)-$(lsb_release -sr | cut -d '.' -f 1-2) if [ "$OS" = "Ubuntu-22.04" ] ; then EXTRA_NM_FLAG="--without-symbol-versions" +elif [ "$OS" = "Ubuntu-24.04" ] ; then + EXTRA_NM_FLAG="--without-symbol-versions" fi # make stub library diff --git a/tools/Polygraphy/CHANGELOG.md b/tools/Polygraphy/CHANGELOG.md index 50cd8e72a..5196870f0 100644 --- a/tools/Polygraphy/CHANGELOG.md +++ b/tools/Polygraphy/CHANGELOG.md @@ -3,6 +3,11 @@ Dates are in YYYY-MM-DD format. +## v0.49.13 (2024-07-15) +### Added +- Added option to emit logs using python `logging` module. + + ## v0.49.12 (2024-05-28) ### Added - Added `runtime_platform` to `CreateConfig` for TensorRT and corresponding `--runtime-platform` command-line option. diff --git a/tools/Polygraphy/examples/cli/inspect/05_inspecting_inference_outputs/README.md b/tools/Polygraphy/examples/cli/inspect/05_inspecting_inference_outputs/README.md index d93c71609..11aad8dba 100644 --- a/tools/Polygraphy/examples/cli/inspect/05_inspecting_inference_outputs/README.md +++ b/tools/Polygraphy/examples/cli/inspect/05_inspecting_inference_outputs/README.md @@ -28,7 +28,7 @@ The `inspect data` subtool can display information about the ---- onnxrt-runner-N0-07/15/21-10:46:07 (1 iterations) ---- - y [dtype=float32, shape=(1, 1, 2, 2)] | Stats: mean=0.35995, std-dev=0.25784, var=0.066482, median=0.35968, min=0.00011437 at (0, 0, 1, 0), max=0.72032 at (0, 0, 0, 1), avg-magnitude=0.35995 + y [dtype=float32, shape=(1, 1, 2, 2)] | Stats: mean=0.35995, std-dev=0.25784, var=0.066482, median=0.35968, min=0.00011437 at (0, 0, 1, 0), max=0.72032 at (0, 0, 0, 1), avg-magnitude=0.35995, p90=0.62933, p95=0.62933, p99=0.71123 [[[[4.17021990e-01 7.20324516e-01] [1.14374816e-04 3.02332580e-01]]]] ``` diff --git a/tools/Polygraphy/examples/cli/inspect/06_inspecting_input_data/README.md b/tools/Polygraphy/examples/cli/inspect/06_inspecting_input_data/README.md index a37ede3cc..011d2220c 100644 --- a/tools/Polygraphy/examples/cli/inspect/06_inspecting_input_data/README.md +++ b/tools/Polygraphy/examples/cli/inspect/06_inspecting_input_data/README.md @@ -25,7 +25,7 @@ by a data loader. ``` [I] ==== Data (1 iterations) ==== - x [dtype=float32, shape=(1, 1, 2, 2)] | Stats: mean=0.35995, std-dev=0.25784, var=0.066482, median=0.35968, min=0.00011437 at (0, 0, 1, 0), max=0.72032 at (0, 0, 0, 1), avg-magnitude=0.35995 + x [dtype=float32, shape=(1, 1, 2, 2)] | Stats: mean=0.35995, std-dev=0.25784, var=0.066482, median=0.35968, min=0.00011437 at (0, 0, 1, 0), max=0.72032 at (0, 0, 0, 1), avg-magnitude=0.35995, p90=0.62933, p95=0.62933, p99=0.71123 [[[[4.17021990e-01 7.20324516e-01] [1.14374816e-04 3.02332580e-01]]]] ``` diff --git a/tools/Polygraphy/examples/dev/02_extending_polygraphy_run/extension_module/setup.py b/tools/Polygraphy/examples/dev/02_extending_polygraphy_run/extension_module/setup.py index 3150d1a34..043c4f3b9 100644 --- a/tools/Polygraphy/examples/dev/02_extending_polygraphy_run/extension_module/setup.py +++ b/tools/Polygraphy/examples/dev/02_extending_polygraphy_run/extension_module/setup.py @@ -46,6 +46,7 @@ def main(): "polygraphy", # Our included loader needs ONNX-GraphSurgeon to modify the model. "onnx_graphsurgeon", + "numpy<2", ], packages=find_packages(exclude=("tests", "tests.*")), # The format of the entry_points is: diff --git a/tools/Polygraphy/polygraphy/README.md b/tools/Polygraphy/polygraphy/README.md index 3b3d1dc08..cb6a01362 100644 --- a/tools/Polygraphy/polygraphy/README.md +++ b/tools/Polygraphy/polygraphy/README.md @@ -164,6 +164,17 @@ For example: G_LOGGER.module_severity = G_LOGGER.EXTRA_VERBOSE ``` +By default logs are emitted to the stdout/stderr stream. If you would like to have them stored in a file, set the `log_file` property to your log file: +```py +G_LOGGER.log_file = "your_log_file.log" +``` + +If you would like to have logs being emitted to python `logging` module, set the following flag: +```py +G_LOGGER.use_python_logging_system = True +``` + +After that, you can define your logging configuration using the `logging` module. ## Putting It All Together diff --git a/tools/Polygraphy/polygraphy/__init__.py b/tools/Polygraphy/polygraphy/__init__.py index de070e0df..5d3949527 100644 --- a/tools/Polygraphy/polygraphy/__init__.py +++ b/tools/Polygraphy/polygraphy/__init__.py @@ -1,3 +1,3 @@ import polygraphy.config -__version__ = "0.49.12" +__version__ = "0.49.13" diff --git a/tools/Polygraphy/polygraphy/comparator/util.py b/tools/Polygraphy/polygraphy/comparator/util.py index d8791a9b7..5bb5762ab 100644 --- a/tools/Polygraphy/polygraphy/comparator/util.py +++ b/tools/Polygraphy/polygraphy/comparator/util.py @@ -146,7 +146,13 @@ def str_output_stats(output, runner_name=None): ret += f"{runner_name} | Stats: " try: - ret += f"mean={compute_mean(output):.5g}, std-dev={compute_std(output):.5g}, var={compute_variance(output):.5g}, median={compute_median(output):.5g}, min={compute_min(output):.5g} at {compute_argmin(output)}, max={compute_max(output):.5g} at {compute_argmax(output)}, avg-magnitude={compute_average_magnitude(output):.5g}\n" + ret += f"mean={compute_mean(output):.5g}, std-dev={compute_std(output):.5g}, var={compute_variance(output):.5g}, median={compute_median(output):.5g}, min={compute_min(output):.5g} at {compute_argmin(output)}, max={compute_max(output):.5g} at {compute_argmax(output)}, avg-magnitude={compute_average_magnitude(output):.5g}" + + # np.quantile doesn't work with boolean input, so we don't show quantile error if the output type is boolean + if output.dtype == bool: + ret += "\n" + else: + ret += f", p90={compute_quantile(output, 0.9):.5g}, p95={compute_quantile(output, 0.9):.5g}, p99={compute_quantile(output, 0.99):.5g}\n" except Exception as err: G_LOGGER.verbose(f"Could not generate statistics.\nNote: Error was: {err}") ret += "" diff --git a/tools/Polygraphy/polygraphy/datatype/numpy.py b/tools/Polygraphy/polygraphy/datatype/numpy.py index c668a4071..c86a4b7fc 100644 --- a/tools/Polygraphy/polygraphy/datatype/numpy.py +++ b/tools/Polygraphy/polygraphy/datatype/numpy.py @@ -39,7 +39,7 @@ def _get_mapping(): np.uint64: DataType.UINT64, np.uint8: DataType.UINT8, np.bool_: DataType.BOOL, - np.unicode_: DataType.STRING, + np.str_: DataType.STRING, } return {np.dtype(key): val for key, val in DATATYPE_FROM_NUMPY.items()} diff --git a/tools/Polygraphy/polygraphy/logger/logger.py b/tools/Polygraphy/polygraphy/logger/logger.py index 7260ee9ee..9b4a9d219 100644 --- a/tools/Polygraphy/polygraphy/logger/logger.py +++ b/tools/Polygraphy/polygraphy/logger/logger.py @@ -17,6 +17,7 @@ import copy import enum import functools +import logging import os import sys import time @@ -239,6 +240,15 @@ def __init__( This is converted to a ``SeverityTrie`` on assignment. Defaults to G_LOGGER.INFO. """ + self._use_python_logging_system = False + """ + A flag indicating whether to use the Python `logging` module for log emission. + By default, logs are emitted to `stdout` or `stderr`. When this flag is set to `True`, + the logger uses the Python `logging` module instead of `stdout`/`stderr`. + This allows logs to be integrated into a unified logging system which can be helpful + in advanced cases like using multiprocessing or when the user wants one logging system + to manage Polygraphy and other libraries logs. + """ @property def log_file(self): @@ -286,6 +296,17 @@ def severity(self, value): ) self.module_severity = value + @property + def use_python_logging_system(self): + return self._use_python_logging_system + + @use_python_logging_system.setter + def use_python_logging_system(self, value): + self._use_python_logging_system = value + if value: + self.python_logger = logging.getLogger("Polygraphy") + self.severity_level_mapping = { Logger.ULTRA_VERBOSE : 2, Logger.SUPER_VERBOSE: 4, Logger.EXTRA_VERBOSE: 6 } + def module_path(self, path): """ Converts a given path to a path relative to the Polygraphy root module. @@ -459,7 +480,12 @@ def should_log(message): self._log_file.write(message + "\n") self._log_file.flush() - print(message, file=sys.stdout if severity < Logger.CRITICAL else sys.stderr) + if self._use_python_logging_system: + # python logging system does not handle negative levels, map them to positive values + level = severity if severity > 0 else self.severity_level_mapping[severity] + self.python_logger.log(level=level, msg=message) + else: + print(message, file=sys.stdout if severity < Logger.CRITICAL else sys.stderr) def backtrace(self, depth=0, limit=None, severity=ERROR): limit = ( diff --git a/tools/Polygraphy/tests/conftest.py b/tools/Polygraphy/tests/conftest.py index 610ed04ae..38a4c5884 100644 --- a/tools/Polygraphy/tests/conftest.py +++ b/tools/Polygraphy/tests/conftest.py @@ -18,6 +18,7 @@ import copy import ctypes.util import glob +import logging import os import subprocess as sp import sys @@ -195,3 +196,20 @@ def nvinfer_lean_path(): return path assert False, "Could not find nvinfer_lean!" + + +@pytest.fixture() +def tmp_python_log_file(tmp_path): + # backup original logging configuration + orig_handlers = logging.root.handlers[:] + orig_level = logging.root.level + logging.root.handlers = [] + tmp_log_file = tmp_path / "test.log" + # setup logging to file + logging.basicConfig(filename=tmp_log_file, level=0) + try: + yield tmp_log_file + finally: + # revert back original configuration + logging.root.handlers = orig_handlers + logging.root.level = orig_level diff --git a/tools/Polygraphy/tests/logger/test_logger.py b/tools/Polygraphy/tests/logger/test_logger.py index 226740e01..77e83232e 100644 --- a/tools/Polygraphy/tests/logger/test_logger.py +++ b/tools/Polygraphy/tests/logger/test_logger.py @@ -16,9 +16,12 @@ # import inspect +import logging import pytest + from polygraphy import util +from polygraphy.exception.exception import PolygraphyException from polygraphy.logger.logger import Logger, SeverityTrie @@ -72,6 +75,46 @@ def callback(module_severity): assert num_times_called == 3 assert num_times_called == 4 + @pytest.mark.serial + def test_use_python_logging_system(self, tmp_python_log_file): + logger = Logger(severity=Logger.ULTRA_VERBOSE) + logger.use_python_logging_system = True + # add custom Polygraphy levels + logging.addLevelName(2, "ULTRA_VERBOSE") + logging.addLevelName(4, "SUPER_VERBOSE") + logging.addLevelName(6, "EXTRA_VERBOSE") + logging.addLevelName(22, "START") + logging.addLevelName(28, "FINISH") + + # emit logs + logger.ultra_verbose("ultra verbose") + logger.super_verbose("super verbose") + logger.extra_verbose("extra verbose") + logger.verbose("verbose") + logger.info("info") + logger.start("start") + logger.finish("finish") + logger.warning("warning") + logger.error("error") + with pytest.raises(PolygraphyException): + logger.critical("critical") + + # verify logs written in the log file + with tmp_python_log_file.open() as fp: + log_messages = fp.read() + + assert log_messages == """\ +ULTRA_VERBOSE:Polygraphy:[U] ultra verbose +SUPER_VERBOSE:Polygraphy:[S] super verbose +EXTRA_VERBOSE:Polygraphy:[X] extra verbose +DEBUG:Polygraphy:[V] verbose +INFO:Polygraphy:[I] info +START:Polygraphy:[I] start +FINISH:Polygraphy:[I] finish +WARNING:Polygraphy:[W] warning +ERROR:Polygraphy:[E] error +CRITICAL:Polygraphy:[!] critical +""" class TestSeverityTrie: @pytest.mark.parametrize(