From 17003e43da9858f574e3a4a1d795fcf218862fe3 Mon Sep 17 00:00:00 2001 From: Kevin Chen <45886021+kevinch-nv@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:12:22 -0800 Subject: [PATCH] TensorRT 10.7-GA OSS Release (#4269) Signed-off-by: Kevin Chen --- .gitmodules | 2 +- CHANGELOG.md | 25 +++ README.md | 18 +- VERSION | 2 +- demo/BERT/README.md | 2 +- demo/Diffusion/README.md | 35 ++- demo/Diffusion/demo_txt2img_flux.py | 75 +++++-- demo/Diffusion/diffusion_pipeline.py | 48 +++-- demo/Diffusion/flux_pipeline.py | 62 ++++-- demo/Diffusion/models.py | 55 +++-- demo/Diffusion/requirements.txt | 4 +- .../stable_video_diffusion_pipeline.py | 5 +- demo/Diffusion/utilities.py | 25 ++- demo/Diffusion/utils_modelopt.py | 1 + docker/rockylinux8.Dockerfile | 26 +-- docker/rockylinux9.Dockerfile | 26 +-- docker/ubuntu-20.04.Dockerfile | 26 +-- docker/ubuntu-22.04-aarch64.Dockerfile | 10 +- docker/ubuntu-22.04.Dockerfile | 26 +-- docker/ubuntu-cross-aarch64.Dockerfile | 10 +- include/NvInfer.h | 149 +++++++++++-- include/NvInferImpl.h | 19 ++ include/NvInferPluginBase.h | 83 ------- include/NvInferRuntime.h | 202 +++++++++++++++++- include/NvInferRuntimePlugin.h | 16 +- include/NvInferVersion.h | 4 +- parsers/onnx | 2 +- plugin/CMakeLists.txt | 10 +- .../include/fused_multihead_attention.h | 13 +- .../include/fused_multihead_attention_v2.h | 14 +- ...ad_attention_v2_fp16_64_64_kernel.sm75.cpp | 1 - plugin/bertQKVToContextPlugin/mhaRunner.cu | 6 +- .../qkvToContextInt8InterleavedPlugin.cpp | 7 +- ...kvToContextInt8InterleavedPluginLegacy.cpp | 7 +- .../qkvToContextPlugin.cpp | 28 +-- .../qkvToContextPluginLegacy.cpp | 9 +- plugin/common/bertCommon.h | 3 +- plugin/common/plugin.cpp | 18 +- plugin/common/serialize.hpp | 5 +- .../disentangledKernel.cu | 24 +-- .../embLayerNormPlugin/embLayerNormPlugin.cpp | 3 +- .../embLayerNormPluginLegacy.cpp | 3 +- .../instanceNormFwdImpl.cu | 4 +- plugin/reorgPlugin/reorgPlugin.cpp | 4 +- python/docstrings/infer/pyCoreDoc.h | 63 +++++- python/docstrings/infer/pyGraphDoc.h | 97 ++++++++- python/include/utils.h | 2 +- python/packaging/bindings_wheel/setup.cfg | 2 +- python/packaging/bindings_wheel/setup.py | 16 +- .../tensorrt/plugin/__init__.py | 2 +- .../tensorrt/plugin/_plugin_class.py | 1 - .../bindings_wheel/tensorrt/plugin/_tensor.py | 87 +++++--- python/packaging/frontend_sdist/setup.cfg | 2 +- python/packaging/frontend_sdist/setup.py | 14 +- python/packaging/libs_wheel/setup.cfg | 2 +- python/packaging/libs_wheel/setup.py | 10 +- python/packaging/metapackage/setup.py | 6 +- python/src/infer/pyCore.cpp | 82 +++++++ python/src/infer/pyGraph.cpp | 15 ++ samples/CMakeSamplesTemplate.txt | 6 + samples/common/common.h | 10 +- samples/common/half.h | 10 +- samples/common/safeCommon.h | 33 ++- samples/common/sampleDevice.cpp | 19 +- samples/common/sampleDevice.h | 50 +++-- samples/common/sampleEngines.cpp | 55 ++++- samples/common/sampleEngines.h | 19 ++ samples/common/sampleInference.cpp | 38 ++-- samples/common/sampleOptions.cpp | 40 +++- samples/common/sampleOptions.h | 3 +- samples/common/sampleUtils.cpp | 17 +- samples/common/sampleUtils.h | 4 +- samples/common/streamReader.h | 85 +++++++- .../python/onnx_custom_plugin/CMakeLists.txt | 2 +- samples/python/python_plugin/README.md | 1 + samples/sampleINT8API/README.md | 7 +- samples/sampleINT8API/sampleINT8API.cpp | 7 - samples/sampleNonZeroPlugin/CMakeLists.txt | 2 + .../sampleOnnxMnistCoordConvAC.cpp | 2 - samples/trtexec/trtexec.cpp | 8 + tools/onnx-graphsurgeon/README.md | 5 +- 81 files changed, 1411 insertions(+), 530 deletions(-) diff --git a/.gitmodules b/.gitmodules index 87cf0015..ce3faf18 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,4 +9,4 @@ [submodule "parsers/onnx"] path = parsers/onnx url = https://github.com/onnx/onnx-tensorrt.git - branch = release/10.6-GA + branch = release/10.7-GA diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bce1c8d..e931030b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ # TensorRT OSS Release Changelog +## 10.7.0 GA - 2024-12-4 +Key Feature and Updates: + +- Demo Changes + - demoDiffusion + - Enabled low-vram for the Flux pipeline. Users can now run the pipelines on systems with 32GB VRAM. + - Added support for [FLUX.1-schnell](https://huggingface.co/black-forest-labs/FLUX.1-schnell) pipeline. + - Enabled weight streaming mode for Flux pipeline. + +- Plugin Changes + - On Blackwell and later platforms, TensorRT will drop cuDNN support on the following categories of plugins + - User-written `IPluginV2Ext`, `IPluginV2DynamicExt`, and `IPluginV2IOExt` plugins that are dependent on cuDNN handles provided by TensorRT (via the `attachToContext()` API). + - TensorRT standard plugins that use cuDNN, specifically: + - `InstanceNormalization_TRT` (version: 1, 2, and 3) present in `plugin/instanceNormalizationPlugin/`. + - `GroupNormalizationPlugin` (version: 1) present in `plugin/groupNormalizationPlugin/`. + - Note: These normalization plugins are superseded by TensorRT’s native `INormalizationLayer` ([C++](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_normalization_layer.html), [Python](https://docs.nvidia.com/deeplearning/tensorrt/operators/docs/Normalization.html)). TensorRT support for cuDNN-dependent plugins remain unchanged on pre-Blackwell platforms. + +- Parser Changes + - Now prioritizes using plugins over local functions when a corresponding plugin is available in the registry. + - Added dynamic axes support for `Squeeze` and `Unsqueeze` operations. + - Added support for parsing mixed-precision `BatchNormalization` nodes in strongly-typed mode. + +- Addressed Issues + - Fixed [4113](https://github.com/NVIDIA/TensorRT/issues/4113). + ## 10.6.0 GA - 2024-11-05 Key Feature and Updates: - Demo Changes diff --git a/README.md b/README.md index 247f86e2..1935a7b4 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ You can skip the **Build** section to enjoy TensorRT with Python. To build the TensorRT-OSS components, you will first need the following software packages. **TensorRT GA build** -* TensorRT v10.6.0.26 +* TensorRT v10.7.0.23 * Available from direct download links listed below **System Packages** @@ -73,25 +73,25 @@ To build the TensorRT-OSS components, you will first need the following software If using the TensorRT OSS build container, TensorRT libraries are preinstalled under `/usr/lib/x86_64-linux-gnu` and you may skip this step. Else download and extract the TensorRT GA build from [NVIDIA Developer Zone](https://developer.nvidia.com) with the direct links below: - - [TensorRT 10.6.0.26 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz) - - [TensorRT 10.6.0.26 for CUDA 12.6, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz) - - [TensorRT 10.6.0.26 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/zip/TensorRT-10.6.0.26.Windows.win10.cuda-11.8.zip) - - [TensorRT 10.6.0.26 for CUDA 12.6, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/zip/TensorRT-10.6.0.26.Windows.win10.cuda-12.6.zip) + - [TensorRT 10.7.0.23 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz) + - [TensorRT 10.7.0.23 for CUDA 12.6, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz) + - [TensorRT 10.7.0.23 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/zip/TensorRT-10.7.0.23.Windows.win10.cuda-11.8.zip) + - [TensorRT 10.7.0.23 for CUDA 12.6, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/zip/TensorRT-10.7.0.23.Windows.win10.cuda-12.6.zip) **Example: Ubuntu 20.04 on x86-64 with cuda-12.6** ```bash cd ~/Downloads - tar -xvzf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz - export TRT_LIBPATH=`pwd`/TensorRT-10.6.0.26 + tar -xvzf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz + export TRT_LIBPATH=`pwd`/TensorRT-10.7.0.23 ``` **Example: Windows on x86-64 with cuda-12.6** ```powershell - Expand-Archive -Path TensorRT-10.6.0.26.Windows.win10.cuda-12.6.zip - $env:TRT_LIBPATH="$pwd\TensorRT-10.6.0.26\lib" + Expand-Archive -Path TensorRT-10.7.0.23.Windows.win10.cuda-12.6.zip + $env:TRT_LIBPATH="$pwd\TensorRT-10.7.0.23\lib" ``` ## Setting Up The Build Environment diff --git a/VERSION b/VERSION index eafccb08..b3578225 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -10.6.0.26 +10.7.0.23 diff --git a/demo/BERT/README.md b/demo/BERT/README.md index 5abc76fa..18b06a9f 100755 --- a/demo/BERT/README.md +++ b/demo/BERT/README.md @@ -75,7 +75,7 @@ The following software version configuration has been tested: |Software|Version| |--------|-------| |Python|>=3.8| -|TensorRT|10.6.0.26| +|TensorRT|10.7.0.23| |CUDA|12.6| ## Setup diff --git a/demo/Diffusion/README.md b/demo/Diffusion/README.md index 974bad5b..a9aaa8f5 100755 --- a/demo/Diffusion/README.md +++ b/demo/Diffusion/README.md @@ -7,7 +7,7 @@ This demo application ("demoDiffusion") showcases the acceleration of Stable Dif ### Clone the TensorRT OSS repository ```bash -git clone git@github.com:NVIDIA/TensorRT.git -b release/10.5 --single-branch +git clone git@github.com:NVIDIA/TensorRT.git -b release/10.7 --single-branch cd TensorRT ``` @@ -16,7 +16,7 @@ cd TensorRT Install nvidia-docker using [these intructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker). ```bash -docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:24.07-py3 /bin/bash +docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:24.10-py3 /bin/bash ``` NOTE: The demo supports CUDA>=11.8 @@ -43,12 +43,12 @@ pip3 install -r requirements.txt > NOTE: demoDiffusion has been tested on systems with NVIDIA H100, A100, L40, T4, and RTX4090 GPUs, and the following software configuration. ``` -diffusers 0.30.2 +diffusers 0.31.0 onnx 1.15.0 onnx-graphsurgeon 0.5.2 onnxruntime 1.16.3 polygraphy 0.49.9 -tensorrt 10.6.0.26 +tensorrt 10.7.0.23 tokenizers 0.13.3 torch 2.2.0 transformers 4.42.2 @@ -66,6 +66,7 @@ python3 demo_img2img.py --help python3 demo_inpaint.py --help python3 demo_controlnet.py --help python3 demo_txt2img_xl.py --help +python3 demo_txt2img_flux.py --help ``` ### HuggingFace user access token @@ -257,23 +258,43 @@ python3 demo_stable_cascade.py --onnx-opset=16 "Anthropomorphic cat dressed as a ### Generate an image guided by a text prompt using Flux +Run the below command to generate an image with FLUX.1 Dev in FP16. + ```bash python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN ``` -Run the below command to generate an image with FLUX in BF16. +Run the below command to generate an image with FLUX.1 Dev in BF16. ```bash python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --bf16 ``` -Run the below command to generate an image with FLUX in FP8. (FP8 is only supppoted on Hopper.) +Run the below command to generate an image with FLUX.1 Dev in FP8. (FP8 is suppported on Hopper and Ada.) ```bash python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --fp8 ``` -NOTE: Running the Flux pipeline requires 80GB of GPU memory or higher +Run the below command to generate an image with FLUX.1 Schnell in FP16. + +```bash +python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --version="flux.1-schnell" +``` + +Run the below command to generate an image with FLUX.1 Schnell in BF16. + +```bash +python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --version="flux.1-schnell" --bf16 +``` + +Run the below command to generate an image with FLUX.1 Schnell in FP8. (FP8 is suppported on Hopper and Ada.) + +```bash +python3 demo_txt2img_flux.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN --version="flux.1-schnell" --fp8 +``` + +NOTE: Running the FLUX.1 Dev or FLUX.1 Schnell pipeline requires 48GB or 24GB of GPU memory or higher, respectively. ## Configuration options - Noise scheduler can be set using `--scheduler `. Note: not all schedulers are available for every version. diff --git a/demo/Diffusion/demo_txt2img_flux.py b/demo/Diffusion/demo_txt2img_flux.py index 501a6a05..f4e65e94 100644 --- a/demo/Diffusion/demo_txt2img_flux.py +++ b/demo/Diffusion/demo_txt2img_flux.py @@ -20,7 +20,12 @@ from cuda import cudart from flux_pipeline import FluxPipeline -from utilities import PIPELINE_TYPE, add_arguments, process_pipeline_args +from utilities import ( + PIPELINE_TYPE, + add_arguments, + process_pipeline_args, + VALID_OPTIMIZATION_LEVELS, +) def parse_args(): @@ -32,7 +37,7 @@ def parse_args(): "--version", type=str, default="flux.1-dev", - choices=["flux.1-dev"], + choices=("flux.1-dev", "flux.1-schnell"), help="Version of Flux", ) parser.add_argument( @@ -65,20 +70,48 @@ def parse_args(): parser.add_argument( "--max_sequence_length", type=int, - default=512, - help="Maximum sequence length to use with the prompt", + help="Maximum sequence length to use with the prompt. Can be up to 512 for the dev and 256 for the schnell variant.", ) parser.add_argument( - "--bf16", - action='store_true', - help="Run pipeline in BFloat16 precision" + "--bf16", action="store_true", help="Run pipeline in BFloat16 precision" ) parser.add_argument( "--low-vram", + action="store_true", + help="Optimize for low VRAM usage, possibly at the expense of inference performance. Disabled by default.", + ) + parser.add_argument( + "--optimization-level", + type=int, + default=3, + help=f"Set the builder optimization level to build the engine with. A higher level allows TensorRT to spend more building time for more optimization options. Must be one of {VALID_OPTIMIZATION_LEVELS}.", + ) + parser.add_argument( + "--torch-fallback", + default=None, + type=str, + help="Name list of models to be inferenced using torch instead of TRT. For example --torch-fallback t5,transformer. If --torch-inference set, this parameter will be ignored." + ) + + parser.add_argument( + "--ws", action='store_true', - help="Optimize for low VRAM usage, possibly at the expense of inference performance. Disabled by default." + help="Build TensorRT engines with weight streaming enabled." ) + parser.add_argument( + "--t5-ws-percentage", + type=int, + default=None, + help="Set runtime weight streaming budget as the percentage of the size of streamable weights for the T5 model. This argument only takes effect when --ws is set. 0 streams the most weights and 100 or None streams no weights. " + ) + + parser.add_argument( + "--transformer-ws-percentage", + type=int, + default=None, + help="Set runtime weight streaming budget as the percentage of the size of streamable weights for the transformer model. This argument only takes effect when --ws is set. 0 streams the most weights and 100 or None streams no weights." + ) return parser.parse_args() @@ -100,10 +133,24 @@ def process_demo_args(args): if len(prompt2) == 1: prompt2 = prompt2 * batch_size - if args.max_sequence_length is not None and args.max_sequence_length > 512: - raise ValueError( - f"`max_sequence_length` cannot be greater than 512 but is {args.max_sequence_length}" - ) + max_seq_supported_by_model = { + "flux.1-schnell": 256, + "flux.1-dev": 512, + }[args.version] + if args.max_sequence_length is not None: + if args.max_sequence_length > max_seq_supported_by_model: + raise ValueError( + f"For {args.version}, `max_sequence_length` cannot be greater than {max_seq_supported_by_model} but is {args.max_sequence_length}" + ) + else: + args.max_sequence_length = max_seq_supported_by_model + + if args.torch_fallback and not args.torch_inference: + args.torch_fallback = args.torch_fallback.split(",") + + if args.torch_fallback and args.torch_inference: + print(f"[W] All models will run in PyTorch when --torch-inference is set. Parameter --torch-fallback will be ignored.") + args.torch_fallback = None args_run_demo = ( prompt, @@ -131,6 +178,10 @@ def process_demo_args(args): max_sequence_length=args.max_sequence_length, bf16=args.bf16, low_vram=args.low_vram, + torch_fallback=args.torch_fallback, + weight_streaming=args.ws, + t5_weight_streaming_budget_percentage=args.t5_ws_percentage, + transformer_weight_streaming_budget_percentage=args.transformer_ws_percentage, **kwargs_init_pipeline) # Load TensorRT engines and pytorch modules diff --git a/demo/Diffusion/diffusion_pipeline.py b/demo/Diffusion/diffusion_pipeline.py index 37332375..9f2434bf 100644 --- a/demo/Diffusion/diffusion_pipeline.py +++ b/demo/Diffusion/diffusion_pipeline.py @@ -81,7 +81,8 @@ class DiffusionPipeline(ABC): "svd-xt-1.1", "sd3", "cascade", - "flux.1-dev" + "flux.1-dev", + "flux.1-schnell" ) SCHEDULER_DEFAULTS = { "1.4": "PNDM", @@ -95,7 +96,8 @@ class DiffusionPipeline(ABC): "xl-turbo": "EulerA", "svd-xt-1.1": "Euler", "cascade": "DDPMWuerstchen", - "flux.1-dev": "FlowMatchEuler" + "flux.1-dev": "FlowMatchEuler", + "flux.1-schnell": "FlowMatchEuler" } def __init__( @@ -117,6 +119,9 @@ def __init__( framework_model_dir='pytorch_model', return_latents=False, torch_inference='', + weight_streaming=False, + text_encoder_weight_streaming_budget_percentage=None, + denoiser_weight_streaming_budget_percentage=None, ): """ Initializes the Diffusion pipeline. @@ -157,6 +162,12 @@ def __init__( Skip decoding the image and return latents instead. torch_inference (str): Run inference with PyTorch (using specified compilation mode) instead of TensorRT. + weight_streaming (`bool`, defaults to False): + Whether to enable weight streaming during TensorRT engine build. + text_encoder_ws_budget_percentage (`int`, defaults to None): + Weight streaming budget as a percentage of the size of total streamable weights for the text encoder model. + denoiser_weight_streaming_budget_percentage (`int`, defaults to None): + Weight streaming budget as a percentage of the size of total streamable weights for the denoiser model. """ self.denoising_steps = denoising_steps self.max_batch_size = max_batch_size @@ -177,6 +188,10 @@ def __init__( self.pipeline_type = pipeline_type self.return_latents = return_latents + self.weight_streaming = weight_streaming + self.text_encoder_weight_streaming_budget_percentage = text_encoder_weight_streaming_budget_percentage + self.denoiser_weight_streaming_budget_percentage = denoiser_weight_streaming_budget_percentage + if not scheduler: scheduler = 'UniPC' if self.pipeline_type.is_controlnet() else self.SCHEDULER_DEFAULTS.get(version, 'DDIM') print(f"[I] Autoselected scheduler: {scheduler}") @@ -332,9 +347,9 @@ def _prepare_model_configs(self, onnx_dir, engine_dir, enable_refit, int8, fp8, config['use_int8'] = True config['model_suffix'] += f"-int8.l{quantization_level}.bs2.s{self.denoising_steps}.c{calibration_size}.p{quantization_percentile}.a{quantization_alpha}" elif fp8: - assert self.pipeline_type.is_sd_xl() or self.version in ["1.5", "2.1", "2.1-base", "flux.1-dev"], "fp8 quantization only supported for SDXL, SD1.5, SD2.1 and FLUX pipeline" + assert self.pipeline_type.is_sd_xl() or self.version in ["1.5", "2.1", "2.1-base", "flux.1-dev", "flux.1-schnell"], "fp8 quantization only supported for SDXL, SD1.5, SD2.1 and FLUX pipeline" if (self.pipeline_type.is_sd_xl() and model_name == 'unetxl') or \ - (self.version == "flux.1-dev" and model_name == 'transformer') or \ + ((self.version in ("flux.1-dev", "flux.1-schnell")) and model_name == 'transformer') or \ (model_name == 'unet'): config['use_fp8'] = True config['model_suffix'] += f"-fp8.l{quantization_level}.bs2.s{self.denoising_steps}.c{calibration_size}.p{quantization_percentile}.a{quantization_alpha}" @@ -360,8 +375,8 @@ def do_calibrate(pipeline, calibration_prompts, **kwargs): for i_th, prompts in enumerate(calibration_prompts): if i_th >= kwargs["calib_size"]: return - if kwargs["model_id"] == "flux.1-dev": - + if kwargs["model_id"] in ("flux.1-dev", "flux.1-schnell"): + max_seq_len = 512 if kwargs["model_id"] == "flux.1-dev" else 256 height = kwargs.get("height", 1024) width = kwargs.get("width", 1024) pipeline( @@ -371,7 +386,7 @@ def do_calibrate(pipeline, calibration_prompts, **kwargs): height=height, width=width, guidance_scale=3.5, - max_sequence_length=512 + max_sequence_length=max_seq_len ).images else: pipeline( @@ -384,7 +399,7 @@ def do_calibrate(pipeline, calibration_prompts, **kwargs): ).images def forward_loop(model): - if self.version not in ["sd3", "flux.1-dev"]: + if self.version not in ("sd3", "flux.1-dev", "flux.1-schnell"): pipeline.unet = model else: pipeline.transformer = model @@ -408,7 +423,7 @@ def forward_loop(model): self.denoising_steps ) elif model_config['use_fp8']: - if self.version == "flux.1-dev": + if self.version in ("flux.1-dev", "flux.1-schnell"): quant_config = SD_FP8_BF16_DEFAULT_CONFIG elif self.version == "2.1": quant_config = SD_FP8_FP32_DEFAULT_CONFIG @@ -416,14 +431,14 @@ def forward_loop(model): quant_config = SD_FP8_FP16_DEFAULT_CONFIG check_lora(model) - if self.version == "flux.1-dev": + if self.version in ("flux.1-dev", "flux.1-schnell"): set_quant_precision(quant_config, "BFloat16") mtq.quantize(model, quant_config, forward_loop) mto.save(model, model_config['state_dict_path']) def _get_quantized_model(self, obj, model_config, quantization_level, quantization_percentile, quantization_alpha, calibration_size, calib_batch_size, **kwargs): pipeline = obj.get_pipeline() - model = pipeline.unet if self.version not in ["sd3", "flux.1-dev"] else pipeline.transformer + model = pipeline.unet if self.version not in ("sd3", "flux.1-dev", "flux.1-schnell") else pipeline.transformer if model_config['use_fp8'] and quantization_level == 4.0: set_fmha(model) @@ -434,7 +449,7 @@ def _get_quantized_model(self, obj, model_config, quantization_level, quantizati if not os.path.exists(model_config['onnx_path']): quantize_lvl(model, quantization_level) - if self.version in ["flux.1-dev"]: + if self.version in ("flux.1-dev", "flux.1-schnell"): mtq.disable_quantizer(model, filter_func_no_proj_out) else: mtq.disable_quantizer(model, filter_func) @@ -469,6 +484,7 @@ def _build_engine(self, obj, engine, model_config, opt_batch_size, opt_image_hei tf32amp = obj.tf32 bf16amp = False if (model_config['use_fp8'] or getattr(obj, 'build_strongly_typed', False)) else obj.bf16 strongly_typed = True if (model_config['use_fp8'] or getattr(obj, 'build_strongly_typed', False)) else False + weight_streaming = getattr(obj, 'weight_streaming', False) extra_build_args = {'verbose': self.verbose} extra_build_args['builder_optimization_level'] = optimization_level if model_config['use_int8']: @@ -487,6 +503,7 @@ def _build_engine(self, obj, engine, model_config, opt_batch_size, opt_image_hei enable_all_tactics=enable_all_tactics, timing_cache=timing_cache, update_output_names=update_output_names, + weight_streaming=weight_streaming, **extra_build_args) def _refit_engine(self, obj, model_name, model_config): @@ -511,6 +528,9 @@ def _load_torch_models(self): for model_name, obj in self.models.items(): if self.torch_fallback[model_name]: self.torch_models[model_name] = obj.get_model(torch_inference=self.torch_inference) + if self.low_vram: + self.torch_models[model_name] = self.torch_models[model_name].to('cpu') + torch.cuda.empty_cache() def load_engines( self, @@ -619,7 +639,9 @@ def load_engines( # For non low_vram case, the engines will remain in GPU memory from now on. assert self.engine[model_name].engine is None if not self.low_vram: - self.engine[model_name].load() + weight_streaming = getattr(obj, 'weight_streaming', False) + weight_streaming_budget_percentage = getattr(obj, 'weight_streaming_budget_percentage', None) + self.engine[model_name].load(weight_streaming, weight_streaming_budget_percentage) if model_config['do_engine_refit'] and self.lora_loader: # For low_vram, using on-demand load and unload for refit. diff --git a/demo/Diffusion/flux_pipeline.py b/demo/Diffusion/flux_pipeline.py index fb764d68..a9edf381 100644 --- a/demo/Diffusion/flux_pipeline.py +++ b/demo/Diffusion/flux_pipeline.py @@ -63,12 +63,18 @@ def __init__( max_sequence_length=512, bf16=False, low_vram=False, + torch_fallback=None, + weight_streaming=False, + t5_weight_streaming_budget_percentage=None, + transformer_weight_streaming_budget_percentage=None, **kwargs ): """ Initializes the Flux pipeline. Args: + version (`str`, defaults to `flux.1-dev`) + Version of the underlying Flux model. guidance_scale (`float`, defaults to 3.5): Guidance scale is enabled by setting as > 1. Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality. @@ -76,8 +82,14 @@ def __init__( Maximum sequence length to use with the `prompt`. bf16 (`bool`, defaults to False): Whether to run the pipeline in BFloat16 precision. + weight_streaming (`bool`, defaults to False): + Whether to enable weight streaming during TensorRT engine build. + t5_weight_streaming_budget_percentage (`int`, defaults to None): + Weight streaming budget as a percentage of the size of total streamable weights for the T5 model. + transformer_weight_streaming_budget_percentage (`int`, defaults to None): + Weight streaming budget as a percentage of the size of total streamable weights for the transformer model. """ - super().__init__(version=version, pipeline_type=pipeline_type, **kwargs) + super().__init__(version=version, pipeline_type=pipeline_type, weight_streaming=weight_streaming, text_encoder_weight_streaming_budget_percentage=t5_weight_streaming_budget_percentage, denoiser_weight_streaming_budget_percentage=transformer_weight_streaming_budget_percentage, **kwargs) self.guidance_scale = guidance_scale self.max_sequence_length = max_sequence_length self.bf16=bf16 @@ -86,6 +98,14 @@ def __init__( # Pipeline type self.stages = ["clip", "t5", "transformer", "vae"] + if torch_fallback: + assert type(torch_fallback) is list + for model_name in torch_fallback: + if model_name not in self.stages: + raise ValueError(f'Model "{model_name}" set in --torch-fallback does not exist') + self.config[model_name.replace('-','_')+'_torch_fallback'] = True + print(f'[I] Setting torch_fallback for {model_name} model.') + def _initialize_models(self, framework_model_dir, int8, fp8): # Load text tokenizer(s) self.tokenizer = make_tokenizer( @@ -129,11 +149,14 @@ def _initialize_models(self, framework_model_dir, int8, fp8): # Known accuracy issues with FP16 self.models["t5"] = T5Model( **models_args, - fp16=False, + fp16=self.fp16, tf32=self.tf32, bf16=self.bf16, subfolder="text_encoder_2", text_maxlen=self.max_sequence_length, + build_strongly_typed=True if self.fp16 else False, + weight_streaming=self.weight_streaming, + weight_streaming_budget_percentage=self.text_encoder_weight_streaming_budget_percentage, ) if "transformer" in self.stages: @@ -146,11 +169,14 @@ def _initialize_models(self, framework_model_dir, int8, fp8): tf32=self.tf32, text_maxlen=self.max_sequence_length, build_strongly_typed=True, + weight_streaming=self.weight_streaming, + weight_streaming_budget_percentage=self.denoiser_weight_streaming_budget_percentage, ) if "vae" in self.stages: # Accuracy issues with FP16 - self.models["vae"] = VAEModel(**models_args, fp16=False, tf32=self.tf32, bf16=self.bf16) + # WAR: VAE fallback to FP32 in BF16 pipeline. TRT support will be added in a future release + self.models["vae"] = VAEModel(**models_args, fp16=False, tf32=self.tf32, bf16=False) self.vae_scale_factor = ( 2 ** (len(self.models["vae"].config["block_out_channels"])) @@ -361,6 +387,8 @@ def denoise_latent( def decode_latent(self, latents, decoder="vae"): self.profile_start(decoder, color="red") + cast_to = torch.float16 if self.models[decoder].fp16 else torch.bfloat16 if self.models[decoder].bf16 else torch.float32 + latents = latents.to(dtype=cast_to) if self.torch_inference or self.torch_fallback[decoder]: images = self.torch_models[decoder](latents, return_dict=False)[0] else: @@ -465,19 +493,29 @@ def __enter__(ctx): if not ctx.low_vram: return for model_name in ctx.model_names: - # creating engine object (load from plan file) - self.engine[model_name].load() - # creating context - self.engine[model_name].activate(device_memory=self.shared_device_memory) - # creating input and output buffer - self.engine[model_name].allocate_buffers(shape_dict=self.shape_dicts[model_name], device=self.device) + if not self.torch_fallback[model_name]: + # creating engine object (load from plan file) + self.engine[model_name].load() + # creating context + self.engine[model_name].activate(device_memory=self.shared_device_memory) + # creating input and output buffer + self.engine[model_name].allocate_buffers(shape_dict=self.shape_dicts[model_name], device=self.device) + else: + print(f"[I] Reloading torch model {model_name} from cpu.") + self.torch_models[model_name] = self.torch_models[model_name].to('cuda') + def __exit__(ctx, exc_type, exc_val, exc_tb): if not ctx.low_vram: return for model_name in ctx.model_names: - self.engine[model_name].deallocate_buffers() - self.engine[model_name].deactivate() - self.engine[model_name].unload() + if not self.torch_fallback[model_name]: + self.engine[model_name].deallocate_buffers() + self.engine[model_name].deactivate() + self.engine[model_name].unload() + else: + print(f"[I] Offloading torch model {model_name} to cpu.") + self.torch_models[model_name] = self.torch_models[model_name].to('cpu') + torch.cuda.empty_cache() # CLIP and T5 text encoder(s) diff --git a/demo/Diffusion/models.py b/demo/Diffusion/models.py index 1f9a71ed..81f2a865 100755 --- a/demo/Diffusion/models.py +++ b/demo/Diffusion/models.py @@ -241,11 +241,13 @@ def get_path(version, pipeline, controlnets=None): return "stabilityai/stable-cascade-prior" elif version == 'flux.1-dev': return "black-forest-labs/FLUX.1-dev" + elif version == 'flux.1-schnell': + return "black-forest-labs/FLUX.1-schnell" else: raise ValueError(f"Unsupported version {version} + pipeline {pipeline.name}") def get_clip_embedding_dim(version, pipeline): - if version in ("1.4", "1.5", "dreamshaper-7", "flux.1-dev"): + if version in ("1.4", "1.5", "dreamshaper-7", "flux.1-dev", "flux.1-schnell"): return 768 elif version in ("2.0", "2.0-base", "2.1", "2.1-base"): return 1024 @@ -423,7 +425,9 @@ def export_onnx(model): with torch.inference_mode(): export_onnx(custom_model) else: - with torch.inference_mode(), torch.autocast("cuda"): + # WAR: Enable autocast for BF16 Stable Cascade pipeline + do_autocast = True if self.version == "cascade" and self.bf16 else False + with torch.inference_mode(), torch.autocast("cuda", enabled=do_autocast): export_onnx(self.get_model()) else: print(f"[I] Found cached ONNX model: {onnx_path}") @@ -724,10 +728,16 @@ def __init__(self, bf16=False, subfolder="text_encoder", text_maxlen=512, + build_strongly_typed=False, + weight_streaming=False, + weight_streaming_budget_percentage=None, ): super(T5Model, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, tf32=tf32, bf16=bf16, max_batch_size=max_batch_size, text_maxlen=text_maxlen) self.subfolder = subfolder self.config = AutoConfig.from_pretrained(self.path, subfolder=self.subfolder, token=self.hf_token) + self.build_strongly_typed = build_strongly_typed + self.weight_streaming = weight_streaming + self.weight_streaming_budget_percentage = weight_streaming_budget_percentage def get_model(self, torch_inference=''): model_opts = {'torch_dtype': torch.float16} if self.fp16 else {'torch_dtype': torch.bfloat16} if self.bf16 else {} @@ -1384,11 +1394,12 @@ def __init__(self, verbose, framework_model_dir, fp16 = False, + fp8 = False, max_batch_size = 16, num_frames = 14, do_classifier_free_guidance = True, ): - super(UNetTemporalModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, max_batch_size=max_batch_size, embedding_dim=get_unet_embedding_dim(version, pipeline)) + super(UNetTemporalModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, fp8=fp8, max_batch_size=max_batch_size, embedding_dim=get_unet_embedding_dim(version, pipeline)) self.subfolder = 'unet' self.unet_dim = 4 self.num_frames = num_frames @@ -1459,6 +1470,9 @@ def get_sample_input(self, batch_size, image_height, image_width, static_shape): torch.randn(self.xB*batch_size, 3, dtype=dtype, device=self.device), ) + def optimize(self, onnx_graph): + return super().optimize(onnx_graph, modify_fp8_graph=self.fp8) + class UNetCascadeModel(BaseModel): def __init__(self, @@ -1620,12 +1634,16 @@ def __init__(self, bf16 = False, max_batch_size = 16, text_maxlen = 77, - build_strongly_typed=False + build_strongly_typed=False, + weight_streaming=False, + weight_streaming_budget_percentage=None, ): super(FluxTransformerModel, self).__init__(version, pipeline, device=device, hf_token=hf_token, verbose=verbose, framework_model_dir=framework_model_dir, fp16=fp16, tf32=tf32, int8=int8, fp8=fp8, bf16=bf16, max_batch_size=max_batch_size, text_maxlen=text_maxlen) self.subfolder = 'transformer' self.config = FluxTransformer2DModel.load_config(self.path, subfolder=self.subfolder, token=self.hf_token) self.build_strongly_typed = build_strongly_typed + self.weight_streaming = weight_streaming + self.weight_streaming_budget_percentage = weight_streaming_budget_percentage def get_model(self, torch_inference=''): model_opts = {'torch_dtype': torch.float16} if self.fp16 else {'torch_dtype': torch.bfloat16} if self.bf16 else {} @@ -1653,33 +1671,37 @@ def get_output_names(self): return ['latent'] def get_dynamic_axes(self): - return { + dynamic_axes = { 'hidden_states': {0: 'B', 1: 'latent_dim'}, 'encoder_hidden_states': {0: 'B'}, 'pooled_projections': {0: 'B'}, 'timestep': {0: 'B'}, 'img_ids': {0: 'latent_dim'}, - 'guidance': {0: 'B'}, } + if self.config['guidance_embeds']: + dynamic_axes['guidance'] = {0: 'B'} + return dynamic_axes def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width, min_latent_height, max_latent_height, min_latent_width, max_latent_width = \ self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) - return { + input_profile = { 'hidden_states': [(min_batch, (min_latent_height // 2) * (min_latent_width // 2), self.config['in_channels']), (batch_size, (latent_height // 2) * (latent_width // 2), self.config['in_channels']), (max_batch, (max_latent_height // 2) * (max_latent_width // 2), self.config['in_channels'])], 'encoder_hidden_states': [(min_batch, self.text_maxlen, self.config['joint_attention_dim']), (batch_size, self.text_maxlen, self.config['joint_attention_dim']), (max_batch, self.text_maxlen, self.config['joint_attention_dim'])], 'pooled_projections': [(min_batch, self.config['pooled_projection_dim']), (batch_size, self.config['pooled_projection_dim']), (max_batch, self.config['pooled_projection_dim'])], 'timestep': [(min_batch,), (batch_size,), (max_batch,)], 'img_ids': [((min_latent_height // 2) * (min_latent_width // 2), 3), ((latent_height // 2) * (latent_width // 2), 3), ((max_latent_height // 2) * (max_latent_width // 2), 3)], 'txt_ids': [(self.text_maxlen, 3), (self.text_maxlen, 3), (self.text_maxlen, 3)], - 'guidance': [(min_batch,), (batch_size,), (max_batch,)], } + if self.config['guidance_embeds']: + input_profile['guidance'] = [(min_batch,), (batch_size,), (max_batch,)] + return input_profile def get_shape_dict(self, batch_size, image_height, image_width): latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) - return { + shape_dict = { 'hidden_states': (batch_size, (latent_height // 2) * (latent_width // 2), self.config['in_channels']), 'encoder_hidden_states': (batch_size, self.text_maxlen, self.config['joint_attention_dim']), 'pooled_projections': (batch_size, self.config['pooled_projection_dim']), @@ -1687,8 +1709,11 @@ def get_shape_dict(self, batch_size, image_height, image_width): 'img_ids': ((latent_height // 2) * (latent_width // 2), 3), 'txt_ids': (self.text_maxlen, 3), 'latent': (batch_size, (latent_height // 2) * (latent_width // 2), self.config['in_channels']), - 'guidance': (batch_size,), } + if self.config['guidance_embeds']: + shape_dict['guidance'] = (batch_size,) + return shape_dict + def get_sample_input(self, batch_size, image_height, image_width, static_shape): latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) @@ -1696,17 +1721,19 @@ def get_sample_input(self, batch_size, image_height, image_width, static_shape): assert not (self.fp16 and self.bf16), "fp16 and bf16 cannot be enabled simultaneously" tensor_dtype = torch.bfloat16 if self.bf16 else (torch.float16 if self.fp16 else torch.float32) - return ( + sample_input = ( torch.randn(batch_size, (latent_height // 2) * (latent_width // 2), self.config['in_channels'], dtype=tensor_dtype, device=self.device), torch.randn(batch_size, self.text_maxlen, self.config['joint_attention_dim'], dtype=tensor_dtype, device=self.device), torch.randn(batch_size, self.config['pooled_projection_dim'], dtype=tensor_dtype, device=self.device), torch.tensor([1.]*batch_size, dtype=tensor_dtype, device=self.device), torch.randn((latent_height // 2) * (latent_width // 2), 3, dtype=dtype, device=self.device), torch.randn(self.text_maxlen, 3, dtype=dtype, device=self.device), - { - 'guidance': torch.tensor([1.]*batch_size, dtype=dtype, device=self.device), - } + { } ) + if self.config['guidance_embeds']: + sample_input[-1]['guidance'] = torch.tensor([1.]*batch_size, dtype=dtype, device=self.device) + return sample_input + def optimize(self, onnx_graph): if self.fp8: return super().optimize(onnx_graph, modify_fp8_graph=True, is_fp16_io=False) diff --git a/demo/Diffusion/requirements.txt b/demo/Diffusion/requirements.txt index 2316b878..3124dad1 100755 --- a/demo/Diffusion/requirements.txt +++ b/demo/Diffusion/requirements.txt @@ -1,9 +1,9 @@ +apex==0.9.10dev accelerate colored controlnet_aux==0.0.6 cuda-python -# TODO: Pin Diffusers version after the next release -git+https://github.com/huggingface/diffusers.git # Install from source for the latest changes in main +diffusers==0.31.0 ftfy matplotlib nvtx diff --git a/demo/Diffusion/stable_video_diffusion_pipeline.py b/demo/Diffusion/stable_video_diffusion_pipeline.py index c9af7247..9e753b65 100644 --- a/demo/Diffusion/stable_video_diffusion_pipeline.py +++ b/demo/Diffusion/stable_video_diffusion_pipeline.py @@ -221,7 +221,7 @@ def loadEngines( if 'clip-imgfe' in self.stages: self.models['clip-imgfe'] = CLIPImageProcessorModel(**models_args, subfolder='feature_extractor') if 'unet-temp' in self.stages: - self.models['unet-temp'] = UNetTemporalModel(**models_args, fp16=True, num_frames=self.num_frames, do_classifier_free_guidance=self.do_classifier_free_guidance) + self.models['unet-temp'] = UNetTemporalModel(**models_args, fp16=True, fp8=fp8, num_frames=self.num_frames, do_classifier_free_guidance=self.do_classifier_free_guidance) if 'vae-temp' in self.stages: self.models['vae-temp'] = VAEDecTemporalModel(**models_args, decode_chunk_size=self.decode_chunk_size) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) @@ -312,6 +312,9 @@ def forward_loop(model): else: obj.export_onnx(onnx_path[model_name], onnx_opt_path[model_name], onnx_opset, opt_image_height, opt_image_width) + # Clean model cache + torch.cuda.empty_cache() + # Build TensorRT engines for model_name, obj in self.models.items(): if self.torch_fallback[model_name]: diff --git a/demo/Diffusion/utilities.py b/demo/Diffusion/utilities.py index 93277153..eb077c40 100755 --- a/demo/Diffusion/utilities.py +++ b/demo/Diffusion/utilities.py @@ -277,6 +277,7 @@ def build(self, update_output_names=None, native_instancenorm=True, verbose=False, + weight_streaming=False, **extra_build_args ): print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}") @@ -292,6 +293,16 @@ def build(self, flags = [] if native_instancenorm: flags.append(trt.OnnxParserFlag.NATIVE_INSTANCENORM) + + # Weight streaming requires the engine to have strong typing, therefore builder flags specifying precision, such as int8 and fp16, should not be enabled. + # Please find more details in our developer guide: https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#streaming-weights. + if weight_streaming: + strongly_typed = True + fp16 = False + bf16 = False + int8 = False + fp8 = False + network = network_from_onnx_path( onnx_path, flags=flags, @@ -311,13 +322,14 @@ def build(self, refittable=enable_refit, profiles=[p], load_timing_cache=timing_cache, + weight_streaming=weight_streaming, **extra_build_args ), save_timing_cache=timing_cache ) save_engine(engine, path=self.engine_path) - def load(self): + def load(self, weight_streaming=False, weight_streaming_budget_percentage=None): if self.engine is not None: print(f"[W]: Engine {self.engine_path} already loaded, skip reloading") return @@ -327,6 +339,11 @@ def load(self): self.engine_bytes_cpu = bytes_from_path(self.engine_path) print(f"Loading TensorRT engine from bytes: {self.engine_path}") self.engine = engine_from_bytes(self.engine_bytes_cpu) + if weight_streaming: + if weight_streaming_budget_percentage is None: + warnings.warn(f"Weight streaming budget is not set for {self.engine_path}. Weights will not be streamed.") + else: + self.engine.weight_streaming_budget_v2 = int(weight_streaming_budget_percentage / 100 * self.engine.streamable_weights_size) def unload(self): if self.engine is not None: @@ -642,7 +659,7 @@ def append(self, item): def add_arguments(parser): # Stable Diffusion configuration - parser.add_argument('--version', type=str, default="1.5", choices=("1.4", "1.5", "dreamshaper-7", "2.0-base", "2.0", "2.1-base", "2.1", "xl-1.0", "xl-turbo", "svd-xt-1.1", "sd3", "cascade", "flux.1-dev"), help="Version of Stable Diffusion") + parser.add_argument('--version', type=str, default="1.5", choices=("1.4", "1.5", "dreamshaper-7", "2.0-base", "2.0", "2.1-base", "2.1", "xl-1.0", "xl-turbo", "svd-xt-1.1", "sd3", "cascade", "flux.1-dev", "flux.1-schnell"), help="Version of Stable Diffusion") parser.add_argument('prompt', nargs = '*', help="Text prompt(s) to guide image generation") parser.add_argument('--negative-prompt', nargs = '*', default=[''], help="The negative prompt(s) to guide the image generation.") parser.add_argument('--batch-size', type=int, default=1, choices=[1, 2, 4], help="Batch size (repeat prompt)") @@ -710,7 +727,7 @@ def process_pipeline_args(args): if args.int8 and not any(args.version.startswith(prefix) for prefix in ['xl', '1.4', '1.5', '2.1']): raise ValueError(f"int8 quantization is only supported for SDXL, SD1.4, SD1.5 and SD2.1 pipelines.") - if args.fp8 and not any(args.version.startswith(prefix) for prefix in ['xl', '1.4', '1.5', '2.1', 'flux.1-dev']): + if args.fp8 and not any(args.version.startswith(prefix) for prefix in ('xl', '1.4', '1.5', '2.1', 'flux.1-dev', 'flux.1-schnell')): raise ValueError(f"fp8 quantization is only supported for SDXL, SD1.4, SD1.5, SD2.1 and FLUX pipelines.") if args.fp8 and args.int8: @@ -728,7 +745,7 @@ def override_quant_level(level : float, dtype_str : str): print(f"The default quantization level has been set to {level} for {dtype_str}.") if args.fp8: - override_quant_level(3.0 if args.version in ("1.4", "1.5", "flux.1-dev") else 4.0, "FP8") + override_quant_level(3.0 if args.version in ("1.4", "1.5", "flux.1-dev", "flux.1-schnell") else 4.0, "FP8") elif args.int8: override_quant_level(3.0, "INT8") diff --git a/demo/Diffusion/utils_modelopt.py b/demo/Diffusion/utils_modelopt.py index e8b9d789..c86e16fa 100644 --- a/demo/Diffusion/utils_modelopt.py +++ b/demo/Diffusion/utils_modelopt.py @@ -172,6 +172,7 @@ def get_int8_config( "quant_cfg": { "*lm_head*": {"enable": False}, "*output_layer*": {"enable": False}, + "*output_quantizer": {"enable": False}, "default": {"num_bits": 8, "axis": None}, }, "algorithm": {"method": "smoothquant", "alpha": alpha}, diff --git a/docker/rockylinux8.Dockerfile b/docker/rockylinux8.Dockerfile index 70c5a0a6..0bfcf025 100644 --- a/docker/rockylinux8.Dockerfile +++ b/docker/rockylinux8.Dockerfile @@ -25,7 +25,7 @@ ENV NV_CUDNN_VERSION 8.9.6.50-1 ENV NV_CUDNN_PACKAGE libcudnn8-${NV_CUDNN_VERSION}.cuda12.2 ENV NV_CUDNN_PACKAGE_DEV libcudnn8-devel-${NV_CUDNN_VERSION}.cuda12.2 -ENV TRT_VERSION 10.6.0.26 +ENV TRT_VERSION 10.7.0.23 SHELL ["/bin/bash", "-c"] RUN dnf install -y \ @@ -62,15 +62,15 @@ RUN dnf install -y python38 python38-devel &&\ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp38-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp38-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ @@ -84,10 +84,10 @@ RUN pip install jupyter jupyterlab # Install Cmake RUN cd /tmp && \ - wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \ - chmod +x cmake-3.14.4-Linux-x86_64.sh && \ - ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ - rm ./cmake-3.14.4-Linux-x86_64.sh + wget https://github.com/Kitware/CMake/releases/download/v3.27.9/cmake-3.27.9-Linux-x86_64.sh && \ + chmod +x cmake-3.27.9-Linux-x86_64.sh && \ + ./cmake-3.27.9-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ + rm ./cmake-3.27.9-Linux-x86_64.sh # Download NGC client RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && echo "no-apikey\nascii\n" | ngc-cli/ngc config set diff --git a/docker/rockylinux9.Dockerfile b/docker/rockylinux9.Dockerfile index 70994b92..abfa103c 100644 --- a/docker/rockylinux9.Dockerfile +++ b/docker/rockylinux9.Dockerfile @@ -25,7 +25,7 @@ ENV NV_CUDNN_VERSION 8.9.6.50-1 ENV NV_CUDNN_PACKAGE libcudnn8-${NV_CUDNN_VERSION}.cuda12.2 ENV NV_CUDNN_PACKAGE_DEV libcudnn8-devel-${NV_CUDNN_VERSION}.cuda12.2 -ENV TRT_VERSION 10.6.0.26 +ENV TRT_VERSION 10.7.0.23 SHELL ["/bin/bash", "-c"] RUN dnf install -y \ @@ -67,15 +67,15 @@ RUN dnf -y install \ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp39-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp39-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib64 \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp39-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib64 \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp39-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ @@ -83,10 +83,10 @@ fi # Install Cmake RUN cd /tmp && \ - wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \ - chmod +x cmake-3.14.4-Linux-x86_64.sh && \ - ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ - rm ./cmake-3.14.4-Linux-x86_64.sh + wget https://github.com/Kitware/CMake/releases/download/v3.27.9/cmake-3.27.9-Linux-x86_64.sh && \ + chmod +x cmake-3.27.9-Linux-x86_64.sh && \ + ./cmake-3.27.9-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ + rm ./cmake-3.27.9-Linux-x86_64.sh # Download NGC client RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && echo "no-apikey\nascii\n" | ngc-cli/ngc config set diff --git a/docker/ubuntu-20.04.Dockerfile b/docker/ubuntu-20.04.Dockerfile index 939eb89d..ddb5ee1e 100644 --- a/docker/ubuntu-20.04.Dockerfile +++ b/docker/ubuntu-20.04.Dockerfile @@ -28,7 +28,7 @@ ENV CUDA_VERSION_MAJOR_MINOR=12.2 ENV NV_CUDNN_PACKAGE "libcudnn8=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" ENV NV_CUDNN_PACKAGE_DEV "libcudnn8-dev=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" -ENV TRT_VERSION 10.6.0.26 +ENV TRT_VERSION 10.7.0.23 SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -84,15 +84,15 @@ RUN apt-get install -y --no-install-recommends \ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp38-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp38-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp38-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ @@ -109,10 +109,10 @@ RUN pip3 install --upgrade numpy # Install Cmake RUN cd /tmp && \ - wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \ - chmod +x cmake-3.14.4-Linux-x86_64.sh && \ - ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ - rm ./cmake-3.14.4-Linux-x86_64.sh + wget https://github.com/Kitware/CMake/releases/download/v3.27.9/cmake-3.27.9-Linux-x86_64.sh && \ + chmod +x cmake-3.27.9-Linux-x86_64.sh && \ + ./cmake-3.27.9-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ + rm ./cmake-3.27.9-Linux-x86_64.sh # Download NGC client RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && echo "no-apikey\nascii\n" | ngc-cli/ngc config set diff --git a/docker/ubuntu-22.04-aarch64.Dockerfile b/docker/ubuntu-22.04-aarch64.Dockerfile index bd28c2bf..c835c8f7 100644 --- a/docker/ubuntu-22.04-aarch64.Dockerfile +++ b/docker/ubuntu-22.04-aarch64.Dockerfile @@ -20,7 +20,7 @@ ARG CUDA_VERSION=12.6.0 # Multi-arch container support available in non-cudnn containers. FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 -ENV TRT_VERSION 10.6.0.26 +ENV TRT_VERSION 10.7.0.23 SHELL ["/bin/bash", "-c"] # Setup user account @@ -84,10 +84,10 @@ RUN ver="${CUDA_VERSION%.*}" &&\ # Install Cmake RUN cd /tmp && \ - wget https://github.com/Kitware/CMake/releases/download/v3.21.4/cmake-3.21.4-linux-aarch64.sh && \ - chmod +x cmake-3.21.4-linux-aarch64.sh && \ - ./cmake-3.21.4-linux-aarch64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ - rm ./cmake-3.21.4-linux-aarch64.sh + wget https://github.com/Kitware/CMake/releases/download/v3.27.9/cmake-3.27.9-linux-aarch64.sh && \ + chmod +x cmake-3.27.9-linux-aarch64.sh && \ + ./cmake-3.27.9-linux-aarch64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ + rm ./cmake-3.27.9-linux-aarch64.sh # Install PyPI packages RUN pip3 install --upgrade pip diff --git a/docker/ubuntu-22.04.Dockerfile b/docker/ubuntu-22.04.Dockerfile index e72671ba..9059f538 100644 --- a/docker/ubuntu-22.04.Dockerfile +++ b/docker/ubuntu-22.04.Dockerfile @@ -28,7 +28,7 @@ ENV CUDA_VERSION_MAJOR_MINOR=12.2 ENV NV_CUDNN_PACKAGE "libcudnn8=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" ENV NV_CUDNN_PACKAGE_DEV "libcudnn8-dev=$NV_CUDNN_VERSION-1+cuda${CUDA_VERSION_MAJOR_MINOR}" -ENV TRT_VERSION 10.6.0.26 +ENV TRT_VERSION 10.7.0.23 SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -84,15 +84,15 @@ RUN apt-get install -y --no-install-recommends \ # Install TensorRT RUN if [ "${CUDA_VERSION:0:2}" = "11" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-11.8.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp310-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-11.8.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp310-none-linux_x86_64.whl ;\ elif [ "${CUDA_VERSION:0:2}" = "12" ]; then \ - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && tar -xf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz \ - && cp -a TensorRT-10.6.0.26/lib/*.so* /usr/lib/x86_64-linux-gnu \ - && pip install TensorRT-10.6.0.26/python/tensorrt-10.6.0-cp310-none-linux_x86_64.whl ;\ + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && tar -xf TensorRT-10.7.0.23.Linux.x86_64-gnu.cuda-12.6.tar.gz \ + && cp -a TensorRT-10.7.0.23/lib/*.so* /usr/lib/x86_64-linux-gnu \ + && pip install TensorRT-10.7.0.23/python/tensorrt-10.7.0-cp310-none-linux_x86_64.whl ;\ else \ echo "Invalid CUDA_VERSION"; \ exit 1; \ @@ -109,10 +109,10 @@ RUN pip3 install --upgrade numpy # Install Cmake RUN cd /tmp && \ - wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \ - chmod +x cmake-3.14.4-Linux-x86_64.sh && \ - ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ - rm ./cmake-3.14.4-Linux-x86_64.sh + wget https://github.com/Kitware/CMake/releases/download/v3.27.9/cmake-3.27.9-Linux-x86_64.sh && \ + chmod +x cmake-3.27.9-Linux-x86_64.sh && \ + ./cmake-3.27.9-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ + rm ./cmake-3.27.9-Linux-x86_64.sh # Download NGC client RUN cd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && echo "no-apikey\nascii\n" | ngc-cli/ngc config set diff --git a/docker/ubuntu-cross-aarch64.Dockerfile b/docker/ubuntu-cross-aarch64.Dockerfile index 8e3c3845..6a8017b1 100644 --- a/docker/ubuntu-cross-aarch64.Dockerfile +++ b/docker/ubuntu-cross-aarch64.Dockerfile @@ -21,7 +21,7 @@ ARG OS_VERSION=22.04 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${OS_VERSION} LABEL maintainer="NVIDIA CORPORATION" -ENV TRT_VERSION 10.6.0.26 +ENV TRT_VERSION 10.7.0.23 ENV DEBIAN_FRONTEND=noninteractive ARG uid=1000 @@ -59,10 +59,10 @@ RUN pip3 install setuptools>=41.0.0 # Install Cmake RUN cd /tmp && \ - wget https://github.com/Kitware/CMake/releases/download/v3.14.4/cmake-3.14.4-Linux-x86_64.sh && \ - chmod +x cmake-3.14.4-Linux-x86_64.sh && \ - ./cmake-3.14.4-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ - rm ./cmake-3.14.4-Linux-x86_64.sh + wget https://github.com/Kitware/CMake/releases/download/v3.27.9/cmake-3.27.9-Linux-x86_64.sh && \ + chmod +x cmake-3.27.9-Linux-x86_64.sh && \ + ./cmake-3.27.9-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \ + rm ./cmake-3.27.9-Linux-x86_64.sh # Skip installing PyPI packages and NGC client on cross-build container diff --git a/include/NvInfer.h b/include/NvInfer.h index 61d71ecc..c5cc061f 100644 --- a/include/NvInfer.h +++ b/include/NvInfer.h @@ -103,6 +103,8 @@ enum class LayerType : int32_t kREVERSE_SEQUENCE = 44, //!< Reverse sequence layer kNORMALIZATION = 45, //!< Normalization layer kPLUGIN_V3 = 46, //!< PluginV3 layer. + kSQUEEZE = 47, //!< Squeeze Layer. + kUNSQUEEZE = 48, //!< Unsqueeze Layer. }; //! @@ -113,7 +115,7 @@ enum class LayerType : int32_t template <> constexpr inline int32_t EnumMax() noexcept { - return 47; + return 49; } //! @@ -185,6 +187,7 @@ class ITensor : public INoCopy //! //! For a network input, the name is assigned by the application. For tensors which are layer outputs, //! a default name is assigned consisting of the layer name followed by the index of the output in brackets. + //! Each input and output tensor must have a unique name. //! //! This method copies the name string. //! @@ -373,7 +376,7 @@ class ITensor : public INoCopy //! //! \deprecated Deprecated in TensorRT 10.1. Superseded by explicit quantization. //! - bool dynamicRangeIsSet() const noexcept + TRT_DEPRECATED bool dynamicRangeIsSet() const noexcept { return mImpl->dynamicRangeIsSet(); } @@ -5334,11 +5337,12 @@ class IFillLayer : public ILayer //! The \p zeroPt tensor is optional, and if not set, will be assumed to be zero. Its data type must match the //! output data type. \p zeroPt must only contain zero-valued coefficients, because only symmetric quantization is //! supported. -//! The \p scale value must be a scalar for per-tensor quantization, a 1-D tensor for per-channel quantization, or a -//! 2-D tensor for block quantization (supported for DataType::kINT4 only). All \p scale coefficients must have -//! positive values. The size of the 1-D \p scale tensor must match the size of the quantization axis. For block -//! quantization, the shape of \p scale tensor must match the shape of the input, except for one dimension in which -//! blocking occurs. The size of \p zeroPt must match the size of \p scale. +//! The \p scale value must be a scalar for per-tensor quantization, a 1D tensor for per-channel quantization, or the +//! same rank as the input tensor for block quantization (supported for DataType::kINT4 only). All \p scale +//! coefficients must have positive values. The size of the 1D \p scale tensor must match the size of the quantization +//! axis. For block quantization, the shape of \p scale tensor must match the shape of the input, except for one +//! dimension (the last or second to last dimension) in which blocking occurs. +//! The size of \p zeroPt must match the size of \p scale. //! //! The subgraph which terminates with the \p scale tensor must be a build-time constant. The same restrictions apply //! to the \p zeroPt. @@ -5369,8 +5373,8 @@ class IFillLayer : public ILayer //! For each s in S: //! output[k,c,r,s] = clamp(round(\p input[k,c,r,s] / \p scale[k]) + \p zeroPt[k]) //! -//! Block quantization is supported only for 2-D weight inputs of DataType::kINT4. As an example of blocked -//! operation, imagine a 2-D RS weights input, R (dimension 0) as the blocking axis and B as the block size. +//! Block quantization is supported only for weight inputs of DataType::kINT4. As an example of blocked +//! operation, imagine a 2D RS weights input, R (dimension 0) as the blocking axis and B as the block size. //! The scale is a 2D array of coefficients, with dimensions (R//B, S). //! For each r in R: //! For each s in S: @@ -5405,7 +5409,7 @@ class IQuantizeLayer : public ILayer //! //! Set the index of the quantization axis (with reference to the input tensor's dimensions). //! The axis must be a valid axis if the scale tensor has more than one coefficient. - //! The axis value will be ignored if the scale tensor has exactly one coefficient (per-tensor quantization). + //! The axis value is used only for per-axis (per-channel) quantization. //! void setAxis(int32_t axis) noexcept { @@ -5461,11 +5465,12 @@ class IQuantizeLayer : public ILayer //! The \p zeroPt tensor is optional, and if not set, will be assumed to be zero. Its data type must be identical to //! the input's data type. \p zeroPt must only contain zero-valued coefficients, because only symmetric quantization is //! supported. -//! The \p scale value must be either a scalar for per-tensor quantization, a 1-D tensor for per-channel quantization, -//! or a 2-D tensor for block quantization (supported for DataType::kINT4 only). All \p scale coefficients must have -//! positive values. The size of the 1-D \p scale tensor must match the size of the quantization axis. For block -//! quantization, the shape of \p scale tensor must match the shape of the input, except for one dimension in which -//! blocking occurs. The size of \p zeroPt must match the size of \p scale. +//! The \p scale value must be a scalar for per-tensor quantization, a 1D tensor for per-channel quantization, or the +//! same rank as the input tensor for block quantization (supported for DataType::kINT4 only). All \p scale +//! coefficients must have positive values. The size of the 1D \p scale tensor must match the size of the quantization +//! axis. For block quantization, the shape of \p scale tensor must match the shape of the input, except for one +//! dimension (the last or second to last dimension) in which blocking occurs. +//! The size of \p zeroPt must match the size of \p scale. //! //! The subgraph which terminates with the \p scale tensor must be a build-time constant. The same restrictions apply //! to the \p zeroPt. @@ -5498,9 +5503,9 @@ class IQuantizeLayer : public ILayer //! For each s in S: //! output[k,c,r,s] = (\p input[k,c,r,s] - \p zeroPt[k]) * \p scale[k] //! -//! Block dequantization is supported only for 2-D input tensors with DataType::kINT4 that are rooted at an -//! IConstantLayer (i.e. weights). As an example of blocked operation, imagine a 2-D RS weights input with R -//! (dimension 0) as the blocking axis and B as the block size. The scale is a 2-D array of coefficients, with +//! Block dequantization is supported only for input tensors with DataType::kINT4 that are rooted at an +//! IConstantLayer (i.e. weights). As an example of blocked operation, imagine a 2D RS weights input with R +//! (dimension 0) as the blocking axis and B as the block size. The scale is a 2D array of coefficients, with //! dimensions (R//B, S). //! For each r in R: //! For each s in S: @@ -6260,6 +6265,65 @@ class INormalizationLayer : public ILayer virtual ~INormalizationLayer() noexcept = default; }; +//! +//! \class ISqueezeLayer +//! +//! \brief Layer that represents a squeeze operation, removing unit dimensions of the input tensor +//! on a set of axes. +//! +//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI. +//! +class ISqueezeLayer : public ILayer +{ +public: + //! + //! \brief Append or replace an input of this layer with a specific tensor + //! + //! \param index The index of the input to modify. + //! \param tensor The new input tensor. + //! + //! For a Squeeze layer, the values 0-1 are valid for index. + //! The indices are as follows: + //! + //! - 0: Input data tensor. + //! - 1: The axes to remove. Must resolvable to a constant Int32 or Int64 1D shape tensor. + //! + using ILayer::setInput; + +protected: + apiv::VSqueezeLayer* mImpl; + virtual ~ISqueezeLayer() noexcept = default; +}; + +//! +//! \class IUnsqueezeLayer +//! +//! \brief Layer that represents an unsqueeze operation, which reshapes the input tensor by inserting unit-length dimensions at specified axes of the output. +//! +//! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI. +//! +class IUnsqueezeLayer : public ILayer +{ +public: + //! + //! \brief Append or replace an input of this layer with a specific tensor + //! + //! \param index The index of the input to modify. + //! \param tensor The new input tensor. + //! + //! For an Unsqueeze layer, the values 0-1 are valid for index. + //! The indices are as follows: + //! + //! - 0: Input data tensor. + //! - 1: The output axes at which unit-length dimensions are inserted. Must resolvable to a constant Int32 or Int64 1D shape tensor. + //! + using ILayer::setInput; + +protected: + apiv::VUnsqueezeLayer* mImpl; + virtual ~IUnsqueezeLayer() noexcept = default; +}; + //! //! \class INetworkDefinition //! @@ -6285,8 +6349,8 @@ class INetworkDefinition : public INoCopy //! //! \brief Add an input tensor to the network. //! - //! The name of the input tensor is used to find the index into the buffer array for an engine built from - //! the network. The volume must be less than 2^31 elements. + //! Each input and output tensor must have a unique name. + //! The volume must be less than 2^31 elements. //! //! For networks with wildcard dimensions, the volume //! is based on the maxima specified by an IOptimizationProfile.Dimensions are normally non-negative integers. The @@ -7636,6 +7700,46 @@ class INetworkDefinition : public INoCopy return mImpl->areWeightsMarkedRefittable(name); } + //! + //! \brief Add a squeeze layer to the network. + //! + //! \param input The input tensor to the layer. + //! \param axes The axes to remove unit dimensions on. + //! + //! \see ISqueezeLayer + //! + //! Axes must be resolvable to a constant Int32 or Int64 1D shape tensor. + //! Values in axes must be unique and in the range of [-r, r-1], where r is the rank of the input tensor. + //! For each axis value, the corresponding dimension in the input tensor must be one. + //! + //! \return The new Squeeze layer, or nullptr if it could not be created. + //! + ISqueezeLayer* addSqueeze(ITensor& input, ITensor& axes) noexcept + { + return mImpl->addSqueeze(input, axes); + } + + //! + //! \brief Add an unsqueeze layer to the network. + //! + //! \param input The input tensor to the layer. + //! \param axes The axes to add unit dimensions. + //! + //! \see IUnsqueezeLauyer + //! + //! Axes must be resolvable to a constant Int32 or Int64 shape tensor. + //! Values in axes must be unique and in the range of [-r_final, r_final-1], where r_final + //! is the sum of rank(input) and len(axes). + //! + //! r_final must be less than Dims::MAX_DIMS. + //! + //! \return The new Unsqueeze layer, or nullptr if it could not be created + //! + IUnsqueezeLayer* addUnsqueeze(ITensor& input, ITensor& axes) noexcept + { + return mImpl->addUnsqueeze(input, axes); + } + protected: apiv::VNetworkDefinition* mImpl; }; @@ -8228,7 +8332,7 @@ enum class QuantizationFlag : int32_t //! Run int8 calibration pass before layer fusion. Only valid for IInt8LegacyCalibrator and //! IInt8EntropyCalibrator. The builder always runs the int8 calibration pass before layer fusion for //! IInt8MinMaxCalibrator and IInt8EntropyCalibrator2. Disabled by default. - kCALIBRATE_BEFORE_FUSION = 0 + kCALIBRATE_BEFORE_FUSION TRT_DEPRECATED_ENUM = 0 }; //! @@ -8345,7 +8449,8 @@ enum class BuilderFlag : int32_t //! Require that no reformats be inserted between a layer and a network I/O tensor //! for which ITensor::setAllowedFormats was called. //! Build fails if a reformat is required for functional correctness. - kDIRECT_IO = 11, + //! \deprecated Deprecated in TensorRT 10.7. Unneeded API. + kDIRECT_IO TRT_DEPRECATED_ENUM = 11, //! Fail if IAlgorithmSelector::selectAlgorithms returns an empty set of algorithms. kREJECT_EMPTY_ALGORITHMS = 12, diff --git a/include/NvInferImpl.h b/include/NvInferImpl.h index 3bb39fa4..7f3239f8 100644 --- a/include/NvInferImpl.h +++ b/include/NvInferImpl.h @@ -120,6 +120,11 @@ namespace v_1_0 class IStreamReader; } // namespace v_1_0 using IStreamReader = v_1_0::IStreamReader; +namespace v_1_0 +{ +class IStreamReaderV2; +} // namespace v_1_0 +using IStreamReaderV2 = v_1_0::IStreamReaderV2; class IPluginV3Layer; class IPoolingLayer; @@ -139,11 +144,13 @@ class IShapeLayer; class IShuffleLayer; class ISliceLayer; class ISoftMaxLayer; +class ISqueezeLayer; class ITensor; class ITimingCache; class ITopKLayer; class ITripLimitLayer; class IUnaryLayer; +class IUnsqueezeLayer; struct Permutation; class Weights; @@ -265,6 +272,8 @@ class VRuntime : public VRoot virtual IRuntime* loadRuntime(char const* path) noexcept = 0; virtual void setEngineHostCodeAllowed(bool allowed) noexcept = 0; virtual bool getEngineHostCodeAllowed() const noexcept = 0; + // Added in TensorRT version 10.7 + virtual nvinfer1::ICudaEngine* deserializeCudaEngineV2(IStreamReaderV2& streamReader) noexcept = 0; }; class VRefitter : public VRoot @@ -986,6 +995,14 @@ class VNormalizationLayer : public VRoot virtual DataType getComputePrecision() const noexcept = 0; }; // class VNormalizationLayer +class VSqueezeLayer : public VRoot +{ +}; + +class VUnsqueezeLayer : public VRoot +{ +}; + class VNetworkDefinition : public VRoot { public: @@ -1073,6 +1090,8 @@ class VNetworkDefinition : public VRoot virtual bool markWeightsRefittable(char const* name) noexcept = 0; virtual bool unmarkWeightsRefittable(char const* name) noexcept = 0; virtual bool areWeightsMarkedRefittable(char const* name) const noexcept = 0; + virtual ISqueezeLayer* addSqueeze(ITensor& input, ITensor& axes) noexcept = 0; + virtual IUnsqueezeLayer* addUnsqueeze(ITensor& input, ITensor& axes) noexcept = 0; }; class VAlgorithmIOInfo : public VRoot diff --git a/include/NvInferPluginBase.h b/include/NvInferPluginBase.h index d337f48a..0ce384dc 100644 --- a/include/NvInferPluginBase.h +++ b/include/NvInferPluginBase.h @@ -236,91 +236,8 @@ class IPluginV3 : public IVersionedInterface //! virtual IPluginV3* clone() noexcept = 0; }; - -class IPluginCreatorV3One : public IPluginCreatorInterface -{ -public: - //! - //! \brief Return version information associated with this interface. Applications must not override this method. - //! - InterfaceInfo getInterfaceInfo() const noexcept override - { - return InterfaceInfo{"PLUGIN CREATOR_V3ONE", 1, 0}; - } - - //! - //! \brief Return a plugin object. Return nullptr in case of error. - //! - //! \param name A NULL-terminated name string of length 1024 or less, including the NULL terminator. - //! \param fc A pointer to a collection of fields needed for constructing the plugin. - //! \param phase The TensorRT phase in which the plugin is being created - //! - //! When the phase is TensorRTPhase::kRUNTIME, the PluginFieldCollection provided for serialization by the plugin's - //! runtime interface will be passed as fc. - //! - //! \note The returned plugin object must be in an initialized state - //! - //! \note If invoked by the user (e.g. with TensorRTPhase::kBUILD, to add to the network defintion with - //! addPluginV3()), it is the user's responsibility to delete the plugin object. If invoked by TensorRT (e.g. during - //! engine deserialization), TensorRT will delete any objects it creates. - //! - virtual IPluginV3* createPlugin( - AsciiChar const* name, PluginFieldCollection const* fc, TensorRTPhase phase) noexcept = 0; - - //! - //! \brief Return a list of fields that need to be passed to createPlugin() when creating a plugin for use in the - //! TensorRT build phase. - //! - //! \see PluginFieldCollection - //! - virtual PluginFieldCollection const* getFieldNames() noexcept = 0; - - //! - //! \brief Return the plugin name. - //! - //! \warning The string returned must be NULL-terminated and have a length of 1024 bytes or less including - //! the NULL terminator. - //! - virtual AsciiChar const* getPluginName() const noexcept = 0; - - //! - //! \brief Return the plugin version. - //! - //! \warning The string returned must be NULL-terminated and have a length of 1024 bytes or less including - //! the NULL terminator. - //! - virtual AsciiChar const* getPluginVersion() const noexcept = 0; - - //! - //! \brief Return the plugin namespace. - //! - //! \warning The string returned must be NULL-terminated and have a length of 1024 bytes or less including - //! the NULL terminator. - //! - virtual AsciiChar const* getPluginNamespace() const noexcept = 0; - - IPluginCreatorV3One() = default; - virtual ~IPluginCreatorV3One() = default; - -protected: - IPluginCreatorV3One(IPluginCreatorV3One const&) = default; - IPluginCreatorV3One(IPluginCreatorV3One&&) = default; - IPluginCreatorV3One& operator=(IPluginCreatorV3One const&) & = default; - IPluginCreatorV3One& operator=(IPluginCreatorV3One&&) & = default; -}; - } // namespace v_1_0 -//! -//! \class IPluginCreatorV3One -//! -//! \brief A plugin creator class capable of producing IPluginV3 objects -//! -//! \see IPluginV3 -//! \see IPluginRegistry -//! -using IPluginCreatorV3One = v_1_0::IPluginCreatorV3One; - //! //! \class IPluginResource //! diff --git a/include/NvInferRuntime.h b/include/NvInferRuntime.h index a9e60719..2b96eabb 100644 --- a/include/NvInferRuntime.h +++ b/include/NvInferRuntime.h @@ -673,6 +673,82 @@ class IStreamReader : public IVersionedInterface //! using IStreamReader = v_1_0::IStreamReader; +//! +//! \enum SeekPosition +//! \brief Controls the seek mode of IStreamReaderV2. +//! +enum class SeekPosition : int32_t +{ + //! From the beginning of the file. + kSET = 0, + + //! From the current position of the file. + kCUR = 1, + + //! From the tail of the file. + kEND = 2, +}; + +namespace v_1_0 +{ +class IStreamReaderV2 : public IVersionedInterface +{ +public: + //! + //! TensorRT never calls the destructor for an IStreamReaderV2 defined by the + //! application. + //! + ~IStreamReaderV2() override = default; + IStreamReaderV2() = default; + + //! + //! \brief Return version information associated with this interface. Applications must not override this method. + //! + InterfaceInfo getInterfaceInfo() const noexcept override + { + return InterfaceInfo{"IStreamReaderV2", 1, 0}; + } + + //! + //! \brief Read the next number of bytes in the stream asynchronously. + //! + //! \param destination The memory to write to, call cudaPointerGetAttributes to get the memory location + //! \param nbBytes The number of bytes to read + //! \param stream The CUDA stream used to do the copy + //! + //! \returns The number of bytes read. Negative values indicate an unrecoverable error. + //! A zero indicates that the end of the stream has been reached. + //! + virtual int64_t read(void* destination, int64_t nbBytes, cudaStream_t stream) noexcept = 0; + + //! + //! \brief Sets the position of the stream to the given offset. + //! + //! \param offset The number of bytes to offset from where. + //! \param where The position from where the offset is added. \see SeekPosition + //! + //! \returns True if the position is updated successfully. + //! + virtual bool seek(int64_t offset, SeekPosition where) noexcept = 0; + +protected: + IStreamReaderV2(IStreamReaderV2 const&) = default; + IStreamReaderV2(IStreamReaderV2&&) = default; + IStreamReaderV2& operator=(IStreamReaderV2 const&) & = default; + IStreamReaderV2& operator=(IStreamReaderV2&&) & = default; +}; +} // namespace v_1_0 + +//! +//! \class IStreamReaderV2 +//! +//! \brief Application-implemented class for reading data in a stream-based manner asynchronously. Intended for use with +//! the GDS API for optimizing load times. +//! +//! \note To ensure compatibility of source code with future versions of TensorRT, use IStreamReaderV2, not +//! v_1_0::IStreamReaderV2 +//! +using IStreamReaderV2 = v_1_0::IStreamReaderV2; //! //! \class IPluginResourceContext @@ -825,7 +901,7 @@ class IPluginV3OneBuild : public IPluginCapability //! \param outputs Pre-allocated array to which the output dimensions must be written //! \param exprBuilder Object for generating new dimension expressions //! - //! \note Any size tensor outputs must be declared to be 0-D. + //! \note Any size tensor outputs must be declared to be 0D. //! //! \note The declaration of shapeInputs as DimsExprs is slightly abusive, because the "dimensions" //! are actually the values of the shape tensor. For example, if the input shape tensor @@ -1853,11 +1929,34 @@ class IRuntime : public INoCopy //! //! \return The engine, or nullptr if it could not be deserialized. //! - ICudaEngine* deserializeCudaEngine(IStreamReader& streamReader) + //! \deprecated Deprecated in TensorRT 10.7. Superseded by deserializeCudaEngine that takes an IStreamReaderV2 + //! instead of IStreamReader. + //! + TRT_DEPRECATED ICudaEngine* deserializeCudaEngine(IStreamReader& streamReader) { return mImpl->deserializeCudaEngine(streamReader); } + //! + //! \brief Deserialize an engine from a stream. IStreamReaderV2 is expected to support reading to both host and + //! device pointers. + //! + //! If an error recorder has been set for the runtime, it will also be passed to the + //! engine. + //! + //! This deserialization path will reduce engine load time when applied with GDS (GPU Direct storage), or when + //! weight streaming is enabled. + //! + //! \param streamReader a read-only stream from which TensorRT will deserialize a previously serialized engine. + //! \param stream The CUDA stream used when performing asynchronous I/O. + //! + //! \return The engine, or nullptr if it could not be deserialized. The pointer may not be valid immediately after + //! the function returns. + //! + ICudaEngine* deserializeCudaEngine(IStreamReaderV2& streamReader) + { + return mImpl->deserializeCudaEngineV2(streamReader); + } //! //! \brief get the logger with which the runtime was created @@ -4344,14 +4443,17 @@ class IExecutionContext : public INoCopy //! Before calling enqueueV3(), each input must have a non-null address and //! each output must have a non-null address or an IOutputAllocator to set it later. //! - //! If the TensorLocation of the tensor is kHOST, the pointer must point to a host buffer of sufficient size. - //! If the TensorLocation of the tensor is kDEVICE, the pointer must point to a device buffer of sufficient size and - //! alignment, or be nullptr if the tensor is an output tensor that will be allocated by IOutputAllocator. + //! If the TensorLocation of the tensor is kHOST: + //! - The pointer must point to a host buffer of sufficient size. + //! - Data representing shape values is not copied until enqueueV3 is invoked. + //! + //! If the TensorLocation of the tensor is kDEVICE: + //! - The pointer must point to a device buffer of sufficient size and alignment, or + //! - Be nullptr if the tensor is an output tensor that will be allocated by IOutputAllocator. //! //! If getTensorShape(name) reports a -1 for any dimension of an output after all - //! input shapes have been set, then to find out - //! the dimensions, use setOutputAllocator() to associate an IOutputAllocator to - //! which the dimensions will be reported when known. + //! input shapes have been set, use setOutputAllocator() to associate an IOutputAllocator + //! to which the dimensions will be reported when known. //! //! Calling both setTensorAddress and setOutputAllocator() for the same output is allowed, //! and can be useful for preallocating memory, and then reallocating if it's not big enough. @@ -5184,6 +5286,79 @@ class IGpuAsyncAllocator : public IGpuAllocator return {"IGpuAllocator", 1, 0}; } }; + +class IPluginCreatorV3One : public IPluginCreatorInterface +{ +public: + //! + //! \brief Return version information associated with this interface. Applications must not override this method. + //! + InterfaceInfo getInterfaceInfo() const noexcept override + { + return InterfaceInfo{"PLUGIN CREATOR_V3ONE", 1, 0}; + } + + //! + //! \brief Return a plugin object. Return nullptr in case of error. + //! + //! \param name A NULL-terminated name string of length 1024 or less, including the NULL terminator. + //! \param fc A pointer to a collection of fields needed for constructing the plugin. + //! \param phase The TensorRT phase in which the plugin is being created + //! + //! When the phase is TensorRTPhase::kRUNTIME, the PluginFieldCollection provided for serialization by the plugin's + //! runtime interface will be passed as fc. + //! + //! \note The returned plugin object must be in an initialized state + //! + //! \note If invoked by the user (e.g. with TensorRTPhase::kBUILD, to add to the network defintion with + //! addPluginV3()), it is the user's responsibility to delete the plugin object. If invoked by TensorRT (e.g. during + //! engine deserialization), TensorRT will delete any objects it creates. + //! + virtual IPluginV3* createPlugin( + AsciiChar const* name, PluginFieldCollection const* fc, TensorRTPhase phase) noexcept = 0; + + //! + //! \brief Return a list of fields that need to be passed to createPlugin() when creating a plugin for use in the + //! TensorRT build phase. + //! + //! \see PluginFieldCollection + //! + virtual PluginFieldCollection const* getFieldNames() noexcept = 0; + + //! + //! \brief Return the plugin name. + //! + //! \warning The string returned must be NULL-terminated and have a length of 1024 bytes or less including + //! the NULL terminator. + //! + virtual AsciiChar const* getPluginName() const noexcept = 0; + + //! + //! \brief Return the plugin version. + //! + //! \warning The string returned must be NULL-terminated and have a length of 1024 bytes or less including + //! the NULL terminator. + //! + virtual AsciiChar const* getPluginVersion() const noexcept = 0; + + //! + //! \brief Return the plugin namespace. + //! + //! \warning The string returned must be NULL-terminated and have a length of 1024 bytes or less including + //! the NULL terminator. + //! + virtual AsciiChar const* getPluginNamespace() const noexcept = 0; + + IPluginCreatorV3One() = default; + virtual ~IPluginCreatorV3One() = default; + +protected: + IPluginCreatorV3One(IPluginCreatorV3One const&) = default; + IPluginCreatorV3One(IPluginCreatorV3One&&) = default; + IPluginCreatorV3One& operator=(IPluginCreatorV3One const&) & = default; + IPluginCreatorV3One& operator=(IPluginCreatorV3One&&) & = default; +}; + } // namespace v_1_0 //! @@ -5200,6 +5375,17 @@ class IGpuAsyncAllocator : public IGpuAllocator //! //! \see IGpuAllocator using IGpuAsyncAllocator = v_1_0::IGpuAsyncAllocator; + +//! +//! \class IPluginCreatorV3One +//! +//! \brief A plugin creator class capable of producing IPluginV3 objects +//! +//! \see IPluginV3 +//! \see IPluginRegistry +//! +using IPluginCreatorV3One = v_1_0::IPluginCreatorV3One; + } // namespace nvinfer1 //! diff --git a/include/NvInferRuntimePlugin.h b/include/NvInferRuntimePlugin.h index dbe5bb49..2f6a3894 100644 --- a/include/NvInferRuntimePlugin.h +++ b/include/NvInferRuntimePlugin.h @@ -127,8 +127,7 @@ enum class PluginCreatorVersion : int32_t //! \see IPluginCreator //! \see IPluginRegistry //! -//! \deprecated Deprecated in TensorRT 8.5. Implement IPluginV2DynamicExt or IPluginV2IOExt depending on your -//! requirement. +//! \deprecated Deprecated in TensorRT 8.5. Implement IPluginV3 instead. //! class TRT_DEPRECATED IPluginV2 { @@ -260,7 +259,7 @@ class TRT_DEPRECATED IPluginV2 //! \param format The format selected for the engine. //! \param maxBatchSize The maximum batch size. Will be a positive integer. //! - //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be + //! The dimensions passed here do not include the outermost batch size (i.e. for 2D image networks, they will be //! 3-dimensional CHW dimensions). //! //! \warning for the format field, the values PluginFormat::kCHW4, PluginFormat::kCHW16, and PluginFormat::kCHW32 @@ -463,8 +462,7 @@ class TRT_DEPRECATED IPluginV2 //! //! \see IPluginV2 //! -//! \deprecated Deprecated in TensorRT 8.5. Implement IPluginV2DynamicExt or IPluginV2IOExt depending on your -//! requirement. +//! \deprecated Deprecated in TensorRT 8.5. Implement IPluginV3 instead. //! class TRT_DEPRECATED IPluginV2Ext : public IPluginV2 { @@ -566,7 +564,7 @@ class TRT_DEPRECATED IPluginV2Ext : public IPluginV2 //! \param floatFormat The format selected for the engine for the floating point inputs/outputs. //! \param maxBatchSize The maximum batch size. Will be a positive integer. //! - //! The dimensions passed here do not include the outermost batch size (i.e. for 2-D image networks, they will be + //! The dimensions passed here do not include the outermost batch size (i.e. for 2D image networks, they will be //! 3-dimensional CHW dimensions). When inputIsBroadcast or outputIsBroadcast is true, the outermost batch size for //! that input or output must be treated as if it is one. //! Index 'i' of inputIsBroadcast is true only if the input is semantically broadcast across the batch and @@ -713,7 +711,7 @@ class TRT_DEPRECATED IPluginV2Ext : public IPluginV2 //! //! \see IPluginV2Ext //! -//! \deprecated Deprecated in TensorRT 10.0. +//! \deprecated Deprecated in TensorRT 10.0. Implement IPluginV3 instead. //! class TRT_DEPRECATED IPluginV2IOExt : public IPluginV2Ext { @@ -966,8 +964,8 @@ class TRT_DEPRECATED IPluginCreator : public IPluginCreatorInterface //! //! \see IPlugin and IPluginFactory //! -//! \deprecated Deprecated in TensorRT 10.0. Please implement IPluginCreatorV3One instead along with IPluginV3 plugins -//! instead. +//! \deprecated Deprecated in TensorRT 10.0. Please implement IPluginCreatorV3One +//! along with IPluginV3 plugins instead. //! using IPluginCreator = v_1_0::IPluginCreator; diff --git a/include/NvInferVersion.h b/include/NvInferVersion.h index d0d78512..00329485 100644 --- a/include/NvInferVersion.h +++ b/include/NvInferVersion.h @@ -24,9 +24,9 @@ #define NV_INFER_VERSION_H #define NV_TENSORRT_MAJOR 10 //!< TensorRT major version. -#define NV_TENSORRT_MINOR 6 //!< TensorRT minor version. +#define NV_TENSORRT_MINOR 7 //!< TensorRT minor version. #define NV_TENSORRT_PATCH 0 //!< TensorRT patch version. -#define NV_TENSORRT_BUILD 26 //!< TensorRT build number. +#define NV_TENSORRT_BUILD 23 //!< TensorRT build number. #define NV_TENSORRT_LWS_MAJOR 0 //!< TensorRT LWS major version. #define NV_TENSORRT_LWS_MINOR 0 //!< TensorRT LWS minor version. diff --git a/parsers/onnx b/parsers/onnx index 4442153a..9c69a24b 160000 --- a/parsers/onnx +++ b/parsers/onnx @@ -1 +1 @@ -Subproject commit 4442153a4483c29e109241eb11752f3e59be62f8 +Subproject commit 9c69a24bc2e20c8a511a4e6b06fd49639ec5300a diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt index 55825fab..fb0c03ed 100644 --- a/plugin/CMakeLists.txt +++ b/plugin/CMakeLists.txt @@ -146,7 +146,9 @@ set_target_properties(${SHARED_TARGET} PROPERTIES DEBUG_POSTFIX ${TRT_DEBUG_POST set_target_properties(${SHARED_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION ${TRT_SOVERSION} ) -set_property(TARGET ${SHARED_TARGET} PROPERTY CUDA_STANDARD 14) +set_property(TARGET ${SHARED_TARGET} PROPERTY CUDA_STANDARD 17) + +target_link_directories(${SHARED_TARGET} PUBLIC ${CUDA_ROOT}/lib) target_link_libraries(${SHARED_TARGET} ${CUDART_LIB} @@ -188,7 +190,7 @@ set_target_properties(${STATIC_TARGET} PROPERTIES DEBUG_POSTFIX ${TRT_DEBUG_POST set_target_properties(${STATIC_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION ${TRT_SOVERSION} ) -set_property(TARGET ${STATIC_TARGET} PROPERTY CUDA_STANDARD 14) +set_property(TARGET ${STATIC_TARGET} PROPERTY CUDA_STANDARD 17) ################################## VFC SHARED LIBRARY ####################################### @@ -225,7 +227,9 @@ set_target_properties(${VFC_SHARED_TARGET} PROPERTIES DEBUG_POSTFIX ${TRT_DEBUG_ set_target_properties(${VFC_SHARED_TARGET} PROPERTIES VERSION ${TRT_VERSION} SOVERSION ${TRT_SOVERSION} ) -set_property(TARGET ${VFC_SHARED_TARGET} PROPERTY CUDA_STANDARD 14) +set_property(TARGET ${VFC_SHARED_TARGET} PROPERTY CUDA_STANDARD 17) + +target_link_directories(${VFC_SHARED_TARGET} PUBLIC ${CUDA_ROOT}/lib) target_link_libraries(${VFC_SHARED_TARGET} ${CUDART_LIB} diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h b/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h index e1b51b9d..0d84188c 100644 --- a/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h +++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention/include/fused_multihead_attention.h @@ -388,6 +388,8 @@ extern unsigned char cubin_fmha_v1_fp16_96_64_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v1_fp16_64_64_sm90_cu_cubin[]; #endif // defined(ENABLE_SM90) + + #if defined(ENABLE_SM75) extern uint32_t fused_multihead_attention_fp16_64_64_kernel_sm75_cu_o_len; extern uint32_t fused_multihead_attention_fp16_96_64_kernel_sm75_cu_o_len; @@ -432,11 +434,13 @@ extern uint32_t cubin_fmha_v1_fp16_96_64_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v1_fp16_64_64_sm90_cu_cubin_len; #endif // defined(ENABLE_SM90) -#if !(defined(ENABLE_SM72) || defined(ENABLE_SM75) || defined(ENABLE_SM80) || defined(ENABLE_SM86) \ - || defined(ENABLE_SM87) || defined(ENABLE_SM89) || defined(ENABLE_SM90)) -// TRT-17573: Remove SM72 support from this file by factoring out the common logic required by the + + +#if !(defined(ENABLE_SM72) || defined(ENABLE_SM75) || defined(ENABLE_SM80) || defined(ENABLE_SM86) \ + || defined(ENABLE_SM87) || defined(ENABLE_SM89) || defined(ENABLE_SM90)) + // V2 headers into a separate header. -#error This file can only be included one of sm 72, 75, 80, 86, 87, 89, or 90 are defined. +#error This file can only be included if one of sm 72, 75, 80, 86, 87, 89 or 90 is defined. #endif static const struct FusedMultiHeadAttentionKernelMetaInfoV1 { @@ -552,6 +556,7 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV1 {DATA_TYPE_FP16, 64, 64, kSM_90, cubin_fmha_v1_fp16_64_64_sm90_cu_cubin, cubin_fmha_v1_fp16_64_64_sm90_cu_cubin_len, "fmha_v1_fp16_64_64_sm90_kernel", 32768, 128}, #endif // defined(ENABLE_SM90) + }; using FusedMultiHeadAttentionXMMAKernel diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h index bdc143b3..bb57561d 100644 --- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h +++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/include/fused_multihead_attention_v2.h @@ -206,6 +206,8 @@ extern unsigned char cubin_fmha_v2_fp16_96_64_sm90_cu_cubin[]; extern unsigned char cubin_fmha_v2_fp16_64_64_sm90_cu_cubin[]; #endif // defined(ENABLE_SM90) + + #if defined(ENABLE_SM72) extern uint32_t fused_multihead_attention_v2_int8_128_64_kernel_cubin_len; extern uint32_t fused_multihead_attention_v2_int8_192_64_kernel_cubin_len; @@ -312,9 +314,13 @@ extern uint32_t cubin_fmha_v2_fp16_96_64_sm90_cu_cubin_len; extern uint32_t cubin_fmha_v2_fp16_64_64_sm90_cu_cubin_len; #endif // defined(ENABLE_SM90) -#if !(defined(ENABLE_SM72) || defined(ENABLE_SM75) || defined(ENABLE_SM80) || defined(ENABLE_SM86) || defined(ENABLE_SM87) || defined(ENABLE_SM89) || defined(ENABLE_SM90)) -#error This file can only be included one of sm 72, 75, 80, 86, 87, 89, or 90 are defined. + + +#if !(defined(ENABLE_SM72) || defined(ENABLE_SM75) || defined(ENABLE_SM80) || defined(ENABLE_SM86) \ + || defined(ENABLE_SM87) || defined(ENABLE_SM89) || defined(ENABLE_SM90)) +#error This file can only be included if one of sm 72, 75, 80, 86, 87, 89 or 90 is defined. #endif + static const struct FusedMultiHeadAttentionKernelMetaInfoV2 { Data_type mDataType; @@ -802,6 +808,8 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2 {DATA_TYPE_FP16, 64, 64, kSM_90, cubin_fmha_v2_fp16_64_64_sm90_cu_cubin, cubin_fmha_v2_fp16_64_64_sm90_cu_cubin_len, "fmha_v2_fp16_64_64_sm90_kernel_nl", 20480, 128, 16, false}, #endif // defined(ENABLE_SM90) + + }; class FusedMultiHeadAttentionXMMAKernelV2 @@ -892,6 +900,7 @@ class FusedMultiHeadAttentionXMMAKernelV2 {kSM_90, bert::DATA_TYPE_INT8, 256, 8}, {kSM_90, bert::DATA_TYPE_INT8, 384, 8}, #endif + }; for (uint32_t i = 0U; i < sizeof(unrollList) / sizeof(unrollList[0]); ++i) { @@ -937,6 +946,7 @@ class FusedMultiHeadAttentionXMMAKernelV2 #if defined(ENABLE_SM90) << "90 " #endif + << "\n"; PLUGIN_VALIDATE(findIter != mFunctions.end(), errMsg.str().c_str()); diff --git a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp index 4642ab1a..aea7d681 100644 --- a/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp +++ b/plugin/bertQKVToContextPlugin/fused_multihead_attention_v2/src/fused_multihead_attention_v2_fp16_64_64_kernel.sm75.cpp @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - namespace nvinfer1 { namespace plugin diff --git a/plugin/bertQKVToContextPlugin/mhaRunner.cu b/plugin/bertQKVToContextPlugin/mhaRunner.cu index 1e10fe6c..116b9756 100644 --- a/plugin/bertQKVToContextPlugin/mhaRunner.cu +++ b/plugin/bertQKVToContextPlugin/mhaRunner.cu @@ -925,7 +925,8 @@ public: , sm(mhaInterface->mSm) , xmmaKernel(getXMMAKernelsV2(DATA_TYPE_FP16, sm)) { - assert((sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_87 || sm == kSM_89 || sm == kSM_90) + assert((sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_87 || sm == kSM_89 || sm == kSM_90 + ) && "Unsupported architecture"); params.clear(); } @@ -1096,7 +1097,8 @@ public: , xmmas_n(0U) , threads_per_cta(1U) { - assert((sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_87 || sm == kSM_89 || sm == kSM_90) + assert((sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_87 || sm == kSM_89 || sm == kSM_90 + ) && "Unsupported architecture"); params.clear(); } diff --git a/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp b/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp index 45d589bd..83a5517d 100644 --- a/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp +++ b/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPlugin.cpp @@ -63,9 +63,10 @@ QKVToContextInterleavedPlugin::QKVToContextInterleavedPlugin(std::string const& mUseExplicitInt8 = static_cast(useExplicitInt8); // variable sequence length is only supported with the fused MHA kernels // we should not override mS! - PLUGIN_VALIDATE((mSM == kSM_AMPERE_100 || mSM == kSM_AMPERE_10X || mSM == kSM_AMPERE_10B || mSM == kSM_TURING - || mSM == kSM_XAVIER || mSM == kSM_ADA_10X || mSM == kSM_HOPPER_100) - && "requesting maxSeqlen not compatible with GPU arch"); + bool isSMSupported = mSM == kSM_AMPERE_100 || mSM == kSM_AMPERE_10X || mSM == kSM_AMPERE_10B || mSM == kSM_TURING + || mSM == kSM_XAVIER || mSM == kSM_ADA_10X || mSM == kSM_HOPPER_100 + ; + PLUGIN_VALIDATE(isSMSupported && "requesting maxSeqlen not compatible with GPU arch"); // the layout changes: SxB will be a combined \sum_i s_i and hdim will be the 2nd dimension instead of the third mXmmaKernel = getXMMAKernelsV2(DATA_TYPE_INT8, mSM); } diff --git a/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPluginLegacy.cpp b/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPluginLegacy.cpp index 748d4b9c..0b7a1b5a 100644 --- a/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPluginLegacy.cpp +++ b/plugin/bertQKVToContextPlugin/qkvToContextInt8InterleavedPluginLegacy.cpp @@ -64,9 +64,10 @@ QKVToContextInterleavedPluginLegacy::QKVToContextInterleavedPluginLegacy(std::st mSM = getSMVersion(); // variable sequence length is only supported with the fused MHA kernels // we should not override mS! - PLUGIN_VALIDATE((mSM == kSM_AMPERE_100 || mSM == kSM_AMPERE_10X || mSM == kSM_AMPERE_10B || mSM == kSM_TURING - || mSM == kSM_XAVIER || mSM == kSM_ADA_10X || mSM == kSM_HOPPER_100) - && "requesting maxSeqlen not compatible with GPU arch"); + bool isSMSupported = mSM == kSM_AMPERE_100 || mSM == kSM_AMPERE_10X || mSM == kSM_AMPERE_10B || mSM == kSM_TURING + || mSM == kSM_XAVIER || mSM == kSM_ADA_10X || mSM == kSM_HOPPER_100 + ; + PLUGIN_VALIDATE(isSMSupported && "requesting maxSeqlen not compatible with GPU arch"); // the layout changes: SxB will be a combined \sum_i s_i and hdim will be the 2nd dimension instead of the third mXmmaKernel = getXMMAKernelsV2(DATA_TYPE_INT8, mSM); } diff --git a/plugin/bertQKVToContextPlugin/qkvToContextPlugin.cpp b/plugin/bertQKVToContextPlugin/qkvToContextPlugin.cpp index 294d020c..d0c49115 100644 --- a/plugin/bertQKVToContextPlugin/qkvToContextPlugin.cpp +++ b/plugin/bertQKVToContextPlugin/qkvToContextPlugin.cpp @@ -333,8 +333,9 @@ int32_t QKVToContextPluginDynamic::onShapeChange( createMHARunner(); - // during build, configurePlugin() should have set mS, mB appropriately - // during inference, the engine should have mS, mB information + // mS and mB that are set by configurePlugin() may be stale + mS = inDesc.dims.d[SDIM]; + mB = inDesc.dims.d[BDIM]; PLUGIN_ASSERT(mS); PLUGIN_ASSERT(mB); if (fusedDispatcher.get() && fusedDispatcher->isValid(mHeadSize, mS)) @@ -624,7 +625,7 @@ IPluginV3* QKVToContextPluginDynamicCreator::createPlugin( int32_t b = -1; int32_t sm = -1; bool hasUnfusedDispatcher = false; - void const* runnerStateBuffer; + void const* runnerStateBuffer = nullptr; float dqProbs = -1; PLUGIN_VALIDATE(fc->fields != nullptr); @@ -784,9 +785,9 @@ QKVToContextVarSeqlenPlugin::QKVToContextVarSeqlenPlugin(std::string const name, { // variable sequence length is only supported with the fused MHA kernels // we should not override mS! - PLUGIN_ASSERT( - (mSM == kSM_90 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_89 || mSM == kSM_80 || mSM == kSM_75) - && (type == DataType::kINT8 || type == DataType::kHALF) + bool isSMSupported = + mSM == kSM_90 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_89 || mSM == kSM_80 || mSM == kSM_75; + PLUGIN_ASSERT(isSMSupported && (type == DataType::kINT8 || type == DataType::kHALF) && "requesting maxSeqlen not compatible with GPU arch"); // the layout changes: SxB will be a combined \sum_i s_i and hdim will be the 2nd dimension instead of the third mHdim = 1; @@ -815,9 +816,9 @@ QKVToContextVarSeqlenPlugin::QKVToContextVarSeqlenPlugin(std::string const name, { // variable sequence length is only supported with the fused MHA kernels // we should not override mS! - PLUGIN_ASSERT( - (mSM == kSM_90 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_89 || mSM == kSM_80 || mSM == kSM_75) - && (type == DataType::kINT8 || type == DataType::kHALF) + bool isSMSupported = + mSM == kSM_90 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_89 || mSM == kSM_80 || mSM == kSM_75; + PLUGIN_ASSERT(isSMSupported && (type == DataType::kINT8 || type == DataType::kHALF) && "requesting maxSeqlen not compatible with GPU arch"); // the layout changes: SxB will be a combined \sum_i s_i and hdim will be the 2nd dimension instead of the third mHdim = 1; @@ -947,13 +948,14 @@ bool QKVToContextVarSeqlenPlugin::supportsFormatCombination( int32_t pos, DynamicPluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept { // we only support variable sequence and int8 IO in fused mha runner, and we only support fused mha runner on - // Turing, Ampere and Hopper + // Turing, Ampere, Hopper. bool const hasV2Kernels - = (mSM == kSM_90 || mSM == kSM_89 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_80 || mSM == kSM_75); + = ( + mSM == kSM_90 || mSM == kSM_89 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_80 || mSM == kSM_75); PLUGIN_ASSERT( - (mType != DataType::kINT8 || hasV2Kernels) && "INT8 IO is only supported on Xavier, Turing, Ampere and Hopper"); + (mType != DataType::kINT8 || hasV2Kernels) && "INT8 IO is only supported on Xavier, Turing, Ampere, Hopper!"); PLUGIN_ASSERT( - (!mUseVarSeqlen || hasV2Kernels) && "Variable sequence is only supported on Xavier, Turing, Ampere and Hopper"); + (!mUseVarSeqlen || hasV2Kernels) && "Variable sequence is only supported on Xavier, Turing, Ampere, Hopper!"); PLUGIN_ASSERT(pos >= 0); PLUGIN_ASSERT(pos < 2 + mHasImask + 2 * mUseVarSeqlen); diff --git a/plugin/bertQKVToContextPlugin/qkvToContextPluginLegacy.cpp b/plugin/bertQKVToContextPlugin/qkvToContextPluginLegacy.cpp index 850e8217..9ebcc2a8 100644 --- a/plugin/bertQKVToContextPlugin/qkvToContextPluginLegacy.cpp +++ b/plugin/bertQKVToContextPlugin/qkvToContextPluginLegacy.cpp @@ -617,9 +617,9 @@ QKVToContextVarSeqlenPluginLegacy::QKVToContextVarSeqlenPluginLegacy(std::string { // variable sequence length is only supported with the fused MHA kernels // we should not override mS! - PLUGIN_ASSERT( - (mSM == kSM_90 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_89 || mSM == kSM_80 || mSM == kSM_75) - && (type == DataType::kINT8 || type == DataType::kHALF) + bool isSMSupported = + mSM == kSM_90 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_89 || mSM == kSM_80 || mSM == kSM_75; + PLUGIN_ASSERT(isSMSupported && (type == DataType::kINT8 || type == DataType::kHALF) && "requesting maxSeqlen not compatible with GPU arch"); // the layout changes: SxB will be a combined \sum_i s_i and hdim will be the 2nd dimension instead of the third mHdim = 1; @@ -728,7 +728,8 @@ bool QKVToContextVarSeqlenPluginLegacy::supportsFormatCombination( // we only support variable sequence and int8 IO in fused mha runner, and we only support fused mha runner on // Turing, Ampere and Hopper bool const hasV2Kernels - = (mSM == kSM_90 || mSM == kSM_89 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_80 || mSM == kSM_75); + = ( + mSM == kSM_90 || mSM == kSM_89 || mSM == kSM_87 || mSM == kSM_86 || mSM == kSM_80 || mSM == kSM_75); PLUGIN_ASSERT( (mType != DataType::kINT8 || hasV2Kernels) && "INT8 IO is only supported on Xavier, Turing, Ampere and Hopper"); PLUGIN_ASSERT( diff --git a/plugin/common/bertCommon.h b/plugin/common/bertCommon.h index a52b380f..0f6e8d00 100644 --- a/plugin/common/bertCommon.h +++ b/plugin/common/bertCommon.h @@ -113,7 +113,8 @@ inline int32_t getMHAMaskPackedSize(int32_t smVersion, nvinfer1::DataType dataTy // this code must match EmbLayerNormPluginDynamic::getOutputDimensions in embLayerNormPlugin.cpp int32_t packedSize = unfusedMaskSize; bool isSmOK = (smVersion == kSM_75 || smVersion == kSM_80 || smVersion == kSM_86 || smVersion == kSM_87 - || smVersion == kSM_89 || smVersion == kSM_90); + || smVersion == kSM_89 || smVersion == kSM_90 + ); bool isPrecisionOK = (dataType == nvinfer1::DataType::kINT8 || dataType == nvinfer1::DataType::kHALF); if (isSmOK && isPrecisionOK) { diff --git a/plugin/common/plugin.cpp b/plugin/common/plugin.cpp index a4b228ff..86f437a3 100644 --- a/plugin/common/plugin.cpp +++ b/plugin/common/plugin.cpp @@ -27,7 +27,7 @@ namespace pluginInternal // when not needed, instead of on process exit. // Objects of this class shall always be declared static / global, and shall never own cudnn/cublas handle // resources. -template +template class PerContextPluginHandleSingletonCreator { public: @@ -35,17 +35,17 @@ class PerContextPluginHandleSingletonCreator // It forces separation of memory for T and memory for control blocks. // So when T is released, but we still have observer weak_ptr in mObservers, the T mem block can be released. // creator itself must not own cudnn/cublas handle resources. Only the object it creates can. - PerContextPluginHandleSingletonCreator(std::function()> creator) + PerContextPluginHandleSingletonCreator(std::function()> creator) : mCreator{std::move(creator)} {}; // \param executionContextIdentifier Unique pointer to identify contexts having overlapping lifetime. - std::shared_ptr operator()(void* executionContextIdentifier) + std::shared_ptr operator()(void* executionContextIdentifier) { std::lock_guard lk{mMutex}; - std::shared_ptr result = mObservers[executionContextIdentifier].lock(); + std::shared_ptr result = mObservers[executionContextIdentifier].lock(); if (result == nullptr) { - auto deleter = [this, executionContextIdentifier](T_* obj) { + auto deleter = [this, executionContextIdentifier](T* obj) { if (obj == nullptr) { return; @@ -53,7 +53,7 @@ class PerContextPluginHandleSingletonCreator delete obj; // Clears observer to avoid growth of mObservers, in case users create/destroy // plugin handle contexts frequently. - std::shared_ptr observedObjHolder; + std::shared_ptr observedObjHolder; // The destructor of observedObjHolder may attempt to acquire a lock on mMutex. // To avoid deadlock, it's critical to release the lock here held by lk first, // before destroying observedObjHolder. Hence observedObjHolder must be declared @@ -71,7 +71,7 @@ class PerContextPluginHandleSingletonCreator } }; // Create the resource and register with an observer. - result = std::shared_ptr{mCreator().release(), std::move(deleter)}; + result = std::shared_ptr{mCreator().release(), std::move(deleter)}; mObservers.at(executionContextIdentifier) = result; } @@ -79,10 +79,10 @@ class PerContextPluginHandleSingletonCreator }; private: - std::function()> mCreator; + std::function()> mCreator; mutable std::mutex mMutex; // cudnn/cublas handle resources are per-context. - std::unordered_map> mObservers; + std::unordered_map> mObservers; }; // class PerContextPluginHandleSingletonCreator std::unique_ptr createPluginCudnnWrapperImpl() diff --git a/plugin/common/serialize.hpp b/plugin/common/serialize.hpp index 8fcef07f..8be010c9 100644 --- a/plugin/common/serialize.hpp +++ b/plugin/common/serialize.hpp @@ -41,8 +41,7 @@ struct Serializer }; template -struct Serializer::value || std::is_enum::value || std::is_pod::value>::type> +struct Serializer || std::is_enum_v || std::is_pod_v>> { static size_t serialized_size(T const&) { @@ -86,7 +85,7 @@ struct Serializer template struct Serializer, - typename std::enable_if::value || std::is_enum::value || std::is_pod::value>::type> + typename std::enable_if_t || std::is_enum_v || std::is_pod_v>> { static size_t serialized_size(std::vector const& value) { diff --git a/plugin/disentangledAttentionPlugin/disentangledKernel.cu b/plugin/disentangledAttentionPlugin/disentangledKernel.cu index 2636fd8f..8a2d0b76 100644 --- a/plugin/disentangledAttentionPlugin/disentangledKernel.cu +++ b/plugin/disentangledAttentionPlugin/disentangledKernel.cu @@ -32,9 +32,9 @@ using namespace nvinfer1; // template specialization for double/float template , double>::value - || std::is_same, float>::value, - TDataType>::type* dummy + typename std::enable_if_t, double> + || std::is_same_v, float>, + TDataType>* dummy = nullptr> __forceinline__ __device__ void compute_attention( TDataType& res, const TDataType& res0, const TDataType& res1, const TDataType& res2, const TDataType& factor) @@ -44,9 +44,9 @@ __forceinline__ __device__ void compute_attention( // template specialization for half template , __half>::value - || std::is_same, half>::value, - TDataType>::type* dummy + typename std::enable_if_t, __half> + || std::is_same_v, half>, + TDataType>* dummy = nullptr> __forceinline__ __device__ void compute_attention( TDataType& res, const TDataType& res0, const TDataType& res1, const TDataType& res2, const TDataType& factor) @@ -62,9 +62,9 @@ __forceinline__ __device__ void compute_attention( // template specialization for int8 template , int8_t>::value - || std::is_same, uint8_t>::value, - TDataType>::type* dummy + typename std::enable_if_t, int8_t> + || std::is_same_v, uint8_t>, + TDataType>* dummy = nullptr> __forceinline__ __device__ void compute_attention( TDataType& res, const TDataType& res0, const TDataType& res1, const TDataType& res2, const TDataType& factor) @@ -222,12 +222,12 @@ __global__ void GatherAddGatherTransposeAddMul_fused(TDataType const* data0, TDa #if __cplusplus >= 201703L // C++ 17 has more convenient `if constexpr` for conditional implementation at compile time; before C++ 17, // switch to template specialization - if constexpr (std::is_same::value || std::is_same::value) + if constexpr (std::is_same_v || std::is_same_v) { // double, float32 res = (res0 + res1 + T[threadIdx.x][ty + threadIdx.y]) * factor; } - else if constexpr (std::is_same::value || std::is_same::value) + else if constexpr (std::is_same_v || std::is_same_v) { // fp16 #if __CUDA_ARCH__ >= 530 @@ -240,7 +240,7 @@ __global__ void GatherAddGatherTransposeAddMul_fused(TDataType const* data0, TDa * __half2float(factor)); #endif } - else if constexpr (std::is_same::value || std::is_same::value) + else if constexpr (std::is_same_v || std::is_same_v) { // int8_t res = (res0 + res1 + T[threadIdx.x][ty + threadIdx.y]) * factor; diff --git a/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp b/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp index c682d48f..6b9b0422 100644 --- a/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp +++ b/plugin/embLayerNormPlugin/embLayerNormPlugin.cpp @@ -421,7 +421,8 @@ int32_t EmbLayerNormPluginDynamic::getOutputShapes(DimsExprs const* inputs, int3 // this code must match getMHAMaskPackedSize in bertCommon.h bool const isSmOK - = (mSM == kSM_75 || mSM == kSM_80 || mSM == kSM_86 || mSM == kSM_87 || mSM == kSM_89 || mSM == kSM_90); + = (mSM == kSM_75 || mSM == kSM_80 || mSM == kSM_86 || mSM == kSM_87 || mSM == kSM_89 || mSM == kSM_90 + ); bool const isPrecisionOK = (mMhaType == nvinfer1::DataType::kHALF || mMhaType == nvinfer1::DataType::kINT8); if (mUseFullMask || (isSmOK && isPrecisionOK)) { diff --git a/plugin/embLayerNormPlugin/embLayerNormPluginLegacy.cpp b/plugin/embLayerNormPlugin/embLayerNormPluginLegacy.cpp index 6c037189..3048b3f7 100644 --- a/plugin/embLayerNormPlugin/embLayerNormPluginLegacy.cpp +++ b/plugin/embLayerNormPlugin/embLayerNormPluginLegacy.cpp @@ -167,7 +167,8 @@ DimsExprs EmbLayerNormPluginDynamicLegacy::getOutputDimensions( // this code must match getMHAMaskPackedSize in bertCommon.h bool const isSmOK - = (mSM == kSM_75 || mSM == kSM_80 || mSM == kSM_86 || mSM == kSM_87 || mSM == kSM_89 || mSM == kSM_90); + = (mSM == kSM_75 || mSM == kSM_80 || mSM == kSM_86 || mSM == kSM_87 || mSM == kSM_89 || mSM == kSM_90 + ); bool const isPrecisionOK = (mMhaType == nvinfer1::DataType::kHALF || mMhaType == nvinfer1::DataType::kINT8); if (mUseFullMask || (isSmOK && isPrecisionOK)) { diff --git a/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu b/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu index 3bf35f6b..31cca913 100644 --- a/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu +++ b/plugin/instanceNormalizationPlugin/instanceNormFwdImpl.cu @@ -66,8 +66,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA, DESIRED_OCCUPANCY) void instanceNo // else: // return M2 / (n - 1) - bool const IS_INPUT_INT8 = std::is_same::value; - bool const IS_OUTPUT_INT8 = std::is_same::value; + bool const IS_INPUT_INT8 = std::is_same_v; + bool const IS_OUTPUT_INT8 = std::is_same_v; // The number of pixels loaded in a single LDG. int32_t const PIXELS_PER_LDG = THREADS_PER_CTA / THREADS_PER_PIXEL; diff --git a/plugin/reorgPlugin/reorgPlugin.cpp b/plugin/reorgPlugin/reorgPlugin.cpp index 227c59d9..5fb41775 100644 --- a/plugin/reorgPlugin/reorgPlugin.cpp +++ b/plugin/reorgPlugin/reorgPlugin.cpp @@ -340,11 +340,11 @@ char const* ReorgPluginCreator::getPluginName() const noexcept template char const* ReorgPluginCreator::getPluginVersion() const noexcept { - if (std::is_same::value) + if (std::is_same_v) { return kREORG_PLUGIN_STATIC_VERSION; } - else if (std::is_same::value) + else if (std::is_same_v) { return kREORG_PLUGIN_DYNAMIC_VERSION; } diff --git a/python/docstrings/infer/pyCoreDoc.h b/python/docstrings/infer/pyCoreDoc.h index ceb4f363..4437405b 100644 --- a/python/docstrings/infer/pyCoreDoc.h +++ b/python/docstrings/infer/pyCoreDoc.h @@ -599,6 +599,7 @@ constexpr char const* set_all_tensors_debug_state = R"trtdoc( :arg flag: True if turning on debug state of tensor. False if turning off. )trtdoc"; + } // namespace IExecutionContextDoc namespace IDebugListenerDoc @@ -714,6 +715,7 @@ constexpr char const* descr = R"trtdoc( :ivar weight_streaming_budget_v2: Set and get the current weight streaming budget for inference. The budget may be set any non-negative value. A value of 0 streams the most weights. Values equal to streamable_weights_size (default) or larger will disable weight streaming. :ivar weight_streaming_scratch_memory_size: The amount of scratch memory required by a TensorRT ExecutionContext to perform inference. This value may change based on the current weight streaming budget. Please use the V2 memory APIs, engine.device_memory_size_v2 and ExecutionContext.set_device_memory() to provide memory which includes the current weight streaming scratch memory. Not specifying these APIs or using the V1 APIs will not include this memory, so TensorRT will resort to allocating itself. )trtdoc" + ; // Documentation bug with parameters on these three functions because they are overloaded. @@ -962,6 +964,53 @@ constexpr char const* read = R"trtdoc( )trtdoc"; } // namespace StreamReaderDoc +namespace StreamReaderV2Doc +{ +constexpr char const* descr = R"trtdoc( + Application-implemented class for asynchronously reading data from a stream. Implementation does not need to be + asynchronous or use the provided cuda stream. Python users are unlikely to see performance gains over IStreamReader + or deserialization from a glob. + + To implement a custom stream reader, ensure that you explicitly instantiate the base class in :func:`__init__` : + :: + class MyStreamReader(trt.IStreamReaderV2): + def __init__(self): + trt.IStreamReaderV2.__init__(self) + + def read(self, num_bytes: int, stream: int) -> bytes: + ... # Your implementation here + + def seek(self, offset: int, where: SeekPosition) -> bool: + ... # Your implementation here +)trtdoc"; + +constexpr char const* read = R"trtdoc( + A callback implemented by the application to read a particular chunk of memory. + + :arg num_bytes: The number of bytes required. + :arg stream: A handle to the cudaStream your implementation can use for reading. + + :returns: A buffer containing the bytes read. +)trtdoc"; + +constexpr char const* seek = R"trtdoc( + A callback implemented by the application to set the stream location. + + :arg offset: The offset within the stream to seek to. + :arg where: A `SeekPosition` enum specifying where the offset is relative to. + + :returns: A buffer containing the bytes read. +)trtdoc"; +} // namespace StreamReaderV2Doc + +namespace SeekPositionDoc +{ +constexpr char const* descr + = R"trtdoc(Specifies what the offset is relative to when calling `seek` on an `IStreamReaderV2`.)trtdoc"; +constexpr char const* SET = R"trtdoc(Offsets forward from the start of the stream.)trtdoc"; +constexpr char const* CUR = R"trtdoc(Offsets forward from the current position within the stream.)trtdoc"; +constexpr char const* END = R"trtdoc(Offsets backward from the end of the stream.)trtdoc"; +} // namespace SeekPositionDoc namespace BuilderFlagDoc { @@ -988,7 +1037,8 @@ constexpr char const* OBEY_PRECISION_CONSTRAINTS constexpr char const* PREFER_PRECISION_CONSTRAINTS = R"trtdoc(Prefer that layers execute in specified precisions. Fall back (with warning) to another precision if build would otherwise fail.)trtdoc"; constexpr char const* DIRECT_IO - = R"trtdoc(Require that no reformats be inserted between a layer and a network I/O tensor for which ITensor.allowed_formats was set. Build fails if a reformat is required for functional correctness.)trtdoc"; + = R"trtdoc(Require that no reformats be inserted between a layer and a network I/O tensor for which ``ITensor.allowed_formats`` was set. Build fails if a reformat is required for functional correctness. + [DEPRECATED] Deprecated in TensorRT 10.7.))trtdoc"; constexpr char const* REJECT_EMPTY_ALGORITHMS = R"trtdoc(Fail if IAlgorithmSelector.select_algorithms returns an empty set of algorithms.)trtdoc"; constexpr char const* VERSION_COMPATIBLE @@ -1532,6 +1582,7 @@ constexpr char const* get_preview_feature = R"trtdoc( :returns: true if the feature is enabled, false otherwise )trtdoc"; + } // namespace IBuilderConfigDoc namespace SerializationFlagDoc @@ -1685,6 +1736,16 @@ constexpr char const* deserialize_cuda_engine_reader = R"trtdoc( :returns: The :class:`ICudaEngine`, or None if it could not be deserialized. )trtdoc"; +constexpr char const* deserialize_cuda_engine_reader_v2 = R"trtdoc( + Deserialize an :class:`ICudaEngine` from a stream reader v2. + + :arg stream_reader: The :class:`PyStreamReaderV2` that will read the serialized :class:`ICudaEngine`. This + enables deserialization from a file directly, with possible benefits to performance. + + :returns: The :class:`ICudaEngine`, or None if it could not be deserialized. +)trtdoc"; + + constexpr char const* get_plugin_registry = R"trtdoc( Get the local plugin registry that can be used by the runtime. diff --git a/python/docstrings/infer/pyGraphDoc.h b/python/docstrings/infer/pyGraphDoc.h index 32a23068..e8c657ed 100644 --- a/python/docstrings/infer/pyGraphDoc.h +++ b/python/docstrings/infer/pyGraphDoc.h @@ -72,6 +72,8 @@ constexpr char const* NON_ZERO = R"trtdoc(NonZero layer)trtdoc"; constexpr char const* REVERSE_SEQUENCE = R"trtdoc(ReverseSequence layer)trtdoc"; constexpr char const* NORMALIZATION = R"trtdoc(Normalization layer)trtdoc"; constexpr const char* PLUGIN_V3 = R"trtdoc(PluginV3 layer)trtdoc"; +constexpr const char* SQUEEZE = R"trtdoc(Squeeze layer)trtdoc"; +constexpr const char* UNSQUEEZE = R"trtdoc(Unsqueeze layer)trtdoc"; } // namespace LayerTypeDoc namespace TensorFormatDoc @@ -184,7 +186,7 @@ namespace ITensorDoc constexpr const char* descr = R"trtdoc( A tensor in an :class:`INetworkDefinition` . - :ivar name: :class:`str` The tensor name. For a network input, the name is assigned by the application. For tensors which are layer outputs, a default name is assigned consisting of the layer name followed by the index of the output in brackets. + :ivar name: :class:`str` The tensor name. For a network input, the name is assigned by the application. For tensors which are layer outputs, a default name is assigned consisting of the layer name followed by the index of the output in brackets. Each network input and output tensor must have a unique name. :ivar shape: :class:`Dims` The shape of a tensor. For a network input the shape is assigned by the application. For a network output it is computed based on the layer parameters and the inputs to the layer. If a tensor size or a parameter is modified in the network, the shape of all dependent tensors will be recomputed. This call is only legal for network input tensors, since the shape of layer output tensors are inferred based on layer inputs and parameters. @@ -199,8 +201,10 @@ constexpr const char* descr = R"trtdoc( :ivar is_shape: :class:`bool` Whether the tensor is a shape tensor. :ivar allowed_formats: :class:`int32` The allowed set of TensorFormat candidates. This should be an integer consisting of one or more :class:`TensorFormat` s, combined via bitwise OR after bit shifting. For example, ``1 << int(TensorFormat.CHW4) | 1 << int(TensorFormat.CHW32)``. )trtdoc" + ; + constexpr const char* set_dynamic_range = R"trtdoc( [DEPRECATED] Deprecated in TensorRT 10.1. Superseded by explicit quantization. Set dynamic range for the tensor. @@ -1821,6 +1825,71 @@ constexpr const char* descr = R"trtdoc( )trtdoc"; } // namespace INormalizationLayerDoc +namespace ISqueezeLayerDoc +{ +constexpr const char* descr = R"trtdoc( + A Squeeze layer in an :class:`INetworkDefinition` . + + This layer represents a squeeze operation, removing unit dimensions of the input tensor on a set of axes. + + Axes must be resolvable to a constant Int32 or Int64 1D shape tensor. + Values in axes must be unique and in the range of [-r, r-1], where r is the rank of the input tensor. + For each axis value, the corresponding dimension in the input tensor must be one. + +)trtdoc"; + +constexpr const char* set_input = R"trtdoc( + Sets the input tensor for the given index. The index must be 0 or 1 for a Squeeze layer. + + The indices are as follows: + + ===== ================================================================================== + Index Description + ===== ================================================================================== + 0 Input data tensor. + 1 The axes to remove. Must be resolvable to a constant Int32 or Int64 1D shape tensor. + ===== ================================================================================== + + :arg index: The index of the input tensor. + :arg tensor: The input tensor. +)trtdoc"; + +} // namespace ISqueezeLayerDoc + +namespace IUnsqueezeLayerDoc +{ +constexpr const char* descr = R"trtdoc( + An Unsqueeze layer in an :class:`INetworkDefinition` . + + This layer represents an unsqueeze operation, which reshapes the input tensor by inserting unit-length dimensions at specified axes of the output. + + Axes must be resolvable to a constant Int32 or Int64 shape tensor. + Values in axes must be unique and in the range of [-r_final, r_final-1], where r_final is the sum of rank(input) and len(axes). + + r_final must be less than Dims.MAX_DIMS. + +)trtdoc"; + +constexpr const char* set_input = R"trtdoc( + Sets the input tensor for the given index. The index must be 0 or 1 for an Unsqueeze layer. + + The indices are as follows: + + ===== ================================================================================== + Index Description + ===== ================================================================================== + 0 Input data tensor. + 1 The axes to add. Must be resolvable to a constant Int32 or Int64 1D shape tensor. + ===== ================================================================================== + + :arg index: The index of the input tensor. + :arg tensor: The input tensor. +)trtdoc"; + +} // namespace IUnsqueezeLayerDoc + + + namespace INetworkDefinitionDoc { constexpr const char* descr = R"trtdoc( @@ -1846,9 +1915,9 @@ constexpr const char* get_flag = R"trtdoc( constexpr const char* add_input = R"trtdoc( Adds an input to the network. - :arg name: The name of the tensor. + :arg name: The name of the tensor. Each input and output tensor must have a unique name. :arg dtype: The data type of the tensor. Currently, tensorrt.int8 is not supported for inputs. - :arg shape: The dimensions of the tensor. The total volume must be less than 2^30 elements. + :arg shape: The dimensions of the tensor. The total volume must be less than 2^31 elements. :returns: The newly added Tensor. )trtdoc"; @@ -2505,6 +2574,28 @@ constexpr char const* add_normalization = R"trtdoc( :returns: the new Normalization layer, or :class:`None` if it could not be created. )trtdoc"; +constexpr char const* add_squeeze = R"trtdoc( + Adds a Squeeze layer to the network. + See :class:`ISqueezeLayer` for more information. + + :arg input: The input tensor to the layer. + :arg axes: The tensor containing axes to remove. Must be resolvable to a constant Int32 or Int64 1D shape tensor. + + :returns: the new Squeeze layer, or :class:`None` if it could not be created. +)trtdoc"; + +constexpr char const* add_unsqueeze = R"trtdoc( + Adds an Unsqueeze layer to the network. + See :class:`IUnsqueezeLayer` for more information. + + :arg input: The input tensor to the layer. + :arg axes: The tensor containing axes to add. Must be resolvable to a constant Int32 or Int64 1D shape tensor. + + :returns: the new Unsqueeze layer, or :class:`None` if it could not be created. +)trtdoc"; + + + } // namespace INetworkDefinitionDoc } // namespace tensorrt diff --git a/python/include/utils.h b/python/include/utils.h index 2f0d5bdc..c9144022 100644 --- a/python/include/utils.h +++ b/python/include/utils.h @@ -109,7 +109,7 @@ constexpr auto deprecate(RetVal (*func)(Args...), const char* useInstead) -> Dep template struct DeprecatedMemberFunc { - using Func = typename std::conditional::type; + using Func = typename std::conditional_t; RetVal operator()(Cls& self, Args... args) const { diff --git a/python/packaging/bindings_wheel/setup.cfg b/python/packaging/bindings_wheel/setup.cfg index b6f5905e..e4e7908d 100644 --- a/python/packaging/bindings_wheel/setup.cfg +++ b/python/packaging/bindings_wheel/setup.cfg @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/python/packaging/bindings_wheel/setup.py b/python/packaging/bindings_wheel/setup.py index 19184de7..918b6e93 100644 --- a/python/packaging/bindings_wheel/setup.py +++ b/python/packaging/bindings_wheel/setup.py @@ -19,20 +19,20 @@ from setuptools import setup -tensorrt_module = "##TENSORRT_MODULE##" -package_name = "##TENSORRT_MODULE##" +distribution_package_name = "##TENSORRT_MODULE##" +import_package_name = "##TENSORRT_MODULE##" # This file expects the following to be passed from the environment when using standalone wheels: # - STANDALONE: Whether we are building a standalone wheel IS_STANDALONE = os.environ.get("STANDALONE") == "1" if IS_STANDALONE: - tensorrt_module += "-cu##CUDA_MAJOR##_bindings" - package_name += "_bindings" + distribution_package_name += "_cu##CUDA_MAJOR##_bindings" + import_package_name += "_bindings" -plugin_subpackage_name = f"{package_name}.plugin" +plugin_import_package_name = f"{import_package_name}.plugin" setup( - name=tensorrt_module, + name=distribution_package_name, version="##TENSORRT_PYTHON_VERSION##", description="A high performance deep learning inference library", long_description="A high performance deep learning inference library", @@ -43,9 +43,9 @@ "Intended Audience :: Developers", "Programming Language :: Python :: 3", ], - packages=[package_name, plugin_subpackage_name], + packages=[import_package_name, plugin_import_package_name], extras_require={"numpy": "numpy"}, - package_data={package_name: ["*.so*", "*.pyd", "*.pdb", "*.dll*"]}, + package_data={import_package_name: ["*.so*", "*.pyd", "*.pdb", "*.dll*"]}, include_package_data=True, zip_safe=True, keywords="nvidia tensorrt deeplearning inference", diff --git a/python/packaging/bindings_wheel/tensorrt/plugin/__init__.py b/python/packaging/bindings_wheel/tensorrt/plugin/__init__.py index c6d55190..e72aeee5 100644 --- a/python/packaging/bindings_wheel/tensorrt/plugin/__init__.py +++ b/python/packaging/bindings_wheel/tensorrt/plugin/__init__.py @@ -18,7 +18,7 @@ import tensorrt as trt logger = trt.Logger() -logger.log(trt.Logger.WARNING, "Functionality provided through tensorrt.plugin module is experimental in TensorRT 10.6.") +logger.log(trt.Logger.WARNING, "Functionality provided through tensorrt.plugin module is experimental.") # export.public_api() will expose things here. To make sure that happens, we just need to # import all the submodules so that the decorator is actually executed (__discover_modules() below). diff --git a/python/packaging/bindings_wheel/tensorrt/plugin/_plugin_class.py b/python/packaging/bindings_wheel/tensorrt/plugin/_plugin_class.py index c0910421..9e681e2f 100644 --- a/python/packaging/bindings_wheel/tensorrt/plugin/_plugin_class.py +++ b/python/packaging/bindings_wheel/tensorrt/plugin/_plugin_class.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorrt as trt from typing import Tuple diff --git a/python/packaging/bindings_wheel/tensorrt/plugin/_tensor.py b/python/packaging/bindings_wheel/tensorrt/plugin/_tensor.py index 5e9e711b..26a2c81e 100644 --- a/python/packaging/bindings_wheel/tensorrt/plugin/_tensor.py +++ b/python/packaging/bindings_wheel/tensorrt/plugin/_tensor.py @@ -130,7 +130,7 @@ def constant_value(self) -> int: "Not accessible for non-constant shape expressions. Check is_constant to determine accessibility." ) return self._expr.get_constant_value() - + # Evaluate the underlying trt.IDimensionExpr, if so done lazily @property def _expr(self): @@ -142,7 +142,7 @@ class SizeTensorShapeExpr(ShapeExpr): Extends :class:`ShapeExpr` A shape expression that represent a size tensor - + """ def __init__(self, size_tensor_desc: "SizeTensorDesc"): """ @@ -155,7 +155,7 @@ def __init__(self, size_tensor_desc: "SizeTensorDesc"): def _op(self, op: trt.DimensionOperation, other: Union[int, "ShapeExpr"]): raise ValueError("It is not permitted to perform binary operations on size tensor expressions") # TRT limitation - + @property def is_constant(self): if self._is_dummy: @@ -163,7 +163,7 @@ def is_constant(self): "Not accessible for fake 'ShapeExpr's. Check is_fake to determine accessibility." ) return False - + @property def _expr(self): if self._dim_expr is not None: @@ -171,7 +171,7 @@ def _expr(self): self._dim_expr = super()._exprBuilder.declare_size_tensor(self._size_tensor_desc.index, self._size_tensor_desc.opt._expr, self._size_tensor_desc.upper_bound._expr) return self._dim_expr - + def __repr__(self): return f"ShapeExpr[is_size_tensor = True, id={id(self)}]" @@ -237,16 +237,21 @@ class Shape: Numerical representation of a tensor shape """ def __init__( - self, tensor_desc: Union[int, trt.DynamicPluginTensorDesc, trt.PluginTensorDesc] + self, tensor_desc: Union[Tuple[int], trt.DynamicPluginTensorDesc, trt.PluginTensorDesc] ): - self._desc = tensor_desc self._is_dynamic = None # set lazily if isinstance(tensor_desc, trt.DynamicPluginTensorDesc): self._length = len(tensor_desc.desc.dims) self._shapes = tensor_desc.desc.dims + self._desc = tensor_desc elif isinstance(tensor_desc, trt.PluginTensorDesc): self._length = len(tensor_desc.dims) self._shapes = tensor_desc.dims + elif isinstance(tensor_desc, tuple): + self._shapes = trt.Dims(tensor_desc) + self._length = len(self._shapes) + else: + raise ValueError("Unsupported type used for constructing trt.plugin.Shape! tensor_desc must be a Tuple[int], trt.DynamicPluginTensorDesc, or trt.PluginTensorDesc") def numel(self) -> int: """ @@ -293,6 +298,10 @@ def opt(self) -> Tuple[int]: """ if not self.is_dynamic: raise ValueError("opt property is only accessible if is_dynamic is true") + if not hasattr(self, "_desc"): + raise AttributeError( + "Shape object has at least one dynamic dimension, but no information is available on 'opt' property." + ) return tuple(self._desc.opt) @property @@ -302,6 +311,10 @@ def min(self) -> Tuple[int]: """ if not self.is_dynamic: raise ValueError("min property is only accessible if is_dynamic is true") + if not hasattr(self, "_desc"): + raise AttributeError( + "Shape object has at least one dynamic dimension, but no information is available on 'min' property." + ) return tuple(self._desc.min) @property @@ -311,12 +324,16 @@ def max(self) -> Tuple[int]: """ if not self.is_dynamic: raise ValueError("max property is only accessible if is_dynamic is true") + if not hasattr(self, "_desc"): + raise AttributeError( + "Shape object has at least one dynamic dimension, but no information is available on 'max' property." + ) return tuple(self._desc.max) def __setitem__(self, index, val): if index >= self._length: raise IndexError("Index out of range") - self._shapes.desc[index] = val + self._shapes[index] = val # Descriptor for a tensor @@ -344,7 +361,7 @@ def __init__(self, shape_expr: ShapeExprs = None, dtype: trt.DataType = None, fo .. code-block:: python :linenos: :caption: Creates a TensorDesc from shape expression of another TensorDesc - + tensor = trt.from_shape_expr(other.shape_expr, dtype=trt.float32) """ @@ -363,7 +380,7 @@ def __init__(self, shape_expr: ShapeExprs = None, dtype: trt.DataType = None, fo def numel(self) -> int: """ Returns: - Returns an int with the number of elements of the tensor. + Returns an int with the number of elements of the tensor. .. warning:: Should only be called when TensorDesc.has_shape is true. If a symbolic expression for the number of elements is required, query TensorDesc.shape_expr.numel(). @@ -373,14 +390,14 @@ def numel(self) -> int: "TensorDesc has no shape information available at this stage. Inspect TensorDesc.has_shape to determine availability." ) return int(np.prod(self.shape)) - + @property def ndim(self) -> int: """ Number of dimensions """ return len(self._shape_expr) - + @property def is_size_tensor(self): return False @@ -389,12 +406,12 @@ def is_size_tensor(self): def like(self) -> "TensorDesc": """ Returns: - Returns a TensorDesc which has identical properties to this tensor, and is mutable. + Returns a TensorDesc which has identical properties to this tensor, and is mutable. .. code-block:: python :linenos: :caption: Communicate that output tensor has identical properties to the input tensor - + @tensorrt.plugin.register("my::plugin") def _(inp: tensorrt.plugin.TensorDesc) -> tensorrt.plugin.TensorDesc: return inp.like() @@ -414,7 +431,7 @@ def aliased(self) -> "TensorDesc": .. code-block:: python :linenos: :caption: Communicate that output tensor has identical properties to the input tensor - + @tensorrt.plugin.register("my::plugin") def _(inp: tensorrt.plugin.TensorDesc) -> tensorrt.plugin.TensorDesc: return inp.aliased() @@ -438,7 +455,7 @@ def _validate_has_shape(self) -> None: raise ValueError( "TensorDesc has no shape information available at this stage. Inspect TensorDesc.has_shape to determine availability." ) - + def _validate_not_immutable(self): if hasattr(self, "_immutable") and self._immutable: raise ValueError("Cannot modify immutable TensorDesc") @@ -446,21 +463,21 @@ def _validate_not_immutable(self): @property def shape_expr(self) -> ShapeExprs: """ - Symbolic expressions for the tensor shape. + Symbolic expressions for the tensor shape. """ return self._shape_expr @property def dtype(self) -> trt.DataType: """ - Data type of the tensor. + Data type of the tensor. """ return self._dtype - + @property def shape(self) -> Shape: """ - The (concrete) shape of the tensor. + The (concrete) shape of the tensor. .. warning:: Only accessible when TensorDesc.has_shape is true. @@ -471,7 +488,7 @@ def shape(self) -> Shape: @property def format(self) -> trt.TensorFormat: """ - The format of the tensor. + The format of the tensor. .. warning:: Only accessible when TensorDesc.has_shape is true. @@ -482,15 +499,15 @@ def format(self) -> trt.TensorFormat: @property def scale(self) -> float: """ - Scale for INT8 data type. + Scale for INT8 data type. .. warning:: Only accessible when TensorDesc.has_shape is true. """ self._validate_has_shape() return self._scale - - + + @shape_expr.setter def shape_expr(self, value): self._shape_expr = value @@ -498,7 +515,7 @@ def shape_expr(self, value): @dtype.setter def dtype(self, value): self._dtype = value - + @shape.setter def shape(self, value): self._validate_not_immutable() @@ -571,11 +588,11 @@ def __init__(self, opt: ShapeExpr, upper_bound: ShapeExpr): self._upper_bound = upper_bound self._index = None self._expr = SizeTensorShapeExpr(self) - + @property def is_size_tensor(self): return True - + @property def opt(self) -> ShapeExpr: """ @@ -596,7 +613,7 @@ def index(self) -> int: Output index at which this size tensor resides """ return self._index - + def _set_index(self, idx: int): self._index = idx @@ -637,21 +654,21 @@ def ndim(self) -> int: Number of dimensions """ return len(self._shape) - + @property def data_ptr(self) -> int: """ Pointer to the data buffer of this tensor """ return self._data_ptr - + @property def dtype(self) -> trt.DataType: """ - Data type of the tensor. + Data type of the tensor. """ return self._dtype - + @property def shape(self) -> Shape: """ @@ -672,7 +689,7 @@ def scale(self) -> float: Scale for INT8 data type. """ return self._scale - + @property def strides(self) -> Tuple[int]: """ @@ -683,11 +700,11 @@ def strides(self) -> Tuple[int]: @data_ptr.setter def data_ptr(self, value): self._data_ptr = value - + @dtype.setter def dtype(self, value): self._dtype = value - + @shape.setter def shape(self, value): self._shape = value diff --git a/python/packaging/frontend_sdist/setup.cfg b/python/packaging/frontend_sdist/setup.cfg index 5b78c91c..031e5396 100644 --- a/python/packaging/frontend_sdist/setup.cfg +++ b/python/packaging/frontend_sdist/setup.cfg @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/python/packaging/frontend_sdist/setup.py b/python/packaging/frontend_sdist/setup.py index 512d3ce1..80f8e05b 100644 --- a/python/packaging/frontend_sdist/setup.py +++ b/python/packaging/frontend_sdist/setup.py @@ -24,12 +24,12 @@ from setuptools import setup from setuptools.command.install import install -tensorrt_module = "##TENSORRT_MODULE##-cu##CUDA_MAJOR##" -tensorrt_package = "##TENSORRT_MODULE##" +distribution_package_name = "##TENSORRT_MODULE##_cu##CUDA_MAJOR##" +import_package_name = "##TENSORRT_MODULE##" tensorrt_version = "##TENSORRT_PYTHON_VERSION##" tensorrt_submodules = [ - "{}_libs=={}".format(tensorrt_module, tensorrt_version), - "{}_bindings=={}".format(tensorrt_module, tensorrt_version), + "{}_libs=={}".format(distribution_package_name, tensorrt_version), + "{}_bindings=={}".format(distribution_package_name, tensorrt_version), ] nvidia_pip_index_url = os.environ.get("NVIDIA_PIP_INDEX_URL", "https://pypi.nvidia.com") disable_internal_pip = os.environ.get("NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP", False) @@ -129,7 +129,7 @@ def parent_command_line(): setup( - name=tensorrt_module, + name=distribution_package_name, version=tensorrt_version, description="A high performance deep learning inference library", long_description=""" @@ -159,13 +159,13 @@ def parent_command_line(): "Intended Audience :: Developers", "Programming Language :: Python :: 3", ], - packages=[tensorrt_package], + packages=[import_package_name], install_requires=install_requires, setup_requires=["wheel", "pip"], python_requires=">=3.6", # ref https://pypi.nvidia.com/tensorrt-bindings/ cmdclass=cmdclass, extras_require={"numpy": "numpy"}, - package_data={tensorrt_package: ["*.so*", "*.pyd", "*.pdb", "*.dll*"]}, + package_data={import_package_name: ["*.so*", "*.pyd", "*.pdb", "*.dll*"]}, include_package_data=True, zip_safe=True, keywords="nvidia tensorrt deeplearning inference", diff --git a/python/packaging/libs_wheel/setup.cfg b/python/packaging/libs_wheel/setup.cfg index 5b78c91c..031e5396 100644 --- a/python/packaging/libs_wheel/setup.cfg +++ b/python/packaging/libs_wheel/setup.cfg @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/python/packaging/libs_wheel/setup.py b/python/packaging/libs_wheel/setup.py index 2d20f09f..054c565b 100644 --- a/python/packaging/libs_wheel/setup.py +++ b/python/packaging/libs_wheel/setup.py @@ -18,8 +18,8 @@ from setuptools import setup -module_name = "##TENSORRT_MODULE##-cu##CUDA_MAJOR##_libs" -package_name = "##TENSORRT_MODULE##_libs" +distribution_package_name = "##TENSORRT_MODULE##_cu##CUDA_MAJOR##_libs" +import_package_name = "##TENSORRT_MODULE##_libs" def get_requirements(): @@ -28,7 +28,7 @@ def get_requirements(): setup( - name=module_name, + name=distribution_package_name, version="##TENSORRT_PYTHON_VERSION##", description="TensorRT Libraries", long_description="TensorRT Libraries", @@ -39,9 +39,9 @@ def get_requirements(): "Intended Audience :: Developers", "Programming Language :: Python :: 3", ], - packages=[package_name], + packages=[import_package_name], install_requires=get_requirements(), - package_data={package_name: ["*.so*", "*.pyd", "*.pdb", "*.dll*"]}, + package_data={import_package_name: ["*.so*", "*.pyd", "*.pdb", "*.dll*"]}, include_package_data=True, zip_safe=True, keywords="nvidia tensorrt deeplearning inference", diff --git a/python/packaging/metapackage/setup.py b/python/packaging/metapackage/setup.py index 11a972d6..1dd43a2f 100644 --- a/python/packaging/metapackage/setup.py +++ b/python/packaging/metapackage/setup.py @@ -18,11 +18,11 @@ from setuptools import setup -module_name = "##TENSORRT_MODULE##" +distribution_package_name = "##TENSORRT_MODULE##" setup( - name=module_name, + name=distribution_package_name, version="##TENSORRT_PYTHON_VERSION##", description="TensorRT Metapackage", long_description="TensorRT Metapackage", @@ -35,7 +35,7 @@ ], packages=[], install_requires=[ - "##TENSORRT_MODULE##-cu##CUDA_MAJOR##==##TENSORRT_PYTHON_VERSION##" + "##TENSORRT_MODULE##_cu##CUDA_MAJOR##==##TENSORRT_PYTHON_VERSION##" ], include_package_data=True, zip_safe=True, diff --git a/python/src/infer/pyCore.cpp b/python/src/infer/pyCore.cpp index ae913bac..0b78ad69 100644 --- a/python/src/infer/pyCore.cpp +++ b/python/src/infer/pyCore.cpp @@ -18,6 +18,7 @@ // This contains the core elements of the API, i.e. builder, logger, engine, runtime, context. #include "ForwardDeclarations.h" #include "utils.h" + #include #include #include @@ -165,6 +166,10 @@ static const auto runtime_deserialize_cuda_engine = [](IRuntime& self, py::buffe return self.deserializeCudaEngine(info.ptr, info.size * info.itemsize); }; +static const auto reader_v2_read = [](IStreamReaderV2& self, void* destination, int64_t nbBytes, size_t stream) { + return self.read(destination, nbBytes, reinterpret_cast(stream)); +}; + // For ICudaEngine @@ -351,6 +356,7 @@ void serialization_config_set_flags(ISerializationConfig& self, uint32_t flags) } } + // For IDebugListener, this function is intended to be override by client. // The bindings here will never be called and is for documentation purpose only. void docProcessDebugTensor(IDebugListener& self, void const* addr, TensorLocation location, DataType type, @@ -630,6 +636,66 @@ class PyStreamReader : public IStreamReader } }; +class PyStreamReaderV2 : public IStreamReaderV2 +{ +public: + int64_t read(void* destination, int64_t nbBytes, cudaStream_t stream) noexcept override + { + try + { + py::gil_scoped_acquire gil{}; + py::function pyFunc = utils::getOverride(static_cast(this), "read"); + + if (!pyFunc) + { + return 0; + } + + intptr_t cudaStreamPtr = reinterpret_cast(stream); + + py::buffer data = pyFunc(nbBytes, cudaStreamPtr); // user implements this + py::buffer_info info = data.request(); + int64_t bytesRead = info.size * info.itemsize; + std::memcpy(destination, info.ptr, std::min(bytesRead, nbBytes)); + return bytesRead; + } + catch (std::exception const& e) + { + std::cerr << "[ERROR] Exception caught in read(): " << e.what() << std::endl; + } + catch (...) + { + std::cerr << "[ERROR] Exception caught in read()" << std::endl; + } + return 0; + } + + bool seek(int64_t offset, SeekPosition where) noexcept override + { + try + { + py::gil_scoped_acquire gil{}; + py::function pyFunc = utils::getOverride(static_cast(this), "seek"); + + if (!pyFunc) + { + return false; + } + + py::bool_ ret = pyFunc(offset, where); + return ret; + } + catch (std::exception const& e) + { + std::cerr << "[ERROR] Exception caught in seek(): " << e.what() << std::endl; + } + catch (...) + { + std::cerr << "[ERROR] Exception caught in seek()" << std::endl; + } + return false; + } +}; class PyDebugListener : public IDebugListener { @@ -1100,6 +1166,7 @@ void bindCore(py::module& m) .def("get_debug_state", &IExecutionContext::getDebugState, "name"_a, IExecutionContextDoc::get_debug_state) .def("set_all_tensors_debug_state", &IExecutionContext::setAllTensorsDebugState, "flag"_a, IExecutionContextDoc::set_all_tensors_debug_state) + ; py::enum_(m, "ExecutionContextAllocationStrategy", py::arithmetic{}, @@ -1297,6 +1364,7 @@ void bindCore(py::module& m) // End weight streaming APIs .def("is_debug_tensor", &ICudaEngine::isDebugTensor, "name"_a, ICudaEngineDoc::is_debug_tensor) + .def("__del__", &utils::doNothingDel); py::enum_(m, "AllocatorFlag", py::arithmetic{}, AllocatorFlagDoc::descr, py::module_local()) @@ -1338,6 +1406,15 @@ void bindCore(py::module& m) .def(py::init<>()) .def("read", &IStreamReader::read, "destination"_a, "size"_a, StreamReaderDoc::read); + py::enum_(m, "SeekPosition", py::arithmetic{}, SeekPositionDoc::descr, py::module_local()) + .value("SET", SeekPosition::kSET, SeekPositionDoc::SET) + .value("CUR", SeekPosition::kCUR, SeekPositionDoc::CUR) + .value("END", SeekPosition::kEND, SeekPositionDoc::END); + + py::class_(m, "IStreamReaderV2", StreamReaderV2Doc::descr, py::module_local()) + .def(py::init<>()) + .def("read", lambdas::reader_v2_read, "destination"_a, "num_bytes"_a, "stream"_a, StreamReaderV2Doc::seek) + .def("seek", &IStreamReaderV2::seek, "offset"_a, "where"_a, StreamReaderV2Doc::read); py::enum_(m, "BuilderFlag", py::arithmetic{}, BuilderFlagDoc::descr, py::module_local()) .value("FP16", BuilderFlag::kFP16, BuilderFlagDoc::FP16) @@ -1503,6 +1580,7 @@ void bindCore(py::module& m) .def_property("max_aux_streams", &IBuilderConfig::getMaxAuxStreams, &IBuilderConfig::setMaxAuxStreams) .def_property("progress_monitor", &IBuilderConfig::getProgressMonitor, py::cpp_function(&IBuilderConfig::setProgressMonitor, py::keep_alive<1, 2>{})) + .def("__del__", &utils::doNothingDel); py::enum_(m, "NetworkDefinitionCreationFlag", py::arithmetic{}, @@ -1557,6 +1635,10 @@ void bindCore(py::module& m) .def("deserialize_cuda_engine", py::overload_cast(&IRuntime::deserializeCudaEngine), "stream_reader"_a, RuntimeDoc::deserialize_cuda_engine_reader, py::call_guard{}, py::keep_alive<0, 1>{}) + .def("deserialize_cuda_engine", py::overload_cast(&IRuntime::deserializeCudaEngine), + "stream_reader_v2"_a, RuntimeDoc::deserialize_cuda_engine_reader_v2, + py::call_guard{}, py::keep_alive<0, 1>{}) + .def_property("DLA_core", &IRuntime::getDLACore, &IRuntime::setDLACore) .def_property_readonly("num_DLA_cores", &IRuntime::getNbDLACores) .def_property("gpu_allocator", nullptr, py::cpp_function(&IRuntime::setGpuAllocator, py::keep_alive<1, 2>{})) diff --git a/python/src/infer/pyGraph.cpp b/python/src/infer/pyGraph.cpp index b4fc21c6..2fde9dd7 100644 --- a/python/src/infer/pyGraph.cpp +++ b/python/src/infer/pyGraph.cpp @@ -25,6 +25,7 @@ #include "NvInferSerialize.h" #endif + #include "infer/pyGraphDoc.h" // clang-format off @@ -255,6 +256,7 @@ namespace tensorrt }; + } /* lambdas */ void bindGraph(py::module& m) @@ -308,6 +310,8 @@ namespace tensorrt .value("REVERSE_SEQUENCE", LayerType::kREVERSE_SEQUENCE, LayerTypeDoc::REVERSE_SEQUENCE) .value("NORMALIZATION", LayerType::kNORMALIZATION, LayerTypeDoc::NORMALIZATION) .value("PLUGIN_V3", LayerType::kPLUGIN_V3, LayerTypeDoc::PLUGIN_V3) + .value("SQUEEZE", LayerType::kSQUEEZE, LayerTypeDoc::SQUEEZE) + .value("UNSQUEEZE", LayerType::kUNSQUEEZE, LayerTypeDoc::UNSQUEEZE) ; // LayerType @@ -346,6 +350,7 @@ namespace tensorrt .def("reset_dynamic_range", utils::deprecateMember(&ITensor::resetDynamicRange, "Deprecated in TensorRT 10.1. Superseded by explicit quantization."), ITensorDoc::reset_dynamic_range) .def("set_dimension_name", &ITensor::setDimensionName, "index"_a, "name"_a, ITensorDoc::set_dimension_name) .def("get_dimension_name", &ITensor::getDimensionName, "index"_a, ITensorDoc::get_dimension_name) + ; py::class_>(m, "ILayer", ILayerDoc::descr, py::module_local()) @@ -828,6 +833,13 @@ namespace tensorrt .def_property("num_groups", &INormalizationLayer::getNbGroups, &INormalizationLayer::setNbGroups) .def_property("compute_precision", &INormalizationLayer::getComputePrecision, &INormalizationLayer::setComputePrecision) ; + py::class_>(m, "ISqueezeLayer", ISqueezeLayerDoc::descr, py::module_local()) + .def("set_input", &ISqueezeLayer::setInput, "index"_a, "tensor"_a, ISqueezeLayerDoc::set_input) + ; + py::class_>(m, "IUnsqueezeLayer", IUnsqueezeLayerDoc::descr, py::module_local()) + .def("set_input", &IUnsqueezeLayer::setInput, "index"_a, "tensor"_a, IUnsqueezeLayerDoc::set_input) + ; + // Weights must be kept alive for the duration of the network. py::keep_alive is critical here! @@ -968,6 +980,9 @@ namespace tensorrt .def("mark_debug", &INetworkDefinition::markDebug, "tensor"_a, INetworkDefinitionDoc::mark_debug) .def("unmark_debug", &INetworkDefinition::unmarkDebug, "tensor"_a, INetworkDefinitionDoc::unmark_debug) .def("is_debug_tensor", &INetworkDefinition::isDebugTensor, "tensor"_a, INetworkDefinitionDoc::is_debug_tensor) + .def("add_squeeze", &INetworkDefinition::addSqueeze, "input"_a, "axes"_a, INetworkDefinitionDoc::add_squeeze, py::return_value_policy::reference_internal) + .def("add_unsqueeze", &INetworkDefinition::addUnsqueeze, "input"_a, "axes"_a, INetworkDefinitionDoc::add_unsqueeze, py::return_value_policy::reference_internal) + #if ENABLE_INETWORK_SERIALIZE // Serialization .def("serialize", lambdas::network_serialize, INetworkDefinitionDoc::serialize) diff --git a/samples/CMakeSamplesTemplate.txt b/samples/CMakeSamplesTemplate.txt index 285e3f99..87fbbe0f 100644 --- a/samples/CMakeSamplesTemplate.txt +++ b/samples/CMakeSamplesTemplate.txt @@ -22,6 +22,7 @@ endif() set_ifndef(PLUGINS_NEEDED OFF) set_ifndef(SAMPLE_PARSERS "none") +set_ifndef(CUDA_LIBS_REQUIRED False) set(TARGET_DIR ${CMAKE_CURRENT_SOURCE_DIR}) @@ -128,6 +129,11 @@ set_target_properties(${TARGET_NAME} RUNTIME_OUTPUT_DIRECTORY "${TRT_OUT_DIR}" ) +# Add CUDA dependencies for samples that require them. +if(${CUDA_LIBS_REQUIRED}) + target_link_directories(${TARGET_NAME} PUBLIC ${CUDA_ROOT}/lib) +endif() + add_dependencies(samples ${TARGET_NAME}) ################################### INSTALLATION ######################################## diff --git a/samples/common/common.h b/samples/common/common.h index d5c3711c..8212d940 100644 --- a/samples/common/common.h +++ b/samples/common/common.h @@ -96,7 +96,7 @@ template OBJ_GUARD(T) makeObjGuard(T_* t) { - CHECK(!(std::is_base_of::value || std::is_same::value)); + static_assert(std::is_base_of_v || std::is_same_v); auto deleter = [](T* t) { delete t; }; return std::unique_ptr{static_cast(t), deleter}; } @@ -207,7 +207,7 @@ using nvinfer1::utils::buildTimingCacheFromFile; using nvinfer1::utils::saveTimingCacheFile; using nvinfer1::utils::updateTimingCacheFile; // Swaps endianness of an integral type. -template ::value, int>::type = 0> +template , int> = 0> inline T swapEndianness(const T& value) { uint8_t bytes[sizeof(T)]; @@ -914,12 +914,6 @@ inline bool isDataTypeSupported(nvinfer1::DataType dataType) return false; } - if ((dataType == nvinfer1::DataType::kINT8 && !builder->platformHasFastInt8()) - || (dataType == nvinfer1::DataType::kHALF && !builder->platformHasFastFp16())) - { - return false; - } - return true; } } // namespace samplesCommon diff --git a/samples/common/half.h b/samples/common/half.h index b997e7db..05f0815e 100644 --- a/samples/common/half.h +++ b/samples/common/half.h @@ -873,7 +873,7 @@ template uint16 int2half_impl(T value) { #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_integral::value, "int to half conversion only supports builtin integer types"); + static_assert(std::is_integral_v, "int to half conversion only supports builtin integer types"); #endif if (S) value = -value; @@ -1249,7 +1249,7 @@ template T half2int_impl(uint16 value) { #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_integral::value, "half to int conversion only supports builtin integer types"); + static_assert(std::is_integral_v, "half to int conversion only supports builtin integer types"); #endif uint32_t e = value & 0x7FFF; if (e >= 0x7C00) @@ -2617,7 +2617,7 @@ template struct half_caster { #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); + static_assert(std::is_arithmetic_v, "half_cast from non-arithmetic type unsupported"); #endif static half cast(U arg) @@ -2639,7 +2639,7 @@ template struct half_caster { #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); + static_assert(std::is_arithmetic_v, "half_cast to non-arithmetic type unsupported"); #endif static T cast(half arg) @@ -2661,7 +2661,7 @@ template struct half_caster { #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); + static_assert(std::is_arithmetic_v, "half_cast to non-arithmetic type unsupported"); #endif static T cast(expr arg) diff --git a/samples/common/safeCommon.h b/samples/common/safeCommon.h index 9bffdb2d..376aacaf 100644 --- a/samples/common/safeCommon.h +++ b/samples/common/safeCommon.h @@ -45,18 +45,21 @@ using namespace nvinfer1; -#undef CHECK -#define CHECK(status) \ +#undef CHECK_WITH_STREAM +#define CHECK_WITH_STREAM(status, stream) \ do \ { \ - auto ret = (status); \ - if (ret != 0) \ + if ((status) != cudaSuccess) \ { \ - std::cerr << "Cuda failure: " << ret << std::endl; \ + stream << "Cuda failure at " << __FILE__ << ":" << __LINE__ << ": " << cudaGetErrorString(status) \ + << std::endl; \ exit(EXIT_FAILURE); \ } \ } while (0) +#undef CHECK +#define CHECK(status) CHECK_WITH_STREAM(status, std::cerr) + #undef SAFE_ASSERT #define SAFE_ASSERT(condition) \ do \ @@ -68,7 +71,7 @@ using namespace nvinfer1; } \ } while (0) -#define LWE_CALL(api_call, recorder) \ +#define SAFE_API_CALL(api_call, recorder) \ do \ { \ const ErrorCode ret = (api_call); \ @@ -77,7 +80,7 @@ using namespace nvinfer1; std::cerr << "LWE Error: [" << #api_call << "]: " << toString(ret); \ throw ret; \ } \ - std::cout << "LWE:[" << #api_call << "]: PASSED"; \ + std::cout << "SAFE API:[" << #api_call << "]: PASSED" << std::endl; \ } while (0) #define CUDA_CALL(cuda_api_call, recorder) \ @@ -86,10 +89,10 @@ using namespace nvinfer1; cudaError_t error = (cuda_api_call); \ if (error != cudaSuccess) \ { \ - std::cerr << "CUDA Error: [" << #cuda_api_call << "]: " << cudaGetErrorString(error); \ + std::cerr << "CUDA Error: [" << #cuda_api_call << "]: " << cudaGetErrorString(error) << std::endl; \ throw ErrorCode::kFAILED_EXECUTION; \ } \ - std::cout << "CUDA:[" << #cuda_api_call << "]: PASSED"; \ + std::cout << "CUDA:[" << #cuda_api_call << "]: PASSED" << std::endl; \ } while (0) inline std::string toString(ErrorCode ec) @@ -231,8 +234,8 @@ inline int64_t volume(nvinfer1::Dims const& d) template inline T1 roundUp(T1 m, T2 n) { - static_assert(std::is_integral::value && std::is_integral::value, "arguments must be integers"); - static_assert(std::is_signed::value == std::is_signed::value, "mixed signedness not allowed"); + static_assert(std::is_integral_v && std::is_integral_v, "arguments must be integers"); + static_assert(std::is_signed_v == std::is_signed_v, "mixed signedness not allowed"); static_assert(sizeof(T1) >= sizeof(T2), "first type must be as least as wide as second type"); return ((m + n - 1) / n) * n; } @@ -247,20 +250,15 @@ inline int64_t volume(nvinfer1::Dims dims, int32_t vecDim, int32_t comps, int32_ return samplesCommon::volume(dims) * std::max(batch, 1); } +#if !TRT_WINML inline int32_t getSMVersion() { -#if 0 - // Use default value for 4090 - int32_t major{8}; - int32_t minor{9}; -#else int32_t major{}; int32_t minor{}; int32_t deviceIndex{}; CHECK(cudaGetDevice(&deviceIndex)); CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); -#endif return ((major << 8) | minor); } @@ -269,6 +267,7 @@ inline bool isSMSafe() const int32_t smVersion = getSMVersion(); return smVersion == 0x0705 || smVersion == 0x0800 || smVersion == 0x0806 || smVersion == 0x0807; } +#endif inline int32_t calculateSoftmax(float* const prob, int32_t const numDigits) { diff --git a/samples/common/sampleDevice.cpp b/samples/common/sampleDevice.cpp index e9ad78dd..cfbab380 100644 --- a/samples/common/sampleDevice.cpp +++ b/samples/common/sampleDevice.cpp @@ -22,15 +22,6 @@ namespace sample { -void cudaCheck(cudaError_t ret, std::ostream& err) -{ - if (ret != cudaSuccess) - { - err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; - exit(EXIT_FAILURE); - } -} - #if !TRT_WINML // Construct GPU UUID string in the same format as nvidia-smi does. std::string getUuidString(cudaUUID_t uuid) @@ -59,7 +50,7 @@ void setCudaDevice(int32_t device, std::ostream& os) // Get the number of visible GPUs. int32_t nbDevices{-1}; - cudaCheck(cudaGetDeviceCount(&nbDevices)); + CHECK(cudaGetDeviceCount(&nbDevices)); if (nbDevices <= 0) { @@ -73,7 +64,7 @@ void setCudaDevice(int32_t device, std::ostream& os) for (int32_t deviceIdx = 0; deviceIdx < nbDevices; ++deviceIdx) { cudaDeviceProp tempProperties; - cudaCheck(cudaGetDeviceProperties(&tempProperties, deviceIdx)); + CHECK(cudaGetDeviceProperties(&tempProperties, deviceIdx)); // clang-format off os << " Device " << deviceIdx << ": \"" << tempProperties.name << "\" UUID: " @@ -95,7 +86,7 @@ void setCudaDevice(int32_t device, std::ostream& os) } // Set to the corresponding GPU. - cudaCheck(cudaSetDevice(device)); + CHECK(cudaSetDevice(device)); // clang-format off os << "Selected Device: " << properties.name << std::endl; @@ -118,14 +109,14 @@ void setCudaDevice(int32_t device, std::ostream& os) int32_t getCudaDriverVersion() { int32_t version{-1}; - cudaCheck(cudaDriverGetVersion(&version)); + CHECK(cudaDriverGetVersion(&version)); return version; } int32_t getCudaRuntimeVersion() { int32_t version{-1}; - cudaCheck(cudaRuntimeGetVersion(&version)); + CHECK(cudaRuntimeGetVersion(&version)); return version; } #endif diff --git a/samples/common/sampleDevice.h b/samples/common/sampleDevice.h index ef6a00a2..43844918 100644 --- a/samples/common/sampleDevice.h +++ b/samples/common/sampleDevice.h @@ -24,14 +24,12 @@ #include #include +#include "safeCommon.h" #include "sampleUtils.h" namespace sample { -//! Check if the CUDA return status shows any error. If so, exit the program immediately. -void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr); - class TrtCudaEvent; namespace @@ -53,7 +51,7 @@ class TrtCudaStream public: TrtCudaStream() { - cudaCheck(cudaStreamCreate(&mStream)); + CHECK(cudaStreamCreate(&mStream)); } TrtCudaStream(const TrtCudaStream&) = delete; @@ -66,7 +64,7 @@ class TrtCudaStream ~TrtCudaStream() { - cudaCheck(cudaStreamDestroy(mStream)); + CHECK(cudaStreamDestroy(mStream)); } cudaStream_t get() const @@ -76,14 +74,14 @@ class TrtCudaStream void synchronize() { - cudaCheck(cudaStreamSynchronize(mStream)); + CHECK(cudaStreamSynchronize(mStream)); } void wait(TrtCudaEvent& event); void sleep(float* ms) { - cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms)); + CHECK(cudaLaunchHostFunc(mStream, cudaSleep, ms)); } private: @@ -100,7 +98,7 @@ class TrtCudaEvent explicit TrtCudaEvent(bool blocking = true) { const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault; - cudaCheck(cudaEventCreateWithFlags(&mEvent, flags)); + CHECK(cudaEventCreateWithFlags(&mEvent, flags)); } TrtCudaEvent(const TrtCudaEvent&) = delete; @@ -113,7 +111,7 @@ class TrtCudaEvent ~TrtCudaEvent() { - cudaCheck(cudaEventDestroy(mEvent)); + CHECK(cudaEventDestroy(mEvent)); } cudaEvent_t get() const @@ -123,19 +121,19 @@ class TrtCudaEvent void record(const TrtCudaStream& stream) { - cudaCheck(cudaEventRecord(mEvent, stream.get())); + CHECK(cudaEventRecord(mEvent, stream.get())); } void synchronize() { - cudaCheck(cudaEventSynchronize(mEvent)); + CHECK(cudaEventSynchronize(mEvent)); } // Returns time elapsed time in milliseconds float operator-(const TrtCudaEvent& e) const { float time{0}; - cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); + CHECK(cudaEventElapsedTime(&time, e.get(), get())); return time; } @@ -145,7 +143,7 @@ class TrtCudaEvent inline void TrtCudaStream::wait(TrtCudaEvent& event) { - cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0)); + CHECK(cudaStreamWaitEvent(mStream, event.get(), 0)); } //! @@ -175,7 +173,7 @@ class TrtCudaGraph void beginCapture(TrtCudaStream& stream) { - cudaCheck(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); + CHECK(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); } bool launch(TrtCudaStream& stream) @@ -185,9 +183,9 @@ class TrtCudaGraph void endCapture(TrtCudaStream& stream) { - cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph)); - cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); - cudaCheck(cudaGraphDestroy(mGraph)); + CHECK(cudaStreamEndCapture(stream.get(), &mGraph)); + CHECK(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + CHECK(cudaGraphDestroy(mGraph)); } void endCaptureOnError(TrtCudaStream& stream) @@ -204,9 +202,9 @@ class TrtCudaGraph } else { - assert(ret == cudaSuccess); + CHECK(ret); assert(mGraph != nullptr); - cudaCheck(cudaGraphDestroy(mGraph)); + CHECK(cudaGraphDestroy(mGraph)); mGraph = nullptr; } // Clean up any CUDA error. @@ -298,7 +296,7 @@ struct DeviceAllocator { void operator()(void** ptr, size_t size) { - cudaCheck(cudaMalloc(ptr, size)); + CHECK(cudaMalloc(ptr, size)); } }; @@ -306,7 +304,7 @@ struct DeviceDeallocator { void operator()(void* ptr) { - cudaCheck(cudaFree(ptr)); + CHECK(cudaFree(ptr)); } }; @@ -314,7 +312,7 @@ struct ManagedAllocator { void operator()(void** ptr, size_t size) { - cudaCheck(cudaMallocManaged(ptr, size)); + CHECK(cudaMallocManaged(ptr, size)); } }; @@ -322,7 +320,7 @@ struct HostAllocator { void operator()(void** ptr, size_t size) { - cudaCheck(cudaMallocHost(ptr, size)); + CHECK(cudaMallocHost(ptr, size)); } }; @@ -330,7 +328,7 @@ struct HostDeallocator { void operator()(void* ptr) { - cudaCheck(cudaFreeHost(ptr)); + CHECK(cudaFreeHost(ptr)); } }; @@ -415,12 +413,12 @@ class DiscreteMirroredBuffer : public IMirroredBuffer void hostToDevice(TrtCudaStream& stream) override { - cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); + CHECK(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); } void deviceToHost(TrtCudaStream& stream) override { - cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); + CHECK(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); } size_t getSize() const override diff --git a/samples/common/sampleEngines.cpp b/samples/common/sampleEngines.cpp index 5dddceeb..eff17e1a 100644 --- a/samples/common/sampleEngines.cpp +++ b/samples/common/sampleEngines.cpp @@ -80,9 +80,8 @@ nvinfer1::ICudaEngine* LazilyDeserializedEngine::get() if (mEngine == nullptr) { - SMP_RETVAL_IF_FALSE(getFileReader().isOpen() || !getBlob().empty(), "Engine is empty. Nothing to deserialize!", - nullptr, sample::gLogError); - + SMP_RETVAL_IF_FALSE(getAsyncFileReader().isOpen() || getFileReader().isOpen() || !getBlob().empty(), + "Engine is empty. Nothing to deserialize!", nullptr, sample::gLogError); using time_point = std::chrono::time_point; using duration = std::chrono::duration; time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()}; @@ -130,6 +129,10 @@ nvinfer1::ICudaEngine* LazilyDeserializedEngine::get() { mEngine.reset(mRuntime->deserializeCudaEngine(getFileReader())); } + else if (getAsyncFileReader().isOpen()) + { + mEngine.reset(mRuntime->deserializeCudaEngine(getAsyncFileReader())); + } else { auto const& engineBlob = getBlob(); @@ -268,7 +271,7 @@ class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 { for (auto& elem : mInputDeviceBuffers) { - cudaCheck(cudaFree(elem.second), mErr); + CHECK_WITH_STREAM(cudaFree(elem.second), mErr); } } @@ -316,8 +319,9 @@ RndInt8Calibrator::RndInt8Calibrator(int32_t batches, std::vector& elem std::generate_n(rnd_data.begin(), elemCount[i], gen); void* data; - cudaCheck(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); - cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr); + CHECK_WITH_STREAM(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); + CHECK_WITH_STREAM( + cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr); mInputDeviceBuffers.insert(std::make_pair(input->getName(), data)); } @@ -1177,9 +1181,11 @@ bool networkToSerializedEngine( } // CUDA stream used for profiling by the builder. +#if !TRT_WINML auto profileStream = samplesCommon::makeCudaStream(); SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err); config->setProfileStream(*profileStream); +#endif auto const tBegin = std::chrono::high_resolution_clock::now(); std::unique_ptr serializedEngine{builder.buildSerializedNetwork(*env.network, *config)}; @@ -1310,6 +1316,13 @@ bool loadStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment return true; } +bool loadAsyncStreamingEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) +{ + auto& asyncReader = env.engine.getAsyncFileReader(); + SMP_RETVAL_IF_FALSE(asyncReader.open(filepath), "", false, err << "Error opening engine file: " << filepath); + return true; +} + bool loadEngineToBuildEnv(std::string const& filepath, BuildEnvironment& env, std::ostream& err) { @@ -1340,10 +1353,16 @@ bool printPlanVersion(BuildEnvironment& env, std::ostream& err) auto blob = data.data(); auto& reader = env.engine.getFileReader(); + auto& asyncReader = env.engine.getAsyncFileReader(); if (reader.isOpen()) { SMP_RETVAL_IF_FALSE(reader.read(data.data(), kPLAN_SIZE) == kPLAN_SIZE, "Failed to read plan file", false, err); } + else if (asyncReader.isOpen()) + { + SMP_RETVAL_IF_FALSE(asyncReader.read(data.data(), kPLAN_SIZE, cudaStream_t{}) == kPLAN_SIZE, + "Failed to read plan file", false, err); + } else { SMP_RETVAL_IF_FALSE(env.engine.getBlob().data != nullptr, "Plan file is empty", false, err); @@ -1428,7 +1447,14 @@ bool getEngineBuildEnv( } else { - createEngineSuccess = loadStreamingEngineToBuildEnv(build.engine, env, err); + if (build.asyncFileReader) + { + createEngineSuccess = loadAsyncStreamingEngineToBuildEnv(build.engine, env, err); + } + else + { + createEngineSuccess = loadStreamingEngineToBuildEnv(build.engine, env, err); + } } } else @@ -1455,7 +1481,16 @@ bool getEngineBuildEnv( if (!build.safe) { env.engine.releaseBlob(); - SMP_RETVAL_IF_FALSE(loadStreamingEngineToBuildEnv(build.engine, env, err), "Reading engine file failed.", false, err); + if (build.asyncFileReader) + { + SMP_RETVAL_IF_FALSE(loadAsyncStreamingEngineToBuildEnv(build.engine, env, err), + "Reading engine file via async stream reader failed.", false, err); + } + else + { + SMP_RETVAL_IF_FALSE(loadStreamingEngineToBuildEnv(build.engine, env, err), + "Reading engine file via stream reader failed.", false, err); + } } } @@ -1546,9 +1581,11 @@ std::vector> getAllRefitWeightsForLayer(const IL case LayerType::kSHUFFLE: case LayerType::kSLICE: case LayerType::kSOFTMAX: + case LayerType::kSQUEEZE: case LayerType::kTOPK: case LayerType::kTRIP_LIMIT: - case LayerType::kUNARY: return {}; + case LayerType::kUNARY: + case LayerType::kUNSQUEEZE: return {}; } return {}; } diff --git a/samples/common/sampleEngines.h b/samples/common/sampleEngines.h index d1d88319..ce300d37 100644 --- a/samples/common/sampleEngines.h +++ b/samples/common/sampleEngines.h @@ -80,6 +80,10 @@ class LazilyDeserializedEngine , mTempfileControls(tempfileControls) , mLeanDLLPath(leanDLLPath) { + // Only one of these is relevant for any given trtexec call. + // Enabled using --asyncFileReader flag. + mAsyncFileReader = std::make_unique(); + // Enabled using --load flag. mFileReader = std::make_unique(); } @@ -110,6 +114,8 @@ class LazilyDeserializedEngine { ASSERT((!mFileReader || !mFileReader->isOpen()) && "Attempting to access the glob when there is an open file reader!"); + ASSERT((!mAsyncFileReader || !mAsyncFileReader->isOpen()) + && "Attempting to access the glob when there is an open async file reader!"); if (!mEngineBlob.empty()) { return EngineBlob{const_cast(static_cast(mEngineBlob.data())), mEngineBlob.size()}; @@ -159,6 +165,17 @@ class LazilyDeserializedEngine return *mFileReader; } + //! + //! \brief Get the file stream reader used for deserialization + //! + //! when IStreamReader is eventually deprecated. + //! + samplesCommon::AsyncStreamReader& getAsyncFileReader() + { + ASSERT(mAsyncFileReader); + return *mAsyncFileReader; + } + //! //! \brief Get if safe mode is enabled. @@ -179,6 +196,8 @@ class LazilyDeserializedEngine int32_t mDLACore{-1}; std::vector mEngineBlob; std::unique_ptr mFileReader; + std::unique_ptr mAsyncFileReader; + // Directly use the host memory of a serialized engine instead of duplicating the engine in CPU memory. std::unique_ptr mEngineBlobHostMemory; diff --git a/samples/common/sampleInference.cpp b/samples/common/sampleInference.cpp index 77a99c1d..bb0f6673 100644 --- a/samples/common/sampleInference.cpp +++ b/samples/common/sampleInference.cpp @@ -237,10 +237,10 @@ bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inferenc int32_t const isIntegrated{}; #else int32_t device{}; - cudaCheck(cudaGetDevice(&device)); + CHECK(cudaGetDevice(&device)); cudaDeviceProp properties; - cudaCheck(cudaGetDeviceProperties(&properties, device)); + CHECK(cudaGetDeviceProperties(&properties, device)); int32_t const isIntegrated{properties.integrated}; #endif // Use managed memory on integrated devices when transfers are skipped @@ -531,7 +531,7 @@ TaskInferenceEnvironment::TaskInferenceEnvironment( std::unique_ptr tmp(new InferenceEnvironment(bEnv)); iEnv = std::move(tmp); - cudaCheck(cudaSetDevice(device)); + CHECK(cudaSetDevice(device)); SystemOptions system{}; system.device = device; system.DLACore = DLACore; @@ -625,7 +625,7 @@ class EnqueueExplicit : private Enqueue bool isStreamCapturing(TrtCudaStream& stream) const { cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone}; - cudaCheck(cudaStreamIsCapturing(stream.get(), &status)); + CHECK(cudaStreamIsCapturing(stream.get(), &status)); return status != cudaStreamCaptureStatusNone; } @@ -916,7 +916,7 @@ class Iteration { mGraph.endCaptureOnError(stream); // Ensure any CUDA error has been cleaned up. - cudaCheck(cudaGetLastError()); + CHECK(cudaGetLastError()); sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under " "CUDA graph capture mode." << std::endl; @@ -1018,7 +1018,7 @@ void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment& durationMs = inference.duration * 1000.F + warmupMs; } - cudaCheck(cudaSetDevice(device)); + CHECK(cudaSetDevice(device)); std::vector> iStreams; @@ -1080,7 +1080,7 @@ bool runInference( InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace) { SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError); - cudaCheck(cudaProfilerStart()); + CHECK(cudaProfilerStart()); trace.resize(0); @@ -1106,7 +1106,7 @@ bool runInference( th.join(); } - cudaCheck(cudaProfilerStop()); + CHECK(cudaProfilerStop()); auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; std::sort(trace.begin(), trace.end(), cmpTrace); @@ -1116,7 +1116,7 @@ bool runInference( bool runMultiTasksInference(std::vector>& tEnvList) { - cudaCheck(cudaProfilerStart()); + CHECK(cudaProfilerStart()); cudaSetDeviceFlags(cudaDeviceScheduleSpin); SyncStruct sync; @@ -1137,7 +1137,7 @@ bool runMultiTasksInference(std::vectorgetPluginRegistry().loadLibrary(pluginPath.c_str()); } #endif - auto& reader = iEnv.engine.getFileReader(); - ASSERT(reader.isOpen()); - reader.reset(); - engine.reset(rt->deserializeCudaEngine(reader)); + auto& asyncReader = iEnv.engine.getAsyncFileReader(); + ASSERT(reader.isOpen() || asyncReader.isOpen()); + if (asyncReader.isOpen()) + { + asyncReader.reset(); + engine.reset(rt->deserializeCudaEngine(asyncReader)); + } + else + { + reader.reset(); + engine.reset(rt->deserializeCudaEngine(reader)); + } deserializeOK = (engine != nullptr); deserializeOK = (engine != nullptr); diff --git a/samples/common/sampleOptions.cpp b/samples/common/sampleOptions.cpp index 283091f1..803f9b35 100644 --- a/samples/common/sampleOptions.cpp +++ b/samples/common/sampleOptions.cpp @@ -357,8 +357,8 @@ bool getOption(Arguments& arguments, const std::string& option, T& value) //! Check if input option exists in input arguments. //! If it does: set its value, erase the argument and return true. //! If it does not: return false. -template -bool getAndDelOption(Arguments& arguments, const std::string& option, T_& value) +template +bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) { bool found = getOption(arguments, option, value); if (found) @@ -373,13 +373,13 @@ bool getAndDelOption(Arguments& arguments, const std::string& option, T_& value) //! Check if input option exists in input arguments. //! If it does: set its value and position, erase the argument and return true. //! If it does not: return false. -template -bool getAndDelOptionWithPosition(Arguments& arguments, std::string const& option, T_& value, int32_t& pos) +template +bool getAndDelOptionWithPosition(Arguments& arguments, std::string const& option, T& value, int32_t& pos) { auto const match = arguments.find(option); if (match != arguments.end()) { - value = stringToValue(match->second.first); + value = stringToValue(match->second.first); pos = match->second.second; arguments.erase(match); return true; @@ -391,8 +391,8 @@ bool getAndDelOptionWithPosition(Arguments& arguments, std::string const& option //! Check if input option exists in input arguments behind the position spcecified by pos. //! If it does: set its value, erase the argument and return true. //! If it does not: return false. -template -bool getAndDelOptionBehind(Arguments& arguments, std::string const& option, int32_t pos, T_& value) +template +bool getAndDelOptionBehind(Arguments& arguments, std::string const& option, int32_t pos, T& value) { auto const match = arguments.equal_range(option); if (match.first == match.second) @@ -403,7 +403,7 @@ bool getAndDelOptionBehind(Arguments& arguments, std::string const& option, int3 { if (i->second.second - pos == 1) { - value = stringToValue(i->second.first); + value = stringToValue(i->second.first); arguments.erase(i); return true; } @@ -1185,7 +1185,9 @@ void BuildOptions::parse(Arguments& arguments) fp16 = true; // BF16 only supported on Ampere+ +#if !TRT_WINML if (samplesCommon::getSMVersion() >= 0x0800) +#endif { bf16 = true; } @@ -1248,6 +1250,19 @@ void BuildOptions::parse(Arguments& arguments) disableAndLog(int4, "int4", "kINT4"); } + // Print a message to tell users that --noTF32 can be added to improve accuracy with performance cost. +#if !TRT_WINML + if (samplesCommon::getSMVersion() >= 0x0800) +#endif + { + if (!(stronglyTyped || fp16 || bf16 || int8 || fp8 || int4)) + { + sample::gLogInfo << "TF32 is enabled by default. Add --noTF32 flag to further improve accuracy with some " + << "performance cost." + << std::endl; + } + } + if (fp8 && int8) { throw std::invalid_argument("Invalid usage, fp8 and int8 aren't allowed to be enabled together."); @@ -1257,7 +1272,10 @@ void BuildOptions::parse(Arguments& arguments) getAndDelOption(arguments, "--allowGPUFallback", allowGPUFallback); getAndDelOption(arguments, "--restricted", restricted); getAndDelOption(arguments, "--skipInference", skipInference); - getAndDelOption(arguments, "--directIO", directIO); + if (getAndDelOption(arguments, "--directIO", directIO)) + { + sample::gLogWarning << "--directIO flag has been deprecated" << std::endl; + } std::string precisionConstraintsString; getAndDelOption(arguments, "--precisionConstraints", precisionConstraintsString); @@ -1348,6 +1366,7 @@ void BuildOptions::parse(Arguments& arguments) { load = true; } + getAndDelOption(arguments, "--asyncFileReader", asyncFileReader); getAndDelOption(arguments, "--getPlanVersionOnly", getPlanVersionOnly); if (getAndDelOption(arguments, "--saveEngine", engine)) @@ -2496,7 +2515,7 @@ void BuildOptions::help(std::ostream& os) " --int4 Enable int4 precision, in addition to fp32 (default = disabled)" "\n" " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" " --stronglyTyped Create a strongly typed network. (default = disabled)" "\n" - " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" + " --directIO [Deprecated] Avoid reformatting at network boundaries. (default = disabled)" "\n" " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" R"( Precision Constraints: spec ::= "none" | "obey" | "prefer")" "\n" " none = no constraints" "\n" @@ -2535,6 +2554,7 @@ void BuildOptions::help(std::ostream& os) " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" " --saveEngine= Save the serialized engine" "\n" " --loadEngine= Load a serialized engine" "\n" + " --asyncFileReader= Load a serialized engine using async stream reader" "\n" " --getPlanVersionOnly Print TensorRT version when loaded plan was created. Works without deserialization of the plan." "\n" " Use together with --loadEngine. Supported only for engines created with 8.6 and forward." "\n" " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" diff --git a/samples/common/sampleOptions.h b/samples/common/sampleOptions.h index 83e11fc4..1bf9bae2 100644 --- a/samples/common/sampleOptions.h +++ b/samples/common/sampleOptions.h @@ -18,6 +18,7 @@ #ifndef TRT_SAMPLE_OPTIONS_H #define TRT_SAMPLE_OPTIONS_H + #include #include #include @@ -232,6 +233,7 @@ class BuildOptions : public Options bool skipInference{false}; bool save{false}; bool load{false}; + bool asyncFileReader{false}; bool refittable{false}; bool stripWeights{false}; bool versionCompatible{false}; @@ -400,7 +402,6 @@ class TaskInferenceOptions : public Options static void help(std::ostream& out); }; - Arguments argsToArgumentsMap(int32_t argc, char* argv[]); bool parseHelp(Arguments& arguments); diff --git a/samples/common/sampleUtils.cpp b/samples/common/sampleUtils.cpp index 689e5857..365bfb95 100644 --- a/samples/common/sampleUtils.cpp +++ b/samples/common/sampleUtils.cpp @@ -18,6 +18,7 @@ #include "sampleUtils.h" #include "bfloat16.h" #include "half.h" +#include using namespace nvinfer1; @@ -539,23 +540,23 @@ void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t con template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); -template ::value, bool>::type> -void fillBuffer(void* buffer, int64_t volume, T min, T max) +template , bool>> +void fillBuffer(void* buffer, int64_t volume, TType min, TType max) { - T* typedBuffer = static_cast(buffer); + TType* typedBuffer = static_cast(buffer); std::default_random_engine engine; std::uniform_int_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } -template ::value, int32_t>::type> -void fillBuffer(void* buffer, int64_t volume, T min, T max) +template , int32_t>> +void fillBuffer(void* buffer, int64_t volume, TType min, TType max) { - T* typedBuffer = static_cast(buffer); + TType* typedBuffer = static_cast(buffer); std::default_random_engine engine; std::uniform_real_distribution distribution(min, max); - auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; std::generate(typedBuffer, typedBuffer + volume, generator); } diff --git a/samples/common/sampleUtils.h b/samples/common/sampleUtils.h index 5d191219..c0c8ecd5 100644 --- a/samples/common/sampleUtils.h +++ b/samples/common/sampleUtils.h @@ -62,10 +62,10 @@ using samplesCommon::volume; nvinfer1::Dims toDims(std::vector const& vec); -template ::value, bool>::type = true> +template , bool> = true> void fillBuffer(void* buffer, int64_t volume, T min, T max); -template ::value, int32_t>::type = 0> +template , int32_t> = 0> void fillBuffer(void* buffer, int64_t volume, T min, T max); template diff --git a/samples/common/streamReader.h b/samples/common/streamReader.h index 8d7f78ff..cd17a2d7 100644 --- a/samples/common/streamReader.h +++ b/samples/common/streamReader.h @@ -18,9 +18,10 @@ #ifndef STREAM_READER_H #define STREAM_READER_H + #include "NvInferRuntime.h" +#include #include "sampleUtils.h" -#include namespace samplesCommon { @@ -73,6 +74,88 @@ class FileStreamReader final : public nvinfer1::IStreamReader std::ifstream mFile; }; +//! Implements the TensorRT IStreamReaderV2 interface to allow deserializing an engine directly from the plan file. +//! Supports seeking to a position within the file, and reading directly to device pointers. +//! This implementation is not optimized, and will not provide performance improvements over the existing reader. +class AsyncStreamReader final : public nvinfer1::IStreamReaderV2 +{ +public: + bool open(std::string const& filepath) + { + mFile.open(filepath, std::ios::binary); + return mFile.is_open(); + } + + void close() + { + if (mFile.is_open()) + { + mFile.close(); + } + } + + ~AsyncStreamReader() final + { + close(); + } + + bool seek(int64_t offset, nvinfer1::SeekPosition where) noexcept final + { + switch (where) + { + case (nvinfer1::SeekPosition::kSET): mFile.seekg(offset, std::ios_base::beg); break; + case (nvinfer1::SeekPosition::kCUR): mFile.seekg(offset, std::ios_base::cur); break; + case (nvinfer1::SeekPosition::kEND): mFile.seekg(offset, std::ios_base::end); break; + } + return mFile.good(); + } + + int64_t read(void* destination, int64_t nbBytes, cudaStream_t stream) noexcept final + { + if (!mFile.good()) + { + return -1; + } + + cudaPointerAttributes attributes; + ASSERT(cudaPointerGetAttributes(&attributes, destination) == cudaSuccess); + + // from CUDA 11 onward, host pointers are return cudaMemoryTypeUnregistered + if (attributes.type == cudaMemoryTypeHost || attributes.type == cudaMemoryTypeUnregistered) + { + mFile.read(static_cast(destination), nbBytes); + return mFile.gcount(); + } + else if (attributes.type == cudaMemoryTypeDevice) + { + // Set up a temp buffer to read into if reading into device memory. + std::unique_ptr tmpBuf{new char[nbBytes]}; + mFile.read(tmpBuf.get(), nbBytes); + // cudaMemcpyAsync into device storage. + ASSERT(cudaMemcpyAsync(destination, tmpBuf.get(), nbBytes, cudaMemcpyHostToDevice, stream) == cudaSuccess); + // No race between the copying and freeing of tmpBuf, because cudaMemcpyAsync will + // return once the pageable buffer has been copied to the staging memory for DMA transfer + // to device memory. + return mFile.gcount(); + } + return -1; + } + + void reset() + { + ASSERT(mFile.good()); + mFile.seekg(0); + } + + bool isOpen() const + { + return mFile.is_open(); + } + +private: + std::ifstream mFile; +}; + } // namespace samplesCommon diff --git a/samples/python/onnx_custom_plugin/CMakeLists.txt b/samples/python/onnx_custom_plugin/CMakeLists.txt index f00bcd31..43a34102 100644 --- a/samples/python/onnx_custom_plugin/CMakeLists.txt +++ b/samples/python/onnx_custom_plugin/CMakeLists.txt @@ -76,7 +76,7 @@ add_library(customHardmaxPlugin MODULE ) # Use C++11 -target_compile_features(customHardmaxPlugin PUBLIC cxx_std_11) +target_compile_features(customHardmaxPlugin PUBLIC cxx_std_17) # Link TensorRT's nvinfer lib target_link_libraries(customHardmaxPlugin PRIVATE ${NVINFER_LIB}) diff --git a/samples/python/python_plugin/README.md b/samples/python/python_plugin/README.md index 49585b22..fc625dda 100644 --- a/samples/python/python_plugin/README.md +++ b/samples/python/python_plugin/README.md @@ -139,6 +139,7 @@ In this example, - The engine / ONNX model cannot be run from outside Python (e.g. with `trtexec`) - This functionality is possible to implement but comes at the cost of embedding the Python interpreter to the TRT runtime / the binary loading the engine - (For `IPluginV2DynamicExt` only) No bindings yet for `attachToContext()` and `detachFromContext()` which are not pure virtual. + - `circ_pad_plugin_torch.py` may work on aarch64 platforms but is unsupported. # FAQ diff --git a/samples/sampleINT8API/README.md b/samples/sampleINT8API/README.md index 5fda204f..4c2264a8 100644 --- a/samples/sampleINT8API/README.md +++ b/samples/sampleINT8API/README.md @@ -43,10 +43,7 @@ Specifically, this sample performs the following steps: ### Configuring the builder to use INT8 without the INT8 calibrator -1. Ensure that INT8 inference is supported on the platform: - `if (!builder->platformHasFastInt8()) return false;` - -2. Enable INT8 mode by setting the builder flag: +1. Enable INT8 mode by setting the builder flag: `builder->setFlag(BuilderFlag::kINT8);` You can choose not to provide the INT8 calibrator. @@ -54,7 +51,7 @@ Specifically, this sample performs the following steps: If you want to provide the calibrator, manual dynamic range will override calibration generate dynamic range/scale. See sampleINT8 on how to setup INT8 calibrator. -3. Optionally and for debugging purposes, the following flag configures the builder to choose type conforming layer implementation, if one exists. +2. Optionally and for debugging purposes, the following flag configures the builder to choose type conforming layer implementation, if one exists. `builder->setStrictTypeConstraints(true);` diff --git a/samples/sampleINT8API/sampleINT8API.cpp b/samples/sampleINT8API/sampleINT8API.cpp index 97742a7f..7649dbe6 100644 --- a/samples/sampleINT8API/sampleINT8API.cpp +++ b/samples/sampleINT8API/sampleINT8API.cpp @@ -511,13 +511,6 @@ sample::Logger::TestResult SampleINT8API::build() return sample::Logger::TestResult::kFAILED; } - if (!builder->platformHasFastInt8()) - { - sample::gLogError << "Platform does not support INT8 inference. sampleINT8API can only run in INT8 Mode." - << std::endl; - return sample::Logger::TestResult::kWAIVED; - } - auto network = SampleUniquePtr(builder->createNetworkV2(0)); if (!network) { diff --git a/samples/sampleNonZeroPlugin/CMakeLists.txt b/samples/sampleNonZeroPlugin/CMakeLists.txt index 590c5005..474d8a69 100644 --- a/samples/sampleNonZeroPlugin/CMakeLists.txt +++ b/samples/sampleNonZeroPlugin/CMakeLists.txt @@ -21,4 +21,6 @@ SET(SAMPLE_SOURCES set(SAMPLE_PARSERS "onnx") +set(CUDA_LIBS_REQUIRED True) + include(../CMakeSamplesTemplate.txt) diff --git a/samples/sampleOnnxMnistCoordConvAC/sampleOnnxMnistCoordConvAC.cpp b/samples/sampleOnnxMnistCoordConvAC/sampleOnnxMnistCoordConvAC.cpp index 35d76e96..8cc97b2d 100644 --- a/samples/sampleOnnxMnistCoordConvAC/sampleOnnxMnistCoordConvAC.cpp +++ b/samples/sampleOnnxMnistCoordConvAC/sampleOnnxMnistCoordConvAC.cpp @@ -113,9 +113,7 @@ class SampleOnnxMnistCoordConvAC //! bool SampleOnnxMnistCoordConvAC::build() { -#if !TRT_WINML initLibNvInferPlugins(&sample::gLogger, ""); -#endif auto builder = SampleUniquePtr(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger())); if (!builder) { diff --git a/samples/trtexec/trtexec.cpp b/samples/trtexec/trtexec.cpp index 96b1b8e1..73f780d3 100644 --- a/samples/trtexec/trtexec.cpp +++ b/samples/trtexec/trtexec.cpp @@ -333,6 +333,14 @@ int main(int argc, char** argv) return sample::gLogger.reportPass(sampleTest); } +#if TRT_WINML + if (options.build.skipInference) + { + sample::gLogInfo << "Skipped inference phase since --skipInference is added." << std::endl; + return sample::gLogger.reportPass(sampleTest); + } +#endif + #if !TRT_WINML // dynamicPlugins may have been updated by getEngineBuildEnv above bEnv->engine.setDynamicPlugins(options.system.dynamicPlugins); diff --git a/tools/onnx-graphsurgeon/README.md b/tools/onnx-graphsurgeon/README.md index e8d08b12..ddb88026 100644 --- a/tools/onnx-graphsurgeon/README.md +++ b/tools/onnx-graphsurgeon/README.md @@ -19,7 +19,10 @@ ## Introduction -ONNX GraphSurgeon is a tool that allows you to easily generate new ONNX graphs, or modify existing ones. +ONNX GraphSurgeon is a Python library that allows you to create and modify ONNX models. + +If you prefer a GUI, try [Nsight DL Designer](https://developer.nvidia.com/nsight-dl-designer), +which integrates ONNX GraphSurgeon and provides various extra features. ## Installation