From e5578dea7eafd1dce39252625b6fe58a2cb85345 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 24 Dec 2024 14:46:16 +0900 Subject: [PATCH 01/17] refactor(autoware_tensorrt_common): multi-TensorRT compatibility & tensorrt_common as unified lib for all perception components Signed-off-by: Amadeusz Szymko --- .../pointpainting_trt.hpp | 5 +- .../src/pointpainting_fusion/node.cpp | 8 +- .../pointpainting_trt.cpp | 4 +- .../src/detector.cpp | 23 +- .../autoware_lidar_centerpoint/CMakeLists.txt | 2 - .../lidar_centerpoint/centerpoint_trt.hpp | 31 +- .../lidar_centerpoint/network/network_trt.hpp | 53 - .../network/tensorrt_wrapper.hpp | 67 -- .../lib/centerpoint_trt.cpp | 100 +- .../lib/network/network_trt.cpp | 85 -- .../lib/network/tensorrt_wrapper.cpp | 169 --- .../autoware_lidar_centerpoint/src/node.cpp | 4 +- .../autoware_lidar_transfusion/CMakeLists.txt | 1 - .../lidar_transfusion/network/network_trt.hpp | 87 -- .../lidar_transfusion/transfusion_config.hpp | 78 +- .../lidar_transfusion/transfusion_trt.hpp | 31 +- .../lib/network/network_trt.cpp | 357 ------ .../lib/transfusion_trt.cpp | 104 +- .../src/lidar_transfusion_node.cpp | 6 +- .../autoware_shape_estimation/CMakeLists.txt | 3 + .../tensorrt_shape_estimator.hpp | 6 +- .../lib/tensorrt_shape_estimator.cpp | 49 +- .../src/shape_estimation_node.cpp | 3 +- .../CMakeLists.txt | 3 + .../tensorrt_classifier.hpp | 21 +- .../src/tensorrt_classifier.cpp | 95 +- .../autoware_tensorrt_common/CMakeLists.txt | 9 +- perception/autoware_tensorrt_common/README.md | 70 +- .../tensorrt_common/conv_profiler.hpp | 107 ++ .../autoware/tensorrt_common/logger.hpp | 40 +- .../autoware/tensorrt_common/profiler.hpp | 104 ++ .../tensorrt_common/simple_profiler.hpp | 73 -- .../tensorrt_common/tensorrt_common.hpp | 457 +++++--- .../tensorrt_common/tensorrt_conv_calib.hpp | 240 ++++ .../autoware/tensorrt_common/utils.hpp | 407 +++++++ .../autoware_tensorrt_common/src/profiler.cpp | 107 ++ .../src/simple_profiler.cpp | 138 --- .../src/tensorrt_common.cpp | 1008 +++++++++-------- .../autoware_tensorrt_yolox/CMakeLists.txt | 3 + .../tensorrt_yolox/tensorrt_yolox.hpp | 46 +- .../src/tensorrt_yolox.cpp | 184 +-- .../src/tensorrt_yolox_node.cpp | 15 +- .../src/yolox_single_image_inference_node.cpp | 4 +- .../CMakeLists.txt | 1 - .../src/classifier/cnn_classifier.cpp | 7 +- .../CMakeLists.txt | 1 - .../src/traffic_light_fine_detector_node.cpp | 15 +- 47 files changed, 2372 insertions(+), 2059 deletions(-) delete mode 100644 perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/network_trt.hpp delete mode 100644 perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp delete mode 100644 perception/autoware_lidar_centerpoint/lib/network/network_trt.cpp delete mode 100644 perception/autoware_lidar_centerpoint/lib/network/tensorrt_wrapper.cpp delete mode 100644 perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/network/network_trt.hpp delete mode 100644 perception/autoware_lidar_transfusion/lib/network/network_trt.cpp create mode 100644 perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp create mode 100644 perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp delete mode 100644 perception/autoware_tensorrt_common/include/autoware/tensorrt_common/simple_profiler.hpp create mode 100644 perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp create mode 100644 perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp create mode 100644 perception/autoware_tensorrt_common/src/profiler.cpp delete mode 100644 perception/autoware_tensorrt_common/src/simple_profiler.cpp diff --git a/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp b/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp index a89ee3d02cd8b..a1d12bcb56384 100644 --- a/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp +++ b/perception/autoware_image_projection_based_fusion/include/autoware/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -29,8 +30,8 @@ class PointPaintingTRT : public autoware::lidar_centerpoint::CenterPointTRT using autoware::lidar_centerpoint::CenterPointTRT::CenterPointTRT; explicit PointPaintingTRT( - const autoware::lidar_centerpoint::NetworkParam & encoder_param, - const autoware::lidar_centerpoint::NetworkParam & head_param, + const autoware::tensorrt_common::TrtCommonConfig & encoder_param, + const autoware::tensorrt_common::TrtCommonConfig & head_param, const autoware::lidar_centerpoint::DensificationParam & densification_param, const autoware::lidar_centerpoint::CenterPointConfig & config); diff --git a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp index 2837bac458541..125358347e76f 100644 --- a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp +++ b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/node.cpp @@ -172,10 +172,10 @@ PointPaintingFusionNode::PointPaintingFusionNode(const rclcpp::NodeOptions & opt iou_bev_nms_.setParameters(p); } - autoware::lidar_centerpoint::NetworkParam encoder_param( - encoder_onnx_path, encoder_engine_path, trt_precision); - autoware::lidar_centerpoint::NetworkParam head_param( - head_onnx_path, head_engine_path, trt_precision); + autoware::tensorrt_common::TrtCommonConfig encoder_param( + encoder_onnx_path, trt_precision, encoder_engine_path); + autoware::tensorrt_common::TrtCommonConfig head_param( + head_onnx_path, trt_precision, head_engine_path); autoware::lidar_centerpoint::DensificationParam densification_param( densification_world_frame_id, densification_num_past_frames); autoware::lidar_centerpoint::CenterPointConfig config( diff --git a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp index cc3eb17cc32ab..1fe860e68a967 100644 --- a/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp +++ b/perception/autoware_image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp @@ -27,8 +27,8 @@ namespace autoware::image_projection_based_fusion { PointPaintingTRT::PointPaintingTRT( - const autoware::lidar_centerpoint::NetworkParam & encoder_param, - const autoware::lidar_centerpoint::NetworkParam & head_param, + const autoware::tensorrt_common::TrtCommonConfig & encoder_param, + const autoware::tensorrt_common::TrtCommonConfig & head_param, const autoware::lidar_centerpoint::DensificationParam & densification_param, const autoware::lidar_centerpoint::CenterPointConfig & config) : autoware::lidar_centerpoint::CenterPointTRT( diff --git a/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp b/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp index 60aa09c8e16d7..2e3a7a2243399 100644 --- a/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp +++ b/perception/autoware_lidar_apollo_instance_segmentation/src/detector.cpp @@ -18,7 +18,6 @@ #include -#include #include #include @@ -51,12 +50,9 @@ LidarApolloInstanceSegmentation::LidarApolloInstanceSegmentation(rclcpp::Node * const auto precision = node_->declare_parameter("precision", "fp32"); trt_common_ = std::make_unique( - onnx_file, precision, nullptr, autoware::tensorrt_common::BatchConfig{1, 1, 1}, 1 << 30); - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - RCLCPP_ERROR_STREAM(node_->get_logger(), "failed to create tensorrt engine file."); - return; + tensorrt_common::TrtCommonConfig(onnx_file, precision)); + if (!trt_common_->setup()) { + throw std::runtime_error("Failed to setup TensorRT"); } if (node_->declare_parameter("build_only", false)) { @@ -65,11 +61,11 @@ LidarApolloInstanceSegmentation::LidarApolloInstanceSegmentation(rclcpp::Node * } // GPU memory allocation - const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_dims = trt_common_->getTensorShape(0); const auto input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); input_d_ = autoware::cuda_utils::make_unique(input_size); - const auto output_dims = trt_common_->getBindingDimensions(1); + const auto output_dims = trt_common_->getTensorShape(1); output_size_ = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); output_d_ = autoware::cuda_utils::make_unique(output_size_); @@ -127,10 +123,6 @@ bool LidarApolloInstanceSegmentation::detectDynamicObjects( const sensor_msgs::msg::PointCloud2 & input, tier4_perception_msgs::msg::DetectedObjectsWithFeature & output) { - if (!trt_common_->isInitialized()) { - return false; - } - // move up pointcloud z_offset in z axis sensor_msgs::msg::PointCloud2 transformed_cloud; transformCloud(input, transformed_cloud, z_offset_); @@ -169,7 +161,10 @@ bool LidarApolloInstanceSegmentation::detectDynamicObjects( std::vector buffers = {input_d_.get(), output_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); CHECK_CUDA_ERROR(cudaMemcpyAsync( output_h_.get(), output_d_.get(), sizeof(float) * output_size_, cudaMemcpyDeviceToHost, diff --git a/perception/autoware_lidar_centerpoint/CMakeLists.txt b/perception/autoware_lidar_centerpoint/CMakeLists.txt index 88244b3153349..322907c67c6ae 100644 --- a/perception/autoware_lidar_centerpoint/CMakeLists.txt +++ b/perception/autoware_lidar_centerpoint/CMakeLists.txt @@ -84,8 +84,6 @@ if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL) lib/detection_class_remapper.cpp lib/utils.cpp lib/ros_utils.cpp - lib/network/network_trt.cpp - lib/network/tensorrt_wrapper.cpp lib/postprocess/non_maximum_suppression.cpp lib/preprocess/pointcloud_densification.cpp lib/preprocess/voxel_generator.cpp diff --git a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp index ce5ec3ea0abfe..bdb824770d1de 100644 --- a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp +++ b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/centerpoint_trt.hpp @@ -16,12 +16,13 @@ #define AUTOWARE__LIDAR_CENTERPOINT__CENTERPOINT_TRT_HPP_ #include "autoware/lidar_centerpoint/cuda_utils.hpp" -#include "autoware/lidar_centerpoint/network/network_trt.hpp" #include "autoware/lidar_centerpoint/postprocess/postprocess_kernel.hpp" #include "autoware/lidar_centerpoint/preprocess/voxel_generator.hpp" #include "pcl/point_cloud.h" #include "pcl/point_types.h" +#include + #include "sensor_msgs/msg/point_cloud2.hpp" #include @@ -31,31 +32,14 @@ namespace autoware::lidar_centerpoint { -class NetworkParam -{ -public: - NetworkParam(std::string onnx_path, std::string engine_path, std::string trt_precision) - : onnx_path_(std::move(onnx_path)), - engine_path_(std::move(engine_path)), - trt_precision_(std::move(trt_precision)) - { - } - - std::string onnx_path() const { return onnx_path_; } - std::string engine_path() const { return engine_path_; } - std::string trt_precision() const { return trt_precision_; } - -private: - std::string onnx_path_; - std::string engine_path_; - std::string trt_precision_; -}; +using autoware::tensorrt_common::ProfileDims; +using autoware::tensorrt_common::TrtCommonConfig; class CenterPointTRT { public: explicit CenterPointTRT( - const NetworkParam & encoder_param, const NetworkParam & head_param, + const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param, const DensificationParam & densification_param, const CenterPointConfig & config); virtual ~CenterPointTRT(); @@ -66,6 +50,7 @@ class CenterPointTRT protected: void initPtr(); + void initTrt(const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param); virtual bool preprocess( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer); @@ -75,8 +60,8 @@ class CenterPointTRT void postProcess(std::vector & det_boxes3d); std::unique_ptr vg_ptr_{nullptr}; - std::unique_ptr encoder_trt_ptr_{nullptr}; - std::unique_ptr head_trt_ptr_{nullptr}; + std::unique_ptr encoder_trt_ptr_{nullptr}; + std::unique_ptr head_trt_ptr_{nullptr}; std::unique_ptr post_proc_ptr_{nullptr}; cudaStream_t stream_{nullptr}; diff --git a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/network_trt.hpp b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/network_trt.hpp deleted file mode 100644 index 17778d77ed798..0000000000000 --- a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/network_trt.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_ -#define AUTOWARE__LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_ - -#include "autoware/lidar_centerpoint/centerpoint_config.hpp" -#include "autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp" - -#include - -namespace autoware::lidar_centerpoint -{ -class VoxelEncoderTRT : public TensorRTWrapper -{ -public: - using TensorRTWrapper::TensorRTWrapper; - -protected: - bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) override; -}; - -class HeadTRT : public TensorRTWrapper -{ -public: - using TensorRTWrapper::TensorRTWrapper; - - HeadTRT(const std::vector & out_channel_sizes, const CenterPointConfig & config); - -protected: - bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) override; - - std::vector out_channel_sizes_; -}; - -} // namespace autoware::lidar_centerpoint - -#endif // AUTOWARE__LIDAR_CENTERPOINT__NETWORK__NETWORK_TRT_HPP_ diff --git a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp b/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp deleted file mode 100644 index d56ccc699add3..0000000000000 --- a/perception/autoware_lidar_centerpoint/include/autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_ -#define AUTOWARE__LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_ - -#include "NvInfer.h" -#include "autoware/lidar_centerpoint/centerpoint_config.hpp" -#include "autoware/tensorrt_common/tensorrt_common.hpp" - -#include -#include -#include - -namespace autoware::lidar_centerpoint -{ - -class TensorRTWrapper -{ -public: - explicit TensorRTWrapper(const CenterPointConfig & config); - - virtual ~TensorRTWrapper(); - - bool init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision); - - autoware::tensorrt_common::TrtUniquePtr context_{nullptr}; - -protected: - virtual bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) = 0; - - CenterPointConfig config_; - autoware::tensorrt_common::Logger logger_; - -private: - bool parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - std::size_t workspace_size = (1ULL << 30)); - - bool saveEngine(const std::string & engine_path); - - bool loadEngine(const std::string & engine_path); - - bool createContext(); - - autoware::tensorrt_common::TrtUniquePtr runtime_{nullptr}; - autoware::tensorrt_common::TrtUniquePtr plan_{nullptr}; - autoware::tensorrt_common::TrtUniquePtr engine_{nullptr}; -}; - -} // namespace autoware::lidar_centerpoint - -#endif // AUTOWARE__LIDAR_CENTERPOINT__NETWORK__TENSORRT_WRAPPER_HPP_ diff --git a/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp index 6ee2b3733bdb0..0d5c646229d44 100644 --- a/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp +++ b/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp @@ -27,39 +27,22 @@ #include #include #include +#include +#include #include namespace autoware::lidar_centerpoint { CenterPointTRT::CenterPointTRT( - const NetworkParam & encoder_param, const NetworkParam & head_param, + const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param, const DensificationParam & densification_param, const CenterPointConfig & config) : config_(config) { vg_ptr_ = std::make_unique(densification_param, config_); post_proc_ptr_ = std::make_unique(config_); - // encoder - encoder_trt_ptr_ = std::make_unique(config_); - encoder_trt_ptr_->init( - encoder_param.onnx_path(), encoder_param.engine_path(), encoder_param.trt_precision()); - encoder_trt_ptr_->context_->setBindingDimensions( - 0, - nvinfer1::Dims3( - config_.max_voxel_size_, config_.max_point_in_voxel_size_, config_.encoder_in_feature_size_)); - - // head - std::vector out_channel_sizes = { - config_.class_size_, config_.head_out_offset_size_, config_.head_out_z_size_, - config_.head_out_dim_size_, config_.head_out_rot_size_, config_.head_out_vel_size_}; - head_trt_ptr_ = std::make_unique(out_channel_sizes, config_); - head_trt_ptr_->init(head_param.onnx_path(), head_param.engine_path(), head_param.trt_precision()); - head_trt_ptr_->context_->setBindingDimensions( - 0, nvinfer1::Dims4( - config_.batch_size_, config_.encoder_out_feature_size_, config_.grid_size_y_, - config_.grid_size_x_)); - initPtr(); + initTrt(encoder_param, head_param); cudaStreamCreate(&stream_); } @@ -125,6 +108,66 @@ void CenterPointTRT::initPtr() cudaMemcpyHostToDevice, stream_)); } +void CenterPointTRT::initTrt( + const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param) +{ + /// encoder input profile + auto enc_in_dims = nvinfer1::Dims{ + 3, + {static_cast(config_.max_voxel_size_), + static_cast(config_.max_point_in_voxel_size_), + static_cast(config_.encoder_in_feature_size_)}}; + std::vector encoder_profile_dims{ + tensorrt_common::ProfileDims(0, enc_in_dims, enc_in_dims, enc_in_dims)}; + auto encoder_profile_dims_ptr = + std::make_unique>(encoder_profile_dims); + + // head input profile + auto head_in_dims = nvinfer1::Dims{ + 4, + {static_cast(config_.batch_size_), + static_cast(config_.encoder_out_feature_size_), + static_cast(config_.grid_size_y_), static_cast(config_.grid_size_x_)}}; + std::vector head_profile_dims{ + tensorrt_common::ProfileDims(0, head_in_dims, head_in_dims, head_in_dims)}; + std::unordered_map out_channel_map = { + {1, static_cast(config_.class_size_)}, + {2, static_cast(config_.head_out_offset_size_)}, + {3, static_cast(config_.head_out_z_size_)}, + {4, static_cast(config_.head_out_dim_size_)}, + {5, static_cast(config_.head_out_rot_size_)}, + {6, static_cast(config_.head_out_vel_size_)}}; + for (const auto & [tensor_name, channel_size] : out_channel_map) { + auto dims = nvinfer1::Dims{ + 4, + {static_cast(config_.batch_size_), channel_size, + static_cast(config_.down_grid_size_y_), + static_cast(config_.down_grid_size_x_)}}; + head_profile_dims.emplace_back(tensor_name, dims, dims, dims); + } + auto head_profile_dims_ptr = + std::make_unique>(head_profile_dims); + + // initialize trt wrappers + encoder_trt_ptr_ = std::make_unique(encoder_param); + + head_trt_ptr_ = std::make_unique(head_param); + + // setup trt engines + if ( + !encoder_trt_ptr_->setup(std::move(encoder_profile_dims_ptr)) || + !head_trt_ptr_->setup(std::move(head_profile_dims_ptr))) { + throw std::runtime_error("Failed to setup TRT engine."); + } + + // set input shapes + if ( + !encoder_trt_ptr_->setInputShape(0, enc_in_dims) || + !head_trt_ptr_->setInputShape(0, head_in_dims)) { + throw std::runtime_error("Failed to set input shape."); + } +} + bool CenterPointTRT::detect( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer, std::vector & det_boxes3d) @@ -206,13 +249,10 @@ bool CenterPointTRT::preprocess( void CenterPointTRT::inference() { - if (!encoder_trt_ptr_->context_ || !head_trt_ptr_->context_) { - throw std::runtime_error("Failed to create tensorrt context."); - } - // pillar encoder network - std::vector encoder_buffers{encoder_in_features_d_.get(), pillar_features_d_.get()}; - encoder_trt_ptr_->context_->enqueueV2(encoder_buffers.data(), stream_, nullptr); + std::vector encoder_tensors = {encoder_in_features_d_.get(), pillar_features_d_.get()}; + encoder_trt_ptr_->setTensorsAddresses(encoder_tensors); + encoder_trt_ptr_->enqueueV3(stream_); // scatter CHECK_CUDA_ERROR(scatterFeatures_launch( @@ -221,11 +261,13 @@ void CenterPointTRT::inference() spatial_features_d_.get(), stream_)); // head network - std::vector head_buffers = {spatial_features_d_.get(), head_out_heatmap_d_.get(), + std::vector head_tensors = {spatial_features_d_.get(), head_out_heatmap_d_.get(), head_out_offset_d_.get(), head_out_z_d_.get(), head_out_dim_d_.get(), head_out_rot_d_.get(), head_out_vel_d_.get()}; - head_trt_ptr_->context_->enqueueV2(head_buffers.data(), stream_, nullptr); + head_trt_ptr_->setTensorsAddresses(head_tensors); + + head_trt_ptr_->enqueueV3(stream_); } void CenterPointTRT::postProcess(std::vector & det_boxes3d) diff --git a/perception/autoware_lidar_centerpoint/lib/network/network_trt.cpp b/perception/autoware_lidar_centerpoint/lib/network/network_trt.cpp deleted file mode 100644 index 7767c0b5cbb02..0000000000000 --- a/perception/autoware_lidar_centerpoint/lib/network/network_trt.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "autoware/lidar_centerpoint/network/network_trt.hpp" - -#include -#include - -namespace autoware::lidar_centerpoint -{ -bool VoxelEncoderTRT::setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) -{ - auto profile = builder.createOptimizationProfile(); - auto in_name = network.getInput(0)->getName(); - auto in_dims = nvinfer1::Dims3( - config_.max_voxel_size_, config_.max_point_in_voxel_size_, config_.encoder_in_feature_size_); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMIN, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kOPT, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMAX, in_dims); - - auto out_name = network.getOutput(0)->getName(); - auto out_dims = nvinfer1::Dims2(config_.max_voxel_size_, config_.encoder_out_feature_size_); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMIN, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kOPT, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMAX, out_dims); - config.addOptimizationProfile(profile); - - return true; -} - -HeadTRT::HeadTRT( - const std::vector & out_channel_sizes, const CenterPointConfig & config) -: TensorRTWrapper(config), out_channel_sizes_(out_channel_sizes) -{ -} - -bool HeadTRT::setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) -{ - auto profile = builder.createOptimizationProfile(); - auto in_name = network.getInput(0)->getName(); - auto in_dims = nvinfer1::Dims4( - config_.batch_size_, config_.encoder_out_feature_size_, config_.grid_size_y_, - config_.grid_size_x_); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMIN, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kOPT, in_dims); - profile->setDimensions(in_name, nvinfer1::OptProfileSelector::kMAX, in_dims); - - for (std::size_t ci = 0; ci < out_channel_sizes_.size(); ci++) { - auto out_name = network.getOutput(ci)->getName(); - - if ( - out_name == std::string("heatmap") && - network.getOutput(ci)->getDimensions().d[1] != static_cast(out_channel_sizes_[ci])) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Expected and actual number of classes do not match" << std::endl; - return false; - } - auto out_dims = nvinfer1::Dims4( - config_.batch_size_, out_channel_sizes_[ci], config_.down_grid_size_y_, - config_.down_grid_size_x_); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMIN, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kOPT, out_dims); - profile->setDimensions(out_name, nvinfer1::OptProfileSelector::kMAX, out_dims); - } - config.addOptimizationProfile(profile); - - return true; -} - -} // namespace autoware::lidar_centerpoint diff --git a/perception/autoware_lidar_centerpoint/lib/network/tensorrt_wrapper.cpp b/perception/autoware_lidar_centerpoint/lib/network/tensorrt_wrapper.cpp deleted file mode 100644 index 6d4c2e053b34f..0000000000000 --- a/perception/autoware_lidar_centerpoint/lib/network/tensorrt_wrapper.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2021 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "autoware/lidar_centerpoint/network/tensorrt_wrapper.hpp" - -#include "NvOnnxParser.h" - -#include -#include -#include - -namespace autoware::lidar_centerpoint -{ -TensorRTWrapper::TensorRTWrapper(const CenterPointConfig & config) : config_(config) -{ -} - -TensorRTWrapper::~TensorRTWrapper() -{ - context_.reset(); - runtime_.reset(); - plan_.reset(); - engine_.reset(); -} - -bool TensorRTWrapper::init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision) -{ - runtime_ = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferRuntime(logger_)); - if (!runtime_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create runtime" << std::endl; - return false; - } - - bool success; - std::ifstream engine_file(engine_path); - if (engine_file.is_open()) { - success = loadEngine(engine_path); - } else { - auto log_thread = logger_.log_throttle( - nvinfer1::ILogger::Severity::kINFO, - "Applying optimizations and building TRT CUDA engine. Please wait a minutes...", 5); - success = parseONNX(onnx_path, engine_path, precision); - logger_.stop_throttle(log_thread); - } - success &= createContext(); - - return success; -} - -bool TensorRTWrapper::createContext() -{ - if (!engine_) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create context: Engine was not created" << std::endl; - return false; - } - - context_ = autoware::tensorrt_common::TrtUniquePtr( - engine_->createExecutionContext()); - if (!context_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create context" << std::endl; - return false; - } - - return true; -} - -bool TensorRTWrapper::parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - const std::size_t workspace_size) -{ - auto builder = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferBuilder(logger_)); - if (!builder) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create builder" << std::endl; - return false; - } - - auto config = autoware::tensorrt_common::TrtUniquePtr( - builder->createBuilderConfig()); - if (!config) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create config" << std::endl; - return false; - } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size); -#else - config->setMaxWorkspaceSize(workspace_size); -#endif - if (precision == "fp16") { - if (builder->platformHasFastFp16()) { - autoware::tensorrt_common::LOG_INFO(logger_) << "Using TensorRT FP16 Inference" << std::endl; - config->setFlag(nvinfer1::BuilderFlag::kFP16); - } else { - autoware::tensorrt_common::LOG_INFO(logger_) - << "TensorRT FP16 Inference isn't supported in this environment" << std::endl; - } - } - - const auto flag = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - auto network = autoware::tensorrt_common::TrtUniquePtr( - builder->createNetworkV2(flag)); - if (!network) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create network" << std::endl; - return false; - } - - auto parser = autoware::tensorrt_common::TrtUniquePtr( - nvonnxparser::createParser(*network, logger_)); - parser->parseFromFile(onnx_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR)); - - if (!setProfile(*builder, *network, *config)) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to set profile" << std::endl; - return false; - } - - plan_ = autoware::tensorrt_common::TrtUniquePtr( - builder->buildSerializedNetwork(*network, *config)); - if (!plan_) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create serialized network" << std::endl; - return false; - } - engine_ = autoware::tensorrt_common::TrtUniquePtr( - runtime_->deserializeCudaEngine(plan_->data(), plan_->size())); - if (!engine_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create engine" << std::endl; - return false; - } - - return saveEngine(engine_path); -} - -bool TensorRTWrapper::saveEngine(const std::string & engine_path) -{ - autoware::tensorrt_common::LOG_INFO(logger_) << "Writing to " << engine_path << std::endl; - std::ofstream file(engine_path, std::ios::out | std::ios::binary); - file.write(reinterpret_cast(plan_->data()), plan_->size()); - return true; -} - -bool TensorRTWrapper::loadEngine(const std::string & engine_path) -{ - std::ifstream engine_file(engine_path); - std::stringstream engine_buffer; - engine_buffer << engine_file.rdbuf(); - std::string engine_str = engine_buffer.str(); - engine_ = - autoware::tensorrt_common::TrtUniquePtr(runtime_->deserializeCudaEngine( - reinterpret_cast(engine_str.data()), engine_str.size())); - autoware::tensorrt_common::LOG_INFO(logger_) << "Loaded engine from " << engine_path << std::endl; - return true; -} - -} // namespace autoware::lidar_centerpoint diff --git a/perception/autoware_lidar_centerpoint/src/node.cpp b/perception/autoware_lidar_centerpoint/src/node.cpp index 59acceeac54d6..83390e47ce371 100644 --- a/perception/autoware_lidar_centerpoint/src/node.cpp +++ b/perception/autoware_lidar_centerpoint/src/node.cpp @@ -86,8 +86,8 @@ LidarCenterPointNode::LidarCenterPointNode(const rclcpp::NodeOptions & node_opti iou_bev_nms_.setParameters(p); } - NetworkParam encoder_param(encoder_onnx_path, encoder_engine_path, trt_precision); - NetworkParam head_param(head_onnx_path, head_engine_path, trt_precision); + TrtCommonConfig encoder_param(encoder_onnx_path, trt_precision, encoder_engine_path); + TrtCommonConfig head_param(head_onnx_path, trt_precision, head_engine_path); DensificationParam densification_param( densification_world_frame_id, densification_num_past_frames); diff --git a/perception/autoware_lidar_transfusion/CMakeLists.txt b/perception/autoware_lidar_transfusion/CMakeLists.txt index c3a56f883fbe5..fdb978c64946f 100644 --- a/perception/autoware_lidar_transfusion/CMakeLists.txt +++ b/perception/autoware_lidar_transfusion/CMakeLists.txt @@ -84,7 +84,6 @@ if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL) ament_auto_add_library(${PROJECT_NAME}_lib SHARED lib/detection_class_remapper.cpp - lib/network/network_trt.cpp lib/postprocess/non_maximum_suppression.cpp lib/preprocess/voxel_generator.cpp lib/preprocess/pointcloud_densification.cpp diff --git a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/network/network_trt.hpp b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/network/network_trt.hpp deleted file mode 100644 index 71cd08acdb8c8..0000000000000 --- a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/network/network_trt.hpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2024 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_ -#define AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_ - -#include "autoware/lidar_transfusion/transfusion_config.hpp" -#include "autoware/lidar_transfusion/utils.hpp" - -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace autoware::lidar_transfusion -{ - -struct ProfileDimension -{ - nvinfer1::Dims min; - nvinfer1::Dims opt; - nvinfer1::Dims max; - - bool operator!=(const ProfileDimension & rhs) const - { - return min.nbDims != rhs.min.nbDims || opt.nbDims != rhs.opt.nbDims || - max.nbDims != rhs.max.nbDims || !std::equal(min.d, min.d + min.nbDims, rhs.min.d) || - !std::equal(opt.d, opt.d + opt.nbDims, rhs.opt.d) || - !std::equal(max.d, max.d + max.nbDims, rhs.max.d); - } -}; - -class NetworkTRT -{ -public: - explicit NetworkTRT(const TransfusionConfig & config); - ~NetworkTRT(); - - bool init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision); - const char * getTensorName(NetworkIO name); - - autoware::tensorrt_common::TrtUniquePtr engine{nullptr}; - autoware::tensorrt_common::TrtUniquePtr context{nullptr}; - -private: - bool parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - std::size_t workspace_size = (1ULL << 30)); - bool saveEngine(const std::string & engine_path); - bool loadEngine(const std::string & engine_path); - bool createContext(); - bool setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config); - bool validateNetworkIO(); - nvinfer1::Dims validateTensorShape(NetworkIO name, const std::vector shape); - - autoware::tensorrt_common::TrtUniquePtr runtime_{nullptr}; - autoware::tensorrt_common::TrtUniquePtr plan_{nullptr}; - autoware::tensorrt_common::Logger logger_; - TransfusionConfig config_; - std::vector tensors_names_; - - std::array in_profile_dims_; -}; - -} // namespace autoware::lidar_transfusion - -#endif // AUTOWARE__LIDAR_TRANSFUSION__NETWORK__NETWORK_TRT_HPP_ diff --git a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp index 363ee17a0d6e0..e60ee04150b5c 100644 --- a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp +++ b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_config.hpp @@ -35,21 +35,21 @@ class TransfusionConfig if (voxels_num.size() == 3) { max_voxels_ = voxels_num[2]; - voxels_num_[0] = voxels_num[0]; - voxels_num_[1] = voxels_num[1]; - voxels_num_[2] = voxels_num[2]; + voxels_num_[0] = static_cast(voxels_num[0]); + voxels_num_[1] = static_cast(voxels_num[1]); + voxels_num_[2] = static_cast(voxels_num[2]); - min_voxel_size_ = voxels_num[0]; - opt_voxel_size_ = voxels_num[1]; - max_voxel_size_ = voxels_num[2]; + min_voxel_size_ = static_cast(voxels_num[0]); + opt_voxel_size_ = static_cast(voxels_num[1]); + max_voxel_size_ = static_cast(voxels_num[2]); - min_points_size_ = voxels_num[0]; - opt_points_size_ = voxels_num[1]; - max_points_size_ = voxels_num[2]; + min_points_size_ = static_cast(voxels_num[0]); + opt_points_size_ = static_cast(voxels_num[1]); + max_points_size_ = static_cast(voxels_num[2]); - min_coors_size_ = voxels_num[0]; - opt_coors_size_ = voxels_num[1]; - max_coors_size_ = voxels_num[2]; + min_coors_size_ = static_cast(voxels_num[0]); + opt_coors_size_ = static_cast(voxels_num[1]); + max_coors_size_ = static_cast(voxels_num[2]); } if (point_cloud_range.size() == 6) { min_x_range_ = static_cast(point_cloud_range[0]); @@ -91,7 +91,7 @@ class TransfusionConfig std::size_t cloud_capacity_{}; ///// KERNEL PARAMETERS ///// const std::size_t threads_for_voxel_{256}; // threads number for a block - const std::size_t points_per_voxel_{20}; + const int32_t points_per_voxel_{20}; const std::size_t warp_size_{32}; // one warp(32 threads) for one pillar const std::size_t pillars_per_block_{64}; // one thread deals with one pillar // and a block has pillars_per_block threads @@ -99,9 +99,9 @@ class TransfusionConfig std::size_t max_voxels_{60000}; ///// NETWORK PARAMETERS ///// - const std::size_t batch_size_{1}; - const std::size_t num_classes_{5}; - const std::size_t num_point_feature_size_{5}; // x, y, z, intensity, lag + const int32_t batch_size_{1}; + const int32_t num_classes_{5}; + const int32_t num_point_feature_size_{5}; // x, y, z, intensity, lag // the dimension of the input cloud float min_x_range_{-76.8}; float max_x_range_{76.8}; @@ -114,9 +114,9 @@ class TransfusionConfig float voxel_y_size_{0.3}; float voxel_z_size_{8.0}; const std::size_t out_size_factor_{4}; - const std::size_t max_num_points_per_pillar_{points_per_voxel_}; - const std::size_t num_point_values_{4}; - std::size_t num_proposals_{200}; + const int32_t max_num_points_per_pillar_{points_per_voxel_}; + const int32_t num_point_values_{4}; + int32_t num_proposals_{200}; // the number of feature maps for pillar scatter const std::size_t num_feature_scatter_{pillar_feature_size_}; // the score threshold for classification @@ -126,7 +126,7 @@ class TransfusionConfig std::size_t max_num_pillars_{max_voxels_}; const std::size_t pillar_points_bev_{max_num_points_per_pillar_ * max_num_pillars_}; // the detected boxes result decode by (x, y, z, w, l, h, yaw) - const std::size_t num_box_values_{8}; + const int32_t num_box_values_{8}; // the input size of the 2D backbone network std::size_t grid_x_size_{512}; std::size_t grid_y_size_{512}; @@ -136,33 +136,33 @@ class TransfusionConfig std::size_t feature_y_size_{grid_y_size_ / out_size_factor_}; ///// RUNTIME DIMENSIONS ///// - std::vector voxels_num_{5000, 30000, 60000}; + std::vector voxels_num_{5000, 30000, 60000}; // voxels - std::size_t min_voxel_size_{voxels_num_[0]}; - std::size_t opt_voxel_size_{voxels_num_[1]}; - std::size_t max_voxel_size_{voxels_num_[2]}; + int32_t min_voxel_size_{voxels_num_[0]}; + int32_t opt_voxel_size_{voxels_num_[1]}; + int32_t max_voxel_size_{voxels_num_[2]}; - std::size_t min_point_in_voxel_size_{points_per_voxel_}; - std::size_t opt_point_in_voxel_size_{points_per_voxel_}; - std::size_t max_point_in_voxel_size_{points_per_voxel_}; + int32_t min_point_in_voxel_size_{points_per_voxel_}; + int32_t opt_point_in_voxel_size_{points_per_voxel_}; + int32_t max_point_in_voxel_size_{points_per_voxel_}; - std::size_t min_network_feature_size_{num_point_feature_size_}; - std::size_t opt_network_feature_size_{num_point_feature_size_}; - std::size_t max_network_feature_size_{num_point_feature_size_}; + int32_t min_network_feature_size_{num_point_feature_size_}; + int32_t opt_network_feature_size_{num_point_feature_size_}; + int32_t max_network_feature_size_{num_point_feature_size_}; // num_points - std::size_t min_points_size_{voxels_num_[0]}; - std::size_t opt_points_size_{voxels_num_[1]}; - std::size_t max_points_size_{voxels_num_[2]}; + int32_t min_points_size_{voxels_num_[0]}; + int32_t opt_points_size_{voxels_num_[1]}; + int32_t max_points_size_{voxels_num_[2]}; // coors - std::size_t min_coors_size_{voxels_num_[0]}; - std::size_t opt_coors_size_{voxels_num_[1]}; - std::size_t max_coors_size_{voxels_num_[2]}; + int32_t min_coors_size_{voxels_num_[0]}; + int32_t opt_coors_size_{voxels_num_[1]}; + int32_t max_coors_size_{voxels_num_[2]}; - std::size_t min_coors_dim_size_{num_point_values_}; - std::size_t opt_coors_dim_size_{num_point_values_}; - std::size_t max_coors_dim_size_{num_point_values_}; + int32_t min_coors_dim_size_{num_point_values_}; + int32_t opt_coors_dim_size_{num_point_values_}; + int32_t max_coors_dim_size_{num_point_values_}; }; } // namespace autoware::lidar_transfusion diff --git a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp index 06dd65b4b805f..00bb5e726706c 100644 --- a/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp +++ b/perception/autoware_lidar_transfusion/include/autoware/lidar_transfusion/transfusion_trt.hpp @@ -16,7 +16,6 @@ #define AUTOWARE__LIDAR_TRANSFUSION__TRANSFUSION_TRT_HPP_ #include "autoware/lidar_transfusion/cuda_utils.hpp" -#include "autoware/lidar_transfusion/network/network_trt.hpp" #include "autoware/lidar_transfusion/postprocess/postprocess_kernel.hpp" #include "autoware/lidar_transfusion/preprocess/pointcloud_densification.hpp" #include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp" @@ -24,6 +23,7 @@ #include "autoware/lidar_transfusion/utils.hpp" #include "autoware/lidar_transfusion/visibility_control.hpp" +#include #include #include @@ -34,38 +34,17 @@ #include #include #include -#include #include namespace autoware::lidar_transfusion { -class NetworkParam -{ -public: - NetworkParam(std::string onnx_path, std::string engine_path, std::string trt_precision) - : onnx_path_(std::move(onnx_path)), - engine_path_(std::move(engine_path)), - trt_precision_(std::move(trt_precision)) - { - } - - std::string onnx_path() const { return onnx_path_; } - std::string engine_path() const { return engine_path_; } - std::string trt_precision() const { return trt_precision_; } - -private: - std::string onnx_path_; - std::string engine_path_; - std::string trt_precision_; -}; - class LIDAR_TRANSFUSION_PUBLIC TransfusionTRT { public: explicit TransfusionTRT( - const NetworkParam & network_param, const DensificationParam & densification_param, - const TransfusionConfig & config); + const tensorrt_common::TrtCommonConfig & trt_config, + const DensificationParam & densification_param, const TransfusionConfig config); virtual ~TransfusionTRT(); bool detect( @@ -73,6 +52,8 @@ class LIDAR_TRANSFUSION_PUBLIC TransfusionTRT std::vector & det_boxes3d, std::unordered_map & proc_timing); protected: + void initTrt(const tensorrt_common::TrtCommonConfig & trt_config); + void initPtr(); bool preprocess(const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer); @@ -81,7 +62,7 @@ class LIDAR_TRANSFUSION_PUBLIC TransfusionTRT bool postprocess(std::vector & det_boxes3d); - std::unique_ptr network_trt_ptr_{nullptr}; + std::unique_ptr network_trt_ptr_{nullptr}; std::unique_ptr vg_ptr_{nullptr}; std::unique_ptr> stop_watch_ptr_{ nullptr}; diff --git a/perception/autoware_lidar_transfusion/lib/network/network_trt.cpp b/perception/autoware_lidar_transfusion/lib/network/network_trt.cpp deleted file mode 100644 index 8e3cb8a55ec7e..0000000000000 --- a/perception/autoware_lidar_transfusion/lib/network/network_trt.cpp +++ /dev/null @@ -1,357 +0,0 @@ -// Copyright 2024 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "autoware/lidar_transfusion/network/network_trt.hpp" - -#include - -#include -#include -#include -#include -#include - -namespace autoware::lidar_transfusion -{ - -inline NetworkIO nameToNetworkIO(const char * name) -{ - static const std::unordered_map name_to_enum = { - {"voxels", NetworkIO::voxels}, {"num_points", NetworkIO::num_points}, - {"coors", NetworkIO::coors}, {"cls_score0", NetworkIO::cls_score}, - {"bbox_pred0", NetworkIO::bbox_pred}, {"dir_cls_pred0", NetworkIO::dir_pred}}; - - auto it = name_to_enum.find(name); - if (it != name_to_enum.end()) { - return it->second; - } - throw std::runtime_error("Invalid input name: " + std::string(name)); -} - -std::ostream & operator<<(std::ostream & os, const ProfileDimension & profile) -{ - std::string delim = ""; - os << "min->["; - for (int i = 0; i < profile.min.nbDims; ++i) { - os << delim << profile.min.d[i]; - delim = ", "; - } - os << "], opt->["; - delim = ""; - for (int i = 0; i < profile.opt.nbDims; ++i) { - os << delim << profile.opt.d[i]; - delim = ", "; - } - os << "], max->["; - delim = ""; - for (int i = 0; i < profile.max.nbDims; ++i) { - os << delim << profile.max.d[i]; - delim = ", "; - } - os << "]"; - return os; -} - -NetworkTRT::NetworkTRT(const TransfusionConfig & config) : config_(config) -{ - ProfileDimension voxels_dims = { - nvinfer1::Dims3( - config_.min_voxel_size_, config_.min_point_in_voxel_size_, config_.min_network_feature_size_), - nvinfer1::Dims3( - config_.opt_voxel_size_, config_.opt_point_in_voxel_size_, config_.opt_network_feature_size_), - nvinfer1::Dims3( - config_.max_voxel_size_, config_.max_point_in_voxel_size_, - config_.max_network_feature_size_)}; - ProfileDimension num_points_dims = { - nvinfer1::Dims{1, {static_cast(config_.min_points_size_)}}, - nvinfer1::Dims{1, {static_cast(config_.opt_points_size_)}}, - nvinfer1::Dims{1, {static_cast(config_.max_points_size_)}}}; - ProfileDimension coors_dims = { - nvinfer1::Dims2(config_.min_coors_size_, config_.min_coors_dim_size_), - nvinfer1::Dims2(config_.opt_coors_size_, config_.opt_coors_dim_size_), - nvinfer1::Dims2(config_.max_coors_size_, config_.max_coors_dim_size_)}; - in_profile_dims_ = {voxels_dims, num_points_dims, coors_dims}; -} - -NetworkTRT::~NetworkTRT() -{ - context.reset(); - runtime_.reset(); - plan_.reset(); - engine.reset(); -} - -bool NetworkTRT::init( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision) -{ - runtime_ = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferRuntime(logger_)); - if (!runtime_) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create runtime" << std::endl; - return false; - } - - bool success; - std::ifstream engine_file(engine_path); - if (engine_file.is_open()) { - success = loadEngine(engine_path); - } else { - auto log_thread = logger_.log_throttle( - nvinfer1::ILogger::Severity::kINFO, - "Applying optimizations and building TRT CUDA engine. Please wait a minutes...", 5); - success = parseONNX(onnx_path, engine_path, precision); - logger_.stop_throttle(log_thread); - } - success &= createContext(); - - return success; -} - -bool NetworkTRT::setProfile( - nvinfer1::IBuilder & builder, nvinfer1::INetworkDefinition & network, - nvinfer1::IBuilderConfig & config) -{ - auto profile = builder.createOptimizationProfile(); - - auto voxels_name = network.getInput(NetworkIO::voxels)->getName(); - auto num_points_name = network.getInput(NetworkIO::num_points)->getName(); - auto coors_name = network.getInput(NetworkIO::coors)->getName(); - - profile->setDimensions( - voxels_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::voxels].min); - profile->setDimensions( - voxels_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::voxels].opt); - profile->setDimensions( - voxels_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::voxels].max); - - profile->setDimensions( - num_points_name, nvinfer1::OptProfileSelector::kMIN, - in_profile_dims_[NetworkIO::num_points].min); - profile->setDimensions( - num_points_name, nvinfer1::OptProfileSelector::kOPT, - in_profile_dims_[NetworkIO::num_points].opt); - profile->setDimensions( - num_points_name, nvinfer1::OptProfileSelector::kMAX, - in_profile_dims_[NetworkIO::num_points].max); - - profile->setDimensions( - coors_name, nvinfer1::OptProfileSelector::kMIN, in_profile_dims_[NetworkIO::coors].min); - profile->setDimensions( - coors_name, nvinfer1::OptProfileSelector::kOPT, in_profile_dims_[NetworkIO::coors].opt); - profile->setDimensions( - coors_name, nvinfer1::OptProfileSelector::kMAX, in_profile_dims_[NetworkIO::coors].max); - - config.addOptimizationProfile(profile); - return true; -} - -bool NetworkTRT::createContext() -{ - if (!engine) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create context: Engine was not created" << std::endl; - return false; - } - - context = autoware::tensorrt_common::TrtUniquePtr( - engine->createExecutionContext()); - if (!context) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create context" << std::endl; - return false; - } - - return true; -} - -bool NetworkTRT::parseONNX( - const std::string & onnx_path, const std::string & engine_path, const std::string & precision, - const size_t workspace_size) -{ - auto builder = autoware::tensorrt_common::TrtUniquePtr( - nvinfer1::createInferBuilder(logger_)); - if (!builder) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create builder" << std::endl; - return false; - } - - auto config = autoware::tensorrt_common::TrtUniquePtr( - builder->createBuilderConfig()); - if (!config) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create config" << std::endl; - return false; - } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size); -#else - config->setMaxWorkspaceSize(workspace_size); -#endif - if (precision == "fp16") { - if (builder->platformHasFastFp16()) { - autoware::tensorrt_common::LOG_INFO(logger_) << "Using TensorRT FP16 Inference" << std::endl; - config->setFlag(nvinfer1::BuilderFlag::kFP16); - } else { - autoware::tensorrt_common::LOG_INFO(logger_) - << "TensorRT FP16 Inference isn't supported in this environment" << std::endl; - } - } - - const auto flag = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - auto network = autoware::tensorrt_common::TrtUniquePtr( - builder->createNetworkV2(flag)); - if (!network) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create network" << std::endl; - return false; - } - - auto parser = autoware::tensorrt_common::TrtUniquePtr( - nvonnxparser::createParser(*network, logger_)); - parser->parseFromFile(onnx_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR)); - - if (!setProfile(*builder, *network, *config)) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to set profile" << std::endl; - return false; - } - - plan_ = autoware::tensorrt_common::TrtUniquePtr( - builder->buildSerializedNetwork(*network, *config)); - if (!plan_) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Failed to create serialized network" << std::endl; - return false; - } - engine = autoware::tensorrt_common::TrtUniquePtr( - runtime_->deserializeCudaEngine(plan_->data(), plan_->size())); - if (!engine) { - autoware::tensorrt_common::LOG_ERROR(logger_) << "Failed to create engine" << std::endl; - return false; - } - - return saveEngine(engine_path); -} - -bool NetworkTRT::saveEngine(const std::string & engine_path) -{ - autoware::tensorrt_common::LOG_INFO(logger_) << "Writing to " << engine_path << std::endl; - std::ofstream file(engine_path, std::ios::out | std::ios::binary); - file.write(reinterpret_cast(plan_->data()), plan_->size()); - return validateNetworkIO(); -} - -bool NetworkTRT::loadEngine(const std::string & engine_path) -{ - std::ifstream engine_file(engine_path); - std::stringstream engine_buffer; - engine_buffer << engine_file.rdbuf(); - std::string engine_str = engine_buffer.str(); - engine = - autoware::tensorrt_common::TrtUniquePtr(runtime_->deserializeCudaEngine( - reinterpret_cast(engine_str.data()), engine_str.size())); - autoware::tensorrt_common::LOG_INFO(logger_) << "Loaded engine from " << engine_path << std::endl; - return validateNetworkIO(); -} - -bool NetworkTRT::validateNetworkIO() -{ - // Whether the number of IO match the expected size - if (engine->getNbIOTensors() != NetworkIO::ENUM_SIZE) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid network IO. Expected size: " << NetworkIO::ENUM_SIZE - << ". Actual size: " << engine->getNbIOTensors() << "." << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - - // Initialize tensors_names_ with null values - tensors_names_.resize(NetworkIO::ENUM_SIZE, nullptr); - - // Loop over the tensor names and place them in the correct positions - for (int i = 0; i < NetworkIO::ENUM_SIZE; ++i) { - const char * name = engine->getIOTensorName(i); - tensors_names_[nameToNetworkIO(name)] = name; - } - - // Log the network IO - std::string tensors = std::accumulate( - tensors_names_.begin(), tensors_names_.end(), std::string(), - [](const std::string & a, const std::string & b) -> std::string { return a + b + " "; }); - autoware::tensorrt_common::LOG_INFO(logger_) << "Network IO: " << tensors << std::endl; - - // Whether the current engine input profile match the config input profile - for (int i = 0; i <= NetworkIO::coors; ++i) { - ProfileDimension engine_dims{ - engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMIN), - engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kOPT), - engine->getProfileShape(tensors_names_[i], 0, nvinfer1::OptProfileSelector::kMAX)}; - - autoware::tensorrt_common::LOG_INFO(logger_) - << "Profile for " << tensors_names_[i] << ": " << engine_dims << std::endl; - - if (engine_dims != in_profile_dims_[i]) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid network input dimension. Config: " << in_profile_dims_[i] - << ". Please change the input profile or delete the engine file and build engine again." - << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - } - - // Whether the IO tensor shapes match the network config, -1 for dynamic size - validateTensorShape( - NetworkIO::voxels, {-1, static_cast(config_.points_per_voxel_), - static_cast(config_.num_point_feature_size_)}); - validateTensorShape(NetworkIO::num_points, {-1}); - validateTensorShape(NetworkIO::coors, {-1, static_cast(config_.num_point_values_)}); - auto cls_score = validateTensorShape( - NetworkIO::cls_score, - {static_cast(config_.batch_size_), static_cast(config_.num_classes_), - static_cast(config_.num_proposals_)}); - autoware::tensorrt_common::LOG_INFO(logger_) - << "Network num classes: " << cls_score.d[1] << std::endl; - validateTensorShape( - NetworkIO::dir_pred, - {static_cast(config_.batch_size_), 2, static_cast(config_.num_proposals_)}); // x, y - validateTensorShape( - NetworkIO::bbox_pred, - {static_cast(config_.batch_size_), static_cast(config_.num_box_values_), - static_cast(config_.num_proposals_)}); - - return true; -} - -const char * NetworkTRT::getTensorName(NetworkIO name) -{ - return tensors_names_.at(name); -} - -nvinfer1::Dims NetworkTRT::validateTensorShape(NetworkIO name, const std::vector shape) -{ - auto tensor_shape = engine->getTensorShape(tensors_names_[name]); - if (tensor_shape.nbDims != static_cast(shape.size())) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid tensor shape for " << tensors_names_[name] << ". Expected size: " << shape.size() - << ". Actual size: " << tensor_shape.nbDims << "." << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - for (int i = 0; i < tensor_shape.nbDims; ++i) { - if (tensor_shape.d[i] != static_cast(shape[i])) { - autoware::tensorrt_common::LOG_ERROR(logger_) - << "Invalid tensor shape for " << tensors_names_[name] << ". Expected: " << shape[i] - << ". Actual: " << tensor_shape.d[i] << "." << std::endl; - throw std::runtime_error("Failed to initialize TRT network."); - } - } - return tensor_shape; -} - -} // namespace autoware::lidar_transfusion diff --git a/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp b/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp index 4e100f2e794d5..339b338244d77 100644 --- a/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp +++ b/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp @@ -17,6 +17,7 @@ #include "autoware/lidar_transfusion/preprocess/preprocess_kernel.hpp" #include "autoware/lidar_transfusion/transfusion_config.hpp" +#include #include #include @@ -27,25 +28,23 @@ #include #include #include +#include #include namespace autoware::lidar_transfusion { TransfusionTRT::TransfusionTRT( - const NetworkParam & network_param, const DensificationParam & densification_param, - const TransfusionConfig & config) -: config_(config) + const tensorrt_common::TrtCommonConfig & trt_config, + const DensificationParam & densification_param, TransfusionConfig config) +: config_(std::move(config)) { - network_trt_ptr_ = std::make_unique(config_); - - network_trt_ptr_->init( - network_param.onnx_path(), network_param.engine_path(), network_param.trt_precision()); vg_ptr_ = std::make_unique(densification_param, config_, stream_); stop_watch_ptr_ = std::make_unique>(); stop_watch_ptr_->tic("processing/inner"); initPtr(); + initTrt(trt_config); CHECK_CUDA_ERROR(cudaStreamCreate(&stream_)); } @@ -99,6 +98,51 @@ void TransfusionTRT::initPtr() post_ptr_ = std::make_unique(config_, stream_); } +void TransfusionTRT::initTrt(const tensorrt_common::TrtCommonConfig & trt_config) +{ + std::vector network_io{ + autoware::tensorrt_common::NetworkIO( + "voxels", {3, {-1, config_.points_per_voxel_, config_.num_point_feature_size_}}), + autoware::tensorrt_common::NetworkIO("num_points", {1, {-1}}), + autoware::tensorrt_common::NetworkIO("coors", {2, {-1, config_.num_point_values_}}), + autoware::tensorrt_common::NetworkIO( + "cls_score0", {3, {config_.batch_size_, config_.num_classes_, config_.num_proposals_}}), + autoware::tensorrt_common::NetworkIO( + "bbox_pred0", {3, {config_.batch_size_, config_.num_box_values_, config_.num_proposals_}}), + autoware::tensorrt_common::NetworkIO( + "dir_cls_pred0", {3, {config_.batch_size_, 2, config_.num_proposals_}})}; + + std::vector profile_dims{ + autoware::tensorrt_common::ProfileDims( + "voxels", + {3, + {config_.min_voxel_size_, config_.min_point_in_voxel_size_, + config_.min_network_feature_size_}}, + {3, + {config_.opt_voxel_size_, config_.opt_point_in_voxel_size_, + config_.opt_network_feature_size_}}, + {3, + {config_.max_voxel_size_, config_.max_point_in_voxel_size_, + config_.max_network_feature_size_}}), + autoware::tensorrt_common::ProfileDims( + "num_points", {1, {static_cast(config_.min_points_size_)}}, + {1, {static_cast(config_.opt_points_size_)}}, + {1, {static_cast(config_.max_points_size_)}}), + autoware::tensorrt_common::ProfileDims( + "coors", {2, {config_.min_coors_size_, config_.min_coors_dim_size_}}, + {2, {config_.opt_coors_size_, config_.opt_coors_dim_size_}}, + {2, {config_.max_coors_size_, config_.max_coors_dim_size_}})}; + + auto network_io_ptr = + std::make_unique>(network_io); + auto profile_dims_ptr = + std::make_unique>(profile_dims); + + network_trt_ptr_ = std::make_unique(trt_config); + if (!network_trt_ptr_->setup(std::move(profile_dims_ptr), std::move(network_io_ptr))) + throw std::runtime_error("Failed to setup TRT engine."); +} + bool TransfusionTRT::detect( const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer, std::vector & det_boxes3d, std::unordered_map & proc_timing) @@ -166,8 +210,9 @@ bool TransfusionTRT::preprocess( CHECK_CUDA_ERROR(cudaMemcpyAsync( ¶ms_input, params_input_d_.get(), sizeof(unsigned int), cudaMemcpyDeviceToHost, stream_)); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + auto params_input_i32 = static_cast(params_input); - if (params_input < config_.min_voxel_size_) { + if (params_input_i32 < config_.min_voxel_size_) { RCLCPP_WARN_STREAM( rclcpp::get_logger("lidar_transfusion"), "Too few voxels (" << params_input << ") for actual optimization profile (" @@ -185,39 +230,34 @@ bool TransfusionTRT::preprocess( params_input = config_.max_voxels_; } + RCLCPP_DEBUG_STREAM( rclcpp::get_logger("lidar_transfusion"), "Generated input voxels: " << params_input); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::voxels), voxel_features_d_.get()); - network_trt_ptr_->context->setInputShape( - network_trt_ptr_->getTensorName(NetworkIO::voxels), + bool success = true; + + // Inputs + success &= network_trt_ptr_->setTensor( + "voxels", voxel_features_d_.get(), nvinfer1::Dims3{ - static_cast(params_input), static_cast(config_.max_num_points_per_pillar_), + static_cast(params_input), config_.max_num_points_per_pillar_, static_cast(config_.num_point_feature_size_)}); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::num_points), voxel_num_d_.get()); - network_trt_ptr_->context->setInputShape( - network_trt_ptr_->getTensorName(NetworkIO::num_points), - nvinfer1::Dims{1, {static_cast(params_input)}}); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::coors), voxel_idxs_d_.get()); - network_trt_ptr_->context->setInputShape( - network_trt_ptr_->getTensorName(NetworkIO::coors), - nvinfer1::Dims2{ - static_cast(params_input), static_cast(config_.num_point_values_)}); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::cls_score), cls_output_d_.get()); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::bbox_pred), box_output_d_.get()); - network_trt_ptr_->context->setTensorAddress( - network_trt_ptr_->getTensorName(NetworkIO::dir_pred), dir_cls_output_d_.get()); - return true; + success &= network_trt_ptr_->setTensor( + "num_points", voxel_num_d_.get(), nvinfer1::Dims{1, {params_input_i32}}); + success &= network_trt_ptr_->setTensor( + "coors", voxel_idxs_d_.get(), nvinfer1::Dims2{params_input_i32, config_.num_point_values_}); + + // Outputs + success &= network_trt_ptr_->setTensor("cls_score0", cls_output_d_.get()); + success &= network_trt_ptr_->setTensor("bbox_pred0", box_output_d_.get()); + success &= network_trt_ptr_->setTensor("dir_cls_pred0", dir_cls_output_d_.get()); + + return success; } bool TransfusionTRT::inference() { - auto status = network_trt_ptr_->context->enqueueV3(stream_); + auto status = network_trt_ptr_->enqueueV3(stream_); CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); if (!status) { diff --git a/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp b/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp index 8bb70a821621f..563abd97698e0 100644 --- a/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp +++ b/perception/autoware_lidar_transfusion/src/lidar_transfusion_node.cpp @@ -75,7 +75,6 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options) const float score_threshold = static_cast(this->declare_parameter("score_threshold", descriptor)); - NetworkParam network_param(onnx_path, engine_path, trt_precision); DensificationParam densification_param( densification_world_frame_id, densification_num_past_frames); TransfusionConfig config( @@ -91,7 +90,9 @@ LidarTransfusionNode::LidarTransfusionNode(const rclcpp::NodeOptions & options) detection_class_remapper_.setParameters( allow_remapping_by_area_matrix, min_area_matrix, max_area_matrix); - detector_ptr_ = std::make_unique(network_param, densification_param, config); + auto trt_config = tensorrt_common::TrtCommonConfig(onnx_path, trt_precision, engine_path); + + detector_ptr_ = std::make_unique(trt_config, densification_param, config); cloud_sub_ = this->create_subscription( "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1), @@ -155,7 +156,6 @@ void LidarTransfusionNode::cloudCallback(const sensor_msgs::msg::PointCloud2::Co objects_pub_->publish(output_msg); published_time_pub_->publish_if_subscribed(objects_pub_, output_msg.header.stamp); } - // add processing time for debug if (debug_publisher_ptr_ && stop_watch_ptr_) { const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic", true); diff --git a/perception/autoware_shape_estimation/CMakeLists.txt b/perception/autoware_shape_estimation/CMakeLists.txt index d0eb2aa5535a8..d7313111e6ade 100644 --- a/perception/autoware_shape_estimation/CMakeLists.txt +++ b/perception/autoware_shape_estimation/CMakeLists.txt @@ -4,6 +4,9 @@ project(autoware_shape_estimation) find_package(autoware_cmake REQUIRED) autoware_package() +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization +add_compile_options(-Wno-deprecated-declarations) + find_package(PCL REQUIRED COMPONENTS common) find_package(pcl_conversions REQUIRED) diff --git a/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp b/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp index d39dce65d224d..4328246ff395f 100644 --- a/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp +++ b/perception/autoware_shape_estimation/include/autoware/shape_estimation/tensorrt_shape_estimator.hpp @@ -48,11 +48,7 @@ class TrtShapeEstimator { public: TrtShapeEstimator( - const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config, - const size_t max_workspace_size = (1 << 30), - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig("MinMax", -1, false, false, false, 0.0)); + const std::string & model_path, const std::string & precision, const int batch_size); ~TrtShapeEstimator() = default; diff --git a/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp b/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp index b4b1f18832da7..fa3c3cb09bad3 100644 --- a/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp +++ b/perception/autoware_shape_estimation/lib/tensorrt_shape_estimator.cpp @@ -29,47 +29,41 @@ namespace autoware::shape_estimation { TrtShapeEstimator::TrtShapeEstimator( - const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size, - const autoware::tensorrt_common::BuildConfig build_config) + const std::string & model_path, const std::string & precision, const int batch_size) { trt_common_ = std::make_unique( - model_path, precision, nullptr, batch_config, max_workspace_size, build_config); + tensorrt_common::TrtCommonConfig(model_path, precision)); + batch_size_ = batch_size; - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - std::cerr << "Failed to initialize TensorRT" << std::endl; - return; + if (!trt_common_->setup()) { + throw std::runtime_error("Failed to setup TensorRT"); } - const auto pc_input_dims = trt_common_->getBindingDimensions(0); + const auto pc_input_dims = trt_common_->getTensorShape(0); const auto pc_input_size = std::accumulate( pc_input_dims.d + 1, pc_input_dims.d + pc_input_dims.nbDims, 1, std::multiplies()); - input_pc_d_ = autoware::cuda_utils::make_unique(pc_input_size * batch_config[2]); - batch_size_ = batch_config[2]; - const auto one_hot_input_dims = trt_common_->getBindingDimensions(1); + input_pc_d_ = autoware::cuda_utils::make_unique(pc_input_size * batch_size_); + const auto one_hot_input_dims = trt_common_->getTensorShape(1); const auto one_hot_input_size = std::accumulate( one_hot_input_dims.d + 1, one_hot_input_dims.d + one_hot_input_dims.nbDims, 1, std::multiplies()); - input_one_hot_d_ = - autoware::cuda_utils::make_unique(one_hot_input_size * batch_config[2]); + input_one_hot_d_ = autoware::cuda_utils::make_unique(one_hot_input_size * batch_size_); - const auto stage1_center_out_dims = trt_common_->getBindingDimensions(2); + const auto stage1_center_out_dims = trt_common_->getTensorShape(2); out_s1center_elem_num_ = std::accumulate( stage1_center_out_dims.d + 1, stage1_center_out_dims.d + stage1_center_out_dims.nbDims, 1, std::multiplies()); - out_s1center_elem_num_ = out_s1center_elem_num_ * batch_config[2]; - out_s1center_elem_num_per_batch_ = static_cast(out_s1center_elem_num_ / batch_config[2]); + out_s1center_elem_num_ = out_s1center_elem_num_ * batch_size_; + out_s1center_elem_num_per_batch_ = static_cast(out_s1center_elem_num_ / batch_size_); out_s1center_prob_d_ = autoware::cuda_utils::make_unique(out_s1center_elem_num_); out_s1center_prob_h_ = autoware::cuda_utils::make_unique_host(out_s1center_elem_num_, cudaHostAllocPortable); - const auto pred_out_dims = trt_common_->getBindingDimensions(3); + const auto pred_out_dims = trt_common_->getTensorShape(3); out_pred_elem_num_ = std::accumulate( pred_out_dims.d + 1, pred_out_dims.d + pred_out_dims.nbDims, 1, std::multiplies()); - out_pred_elem_num_ = out_pred_elem_num_ * batch_config[2]; - out_pred_elem_num_per_batch_ = static_cast(out_pred_elem_num_ / batch_config[2]); + out_pred_elem_num_ = out_pred_elem_num_ * batch_size_; + out_pred_elem_num_per_batch_ = static_cast(out_pred_elem_num_ / batch_size_); out_pred_prob_d_ = autoware::cuda_utils::make_unique(out_pred_elem_num_); out_pred_prob_h_ = autoware::cuda_utils::make_unique_host(out_pred_elem_num_, cudaHostAllocPortable); @@ -83,10 +77,6 @@ TrtShapeEstimator::TrtShapeEstimator( bool TrtShapeEstimator::inference( const DetectedObjectsWithFeature & input, DetectedObjectsWithFeature & output) { - if (!trt_common_->isInitialized()) { - return false; - } - bool result = false; for (size_t i = 0; i < input.feature_objects.size(); i += batch_size_) { @@ -111,13 +101,13 @@ bool TrtShapeEstimator::inference( void TrtShapeEstimator::preprocess(const DetectedObjectsWithFeature & input) { - auto input_dims_pc = trt_common_->getBindingDimensions(0); + auto input_dims_pc = trt_common_->getTensorShape(0); int batch_size = static_cast(input.feature_objects.size()); const auto input_chan = static_cast(input_dims_pc.d[1]); const auto input_pc_size = static_cast(input_dims_pc.d[2]); - auto input_dims_one_hot = trt_common_->getBindingDimensions(1); + auto input_dims_one_hot = trt_common_->getTensorShape(1); const auto input_one_hot_size = static_cast(input_dims_one_hot.d[1]); int volume_pc = batch_size * input_chan * input_pc_size; @@ -229,7 +219,10 @@ bool TrtShapeEstimator::feed_forward_and_decode( int batch_size = static_cast(input.feature_objects.size()); std::vector buffers = { input_pc_d_.get(), input_one_hot_d_.get(), out_s1center_prob_d_.get(), out_pred_prob_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); CHECK_CUDA_ERROR(cudaMemcpyAsync( out_s1center_prob_h_.get(), out_s1center_prob_d_.get(), out_s1center_elem_num_ * sizeof(float), diff --git a/perception/autoware_shape_estimation/src/shape_estimation_node.cpp b/perception/autoware_shape_estimation/src/shape_estimation_node.cpp index 67402f68d63ef..c5450dc14adb0 100644 --- a/perception/autoware_shape_estimation/src/shape_estimation_node.cpp +++ b/perception/autoware_shape_estimation/src/shape_estimation_node.cpp @@ -63,9 +63,8 @@ ShapeEstimationNode::ShapeEstimationNode(const rclcpp::NodeOptions & node_option min_points_ = declare_parameter("model_params.minimum_points"); std::string precision = declare_parameter("model_params.precision"); int batch_size = declare_parameter("model_params.batch_size"); - autoware::tensorrt_common::BatchConfig batch_config{batch_size, batch_size, batch_size}; tensorrt_shape_estimator_ = - std::make_unique(model_path, precision, batch_config); + std::make_unique(model_path, precision, batch_size); if (this->declare_parameter("model_params.build_only", false)) { RCLCPP_INFO(this->get_logger(), "TensorRT engine is built and shutdown node."); rclcpp::shutdown(); diff --git a/perception/autoware_tensorrt_classifier/CMakeLists.txt b/perception/autoware_tensorrt_classifier/CMakeLists.txt index 88747a1e9240c..59ce7b18afcc3 100644 --- a/perception/autoware_tensorrt_classifier/CMakeLists.txt +++ b/perception/autoware_tensorrt_classifier/CMakeLists.txt @@ -6,6 +6,9 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O3 -Wno-write-strings -fopen find_package(autoware_cmake REQUIRED) autoware_package() +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization +add_compile_options(-Wno-deprecated-declarations) + find_package(CUDA) find_package(CUDNN) find_package(TENSORRT) diff --git a/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp b/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp index ee16343b956d1..4f7a5865c72ae 100644 --- a/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp +++ b/perception/autoware_tensorrt_classifier/include/autoware/tensorrt_classifier/tensorrt_classifier.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -44,22 +45,18 @@ class TrtClassifier * @param[in] mode_path ONNX model_path * @param[in] precision precision for inference * @param[in] calibration_images path for calibration files (only require for quantization) - * @param[in] batch_config configuration for batched execution - * @param[in] max_workspace_size maximum workspace for building TensorRT engine * @param[in] mean mean for preprocess * @param[in] std std for preprocess - * @param[in] buildConfig configuration including precision, calibration method, dla, remaining - * fp16 for first layer, remaining fp16 for last layer and profiler for builder + * @param[in] calib_config calibration configuration * @param[in] cuda whether use cuda gpu for preprocessing */ TrtClassifier( const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, const std::vector & mean = {123.675, 116.28, 103.53}, const std::vector & std = {58.395, 57.12, 57.375}, - const size_t max_workspace_size = (1 << 30), const std::string & calibration_images = "", - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig("MinMax", -1, false, false, false, 0.0), + const std::string & calibration_images = "", + const tensorrt_common::CalibrationConfig & calibration_config = + tensorrt_common::CalibrationConfig("MinMax", false, false, 0.0), const bool cuda = false); /** * @brief Deconstruct TrtClassifier @@ -84,6 +81,12 @@ class TrtClassifier */ void initPreprocessBuffer(int width, int height); + /** + * @brief get batch size + * @return batch size + */ + [[nodiscard]] int getBatchSize() const; + private: /** * @brief run preprocess including resizing, letterbox, BGR2RGB, NHWC2NCHW and toFloat on CPU @@ -102,7 +105,7 @@ class TrtClassifier const std::vector & images, std::vector & results, std::vector & probabilities); - std::unique_ptr trt_common_; + std::unique_ptr trt_common_; std::vector input_h_; CudaUniquePtr input_d_; diff --git a/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp b/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp index b24e4fe56e8b8..798e7e3bf926c 100644 --- a/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp +++ b/perception/autoware_tensorrt_classifier/src/tensorrt_classifier.cpp @@ -100,11 +100,9 @@ std::vector loadImageList(const std::string & filename, const std:: namespace autoware::tensorrt_classifier { TrtClassifier::TrtClassifier( - const std::string & model_path, const std::string & precision, - const autoware::tensorrt_common::BatchConfig & batch_config, const std::vector & mean, - const std::vector & std, const size_t max_workspace_size, - const std::string & calibration_image_list_path, - autoware::tensorrt_common::BuildConfig build_config, const bool cuda) + const std::string & model_path, const std::string & precision, const std::vector & mean, + const std::vector & std, const std::string & calibration_image_list_path, + const tensorrt_common::CalibrationConfig & calib_config, const bool cuda) { src_width_ = -1; src_height_ = -1; @@ -114,22 +112,36 @@ TrtClassifier::TrtClassifier( for (size_t i = 0; i < inv_std_.size(); i++) { inv_std_[i] = 1.0 / inv_std_[i]; } - batch_size_ = batch_config[2]; + trt_common_ = std::make_unique( + tensorrt_common::TrtCommonConfig(model_path, precision)); + + const auto network_input_dims = trt_common_->getTensorShape(0); + batch_size_ = network_input_dims.d[0]; + const auto input_channel = network_input_dims.d[1]; + const auto input_height = network_input_dims.d[2]; + const auto input_width = network_input_dims.d[3]; + + std::vector profile_dims{ + autoware::tensorrt_common::ProfileDims( + 0, {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}})}; + auto profile_dims_ptr = + std::make_unique>(profile_dims); + if (precision == "int8") { - int max_batch_size = batch_config[2]; - nvinfer1::Dims input_dims = autoware::tensorrt_common::get_input_dims(model_path); std::vector calibration_images; if (calibration_image_list_path != "") { calibration_images = loadImageList(calibration_image_list_path, ""); } autoware::tensorrt_classifier::ImageStream stream( - max_batch_size, input_dims, calibration_images); + batch_size_, network_input_dims, calibration_images); fs::path calibration_table{model_path}; std::string ext = ""; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { ext = "EntropyV2-"; } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { ext = "Legacy-"; } else { ext = "MinMax-"; @@ -141,11 +153,11 @@ TrtClassifier::TrtClassifier( histogram_table.replace_extension(ext); std::unique_ptr calibrator; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { calibrator.reset(new autoware::tensorrt_classifier::Int8EntropyCalibrator( stream, calibration_table, mean_, std_)); } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { double quantile = 0.999999; double cutoff = 0.999999; calibrator.reset(new autoware::tensorrt_classifier::Int8LegacyCalibrator( @@ -154,29 +166,27 @@ TrtClassifier::TrtClassifier( calibrator.reset(new autoware::tensorrt_classifier::Int8MinMaxCalibrator( stream, calibration_table, mean_, std_)); } - trt_common_ = std::make_unique( - model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config); + if (!trt_common_->setupWithCalibrator( + std::move(calibrator), calib_config, std::move((profile_dims_ptr)))) { + throw std::runtime_error("Failed to setup TensorRT engine with calibrator"); + } } else { - trt_common_ = std::make_unique( - model_path, precision, nullptr, batch_config, max_workspace_size, build_config); - } - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - return; + if (!trt_common_->setup(std::move(profile_dims_ptr))) { + throw std::runtime_error("Failed to setup TensorRT engine"); + } } // GPU memory allocation - const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_dims = trt_common_->getTensorShape(0); const auto input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); - const auto output_dims = trt_common_->getBindingDimensions(1); - input_d_ = autoware::cuda_utils::make_unique(batch_config[2] * input_size); + const auto output_dims = trt_common_->getTensorShape(1); + input_d_ = autoware::cuda_utils::make_unique(batch_size_ * input_size); out_elem_num_ = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); - out_elem_num_ = out_elem_num_ * batch_config[2]; - out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_config[2]); + out_elem_num_ = out_elem_num_ * batch_size_; + out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_size_); out_prob_d_ = autoware::cuda_utils::make_unique(out_elem_num_); out_prob_h_ = autoware::cuda_utils::make_unique_host(out_elem_num_, cudaHostAllocPortable); @@ -218,7 +228,7 @@ void TrtClassifier::initPreprocessBuffer(int width, int height) src_width_ = width; src_height_ = height; if (m_cuda) { - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); bool const hasRuntimeDim = std::any_of( input_dims.d, input_dims.d + input_dims.nbDims, [](int32_t input_dim) { return input_dim == -1; }); @@ -226,7 +236,9 @@ void TrtClassifier::initPreprocessBuffer(int width, int height) input_dims.d[0] = batch_size_; } if (!h_img_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } if (!h_img_) { CHECK_CUDA_ERROR(cudaMallocHost( @@ -239,10 +251,15 @@ void TrtClassifier::initPreprocessBuffer(int width, int height) } } +int TrtClassifier::getBatchSize() const +{ + return batch_size_; +} + void TrtClassifier::preprocessGpu(const std::vector & images) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; for (const auto & image : images) { @@ -266,7 +283,9 @@ void TrtClassifier::preprocessGpu(const std::vector & images) src_height_ = height; } if (!h_img_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -299,9 +318,11 @@ void TrtClassifier::preprocessGpu(const std::vector & images) void TrtClassifier::preprocess_opt(const std::vector & images) { int batch_size = static_cast(images.size()); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_chan = static_cast(input_dims.d[1]); const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -348,9 +369,6 @@ bool TrtClassifier::doInference( const std::vector & images, std::vector & results, std::vector & probabilities) { - if (!trt_common_->isInitialized()) { - return false; - } preprocess_opt(images); return feedforwardAndDecode(images, results, probabilities); @@ -363,7 +381,10 @@ bool TrtClassifier::feedforwardAndDecode( results.clear(); probabilities.clear(); std::vector buffers = {input_d_.get(), out_prob_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); int batch_size = static_cast(images.size()); diff --git a/perception/autoware_tensorrt_common/CMakeLists.txt b/perception/autoware_tensorrt_common/CMakeLists.txt index 3105523a509e3..d7d8026866124 100644 --- a/perception/autoware_tensorrt_common/CMakeLists.txt +++ b/perception/autoware_tensorrt_common/CMakeLists.txt @@ -16,9 +16,14 @@ if(NOT (CUDAToolkit_FOUND AND CUDNN_FOUND AND TENSORRT_FOUND)) return() endif() +if(TENSORRT_VERSION VERSION_LESS 8.5) + message(WARNING "Unsupported version TensorRT ${TENSORRT_VERSION} detected. This package requires TensorRT 8.5 or later.") + return() +endif() + add_library(${PROJECT_NAME} SHARED src/tensorrt_common.cpp - src/simple_profiler.cpp + src/profiler.cpp ) target_link_libraries(${PROJECT_NAME} @@ -42,7 +47,7 @@ set_target_properties(${PROJECT_NAME} CXX_EXTENSIONS NO ) -# TODO(autoware_tensorrt_common): Remove -Wno-deprecated-declarations once upgrading to TensorRT 8.5 is complete +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -Wpedantic -Werror -Wno-deprecated-declarations ) diff --git a/perception/autoware_tensorrt_common/README.md b/perception/autoware_tensorrt_common/README.md index c828be58c1c1c..cf19b187337b2 100644 --- a/perception/autoware_tensorrt_common/README.md +++ b/perception/autoware_tensorrt_common/README.md @@ -1,6 +1,70 @@ # autoware_tensorrt_common -## Purpose +This package provides a high-level API to work with TensorRT. This library simplifies the process of loading, building, and executing TensorRT inference engines using ONNX models. It also includes utilities for profiling and managing TensorRT execution contexts, making it easier to integrate TensorRT-based packages in Autoware. -This package contains a library of common functions related to TensorRT. -This package may include functions for handling TensorRT engine and calibration algorithm used for quantization +## Usage + +Here is example usage of the library. For full API documentation, please refer to the doxygen documentation (see header [file](include/autoware/tensorrt_common/tensorrt_common.hpp)). + +```c++ +#include + +#include +#include +#include + +using autoware::tensorrt_common::TrtCommon; +using autoware::tensorrt_common::TrtCommonConfig; +using autoware::tensorrt_common::TensorInfo; +using autoware::tensorrt_common::NetworkIO; +using autoware::tensorrt_common::ProfileDims; + +std::unique_ptr trt_common_; +``` + +### Create a tensorrt common instance and setup engine + +- With minimal configuration. + +```c++ +trt_common_ = std::make_unique(TrtCommonConfig("/path/to/onnx/model.onnx")); +trt_common_->setup(); +``` + +- With full configuration. + +```c++ +trt_common_ = std::make_unique(TrtCommonConfig("/path/to/onnx/model.onnx", "fp16", "/path/to/engine/model.engine", (1ULL << 30U), -1, false)); + +std::vector network_io{ + NetworkIO("sample_input", {3, {-1, 64, 512}}), NetworkIO("sample_output", {1, {50}})}; +std::vector profile_dims{ + ProfileDims("sample_input", {3, {1, 64, 512}}, {3, {3, 64, 512}}, {3, {9, 64, 512}})}; + +auto network_io_ptr = std::make_unique>(network_io); +auto profile_dims_ptr = std::make_unique>(profile_dims); + +trt_common_->setup(std::move(profile_dims_ptr), std::move(network_io_ptr)); +``` + +By defining network IO names and dimensions, an extra shapes validation will be performed after building / loading engine. This is useful to ensure the model is compatible with current code for preprocessing as it might consists of operations dependent on tensor shapes. + +Profile dimension is used to specify the min, opt, and max dimensions for dynamic shapes. + +Network IO or / and profile dimensions can be omitted if not needed. + +### Setting input and output tensors + +```c++ +bool success = true; +success &= trt_common_->setTensor("sample_input", sample_input_d_.get(), nvinfer1::Dims{3, {var_size, 64, 512}}); +success &= trt_common_->setTensor("sample_output", sample_output_d_.get()); +return success; +``` + +### Execute inference + +```c++ +auto success = trt_common_->enqueueV3(stream_); +return success; +``` diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp new file mode 100644 index 0000000000000..4e567ba0b0e18 --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -0,0 +1,107 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__CONV_PROFILER_HPP_ +#define AUTOWARE__TENSORRT_COMMON__CONV_PROFILER_HPP_ + +#include + +#include + +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ +/** + * @struct LayerInfo + * @brief Information of a layer. + */ +struct LayerInfo +{ + //! @brief Input channel. + int in_c; + //! @brief Output channel. + int out_c; + //! @brief Width. + int w; + //! @brief Height. + int h; + //! @brief Kernel size. + int k; + //! @brief Stride. + int stride; + //! @brief Number of groups. + int groups; + //! @brief Layer type. + nvinfer1::LayerType type; +}; + +/** + * @class ConvProfiler + * @brief Collect per-layer profile information, assuming times are reported in the same order. + */ +class ConvProfiler : public tensorrt_common::Profiler +{ +public: + /** + * @brief Construct Profiler for convolutional layers. + * + * @param[in] src_profilers Source profilers to merge. + */ + explicit ConvProfiler( + const std::vector & src_profilers = std::vector()) + : Profiler(src_profilers){}; + + /** + * @brief Set per-layer profile information for model. + * + * @param[in] layer Layer to set profile information. + */ + void setProfDict(nvinfer1::ILayer * layer) noexcept final + { + std::string name = layer->getName(); + layer_dict_[name]; + layer_dict_[name].type = layer->getType(); + if (layer->getType() == nvinfer1::LayerType::kCONVOLUTION) { + auto conv = dynamic_cast(layer); + nvinfer1::ITensor * in = layer->getInput(0); + nvinfer1::Dims dim_in = in->getDimensions(); + nvinfer1::ITensor * out = layer->getOutput(0); + nvinfer1::Dims dim_out = out->getDimensions(); + nvinfer1::Dims k_dims = conv->getKernelSizeNd(); + nvinfer1::Dims s_dims = conv->getStrideNd(); + int groups = conv->getNbGroups(); + int stride = s_dims.d[0]; + int kernel = k_dims.d[0]; + layer_dict_[name].in_c = dim_in.d[1]; + layer_dict_[name].out_c = dim_out.d[1]; + layer_dict_[name].w = dim_in.d[3]; + layer_dict_[name].h = dim_in.d[2]; + layer_dict_[name].k = kernel; + layer_dict_[name].stride = stride; + layer_dict_[name].groups = groups; + } + } + +private: + //! @brief Per-layer information. + std::map layer_dict_; +}; +} // namespace tensorrt_common +} // namespace autoware +#endif // AUTOWARE__TENSORRT_COMMON__CONV_PROFILER_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp index 5aa897983af72..b9b9048cd8204 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/logger.hpp @@ -17,18 +17,19 @@ #ifndef AUTOWARE__TENSORRT_COMMON__LOGGER_HPP_ #define AUTOWARE__TENSORRT_COMMON__LOGGER_HPP_ -#include "NvInferRuntimeCommon.h" +#include #include #include +#include #include #include -#include #include #include #include #include #include +#include namespace autoware { @@ -46,7 +47,7 @@ class LogStreamConsumerBuffer : public std::stringbuf LogStreamConsumerBuffer(LogStreamConsumerBuffer && other) : mOutput(other.mOutput) {} - ~LogStreamConsumerBuffer() + ~LogStreamConsumerBuffer() override { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output // sequence std::streambuf::pptr() gives a pointer to the current position of the output @@ -60,7 +61,7 @@ class LogStreamConsumerBuffer : public std::stringbuf // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream - virtual int sync() + int sync() override { putOutput(); return 0; @@ -252,6 +253,37 @@ class Logger : public nvinfer1::ILogger // NOLINT } } + void log(Severity severity, const char * msg, ...) const noexcept + { + if (mVerbose) { + va_list args; + va_start(args, msg); + + // Buffer size + va_list args_copy; + va_copy(args_copy, args); + int required_size = std::vsnprintf(nullptr, 0, msg, args_copy); + va_end(args_copy); + + // Formatting error + if (required_size < 0) { + log(Severity::kINTERNAL_ERROR, "Error formatting log message"); + va_end(args); + return; + } + + // Format msg + std::vector buffer(required_size + 1); + std::vsnprintf(buffer.data(), buffer.size(), msg, args); + + va_end(args); // End variadic argument processing + + // Send the formatted message to LogStreamConsumer + LogStreamConsumer(mReportableSeverity, severity) + << "[TRT] " << std::string(buffer.data()) << std::endl; + } + } + /** * @brief Logging with throttle. * diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp new file mode 100644 index 0000000000000..d82eab8c5f89b --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp @@ -0,0 +1,104 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__PROFILER_HPP_ +#define AUTOWARE__TENSORRT_COMMON__PROFILER_HPP_ + +#include + +#include +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ + +/** + * @class Profiler + * @brief Collect per-layer profile information, assuming times are reported in the same order. + */ +class Profiler : public nvinfer1::IProfiler +{ +public: + /** + * @struct Record + * @brief Record of layer profile information. + */ + struct Record + { + float time{0}; + int count{0}; + float min_time{-1.0}; + int index; + }; + + /** + * @brief Construct Profiler. + * + * @param[in] src_profilers Source profilers to merge. + */ + Profiler(const std::vector & src_profilers = std::vector()); + + /** + * @brief Report layer time. + * + * @param[in] layerName Layer name. + * @param[in] ms Time in milliseconds. + */ + void reportLayerTime(const char * layerName, float ms) noexcept final; + + /** + * @brief Get printable representation of Profiler. + * + * @return String representation for current state of Profiler. + */ + [[nodiscard]] std::string toString() const; + + /** + * @brief Set per-layer profile information for model. + * + * @param[in] layer Layer to set profile information. + */ + virtual void setProfDict([[maybe_unused]] nvinfer1::ILayer * layer) noexcept {}; + + /** + * @brief Get printable per-layer profile information for model. + * + * @param[in] format Format for layer information. + * @return Layer information. + */ + std::string getLayerInformation(nvinfer1::LayerInformationFormat format); + + /** + * @brief Output Profiler to ostream. + * + * @param[out] out Output stream. + * @param[in] value Profiler to output. + * @return Output stream. + */ + friend std::ostream & operator<<(std::ostream & out, const Profiler & value); + +private: + //!< @brief Profile information for layers. + std::map profile_; + + //!< @brief Index for layer. + int index_; +}; +} // namespace tensorrt_common +} // namespace autoware +#endif // AUTOWARE__TENSORRT_COMMON__PROFILER_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/simple_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/simple_profiler.hpp deleted file mode 100644 index a8d504fb2861a..0000000000000 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/simple_profiler.hpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef AUTOWARE__TENSORRT_COMMON__SIMPLE_PROFILER_HPP_ -#define AUTOWARE__TENSORRT_COMMON__SIMPLE_PROFILER_HPP_ - -#include - -#include -#include -#include -#include - -namespace autoware -{ -namespace tensorrt_common -{ -struct LayerInfo -{ - int in_c; - int out_c; - int w; - int h; - int k; - int stride; - int groups; - nvinfer1::LayerType type; -}; - -/** - * @class Profiler - * @brief Collect per-layer profile information, assuming times are reported in the same order - */ -class SimpleProfiler : public nvinfer1::IProfiler -{ -public: - struct Record - { - float time{0}; - int count{0}; - float min_time{-1.0}; - int index; - }; - SimpleProfiler( - std::string name, - const std::vector & src_profilers = std::vector()); - - void reportLayerTime(const char * layerName, float ms) noexcept override; - - void setProfDict(nvinfer1::ILayer * layer) noexcept; - - friend std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value); - -private: - std::string m_name; - std::map m_profile; - int m_index; - std::map m_layer_dict; -}; -} // namespace tensorrt_common -} // namespace autoware -#endif // AUTOWARE__TENSORRT_COMMON__SIMPLE_PROFILER_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp index 2b3b3f02f315f..6393a34c36d7a 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp @@ -15,226 +15,379 @@ #ifndef AUTOWARE__TENSORRT_COMMON__TENSORRT_COMMON_HPP_ #define AUTOWARE__TENSORRT_COMMON__TENSORRT_COMMON_HPP_ -#ifndef YOLOX_STANDALONE -#include - -#include -#endif +#include "autoware/tensorrt_common/logger.hpp" +#include "autoware/tensorrt_common/profiler.hpp" +#include "autoware/tensorrt_common/utils.hpp" #include #include -#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__))) -#include -namespace fs = ::std::filesystem; -#else -#include -namespace fs = ::std::experimental::filesystem; -#endif - -#include -#include - #include -#include #include +#include +#include #include namespace autoware { namespace tensorrt_common { -/** - * @struct BuildConfig - * @brief Configuration to provide fine control regarding TensorRT builder - */ -struct BuildConfig -{ - // type for calibration - std::string calib_type_str; - - // DLA core ID that the process uses - int dla_core_id; - - // flag for partial quantization in first layer - bool quantize_first_layer; // For partial quantization - - // flag for partial quantization in last layer - bool quantize_last_layer; // For partial quantization - - // flag for per-layer profiler using IProfiler - bool profile_per_layer; - - // clip value for implicit quantization - double clip_value; // For implicit quantization - - // Supported calibration type - const std::array valid_calib_type = {"Entropy", "Legacy", "Percentile", "MinMax"}; - - BuildConfig() - : calib_type_str("MinMax"), - dla_core_id(-1), - quantize_first_layer(false), - quantize_last_layer(false), - profile_per_layer(false), - clip_value(0.0) - { - } - - explicit BuildConfig( - const std::string & calib_type_str, const int dla_core_id = -1, - const bool quantize_first_layer = false, const bool quantize_last_layer = false, - const bool profile_per_layer = false, const double clip_value = 0.0) - : calib_type_str(calib_type_str), - dla_core_id(dla_core_id), - quantize_first_layer(quantize_first_layer), - quantize_last_layer(quantize_last_layer), - profile_per_layer(profile_per_layer), - clip_value(clip_value) - { -#ifndef YOLOX_STANDALONE - if ( - std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) == - valid_calib_type.end()) { - std::stringstream message; - message << "Invalid calibration type was specified: " << calib_type_str << std::endl - << "Valid value is one of: [Entropy, (Legacy | Percentile), MinMax]" << std::endl - << "Default calibration type will be used: MinMax" << std::endl; - std::cerr << message.str(); - } -#endif - } -}; - -nvinfer1::Dims get_input_dims(const std::string & onnx_file_path); - -const std::array valid_precisions = {"fp32", "fp16", "int8"}; -bool is_valid_precision_string(const std::string & precision); - template struct InferDeleter // NOLINT { - void operator()(T * obj) const - { - if (obj) { -#if TENSORRT_VERSION_MAJOR >= 8 - delete obj; -#else - obj->destroy(); -#endif - } - } + void operator()(T * obj) const { delete obj; } }; template using TrtUniquePtr = std::unique_ptr>; -using BatchConfig = std::array; +using NetworkIOPtr = std::unique_ptr>; +using ProfileDimsPtr = std::unique_ptr>; +using TensorsVec = std::vector>; +using TensorsMap = std::unordered_map>; /** * @class TrtCommon - * @brief TensorRT common library + * @brief TensorRT common library. */ class TrtCommon // NOLINT { public: /** * @brief Construct TrtCommon. - * @param[in] mode_path ONNX model_path - * @param[in] precision precision for inference - * @param[in] calibrator pointer for any type of INT8 calibrator - * @param[in] batch_config configuration for batched execution - * @param[in] max_workspace_size maximum workspace for building TensorRT engine - * @param[in] buildConfig configuration including precision, calibration method, dla, remaining - * fp16 for first layer, remaining fp16 for last layer and profiler for builder - * @param[in] plugin_paths path for custom plugin + * + * @param[in] trt_config Base configuration with ONNX model path as minimum required. + * parameter. + * @param[in] profiler Per-layer profiler. + * @param[in] plugin_paths Paths for TensorRT plugins. */ TrtCommon( - const std::string & model_path, const std::string & precision, - std::unique_ptr calibrator = nullptr, - const BatchConfig & batch_config = {1, 1, 1}, const size_t max_workspace_size = (16 << 20), - const BuildConfig & buildConfig = BuildConfig(), + const TrtCommonConfig & trt_config, + const std::shared_ptr & profiler = std::make_shared(), const std::vector & plugin_paths = {}); - /** * @brief Deconstruct TrtCommon */ ~TrtCommon(); /** - * @brief Load TensorRT engine - * @param[in] engine_file_path path for a engine file - * @return flag for whether loading are succeeded or failed + * @brief Setup for TensorRT execution including building and loading engine. + * + * @param[in] profile_dims Optimization profile of tensors for dynamic shapes. + * @param[in] network_io Network input/output tensors information. + * @return Whether setup is successful. */ - bool loadEngine(const std::string & engine_file_path); + [[nodiscard]] virtual bool setup( + ProfileDimsPtr profile_dims = nullptr, NetworkIOPtr network_io = nullptr); /** - * @brief Output layer information including GFLOPs and parameters - * @param[in] onnx_file_path path for a onnx file - * @warning This function is based on darknet log. + * @brief Get TensorRT engine precision. + * + * @return string representation of TensorRT engine precision. */ - void printNetworkInfo(const std::string & onnx_file_path); + [[nodiscard]] std::string getPrecision() const; /** - * @brief build TensorRT engine from ONNX - * @param[in] onnx_file_path path for a onnx file - * @param[in] output_engine_file_path path for a engine file + * @brief Get tensor name by index from TensorRT engine with fallback from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor name. */ - bool buildEngineFromOnnx( - const std::string & onnx_file_path, const std::string & output_engine_file_path); + [[nodiscard]] const char * getIOTensorName(const int32_t index) const; /** - * @brief setup for TensorRT execution including building and loading engine + * @brief Get number of IO tensors from TensorRT engine with fallback from TensorRT network. + * + * @return Number of IO tensors. */ - void setup(); + [[nodiscard]] int32_t getNbIOTensors() const; -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 - void setupBindings(std::vector & bindings); -#endif + /** + * @brief Get tensor shape by index from TensorRT engine with fallback from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getTensorShape(const int32_t index) const; - bool isInitialized(); + /** + * @brief Get tensor shape by name from TensorRT engine. + * + * @param[in] tensor_name Tensor name. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getTensorShape(const char * tensor_name) const; - nvinfer1::Dims getBindingDimensions(const int32_t index) const; - int32_t getNbBindings(); - bool setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const; -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 - bool enqueueV3(cudaStream_t stream); -#endif + /** + * @brief Get input tensor shape by index from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getInputDims(const int32_t index) const; + + /** + * @brief Get output tensor shape by index from TensorRT network. + * + * @param[in] index Tensor index. + * @return Tensor shape. + */ + [[nodiscard]] nvinfer1::Dims getOutputDims(const int32_t index) const; + + /** + * @brief Set tensor address by index via TensorRT context. + * + * @param[in] index Tensor index. + * @param[in] data Tensor pointer. + * @return Whether setting tensor address is successful. + */ + bool setTensorAddress(const int32_t index, void * data); + + /** + * @brief Set tensor address by name via TensorRT context. + * + * @param[in] tensor_name Tensor name. + * @param[in] data Tensor pointer. + * @return Whether setting tensor address is successful. + */ + bool setTensorAddress(const char * tensor_name, void * data); + + /** + * @brief Set tensors addresses by indices via TensorRT context. + * + * @param[in] tensors Tensors pointers. + * @return Whether setting tensors addresses is successful. + */ + bool setTensorsAddresses(std::vector & tensors); + + /** + * @brief Set tensors addresses by names via TensorRT context. + * + * @param[in] tensors Tensors pointers. + * @return Whether setting tensors addresses is successful. + */ + bool setTensorsAddresses(std::unordered_map & tensors); + + /** + * @brief Set input shape by index via TensorRT context. + * + * @param[in] index Tensor index. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting input shape is successful. + */ + bool setInputShape(const int32_t index, const nvinfer1::Dims & dimensions); + + /** + * @brief Set input shape by name via TensorRT context. + * + * @param[in] tensor_name Tensor name. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting input shape is successful. + */ + bool setInputShape(const char * tensor_name, const nvinfer1::Dims & dimensions); + + /** + * @brief Set inputs shapes by indices via TensorRT context. + * + * @param[in] dimensions Vector of tensor dimensions with corresponding indices. + * @return Whether setting input shapes is successful. + */ + bool setInputsShapes(const std::vector & dimensions); + + /** + * @brief Set inputs shapes by names via TensorRT context. + * + * @param[in] dimensions Map of tensor dimensions with corresponding names as keys. + * @return Whether setting input shapes is successful. + */ + bool setInputsShapes(const std::unordered_map & dimensions); + + /** + * @brief Set tensor (address and shape) by index via TensorRT context. + * + * @param[in] index Tensor index. + * @param[in] data Tensor pointer. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting tensor is successful. + */ + bool setTensor(const int32_t index, void * data, nvinfer1::Dims dimensions = {}); + + /** + * @brief Set tensor (address and shape) by name via TensorRT context. + * + * @param[in] tensor_name Tensor name. + * @param[in] data Tensor pointer. + * @param[in] dimensions Tensor dimensions. + * @return Whether setting tensor is successful. + */ + bool setTensor(const char * tensor_name, void * data, nvinfer1::Dims dimensions = {}); + + /** + * @brief Set tensors (addresses and shapes) by indices via TensorRT context. + * + * @param[in] tensors Vector of tensor pointers and dimensions with corresponding indices. + * @return Whether setting tensors is successful. + */ + bool setTensors(TensorsVec & tensors); + + /** + * @brief Set tensors (addresses and shapes) by names via TensorRT context. + * + * @param[in] tensors Map of tensor pointers and dimensions with corresponding names as keys. + * @return Whether setting tensors is successful. + */ + bool setTensors(TensorsMap & tensors); + + /** + * @brief Get per-layer profiler for model. + * + * @return Per-layer profiler. + */ + [[nodiscard]] std::shared_ptr getModelProfiler(); + + /** + * @brief Get per-layer profiler for host. + * + * @return Per-layer profiler. + */ + [[nodiscard]] std::shared_ptr getHostProfiler(); + + /** + * @brief Get TensorRT common configuration. + * + * @return TensorRT common configuration. + */ + [[nodiscard]] std::shared_ptr getTrtCommonConfig(); + + /** + * @brief Get TensorRT builder configuration. + * + * @return TensorRT builder configuration. + */ + [[nodiscard]] std::shared_ptr getBuilderConfig(); + + /** + * @brief Get TensorRT network definition. + * + * @return TensorRT network definition. + */ + [[nodiscard]] std::shared_ptr getNetwork(); + + /** + * @brief Get TensorRT logger. + * + * @return TensorRT logger. + */ + [[nodiscard]] std::shared_ptr getLogger(); + + /** + * @brief Execute inference via TensorRT context. + * + * @param[in] bindings input/output tensor addresses. + * @param[in] stream CUDA stream. + * @param[in] input_consumed Signal whether input buffers can be refilled with new data + * @return Whether inference is successful. + */ #if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 bool enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed); #endif /** - * @brief output per-layer information + * @brief Execute inference via TensorRT context. + * + * @param[in] stream CUDA stream. + * @return Whether inference is successful. */ - void printProfiling(void); + bool enqueueV3(cudaStream_t stream); -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 /** - * @brief get per-layer information for trt-engine-profiler + * @brief Print per-layer information. */ - std::string getLayerInformation(nvinfer1::LayerInformationFormat format); -#endif + void printProfiling() const; private: - Logger logger_; - fs::path model_file_path_; + /** + * @brief Initialize TensorRT common. + * + * @return Whether initialization is successful. + */ + [[nodiscard]] bool initialize(); + + /** + * @brief Get per-layer information for trt-engine-profiler. + * + * @param[in] format Format for layer information. + * @return Layer information. + */ + std::string getLayerInformation(nvinfer1::LayerInformationFormat format); + + /** + + * @brief Build TensorRT engine from ONNX. + * + * @return Whether building engine is successful. + */ + bool buildEngineFromOnnx(); + + /** + * @brief Load TensorRT engine. + * + * @return Whether loading engine is successful. + */ + bool loadEngine(); + + /** + * @brief Validate network input/output names and dimensions. + * + * @return Whether network input/output is valid. + */ + bool validateNetworkIO(); + + /** + * @brief Validate optimization profile. + * + * @return Whether optimization profile is valid. + */ + bool validateProfileDims(); + + //! @brief TensorRT runtime. TrtUniquePtr runtime_; + + //! @brief TensorRT engine. TrtUniquePtr engine_; + + //! @brief TensorRT execution context. TrtUniquePtr context_; - std::unique_ptr calibrator_; - std::string precision_; - BatchConfig batch_config_; - size_t max_workspace_size_; - bool is_initialized_{false}; + //! @brief TensorRT builder. + TrtUniquePtr builder_; + + //! @brief TensorRT engine parser. + TrtUniquePtr parser_; + + //! @brief TensorRT builder configuration. + std::shared_ptr builder_config_; + + //! @brief TensorRT network definition. + std::shared_ptr network_; + + //! @brief TrtCommon library base configuration. + std::shared_ptr trt_config_; + + //! @brief TensorRT logger. + std::shared_ptr logger_; + + //! @brief Per-layer profiler for host. + std::shared_ptr host_profiler_; + + //! @brief Per-layer profiler for model. + std::shared_ptr model_profiler_; - // profiler for per-layer - SimpleProfiler model_profiler_; - // profiler for whole model - SimpleProfiler host_profiler_; + //! @brief Model optimization profile. + ProfileDimsPtr profile_dims_; - std::unique_ptr build_config_; + //! @brief Model network input/output tensors information. + NetworkIOPtr network_io_; }; } // namespace tensorrt_common diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp new file mode 100644 index 0000000000000..2b87bb08dc22a --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp @@ -0,0 +1,240 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__TENSORRT_CONV_CALIB_HPP_ +#define AUTOWARE__TENSORRT_COMMON__TENSORRT_CONV_CALIB_HPP_ + +#include +#include +#include + +#include + +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ +template +bool contain(const std::string & s, const T & v) +{ + return s.find(v) != std::string::npos; +} + +using autoware::tensorrt_common::CalibrationConfig; +using autoware::tensorrt_common::NetworkIOPtr; +using autoware::tensorrt_common::ProfileDimsPtr; +using autoware::tensorrt_common::Profiler; +using autoware::tensorrt_common::TrtCommonConfig; + +/** + * @class TrtConvCalib + * @brief TensorRT common library with calibration. + */ +class TrtConvCalib : public tensorrt_common::TrtCommon +{ +public: + /** + * @brief Construct TrtCommonCalib. + * + * @param[in] trt_config base trt common configuration, ONNX model path is mandatory + * @param[in] profiler per-layer profiler + * @param[in] plugin_paths paths for TensorRT plugins + */ + explicit TrtConvCalib( + const TrtCommonConfig & trt_config, + const std::shared_ptr & profiler = std::make_shared(), + const std::vector & plugin_paths = {}) + : TrtCommon(trt_config, profiler, plugin_paths) + { + } + + /** + * @brief Setup for TensorRT execution including calibration, building and loading engine. + * + * @param[in] calibrator Pointer for any type of INT8 calibrator. + * @param[in] calib_config Calibration configuration. + * @param[in] profile_dims Optimization profile of input tensors for dynamic shapes. + * @param[in] network_io Network input/output tensors information. + * @return Whether setup is successful. + */ + [[nodiscard]] bool setupWithCalibrator( + std::unique_ptr calibrator, const CalibrationConfig & calib_config, + ProfileDimsPtr profile_dims = nullptr, NetworkIOPtr network_io = nullptr) + { + calibrator_ = std::move(calibrator); + calib_config_ = std::make_unique(calib_config); + + auto builder_config = getBuilderConfig(); + builder_config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + builder_config->setInt8Calibrator(calibrator_.get()); + + // Model specific quantization + auto logger = getLogger(); + auto quantization_log = quantization(); + logger->log(nvinfer1::ILogger::Severity::kINFO, quantization_log.c_str()); + + return setup(std::move(profile_dims), std::move(network_io)); + } + +private: + /** + * @brief Implicit quantization for TensorRT. + * + * @return Output log for TensorRT logger. + */ + std::string quantization() + { + auto network = getNetwork(); + auto trt_config = getTrtCommonConfig(); + auto model_profiler = getModelProfiler(); + + const int num = network->getNbLayers(); + bool first = calib_config_->quantize_first_layer; + bool last = calib_config_->quantize_last_layer; + std::stringstream ss; + + // Partial Quantization + if (getPrecision() == "int8") { + auto builder_config = getBuilderConfig(); + builder_config->setFlag(nvinfer1::BuilderFlag::kFP16); + network->getInput(0)->setDynamicRange(0, 255.0); + for (int i = 0; i < num; i++) { + nvinfer1::ILayer * layer = network->getLayer(i); + auto layer_type = layer->getType(); + std::string name = layer->getName(); + nvinfer1::ITensor * out = layer->getOutput(0); + if (calib_config_->clip_value > 0.0) { + ss << "Set max value for outputs: " << calib_config_->clip_value << " " << name + << std::endl; + out->setDynamicRange(0.0, calib_config_->clip_value); + } + + if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { + if (first) { + layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << name << std::endl; + first = false; + } + if (last) { + // cspell: ignore preds + if ( + contain(name, "reg_preds") || contain(name, "cls_preds") || + contain(name, "obj_preds")) { + layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << name << std::endl; + } + for (int j = num - 1; j >= 0; j--) { + nvinfer1::ILayer * inner_layer = network->getLayer(j); + auto inner_layer_type = inner_layer->getType(); + std::string inner_name = inner_layer->getName(); + if (inner_layer_type == nvinfer1::LayerType::kCONVOLUTION) { + inner_layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << inner_name << std::endl; + break; + } + if (inner_layer_type == nvinfer1::LayerType::kMATRIX_MULTIPLY) { + inner_layer->setPrecision(nvinfer1::DataType::kHALF); + ss << "Set kHALF in " << inner_name << std::endl; + break; + } + } + } + } + } + builder_config->setFlag(nvinfer1::BuilderFlag::kINT8); + } + + // Print layer information + float total_gflops = 0.0; + int total_params = 0; + + for (int i = 0; i < num; i++) { + nvinfer1::ILayer * layer = network->getLayer(i); + auto layer_type = layer->getType(); + if (trt_config->profile_per_layer) { + model_profiler->setProfDict(layer); + } + if (layer_type == nvinfer1::LayerType::kCONSTANT) { + continue; + } + nvinfer1::ITensor * in = layer->getInput(0); + nvinfer1::Dims dim_in = in->getDimensions(); + nvinfer1::ITensor * out = layer->getOutput(0); + nvinfer1::Dims dim_out = out->getDimensions(); + + if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { + auto conv = dynamic_cast(layer); + nvinfer1::Dims k_dims = conv->getKernelSizeNd(); + nvinfer1::Dims s_dims = conv->getStrideNd(); + int groups = conv->getNbGroups(); + int stride = s_dims.d[0]; + int num_weights = (dim_in.d[1] / groups) * dim_out.d[1] * k_dims.d[0] * k_dims.d[1]; + float gflops = (2.0 * num_weights) * (static_cast(dim_in.d[3]) / stride * + static_cast(dim_in.d[2]) / stride / 1e9); + total_gflops += gflops; + total_params += num_weights; + ss << "L" << i << " [conv " << k_dims.d[0] << "x" << k_dims.d[1] << " (" << groups << ") " + << ") " << "/" << s_dims.d[0] << "] " << dim_in.d[3] << "x" << dim_in.d[2] << "x" + << " -> " << dim_out.d[3] << "x" << dim_out.d[2] << "x" << dim_out.d[1]; + ss << " weights:" << num_weights; + ss << " GFLOPs:" << gflops; + ss << std::endl; + } else if (layer_type == nvinfer1::LayerType::kPOOLING) { + auto pool = dynamic_cast(layer); + auto p_type = pool->getPoolingType(); + nvinfer1::Dims dim_stride = pool->getStrideNd(); + nvinfer1::Dims dim_window = pool->getWindowSizeNd(); + + ss << "L" << i << " ["; + if (p_type == nvinfer1::PoolingType::kMAX) { + ss << "max "; + } else if (p_type == nvinfer1::PoolingType::kAVERAGE) { + ss << "avg "; + } else if (p_type == nvinfer1::PoolingType::kMAX_AVERAGE_BLEND) { + ss << "max avg blend "; + } + float gflops = static_cast(dim_in.d[1]) * + (static_cast(dim_window.d[0]) / static_cast(dim_stride.d[0])) * + (static_cast(dim_window.d[1]) / static_cast(dim_stride.d[1])) * + static_cast(dim_in.d[2]) * static_cast(dim_in.d[3]) / 1e9; + total_gflops += gflops; + ss << "pool " << dim_window.d[0] << "x" << dim_window.d[1] << "]"; + ss << " GFLOPs:" << gflops; + ss << std::endl; + } else if (layer_type == nvinfer1::LayerType::kRESIZE) { + ss << "L" << i << " [resize]" << std::endl; + } + } + ss << "Total " << total_gflops << " GFLOPs" << std::endl; + ss << "Total " << total_params / 1000.0 / 1000.0 << " M params" << std::endl; + + return ss.str(); + }; + +private: + //!< Calibration configuration + std::unique_ptr calib_config_; + + //!< Calibrator used for implicit quantization + std::unique_ptr calibrator_; +}; + +} // namespace tensorrt_common +} // namespace autoware + +#endif // AUTOWARE__TENSORRT_COMMON__TENSORRT_CONV_CALIB_HPP_ diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp new file mode 100644 index 0000000000000..f75821b898fa3 --- /dev/null +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp @@ -0,0 +1,407 @@ +// Copyright 2024 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ +#define AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ + +#include +#include + +#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__))) +#include +namespace fs = ::std::filesystem; +#else +#include +namespace fs = ::std::experimental::filesystem; +#endif + +#include +#include +#include +#include +#include +#include + +namespace autoware +{ +namespace tensorrt_common +{ +constexpr std::array valid_precisions = {"fp32", "fp16", "int8"}; + +/** + * @struct TrtCommonConfig + * @brief Configuration to provide fine control regarding TensorRT builder + */ +struct TrtCommonConfig +{ + /** @brief Construct TrtCommonConfig, ONNX model path is mandatory. + * + * @param[in] onnx_path ONNX model path + * @param[in] precision precision for inference + * @param[in] engine_path TensorRT engine path + * @param[in] max_workspace_size maximum workspace size for TensorRT + * @param[in] dla_core_id DLA core ID + * @param[in] profile_per_layer flag for per-layer profiler using IProfiler + */ + explicit TrtCommonConfig( + const std::string onnx_path, const std::string precision = "fp16", + const std::string engine_path = "", const size_t max_workspace_size = (1ULL << 30U), + const int32_t dla_core_id = -1, const bool profile_per_layer = false) + : onnx_path(onnx_path), + precision(precision), + engine_path(engine_path), + max_workspace_size(max_workspace_size), + dla_core_id(dla_core_id), + profile_per_layer(profile_per_layer) + { + validatePrecision(); + + if (engine_path.empty()) { + this->engine_path = onnx_path; + this->engine_path.replace_extension(".engine"); + } + } + + /** + * @brief Validate configured TensorRT engine precision. + */ + void validatePrecision() const + { + if ( + std::find(valid_precisions.begin(), valid_precisions.end(), precision) == + valid_precisions.end()) { + std::stringstream message; + message << "Invalid precision was specified: " << precision << std::endl + << "Valid string is one of: [" << valid_precisions[0]; + for (size_t i = 1; i < valid_precisions.size(); ++i) { + message << ", " << valid_precisions[i]; + } + message << "] (case sensitive)" << std::endl; + + throw std::runtime_error(message.str()); + } + } + + //!< @brief ONNX model path. + const fs::path onnx_path; + + //!< @brief Precision for inference. + const std::string precision; + + //!< @brief TensorRT engine path. + fs::path engine_path; + + //!< @brief TensorRT max workspace size. + const size_t max_workspace_size; + + //!< @brief DLA core ID that the process uses. + const int32_t dla_core_id; + + //!< @brief Flag for per-layer profiler using IProfiler. + const bool profile_per_layer; +}; + +/** + * @brief Represents a tensor with its name or index. + */ +struct TensorInfo +{ + /** + * @brief Construct TensorInfo with tensor name. + * + * @param[in] name Tensor name. + */ + explicit TensorInfo(std::string name) : tensor_name(std::move(name)), tensor_index(-1) {} + + /** + * @brief Construct TensorInfo with tensor index. + * + * @param[in] index Tensor index. + */ + explicit TensorInfo(int32_t index) : tensor_index(index) {} + + /** + * @brief Check if dimensions are equal. + * + * @param lhs Left-hand side tensor dimensions. + * @param rhs Right-hand side tensor dimensions. + * @return Whether dimensions are equal. + */ + static bool dimsEqual(const nvinfer1::Dims & lhs, const nvinfer1::Dims & rhs) + { + if (lhs.nbDims != rhs.nbDims) return false; + return std::equal(lhs.d, lhs.d + lhs.nbDims, rhs.d); // NOLINT + } + + /** + * @brief Get printable representation of tensor dimensions. + * + * @param[in] dims Tensor dimensions. + * @return String representation of tensor dimensions. + */ + static std::string dimsRepr(const nvinfer1::Dims & dims) + { + if (dims.nbDims == 0) return "[]"; + std::string repr = "[" + std::to_string(dims.d[0]); + for (int i = 1; i < dims.nbDims; ++i) { + repr += ", " + std::to_string(dims.d[i]); + } + repr += "]"; + return repr; + } + + //!< @brief Tensor name. + std::string tensor_name; + + //!< @brief Tensor index. + int32_t tensor_index; +}; + +/** + * @brief Represents a network input/output tensor with its dimensions. + * + * Example usage: + * @code + * nvinfer1::Dims tensor_dims = {3, {1, 2, 3}}; + * NetworkIO input("input_tensor", tensor_dims); + * NetworkIO output("output_tensor", tensor_dims); + * bool is_equal = input == output; // false + * @endcode + */ +struct NetworkIO : public TensorInfo +{ + /** + * @brief Construct NetworkIO with tensor name and dimensions. + * + * @param[in] name Tensor name. + * @param[in] tensor_dims Tensor dimensions. + */ + NetworkIO(std::string name, const nvinfer1::Dims & tensor_dims) + : TensorInfo(std::move(name)), dims(tensor_dims) + { + } + + /** + * @brief Construct NetworkIO with tensor index and dimensions. + * + * @param[in] index Tensor index. + * @param[in] tensor_dims Tensor dimensions. + */ + NetworkIO(int32_t index, const nvinfer1::Dims & tensor_dims) + : TensorInfo(index), dims(tensor_dims) + { + } + + /** + * @brief Get printable representation of NetworkIO. + * + * @return String representation of NetworkIO. + */ + [[nodiscard]] std::string toString() const + { + std::stringstream ss; + ss << tensor_name << " {" << TensorInfo::dimsRepr(dims) << "}"; + return ss.str(); + } + + /** + * @brief Check if NetworkIO is equal to another NetworkIO. + * + * @param rhs Right-hand side NetworkIO. + * @return Whether NetworkIO is equal to another NetworkIO. + */ + bool operator==(const NetworkIO & rhs) const + { + if (tensor_name != rhs.tensor_name) return false; + return dimsEqual(dims, rhs.dims); + } + + /** + * @brief Check if NetworkIO is not equal to another NetworkIO. + * + * @param rhs Right-hand side NetworkIO. + * @return Whether NetworkIO is not equal to another NetworkIO. + */ + bool operator!=(const NetworkIO & rhs) const { return !(*this == rhs); } + + /** + * @brief Output NetworkIO to ostream. + * + * @param os Output stream. + * @param io NetworkIO. + * @return Output stream. + */ + friend std::ostream & operator<<(std::ostream & os, const NetworkIO & io) + { + os << io.toString(); + return os; + } + + //!< @brief Tensor dimensions. + nvinfer1::Dims dims; +}; + +/** + * @brief Represents a tensor optimization profile with minimum, optimal, and maximum dimensions. + * + * Example usage: + * @code + * nvinfer1::Dims min = {3, {1, 2, 3}}; + * nvinfer1::Dims opt = {3, {1, 3, 4}}; + * nvinfer1::Dims max = {3, {1, 4, 5}}; + * ProfileDims entry_1("tensor_name", min, opt, max); + * ProfileDims entry_2("tensor_name", {3, {1, 2, 3}}, {3, {1, 3, 4}}, {3, {1, 4, 5}}); + * bool is_equal = entry_1 == entry_2; // true + * @endcode + */ +struct ProfileDims : public TensorInfo +{ + /** + * @brief Construct ProfileDims with tensor name and dimensions. + * + * @param[in] name Tensor name. + * @param[in] min Minimum dimensions for optimization profile. + * @param[in] opt Optimal dimensions for optimization profile. + * @param[in] max Maximum dimensions for optimization profile. + */ + ProfileDims( + std::string name, const nvinfer1::Dims & min, const nvinfer1::Dims & opt, + const nvinfer1::Dims & max) + : TensorInfo(std::move(name)), min_dims(min), opt_dims(opt), max_dims(max) + { + } + + /** + * @brief Construct ProfileDims with tensor index and dimensions. + * + * @param[in] index Tensor index. + * @param[in] min Minimum dimensions for optimization profile. + * @param[in] opt Optimal dimensions for optimization profile. + * @param[in] max Maximum dimensions for optimization profile. + */ + ProfileDims( + int32_t index, const nvinfer1::Dims & min, const nvinfer1::Dims & opt, + const nvinfer1::Dims & max) + : TensorInfo(index), min_dims(min), opt_dims(opt), max_dims(max) + { + } + + /** + * @brief Get printable representation of ProfileDims. + * + * @return String representation of ProfileDims. + */ + [[nodiscard]] std::string toString() const + { + std::ostringstream oss; + oss << tensor_name << " {min " << TensorInfo::dimsRepr(min_dims) << ", opt " + << TensorInfo::dimsRepr(opt_dims) << ", max " << TensorInfo::dimsRepr(max_dims) << "}"; + return oss.str(); + } + + /** + * @brief Check if ProfileDims is equal to another ProfileDims. + * + * @param rhs Right-hand side ProfileDims. + * @return Whether ProfileDims is equal to another ProfileDims. + */ + bool operator==(const ProfileDims & rhs) const + { + if (tensor_name != rhs.tensor_name) return false; + return dimsEqual(min_dims, rhs.min_dims) && dimsEqual(opt_dims, rhs.opt_dims) && + dimsEqual(max_dims, rhs.max_dims); + } + + /** + * @brief Check if ProfileDims is not equal to another ProfileDims. + * + * @param rhs Right-hand side ProfileDims. + * @return Whether ProfileDims is not equal to another ProfileDims. + */ + bool operator!=(const ProfileDims & rhs) const { return !(*this == rhs); } + + /** + * @brief Output ProfileDims to ostream. + * + * @param os Output stream. + * @param profile ProfileDims. + * @return Output stream. + */ + friend std::ostream & operator<<(std::ostream & os, const ProfileDims & profile) + { + os << profile.toString(); + return os; + } + + //!< @brief Minimum dimensions for optimization profile. + nvinfer1::Dims min_dims; + + //!< @brief Optimal dimensions for optimization profile. + nvinfer1::Dims opt_dims; + + //!< @brief Maximum dimensions for optimization profile. + nvinfer1::Dims max_dims; +}; + +//!< @brief Valid calibration types for TensorRT. +constexpr std::array valid_calib_type = { + "Entropy", "Legacy", "Percentile", "MinMax"}; + +/** + * @brief Configuration for implicit calibration. + */ +struct CalibrationConfig +{ + /** + * @brief Construct CalibrationConfig with its parameters. + * + * @param[in] calib_type_str Calibration type. + * @param[in] quantize_first_layer Flag for partial quantization in first layer. + * @param[in] quantize_last_layer Flag for partial quantization in last layer. + * @param[in] clip_value Clip value for implicit quantization. + */ + explicit CalibrationConfig( + const std::string & calib_type_str = "MinMax", const bool quantize_first_layer = false, + const bool quantize_last_layer = false, const double clip_value = 0.0) + : calib_type_str(calib_type_str), + quantize_first_layer(quantize_first_layer), + quantize_last_layer(quantize_last_layer), + clip_value(clip_value) + { + if ( + std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) == + valid_calib_type.end()) { + throw std::runtime_error( + "Invalid calibration type was specified: " + std::string(calib_type_str) + + "\nValid value is one of: [Entropy, (Legacy | Percentile), MinMax]"); + } + } + + //!< @brief type of calibration + const std::string calib_type_str; + + //!< @brief flag for partial quantization in first layer + const bool quantize_first_layer; + + //!< @brief flag for partial quantization in last layer + const bool quantize_last_layer; + + //!< @brief clip value for implicit quantization + const double clip_value; +}; + +} // namespace tensorrt_common +} // namespace autoware + +#endif // AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ \ No newline at end of file diff --git a/perception/autoware_tensorrt_common/src/profiler.cpp b/perception/autoware_tensorrt_common/src/profiler.cpp new file mode 100644 index 0000000000000..bf630abb5d83f --- /dev/null +++ b/perception/autoware_tensorrt_common/src/profiler.cpp @@ -0,0 +1,107 @@ +// Copyright 2023 TIER IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +namespace autoware +{ +namespace tensorrt_common +{ + +Profiler::Profiler(const std::vector & src_profilers) +{ + index_ = 0; + for (const auto & src_profiler : src_profilers) { + for (const auto & rec : src_profiler.profile_) { + auto it = profile_.find(rec.first); + if (it == profile_.end()) { + profile_.insert(rec); + } else { + it->second.time += rec.second.time; + it->second.count += rec.second.count; + } + } + } +} + +void Profiler::reportLayerTime(const char * layerName, float ms) noexcept +{ + profile_[layerName].count++; + profile_[layerName].time += ms; + if (profile_[layerName].min_time == -1.0) { + profile_[layerName].min_time = ms; + profile_[layerName].index = index_; + index_++; + } else if (profile_[layerName].min_time > ms) { + profile_[layerName].min_time = ms; + } +} + +std::string Profiler::toString() const +{ + std::ostringstream out; + float total_time = 0.0; + std::string layer_name = "Operation"; + + int max_layer_name_length = static_cast(layer_name.size()); + for (const auto & elem : profile_) { + total_time += elem.second.time; + max_layer_name_length = std::max(max_layer_name_length, static_cast(elem.first.size())); + } + + auto old_settings = out.flags(); + auto old_precision = out.precision(); + // Output header + { + out << "index, " << std::setw(12); + out << std::setw(max_layer_name_length) << layer_name << " "; + out << std::setw(12) << "Runtime" << "%," << " "; + out << std::setw(12) << "Invocations" << " , "; + out << std::setw(12) << "Runtime[ms]" << " , "; + out << std::setw(12) << "Avg Runtime[ms]" << " ,"; + out << std::setw(12) << "Min Runtime[ms]" << std::endl; + } + int index = index_; + for (int i = 0; i < index; i++) { + for (const auto & elem : profile_) { + if (elem.second.index == i) { + out << i << ", "; + out << std::setw(max_layer_name_length) << elem.first << ","; + out << std::setw(12) << std::fixed << std::setprecision(1) + << (elem.second.time * 100.0F / total_time) << "%" << ","; + out << std::setw(12) << elem.second.count << ","; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << ", "; + out << std::setw(12) << std::fixed << std::setprecision(2) + << elem.second.time / elem.second.count << ", "; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.min_time + << std::endl; + } + } + } + out.flags(old_settings); + out.precision(old_precision); + out << "========== total runtime = " << total_time << " ms ==========" << std::endl; + + return out.str(); +} + +std::ostream & operator<<(std::ostream & out, const Profiler & value) +{ + out << value.toString(); + return out; +} +} // namespace tensorrt_common +} // namespace autoware diff --git a/perception/autoware_tensorrt_common/src/simple_profiler.cpp b/perception/autoware_tensorrt_common/src/simple_profiler.cpp deleted file mode 100644 index 2bcc1c4f9da06..0000000000000 --- a/perception/autoware_tensorrt_common/src/simple_profiler.cpp +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2023 TIER IV, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include -#include -#include -#include - -namespace autoware -{ -namespace tensorrt_common -{ - -SimpleProfiler::SimpleProfiler(std::string name, const std::vector & src_profilers) -: m_name(name) -{ - m_index = 0; - for (const auto & src_profiler : src_profilers) { - for (const auto & rec : src_profiler.m_profile) { - auto it = m_profile.find(rec.first); - if (it == m_profile.end()) { - m_profile.insert(rec); - } else { - it->second.time += rec.second.time; - it->second.count += rec.second.count; - } - } - } -} - -void SimpleProfiler::reportLayerTime(const char * layerName, float ms) noexcept -{ - m_profile[layerName].count++; - m_profile[layerName].time += ms; - if (m_profile[layerName].min_time == -1.0) { - m_profile[layerName].min_time = ms; - m_profile[layerName].index = m_index; - m_index++; - } else if (m_profile[layerName].min_time > ms) { - m_profile[layerName].min_time = ms; - } -} - -void SimpleProfiler::setProfDict(nvinfer1::ILayer * layer) noexcept -{ - std::string name = layer->getName(); - m_layer_dict[name]; - m_layer_dict[name].type = layer->getType(); - if (layer->getType() == nvinfer1::LayerType::kCONVOLUTION) { - nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer; - nvinfer1::ITensor * in = layer->getInput(0); - nvinfer1::Dims dim_in = in->getDimensions(); - nvinfer1::ITensor * out = layer->getOutput(0); - nvinfer1::Dims dim_out = out->getDimensions(); - nvinfer1::Dims k_dims = conv->getKernelSizeNd(); - nvinfer1::Dims s_dims = conv->getStrideNd(); - int groups = conv->getNbGroups(); - int stride = s_dims.d[0]; - int kernel = k_dims.d[0]; - m_layer_dict[name].in_c = dim_in.d[1]; - m_layer_dict[name].out_c = dim_out.d[1]; - m_layer_dict[name].w = dim_in.d[3]; - m_layer_dict[name].h = dim_in.d[2]; - m_layer_dict[name].k = kernel; - ; - m_layer_dict[name].stride = stride; - m_layer_dict[name].groups = groups; - } -} - -std::ostream & operator<<(std::ostream & out, const SimpleProfiler & value) -{ - out << "========== " << value.m_name << " profile ==========" << std::endl; - float totalTime = 0; - std::string layerNameStr = "Operation"; - - int maxLayerNameLength = static_cast(layerNameStr.size()); - for (const auto & elem : value.m_profile) { - totalTime += elem.second.time; - maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); - } - - auto old_settings = out.flags(); - auto old_precision = out.precision(); - // Output header - { - out << "index, " << std::setw(12); - out << std::setw(maxLayerNameLength) << layerNameStr << " "; - out << std::setw(12) << "Runtime" - << "%," - << " "; - out << std::setw(12) << "Invocations" - << " , "; - out << std::setw(12) << "Runtime[ms]" - << " , "; - out << std::setw(12) << "Avg Runtime[ms]" - << " ,"; - out << std::setw(12) << "Min Runtime[ms]" << std::endl; - } - int index = value.m_index; - for (int i = 0; i < index; i++) { - for (const auto & elem : value.m_profile) { - if (elem.second.index == i) { - out << i << ", "; - out << std::setw(maxLayerNameLength) << elem.first << ","; - out << std::setw(12) << std::fixed << std::setprecision(1) - << (elem.second.time * 100.0F / totalTime) << "%" - << ","; - out << std::setw(12) << elem.second.count << ","; - out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << ", "; - out << std::setw(12) << std::fixed << std::setprecision(2) - << elem.second.time / elem.second.count << ", "; - out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.min_time - << std::endl; - } - } - } - out.flags(old_settings); - out.precision(old_precision); - out << "========== " << value.m_name << " total runtime = " << totalTime - << " ms ==========" << std::endl; - return out; -} -} // namespace tensorrt_common -} // namespace autoware diff --git a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp index 990433ee277a0..601c6f108bd3b 100644 --- a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp +++ b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp @@ -12,102 +12,38 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +#include "autoware/tensorrt_common/tensorrt_common.hpp" +#include "autoware/tensorrt_common/logger.hpp" +#include "autoware/tensorrt_common/utils.hpp" + +#include #include +#include #include +#include #include -#include #include #include #include +#include #include #include -namespace -{ -template -bool contain(const std::string & s, const T & v) -{ - return s.find(v) != std::string::npos; -} -} // anonymous namespace - namespace autoware { namespace tensorrt_common { -nvinfer1::Dims get_input_dims(const std::string & onnx_file_path) -{ - Logger logger_; - auto builder = TrtUniquePtr(nvinfer1::createInferBuilder(logger_)); - if (!builder) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); - } - - const auto explicitBatch = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - - auto network = - TrtUniquePtr(builder->createNetworkV2(explicitBatch)); - if (!network) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); - } - - auto config = TrtUniquePtr(builder->createBuilderConfig()); - if (!config) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); - } - - auto parser = TrtUniquePtr(nvonnxparser::createParser(*network, logger_)); - if (!parser->parseFromFile( - onnx_file_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR))) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Failed to parse onnx file"); - } - - const auto input = network->getInput(0); - return input->getDimensions(); -} - -bool is_valid_precision_string(const std::string & precision) -{ - if ( - std::find(valid_precisions.begin(), valid_precisions.end(), precision) == - valid_precisions.end()) { - std::stringstream message; - message << "Invalid precision was specified: " << precision << std::endl - << "Valid string is one of: ["; - for (const auto & s : valid_precisions) { - message << s << ", "; - } - message << "] (case sensitive)" << std::endl; - std::cerr << message.str(); - return false; - } else { - return true; - } -} TrtCommon::TrtCommon( - const std::string & model_path, const std::string & precision, - std::unique_ptr calibrator, const BatchConfig & batch_config, - const size_t max_workspace_size, const BuildConfig & build_config, + const TrtCommonConfig & trt_config, const std::shared_ptr & profiler, const std::vector & plugin_paths) -: model_file_path_(model_path), - calibrator_(std::move(calibrator)), - precision_(precision), - batch_config_(batch_config), - max_workspace_size_(max_workspace_size), - model_profiler_("Model"), - host_profiler_("Host") +: trt_config_(std::make_shared(trt_config)), + host_profiler_(profiler), + model_profiler_(profiler) { - // Check given precision is valid one - if (!is_valid_precision_string(precision)) { - return; - } - build_config_ = std::make_unique(build_config); - + logger_ = std::make_shared(); for (const auto & plugin_path : plugin_paths) { int32_t flags{RTLD_LAZY}; // cspell: ignore asan @@ -120,523 +56,615 @@ TrtCommon::TrtCommon( #endif // ENABLE_ASAN void * handle = dlopen(plugin_path.c_str(), flags); if (!handle) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Could not load plugin library"); + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Could not load plugin library"); + } else { + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Loaded plugin library: %s", plugin_path.c_str()); } } - runtime_ = TrtUniquePtr(nvinfer1::createInferRuntime(logger_)); - if (build_config_->dla_core_id != -1) { - runtime_->setDLACore(build_config_->dla_core_id); + runtime_ = TrtUniquePtr(nvinfer1::createInferRuntime(*logger_)); + if (trt_config_->dla_core_id != -1) { + runtime_->setDLACore(trt_config_->dla_core_id); } - initLibNvInferPlugins(&logger_, ""); -} + initLibNvInferPlugins(&*logger_, ""); -TrtCommon::~TrtCommon() -{ + if (!initialize()) { + throw std::runtime_error("Failed to initialize TensorRT"); + } } -void TrtCommon::setup() +TrtCommon::~TrtCommon() = default; + +bool TrtCommon::setup(ProfileDimsPtr profile_dims, NetworkIOPtr network_io) { - if (!fs::exists(model_file_path_)) { - is_initialized_ = false; - return; - } - // cppcheck-suppress unreadVariable - std::string engine_path = model_file_path_; - if (model_file_path_.extension() == ".engine") { - std::cout << "Load ... " << model_file_path_ << std::endl; - loadEngine(model_file_path_); - } else if (model_file_path_.extension() == ".onnx") { - fs::path cache_engine_path{model_file_path_}; - std::string ext; - std::string calib_name = ""; - if (precision_ == "int8") { - if (build_config_->calib_type_str == "Entropy") { - calib_name = "EntropyV2-"; - } else if ( - build_config_->calib_type_str == "Legacy" || - build_config_->calib_type_str == "Percentile") { - calib_name = "Legacy-"; - } else { - calib_name = "MinMax-"; + profile_dims_ = std::move(profile_dims); + network_io_ = std::move(network_io); + + // Set input profile + if (profile_dims_ && !profile_dims_->empty()) { + auto profile = builder_->createOptimizationProfile(); + for (auto & profile_dim : *profile_dims_) { + if (profile_dim.tensor_name.empty()) { + profile_dim.tensor_name = getIOTensorName(profile_dim.tensor_index); } + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Setting optimization profile for tensor: %s", + profile_dim.toString().c_str()); + profile->setDimensions( + profile_dim.tensor_name.c_str(), nvinfer1::OptProfileSelector::kMIN, profile_dim.min_dims); + profile->setDimensions( + profile_dim.tensor_name.c_str(), nvinfer1::OptProfileSelector::kOPT, profile_dim.opt_dims); + profile->setDimensions( + profile_dim.tensor_name.c_str(), nvinfer1::OptProfileSelector::kMAX, profile_dim.max_dims); } - if (build_config_->dla_core_id != -1) { - ext = "DLA" + std::to_string(build_config_->dla_core_id) + "-" + calib_name + precision_; - if (build_config_->quantize_first_layer) { - ext += "-firstFP16"; - } - if (build_config_->quantize_last_layer) { - ext += "-lastFP16"; - } - ext += "-batch" + std::to_string(batch_config_[0]) + ".engine"; - } else { - ext = calib_name + precision_; - if (build_config_->quantize_first_layer) { - ext += "-firstFP16"; - } - if (build_config_->quantize_last_layer) { - ext += "-lastFP16"; + builder_config_->addOptimizationProfile(profile); + } + + auto build_engine_with_log = [this]() -> bool { + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Starting to build engine"); + auto log_thread = logger_->log_throttle( + nvinfer1::ILogger::Severity::kINFO, + "Applying optimizations and building TensorRT CUDA engine. Please wait for a few minutes...", + 5); + auto success = buildEngineFromOnnx(); + logger_->stop_throttle(log_thread); + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Engine build completed"); + return success; + }; + + // Load engine file if it exists + if (fs::exists(trt_config_->engine_path)) { + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Loading engine"); + if (!loadEngine()) { + return false; + } + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Network validation"); + // Validate engine tensor shapes and optimization profile + if (!validateNetworkIO() || !validateProfileDims()) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Network validation failed for loaded engine from file. Rebuilding engine"); + // Rebuild engine if the tensor shapes or optimization profile mismatch + if (!build_engine_with_log()) { + return false; } - ext += "-batch" + std::to_string(batch_config_[0]) + ".engine"; } - cache_engine_path.replace_extension(ext); + } else { + // Build engine if engine has not been cached + if (!build_engine_with_log()) { + return false; + } + } - // Output Network Information - printNetworkInfo(model_file_path_); + // Validate engine nevertheless is loaded or rebuilt + if (!validateNetworkIO() || !validateProfileDims()) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Final network validation failed. Possibly input / output of currently " + "deployed model has been changed. Check your configuration file with the current model."); + return false; + } - if (fs::exists(cache_engine_path)) { - std::cout << "Loading... " << cache_engine_path << std::endl; - loadEngine(cache_engine_path); - } else { - std::cout << "Building... " << cache_engine_path << std::endl; - logger_.log(nvinfer1::ILogger::Severity::kINFO, "Start build engine"); - auto log_thread = logger_.log_throttle( - nvinfer1::ILogger::Severity::kINFO, - "Applying optimizations and building TRT CUDA engine. Please wait for a few minutes...", 5); - buildEngineFromOnnx(model_file_path_, cache_engine_path); - logger_.stop_throttle(log_thread); - logger_.log(nvinfer1::ILogger::Severity::kINFO, "End build engine"); + logger_->log(nvinfer1::ILogger::Severity::kINFO, "Engine setup completed"); + return true; +} + +std::string TrtCommon::getPrecision() const +{ + return trt_config_->precision; +} + +const char * TrtCommon::getIOTensorName(const int32_t index) const +{ + if (!engine_) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Engine is not initialized. Retrieving data from network"); + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return nullptr; } - // cppcheck-suppress unreadVariable - engine_path = cache_engine_path; - } else { - is_initialized_ = false; - return; + auto num_inputs = network_->getNbInputs(); + auto num_outputs = network_->getNbOutputs(); + if (index < 0 || index >= num_inputs + num_outputs) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Invalid index for I/O tensor: %d. Total I/O tensors: %d", index, num_inputs + num_outputs); + return nullptr; + } + if (index < num_inputs) { + return network_->getInput(index)->getName(); + } + return network_->getOutput(index - num_inputs)->getName(); } - context_ = TrtUniquePtr(engine_->createExecutionContext()); - if (!context_) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context"); - is_initialized_ = false; - return; + return engine_->getIOTensorName(index); +} + +int32_t TrtCommon::getNbIOTensors() const +{ + if (!engine_) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Engine is not initialized. Retrieving data from network"); + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return 0; + } + return network_->getNbInputs() + network_->getNbOutputs(); + } + return engine_->getNbIOTensors(); +} + +nvinfer1::Dims TrtCommon::getTensorShape(const int32_t index) const +{ + if (!engine_) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Engine is not initialized. Retrieving data from network"); + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return nvinfer1::Dims{}; + } + auto num_inputs = network_->getNbInputs(); + auto num_outputs = network_->getNbOutputs(); + if (index < 0 || index >= num_inputs + num_outputs) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Invalid index for I/O tensor: %d. Total I/O tensors: %d", index, num_inputs + num_outputs); + return nvinfer1::Dims{}; + } + if (index < num_inputs) { + return network_->getInput(index)->getDimensions(); + } + return network_->getOutput(index - num_inputs)->getDimensions(); } + auto const & name = getIOTensorName(index); + return getTensorShape(name); +} - if (build_config_->profile_per_layer) { - context_->setProfiler(&model_profiler_); +nvinfer1::Dims TrtCommon::getTensorShape(const char * tensor_name) const +{ + if (!engine_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Engine is not initialized"); + return nvinfer1::Dims{}; } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - // Write profiles for trt-engine-explorer - // See: https://github.com/NVIDIA/TensorRT/tree/main/tools/experimental/trt-engine-explorer - std::string j_ext = ".json"; - fs::path json_path{engine_path}; - json_path.replace_extension(j_ext); - std::string ret = getLayerInformation(nvinfer1::LayerInformationFormat::kJSON); - std::ofstream os(json_path, std::ofstream::trunc); - os << ret << std::flush; -#endif + return engine_->getTensorShape(tensor_name); +} - is_initialized_ = true; +nvinfer1::Dims TrtCommon::getInputDims(const int32_t index) const +{ + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return {}; + } + const auto input = network_->getInput(index); + return input->getDimensions(); } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 -void TrtCommon::setupBindings(std::vector & bindings) +nvinfer1::Dims TrtCommon::getOutputDims(const int32_t index) const { - for (int32_t i = 0, e = engine_->getNbIOTensors(); i < e; i++) { - auto const name = engine_->getIOTensorName(i); - context_->setTensorAddress(name, bindings.at(i)); + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Network is not initialized"); + return {}; } + const auto output = network_->getOutput(index); + return output->getDimensions(); } -#endif -bool TrtCommon::loadEngine(const std::string & engine_file_path) +bool TrtCommon::setTensorAddress(const int32_t index, void * data) { - std::ifstream engine_file(engine_file_path); - std::stringstream engine_buffer; - engine_buffer << engine_file.rdbuf(); - std::string engine_str = engine_buffer.str(); - engine_ = TrtUniquePtr(runtime_->deserializeCudaEngine( - reinterpret_cast(engine_str.data()), engine_str.size())); - return true; + auto const & name = getIOTensorName(index); + return setTensorAddress(name, data); } -void TrtCommon::printNetworkInfo(const std::string & onnx_file_path) +bool TrtCommon::setTensorAddress(const char * tensor_name, void * data) { - auto builder = TrtUniquePtr(nvinfer1::createInferBuilder(logger_)); - if (!builder) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); - return; + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); + return false; + } + auto success = context_->setTensorAddress(tensor_name, data); + if (!success) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Failed to set tensor address for tensor: ", tensor_name); } + return success; +} - const auto explicitBatch = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); +bool TrtCommon::setTensorsAddresses(std::vector & tensors) +{ + bool success = true; + for (std::size_t i = 0, e = tensors.size(); i < e; i++) { + auto const name = getIOTensorName(i); + success &= setTensorAddress(name, tensors.at(i)); + } + return success; +} - auto network = - TrtUniquePtr(builder->createNetworkV2(explicitBatch)); - if (!network) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); - return; +bool TrtCommon::setTensorsAddresses(std::unordered_map & tensors) +{ + bool success = true; + for (auto const & tensor : tensors) { + success &= setTensorAddress(tensor.first, tensor.second); } + return success; +} + +bool TrtCommon::setInputShape(const int32_t index, const nvinfer1::Dims & dimensions) +{ + auto const & name = getIOTensorName(index); + return setInputShape(name, dimensions); +} - auto config = TrtUniquePtr(builder->createBuilderConfig()); - if (!config) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); - return; +bool TrtCommon::setInputShape(const char * tensor_name, const nvinfer1::Dims & dimensions) +{ + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); + return false; + } + auto success = context_->setInputShape(tensor_name, dimensions); + if (!success) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, "Failed to set input shape for tensor: ", tensor_name); } + return success; +} - if (precision_ == "fp16" || precision_ == "int8") { - config->setFlag(nvinfer1::BuilderFlag::kFP16); +bool TrtCommon::setInputsShapes(const std::vector & dimensions) +{ + bool success = true; + for (std::size_t i = 0, e = dimensions.size(); i < e; i++) { + success &= setInputShape(i, dimensions.at(i)); } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); -#else - config->setMaxWorkspaceSize(max_workspace_size_); -#endif + return success; +} - auto parser = TrtUniquePtr(nvonnxparser::createParser(*network, logger_)); - if (!parser->parseFromFile( - onnx_file_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR))) { - return; - } - int num = network->getNbLayers(); - float total_gflops = 0.0; - int total_params = 0; - for (int i = 0; i < num; i++) { - nvinfer1::ILayer * layer = network->getLayer(i); - auto layer_type = layer->getType(); - if (build_config_->profile_per_layer) { - model_profiler_.setProfDict(layer); - } - if (layer_type == nvinfer1::LayerType::kCONSTANT) { - continue; - } - nvinfer1::ITensor * in = layer->getInput(0); - nvinfer1::Dims dim_in = in->getDimensions(); - nvinfer1::ITensor * out = layer->getOutput(0); - nvinfer1::Dims dim_out = out->getDimensions(); - - if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { - nvinfer1::IConvolutionLayer * conv = (nvinfer1::IConvolutionLayer *)layer; - nvinfer1::Dims k_dims = conv->getKernelSizeNd(); - nvinfer1::Dims s_dims = conv->getStrideNd(); - int groups = conv->getNbGroups(); - int stride = s_dims.d[0]; - int num_weights = (dim_in.d[1] / groups) * dim_out.d[1] * k_dims.d[0] * k_dims.d[1]; - float gflops = (2.0 * num_weights) * (static_cast(dim_in.d[3]) / stride * - static_cast(dim_in.d[2]) / stride / 1e9); - total_gflops += gflops; - total_params += num_weights; - std::cout << "L" << i << " [conv " << k_dims.d[0] << "x" << k_dims.d[1] << " (" << groups - << ") " - << "/" << s_dims.d[0] << "] " << dim_in.d[3] << "x" << dim_in.d[2] << "x" - << dim_in.d[1] << " -> " << dim_out.d[3] << "x" << dim_out.d[2] << "x" - << dim_out.d[1]; - std::cout << " weights:" << num_weights; - std::cout << " GFLOPs:" << gflops; - std::cout << std::endl; - } else if (layer_type == nvinfer1::LayerType::kPOOLING) { - nvinfer1::IPoolingLayer * pool = (nvinfer1::IPoolingLayer *)layer; - auto p_type = pool->getPoolingType(); - nvinfer1::Dims dim_stride = pool->getStrideNd(); - nvinfer1::Dims dim_window = pool->getWindowSizeNd(); - - std::cout << "L" << i << " ["; - if (p_type == nvinfer1::PoolingType::kMAX) { - std::cout << "max "; - } else if (p_type == nvinfer1::PoolingType::kAVERAGE) { - std::cout << "avg "; - } else if (p_type == nvinfer1::PoolingType::kMAX_AVERAGE_BLEND) { - std::cout << "max avg blend "; - } - float gflops = static_cast(dim_in.d[1]) * - (static_cast(dim_window.d[0]) / static_cast(dim_stride.d[0])) * - (static_cast(dim_window.d[1]) / static_cast(dim_stride.d[1])) * - static_cast(dim_in.d[2]) * static_cast(dim_in.d[3]) / 1e9; - total_gflops += gflops; - std::cout << "pool " << dim_window.d[0] << "x" << dim_window.d[1] << "]"; - std::cout << " GFLOPs:" << gflops; - std::cout << std::endl; - } else if (layer_type == nvinfer1::LayerType::kRESIZE) { - std::cout << "L" << i << " [resize]" << std::endl; - } +bool TrtCommon::setInputsShapes(const std::unordered_map & dimensions) +{ + bool success = true; + for (auto const & dim : dimensions) { + success &= setInputShape(dim.first, dim.second); } - std::cout << "Total " << total_gflops << " GFLOPs" << std::endl; - std::cout << "Total " << total_params / 1000.0 / 1000.0 << " M params" << std::endl; - return; + return success; } -bool TrtCommon::buildEngineFromOnnx( - const std::string & onnx_file_path, const std::string & output_engine_file_path) +bool TrtCommon::setTensor(const int32_t index, void * data, nvinfer1::Dims dimensions) { - auto builder = TrtUniquePtr(nvinfer1::createInferBuilder(logger_)); - if (!builder) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); - return false; + auto success = setTensorAddress(index, data); + if (dimensions.nbDims > 0) { + success &= setInputShape(index, dimensions); } + return success; +} - const auto explicitBatch = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); +bool TrtCommon::setTensor(const char * tensor_name, void * data, nvinfer1::Dims dimensions) +{ + auto success = setTensorAddress(tensor_name, data); + if (dimensions.nbDims > 0) { + success &= setInputShape(tensor_name, dimensions); + } + return success; +} - auto network = - TrtUniquePtr(builder->createNetworkV2(explicitBatch)); - if (!network) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); - return false; +bool TrtCommon::setTensors(TensorsVec & tensors) +{ + bool success = true; + for (std::size_t i = 0, e = tensors.size(); i < e; i++) { + success &= setTensor(i, tensors.at(i).first, tensors.at(i).second); } + return success; +} - auto config = TrtUniquePtr(builder->createBuilderConfig()); - if (!config) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); - return false; +bool TrtCommon::setTensors(TensorsMap & tensors) +{ + bool success = true; + for (auto const & tensor : tensors) { + success &= setTensor(tensor.first, tensor.second.first, tensor.second.second); } + return success; +} - int num_available_dla = builder->getNbDLACores(); - if (build_config_->dla_core_id != -1) { - if (num_available_dla > 0) { - std::cout << "###" << num_available_dla << " DLAs are supported! ###" << std::endl; - } else { - std::cout << "###Warning : " - << "No DLA is supported! ###" << std::endl; - } - config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); - config->setDLACore(build_config_->dla_core_id); -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); -#else - config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES); -#endif - config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); +std::shared_ptr TrtCommon::getModelProfiler() +{ + return model_profiler_; +} + +std::shared_ptr TrtCommon::getHostProfiler() +{ + return host_profiler_; +} + +std::shared_ptr TrtCommon::getTrtCommonConfig() +{ + return trt_config_; +} + +std::shared_ptr TrtCommon::getBuilderConfig() +{ + return builder_config_; +} + +std::shared_ptr TrtCommon::getNetwork() +{ + return network_; +} + +std::shared_ptr TrtCommon::getLogger() +{ + return logger_; +} + +#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 +bool TrtCommon::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed) +{ + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); + return false; } - if (precision_ == "fp16" || precision_ == "int8") { - config->setFlag(nvinfer1::BuilderFlag::kFP16); + if (trt_config_->profile_per_layer) { + auto inference_start = std::chrono::high_resolution_clock::now(); + auto success = context_->enqueueV2(bindings, stream, input_consumed); + auto inference_end = std::chrono::high_resolution_clock::now(); + host_profiler_->reportLayerTime( + "inference_host", + std::chrono::duration(inference_end - inference_start).count()); + return success; } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 - config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); -#else - config->setMaxWorkspaceSize(max_workspace_size_); + return context_->enqueueV2(bindings, stream, input_consumed); +} #endif - auto parser = TrtUniquePtr(nvonnxparser::createParser(*network, logger_)); - if (!parser->parseFromFile( - onnx_file_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR))) { - std::cout << "Failed to parse onnx file" << std::endl; +bool TrtCommon::enqueueV3(cudaStream_t stream) +{ + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); return false; } + if (trt_config_->profile_per_layer) { + auto inference_start = std::chrono::high_resolution_clock::now(); + auto success = context_->enqueueV3(stream); + auto inference_end = std::chrono::high_resolution_clock::now(); + host_profiler_->reportLayerTime( + "inference_host", + std::chrono::duration(inference_end - inference_start).count()); + return success; + } + return context_->enqueueV3(stream); +} - const int num = network->getNbLayers(); - bool first = build_config_->quantize_first_layer; - bool last = build_config_->quantize_last_layer; - // Partial Quantization - if (precision_ == "int8") { - network->getInput(0)->setDynamicRange(0, 255.0); - for (int i = 0; i < num; i++) { - nvinfer1::ILayer * layer = network->getLayer(i); - auto layer_type = layer->getType(); - std::string name = layer->getName(); - nvinfer1::ITensor * out = layer->getOutput(0); - if (build_config_->clip_value > 0.0) { - std::cout << "Set max value for outputs : " << build_config_->clip_value << " " << name - << std::endl; - out->setDynamicRange(0.0, build_config_->clip_value); - } +void TrtCommon::printProfiling() const +{ + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Host Profiling\n", host_profiler_->toString().c_str()); + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Model Profiling\n", model_profiler_->toString().c_str()); +} - if (layer_type == nvinfer1::LayerType::kCONVOLUTION) { - if (first) { - layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << name << std::endl; - first = false; - } - if (last) { - // cspell: ignore preds - if ( - contain(name, "reg_preds") || contain(name, "cls_preds") || - contain(name, "obj_preds")) { - layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << name << std::endl; - } - for (int j = num - 1; j >= 0; j--) { - nvinfer1::ILayer * inner_layer = network->getLayer(j); - auto inner_layer_type = inner_layer->getType(); - std::string inner_name = inner_layer->getName(); - if (inner_layer_type == nvinfer1::LayerType::kCONVOLUTION) { - inner_layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << inner_name << std::endl; - break; - } - if (inner_layer_type == nvinfer1::LayerType::kMATRIX_MULTIPLY) { - inner_layer->setPrecision(nvinfer1::DataType::kHALF); - std::cout << "Set kHALF in " << inner_name << std::endl; - break; - } - } - } - } - } +std::string TrtCommon::getLayerInformation(nvinfer1::LayerInformationFormat format) +{ + if (!context_ || !engine_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context or engine are not initialized"); + return {}; } + auto inspector = std::unique_ptr(engine_->createEngineInspector()); + inspector->setExecutionContext(&(*context_)); + std::string result = inspector->getEngineInformation(format); + return result; +} - const auto input = network->getInput(0); - const auto input_dims = input->getDimensions(); - const auto input_channel = input_dims.d[1]; - const auto input_height = input_dims.d[2]; - const auto input_width = input_dims.d[3]; - const auto input_batch = input_dims.d[0]; - - if (input_batch > 1) { - batch_config_[0] = input_batch; +bool TrtCommon::initialize() +{ + if (!fs::exists(trt_config_->onnx_path) || trt_config_->onnx_path.extension() != ".onnx") { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Invalid ONNX file path or extension"); + return false; } - if (batch_config_.at(0) > 1 && (batch_config_.at(0) == batch_config_.at(2))) { - // Attention : below API is deprecated in TRT8.4 - builder->setMaxBatchSize(batch_config_.at(2)); - } else { - if (build_config_->profile_per_layer) { - auto profile = builder->createOptimizationProfile(); - profile->setDimensions( - network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMIN, - nvinfer1::Dims4{batch_config_.at(0), input_channel, input_height, input_width}); - profile->setDimensions( - network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kOPT, - nvinfer1::Dims4{batch_config_.at(1), input_channel, input_height, input_width}); - profile->setDimensions( - network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMAX, - nvinfer1::Dims4{batch_config_.at(2), input_channel, input_height, input_width}); - config->addOptimizationProfile(profile); - } + builder_ = TrtUniquePtr(nvinfer1::createInferBuilder(*logger_)); + if (!builder_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); + return false; } - if (precision_ == "int8" && calibrator_) { - config->setFlag(nvinfer1::BuilderFlag::kINT8); -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - config->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + +#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 + const auto explicit_batch = + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network_ = TrtUniquePtr(builder_->createNetworkV2(explicit_batch)); #else - config->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES); + network_ = TrtUniquePtr(builder_->createNetworkV2(0)); #endif - // QAT requires no calibrator. - // assert((calibrator != nullptr) && "Invalid calibrator for INT8 precision"); - config->setInt8Calibrator(calibrator_.get()); + + if (!network_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); + return false; } - if (build_config_->profile_per_layer) { -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 - config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED); -#else - config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kVERBOSE); -#endif + + builder_config_ = TrtUniquePtr(builder_->createBuilderConfig()); + if (!builder_config_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); + return false; } -#if TENSORRT_VERSION_MAJOR >= 8 - auto plan = - TrtUniquePtr(builder->buildSerializedNetwork(*network, *config)); + auto num_available_dla = builder_->getNbDLACores(); + if (trt_config_->dla_core_id != -1) { + logger_->log( + nvinfer1::ILogger::Severity::kINFO, "Number of DLAs supported: %d", num_available_dla); + builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + builder_config_->setDLACore(trt_config_->dla_core_id); + builder_config_->setFlag(nvinfer1::BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); + } + if (trt_config_->precision == "fp16") { + builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); + } else if (trt_config_->precision == "int8") { + builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); + } + builder_config_->setMemoryPoolLimit( + nvinfer1::MemoryPoolType::kWORKSPACE, trt_config_->max_workspace_size); + + parser_ = TrtUniquePtr(nvonnxparser::createParser(*network_, *logger_)); + if (!parser_->parseFromFile( + trt_config_->onnx_path.c_str(), + static_cast(nvinfer1::ILogger::Severity::kERROR))) { + return false; + } + + if (trt_config_->profile_per_layer) { + builder_config_->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED); + } + + return true; +} + +bool TrtCommon::buildEngineFromOnnx() +{ + // Build engine + auto plan = TrtUniquePtr( + builder_->buildSerializedNetwork(*network_, *builder_config_)); if (!plan) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory"); + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory"); return false; } engine_ = TrtUniquePtr( runtime_->deserializeCudaEngine(plan->data(), plan->size())); -#else - engine_ = TrtUniquePtr(builder->buildEngineWithConfig(*network, *config)); -#endif if (!engine_) { - logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine"); + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine"); return false; } - // save engine -#if TENSORRT_VERSION_MAJOR < 8 - auto data = TrtUniquePtr(engine_->serialize()); -#endif + // Save engine std::ofstream file; - file.open(output_engine_file_path, std::ios::binary | std::ios::out); + file.open(trt_config_->engine_path, std::ios::binary | std::ios::out); if (!file.is_open()) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to open engine file"); return false; } -#if TENSORRT_VERSION_MAJOR < 8 - file.write(reinterpret_cast(data->data()), data->size()); -#else - file.write(reinterpret_cast(plan->data()), plan->size()); -#endif - + file.write(reinterpret_cast(plan->data()), plan->size()); // NOLINT file.close(); - return true; -} - -bool TrtCommon::isInitialized() -{ - return is_initialized_; -} - -nvinfer1::Dims TrtCommon::getBindingDimensions(const int32_t index) const -{ -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + (NV_TENSOR_PATCH * 10) >= 8500 - auto const & name = engine_->getIOTensorName(index); - auto dims = context_->getTensorShape(name); - bool const has_runtime_dim = - std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + context_ = TrtUniquePtr(engine_->createExecutionContext()); + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context"); + return false; + } - if (has_runtime_dim) { - return dims; - } else { - return context_->getBindingDimensions(index); + if (trt_config_->profile_per_layer) { + context_->setProfiler(&*model_profiler_); } -#else - return context_->getBindingDimensions(index); -#endif -} -int32_t TrtCommon::getNbBindings() -{ - return engine_->getNbBindings(); -} + fs::path json_path = trt_config_->engine_path.replace_extension(".json"); + auto ret = getLayerInformation(nvinfer1::LayerInformationFormat::kJSON); + std::ofstream os(json_path, std::ofstream::trunc); + os << ret << std::flush; + os.close(); -bool TrtCommon::setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const -{ - return context_->setBindingDimensions(index, dimensions); + return true; } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8500 -bool TrtCommon::enqueueV3(cudaStream_t stream) +bool TrtCommon::loadEngine() { - if (build_config_->profile_per_layer) { - auto inference_start = std::chrono::high_resolution_clock::now(); - - bool ret = context_->enqueueV3(stream); + std::ifstream engine_file(trt_config_->engine_path); + std::stringstream engine_buffer; + engine_buffer << engine_file.rdbuf(); + std::string engine_str = engine_buffer.str(); - auto inference_end = std::chrono::high_resolution_clock::now(); - host_profiler_.reportLayerTime( - "inference", - std::chrono::duration(inference_end - inference_start).count()); - return ret; + engine_ = TrtUniquePtr(runtime_->deserializeCudaEngine( + reinterpret_cast( // NOLINT + engine_str.data()), + engine_str.size())); + if (!engine_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine"); + return false; } - return context_->enqueueV3(stream); -} -#endif -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 -bool TrtCommon::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed) -{ - if (build_config_->profile_per_layer) { - auto inference_start = std::chrono::high_resolution_clock::now(); - bool ret = context_->enqueueV2(bindings, stream, input_consumed); + context_ = TrtUniquePtr(engine_->createExecutionContext()); + if (!context_) { + logger_->log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context"); + return false; + } - auto inference_end = std::chrono::high_resolution_clock::now(); - host_profiler_.reportLayerTime( - "inference", - std::chrono::duration(inference_end - inference_start).count()); - return ret; - } else { - return context_->enqueueV2(bindings, stream, input_consumed); + if (trt_config_->profile_per_layer) { + context_->setProfiler(&*model_profiler_); } + + return true; } -#endif -void TrtCommon::printProfiling() +bool TrtCommon::validateProfileDims() { - std::cout << host_profiler_; - std::cout << std::endl; - std::cout << model_profiler_; + auto success = true; + if (!profile_dims_ || profile_dims_->empty()) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Input profile is empty, skipping validation. If network has dynamic shapes, it might lead " + "to undefined behavior."); + return success; + } + if (engine_->getNbOptimizationProfiles() != 1) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Number of optimization profiles in the engine (%d) is not equal to 1. Selecting the first " + "cached profile.", + engine_->getNbOptimizationProfiles()); + } + + for (const auto & profile_dim : *profile_dims_) { + nvinfer1::Dims min_dims = engine_->getProfileShape( + profile_dim.tensor_name.c_str(), 0, nvinfer1::OptProfileSelector::kMIN); + nvinfer1::Dims opt_dims = engine_->getProfileShape( + profile_dim.tensor_name.c_str(), 0, nvinfer1::OptProfileSelector::kOPT); + nvinfer1::Dims max_dims = engine_->getProfileShape( + profile_dim.tensor_name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX); + ProfileDims profile_from_engine{profile_dim.tensor_name, min_dims, opt_dims, max_dims}; + if (profile_dim != profile_from_engine) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Invalid profile. Current configuration: %s. Cached engine: %s", + profile_dim.toString().c_str(), profile_from_engine.toString().c_str()); + success = false; + } + } + return success; } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200 -std::string TrtCommon::getLayerInformation(nvinfer1::LayerInformationFormat format) +bool TrtCommon::validateNetworkIO() { - auto runtime = std::unique_ptr(nvinfer1::createInferRuntime(logger_)); - auto inspector = std::unique_ptr(engine_->createEngineInspector()); - if (context_ != nullptr) { - inspector->setExecutionContext(&(*context_)); + auto success = true; + if (!network_io_ || network_io_->empty()) { + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Network IO is empty, skipping validation. It might lead to undefined behavior"); + return success; + } + + if (network_io_->size() != static_cast(getNbIOTensors())) { + std::string tensor_names = "[" + std::string(getIOTensorName(0)); + for (int32_t i = 1; i < getNbIOTensors(); ++i) { + tensor_names += ", " + std::string(getIOTensorName(i)); + } + tensor_names += "]"; + logger_->log( + nvinfer1::ILogger::Severity::kWARNING, + "Number of tensors in the engine (%d) does not match number of tensors in the config (%d). " + "Tensors in the built engine: %s", + getNbIOTensors(), network_io_->size(), tensor_names.c_str()); + success = false; + } + for (const auto & io : *network_io_) { + NetworkIO tensor_from_engine{io.tensor_name, getTensorShape(io.tensor_name.c_str())}; + if (io != tensor_from_engine) { + logger_->log( + nvinfer1::ILogger::Severity::kERROR, + "Invalid tensor. Current configuration: %s. Cached engine: %s", io.toString().c_str(), + tensor_from_engine.toString().c_str()); + success = false; + } } - std::string result = inspector->getEngineInformation(format); - return result; + + return success; } -#endif } // namespace tensorrt_common } // namespace autoware diff --git a/perception/autoware_tensorrt_yolox/CMakeLists.txt b/perception/autoware_tensorrt_yolox/CMakeLists.txt index 17b31fc7e8df1..cc0d793151781 100644 --- a/perception/autoware_tensorrt_yolox/CMakeLists.txt +++ b/perception/autoware_tensorrt_yolox/CMakeLists.txt @@ -10,6 +10,9 @@ endif() find_package(autoware_cmake REQUIRED) autoware_package() +# TODO(amadeuszsz): Remove -Wno-deprecated-declarations once removing implicit quantization +add_compile_options(-Wno-deprecated-declarations) + find_package(OpenCV REQUIRED) option(CUDA_VERBOSE "Verbose output of CUDA modules" OFF) diff --git a/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp b/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp index 483adfbdf2757..7d49df145b72e 100644 --- a/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp +++ b/perception/autoware_tensorrt_yolox/include/autoware/tensorrt_yolox/tensorrt_yolox.hpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -44,6 +46,13 @@ struct Object using ObjectArray = std::vector; using ObjectArrays = std::vector; +using autoware::tensorrt_common::CalibrationConfig; +using autoware::tensorrt_common::NetworkIOPtr; +using autoware::tensorrt_common::ProfileDimsPtr; +using autoware::tensorrt_common::Profiler; +using autoware::tensorrt_common::TrtCommon; +using autoware::tensorrt_common::TrtCommonConfig; +using autoware::tensorrt_common::TrtConvCalib; struct GridAndStride { @@ -70,31 +79,26 @@ class TrtYoloX public: /** * @brief Construct TrtYoloX. - * @param[in] mode_path ONNX model_path - * @param[in] precision precision for inference + * @param[in] trt_config base trt common configuration * @param[in] num_class classifier-ed num * @param[in] score_threshold threshold for detection * @param[in] nms_threshold threshold for NMS - * @param[in] build_config configuration including precision, calibration method, DLA, remaining - * fp16 for first layer, remaining fp16 for last layer and profiler for builder * @param[in] use_gpu_preprocess whether use cuda gpu for preprocessing - * @param[in] calibration_image_list_file path for calibration files (only require for + * @param[in] gpu_id GPU id for inference + * @param[in] calibration_image_list_path path for calibration files (only require for * quantization) * @param[in] norm_factor scaling factor for preprocess * @param[in] cache_dir unused variable - * @param[in] batch_config configuration for batched execution - * @param[in] max_workspace_size maximum workspace for building TensorRT engine + * @param[in] color_map_path path for colormap for masks + * @param[in] calib_config calibration configuration */ TrtYoloX( - const std::string & model_path, const std::string & precision, const int num_class = 8, - const float score_threshold = 0.3, const float nms_threshold = 0.7, - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig(), - const bool use_gpu_preprocess = false, const uint8_t gpu_id = 0, - std::string calibration_image_list_file = std::string(), const double norm_factor = 1.0, - [[maybe_unused]] const std::string & cache_dir = "", - const autoware::tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, - const size_t max_workspace_size = (1 << 30), const std::string & color_map_path = ""); + TrtCommonConfig & trt_config, const int num_class = 8, const float score_threshold = 0.3, + const float nms_threshold = 0.7, const bool use_gpu_preprocess = false, + const uint8_t gpu_id = 0, std::string calibration_image_list_path = std::string(), + const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "", + const std::string & color_map_path = "", + const CalibrationConfig & calib_config = CalibrationConfig()); /** * @brief Deconstruct TrtYoloX */ @@ -168,6 +172,12 @@ class TrtYoloX cv::Mat & colorized_mask); inline std::vector getColorMap() { return sematic_color_map_; } + /** + * @brief get batch size + * @return batch size + */ + [[nodiscard]] int getBatchSize() const; + private: /** * @brief run preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU @@ -266,7 +276,7 @@ class TrtYoloX */ cv::Mat getMaskImageGpu(float * d_prob, nvinfer1::Dims dims, int out_w, int out_h, int b); - std::unique_ptr trt_common_; + std::unique_ptr trt_common_; std::vector input_h_; CudaUniquePtr input_d_; @@ -288,7 +298,7 @@ class TrtYoloX int num_class_; float score_threshold_; float nms_threshold_; - int batch_size_; + int32_t batch_size_; CudaUniquePtrHost out_prob_h_; // flag whether preprocess are performed on GPU diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp index 06540f2b7cd19..62ca8ded21101 100644 --- a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp +++ b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox.cpp @@ -155,13 +155,11 @@ std::vector get_seg_colormap(const std::stri namespace autoware::tensorrt_yolox { TrtYoloX::TrtYoloX( - const std::string & model_path, const std::string & precision, const int num_class, - const float score_threshold, const float nms_threshold, - autoware::tensorrt_common::BuildConfig build_config, const bool use_gpu_preprocess, - const uint8_t gpu_id, std::string calibration_image_list_path, const double norm_factor, - [[maybe_unused]] const std::string & cache_dir, - const autoware::tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size, - const std::string & color_map_path) + TrtCommonConfig & trt_config, const int num_class, const float score_threshold, + const float nms_threshold, const bool use_gpu_preprocess, const uint8_t gpu_id, + std::string calibration_image_list_path, const double norm_factor, + [[maybe_unused]] const std::string & cache_dir, const std::string & color_map_path, + const CalibrationConfig & calib_config) : gpu_id_(gpu_id), is_gpu_initialized_(false) { if (!setCudaDeviceId(gpu_id_)) { @@ -172,13 +170,29 @@ TrtYoloX::TrtYoloX( src_width_ = -1; src_height_ = -1; norm_factor_ = norm_factor; - batch_size_ = batch_config[2]; multitask_ = 0; sematic_color_map_ = get_seg_colormap(color_map_path); stream_ = makeCudaStream(); - if (precision == "int8") { - if (build_config.clip_value <= 0.0) { + trt_common_ = std::make_unique(trt_config); + + const auto network_input_dims = trt_common_->getTensorShape(0); + batch_size_ = network_input_dims.d[0]; + const auto input_channel = network_input_dims.d[1]; + const auto input_height = network_input_dims.d[2]; + const auto input_width = network_input_dims.d[3]; + + auto profile_dims_ptr = std::make_unique>(); + + std::vector profile_dims{ + autoware::tensorrt_common::ProfileDims( + 0, {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}}, + {4, {batch_size_, input_channel, input_height, input_width}})}; + *profile_dims_ptr = profile_dims; + + if (trt_config.precision == "int8") { + if (calib_config.clip_value <= 0.0) { if (calibration_image_list_path.empty()) { throw std::runtime_error( "calibration_image_list_path should be passed to generate int8 engine " @@ -189,19 +203,17 @@ TrtYoloX::TrtYoloX( calibration_image_list_path = ""; } - int max_batch_size = batch_size_; - nvinfer1::Dims input_dims = autoware::tensorrt_common::get_input_dims(model_path); std::vector calibration_images; if (calibration_image_list_path != "") { calibration_images = loadImageList(calibration_image_list_path, ""); } - tensorrt_yolox::ImageStream stream(max_batch_size, input_dims, calibration_images); - fs::path calibration_table{model_path}; + tensorrt_yolox::ImageStream stream(batch_size_, network_input_dims, calibration_images); + fs::path calibration_table{trt_config.onnx_path}; std::string ext = ""; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { ext = "EntropyV2-"; } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { ext = "Legacy-"; } else { ext = "MinMax-"; @@ -209,17 +221,17 @@ TrtYoloX::TrtYoloX( ext += "calibration.table"; calibration_table.replace_extension(ext); - fs::path histogram_table{model_path}; + fs::path histogram_table{trt_config.onnx_path}; ext = "histogram.table"; histogram_table.replace_extension(ext); std::unique_ptr calibrator; - if (build_config.calib_type_str == "Entropy") { + if (calib_config.calib_type_str == "Entropy") { calibrator.reset( new tensorrt_yolox::Int8EntropyCalibrator(stream, calibration_table, norm_factor_)); } else if ( - build_config.calib_type_str == "Legacy" || build_config.calib_type_str == "Percentile") { + calib_config.calib_type_str == "Legacy" || calib_config.calib_type_str == "Percentile") { const double quantile = 0.999999; const double cutoff = 0.999999; calibrator.reset(new tensorrt_yolox::Int8LegacyCalibrator( @@ -228,22 +240,20 @@ TrtYoloX::TrtYoloX( calibrator.reset( new tensorrt_yolox::Int8MinMaxCalibrator(stream, calibration_table, norm_factor_)); } - trt_common_ = std::make_unique( - model_path, precision, std::move(calibrator), batch_config, max_workspace_size, build_config); + if (!trt_common_->setupWithCalibrator( + std::move(calibrator), calib_config, std::move((profile_dims_ptr)))) { + throw std::runtime_error("Failed to setup TensorRT engine with calibrator"); + } } else { - trt_common_ = std::make_unique( - model_path, precision, nullptr, batch_config, max_workspace_size, build_config); - } - trt_common_->setup(); - - if (!trt_common_->isInitialized()) { - return; + if (!trt_common_->setup(std::move(profile_dims_ptr))) { + throw std::runtime_error("Failed to setup TensorRT engine"); + } } // Judge whether decoding output is required // Plain models require decoding, while models with EfficientNMS_TRT module don't. // If getNbBindings == 5, the model contains EfficientNMS_TRT - switch (trt_common_->getNbBindings()) { + switch (trt_common_->getNbIOTensors()) { case 2: // Specified model is plain one. // Bindings are: [input, output] @@ -273,16 +283,16 @@ TrtYoloX::TrtYoloX( */ } // GPU memory allocation - const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_dims = trt_common_->getTensorShape(0); const auto input_size = std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); if (needs_output_decode_) { - const auto output_dims = trt_common_->getBindingDimensions(1); - input_d_ = autoware::cuda_utils::make_unique(batch_config[2] * input_size); + const auto output_dims = trt_common_->getTensorShape(1); + input_d_ = autoware::cuda_utils::make_unique(batch_size_ * input_size); out_elem_num_ = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); - out_elem_num_ = out_elem_num_ * batch_config[2]; - out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_config[2]); + out_elem_num_ = out_elem_num_ * batch_size_; + out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_size_); out_prob_d_ = autoware::cuda_utils::make_unique(out_elem_num_); out_prob_h_ = autoware::cuda_utils::make_unique_host(out_elem_num_, cudaHostAllocPortable); @@ -298,29 +308,27 @@ TrtYoloX::TrtYoloX( output_strides_ = {8, 16, 32, 4}; } } else { - const auto out_scores_dims = trt_common_->getBindingDimensions(3); + const auto out_scores_dims = trt_common_->getTensorShape(3); max_detections_ = out_scores_dims.d[1]; - input_d_ = autoware::cuda_utils::make_unique(batch_config[2] * input_size); - out_num_detections_d_ = autoware::cuda_utils::make_unique(batch_config[2]); - out_boxes_d_ = - autoware::cuda_utils::make_unique(batch_config[2] * max_detections_ * 4); - out_scores_d_ = autoware::cuda_utils::make_unique(batch_config[2] * max_detections_); - out_classes_d_ = - autoware::cuda_utils::make_unique(batch_config[2] * max_detections_); + input_d_ = autoware::cuda_utils::make_unique(batch_size_ * input_size); + out_num_detections_d_ = autoware::cuda_utils::make_unique(batch_size_); + out_boxes_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_ * 4); + out_scores_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_); + out_classes_d_ = autoware::cuda_utils::make_unique(batch_size_ * max_detections_); } if (multitask_) { // Allocate buffer for segmentation segmentation_out_elem_num_ = 0; for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0 : input, 1 : output for detections + trt_common_->getTensorShape(m + 2); // 0 : input, 1 : output for detections size_t out_elem_num = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); - out_elem_num = out_elem_num * batch_config[2]; + out_elem_num = out_elem_num * batch_size_; segmentation_out_elem_num_ += out_elem_num; } segmentation_out_elem_num_per_batch_ = - static_cast(segmentation_out_elem_num_ / batch_config[2]); + static_cast(segmentation_out_elem_num_ / batch_size_); segmentation_out_prob_d_ = autoware::cuda_utils::make_unique(segmentation_out_elem_num_); segmentation_out_prob_h_ = autoware::cuda_utils::make_unique_host( @@ -381,7 +389,7 @@ void TrtYoloX::initPreprocessBuffer(int width, int height) src_width_ = width; src_height_ = height; if (use_gpu_preprocess_) { - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); bool const hasRuntimeDim = std::any_of( input_dims.d, input_dims.d + input_dims.nbDims, [](int32_t input_dim) { return input_dim == -1; }); @@ -389,7 +397,9 @@ void TrtYoloX::initPreprocessBuffer(int width, int height) input_dims.d[0] = batch_size_; } if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } scales_.clear(); } const float input_height = static_cast(input_dims.d[2]); @@ -408,7 +418,7 @@ void TrtYoloX::initPreprocessBuffer(int width, int height) size_t argmax_out_elem_num = 0; for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0 : input, 1 : output for detections + trt_common_->getTensorShape(m + 2); // 0 : input, 1 : output for detections const float scale = std::min( output_dims.d[3] / static_cast(width), output_dims.d[2] / static_cast(height)); @@ -435,7 +445,7 @@ void TrtYoloX::printProfiling(void) void TrtYoloX::preprocessGpu(const std::vector & images) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; for (const auto & image : images) { @@ -463,7 +473,9 @@ void TrtYoloX::preprocessGpu(const std::vector & images) src_height_ = height; } if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } scales_.clear(); } const float input_height = static_cast(input_dims.d[2]); @@ -489,7 +501,7 @@ void TrtYoloX::preprocessGpu(const std::vector & images) if (multitask_) { for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0: input, 1: output for detections + trt_common_->getTensorShape(m + 2); // 0: input, 1: output for detections const float scale = std::min( output_dims.d[3] / static_cast(image.cols), output_dims.d[2] / static_cast(image.rows)); @@ -524,9 +536,11 @@ void TrtYoloX::preprocessGpu(const std::vector & images) void TrtYoloX::preprocess(const std::vector & images) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector dst_images; @@ -561,9 +575,6 @@ bool TrtYoloX::doInference( if (!setCudaDeviceId(gpu_id_)) { return false; } - if (!trt_common_->isInitialized()) { - return false; - } if (use_gpu_preprocess_) { preprocessGpu(images); @@ -582,7 +593,7 @@ void TrtYoloX::preprocessWithRoiGpu( const std::vector & images, const std::vector & rois) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; for (const auto & image : images) { @@ -604,7 +615,9 @@ void TrtYoloX::preprocessWithRoiGpu( src_height_ = height; } if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -656,9 +669,11 @@ void TrtYoloX::preprocessWithRoi( const std::vector & images, const std::vector & rois) { const auto batch_size = images.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector dst_images; @@ -694,7 +709,7 @@ void TrtYoloX::preprocessWithRoi( void TrtYoloX::multiScalePreprocessGpu(const cv::Mat & image, const std::vector & rois) { const auto batch_size = rois.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; @@ -716,7 +731,9 @@ void TrtYoloX::multiScalePreprocessGpu(const cv::Mat & image, const std::vector< src_height_ = height; if (!image_buf_h_) { - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); @@ -764,9 +781,11 @@ void TrtYoloX::multiScalePreprocessGpu(const cv::Mat & image, const std::vector< void TrtYoloX::multiScalePreprocess(const cv::Mat & image, const std::vector & rois) { const auto batch_size = rois.size(); - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); input_dims.d[0] = batch_size; - trt_common_->setBindingDimensions(0, input_dims); + if (!trt_common_->setInputShape(0, input_dims)) { + return; + } const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector dst_images; @@ -800,9 +819,6 @@ void TrtYoloX::multiScalePreprocess(const cv::Mat & image, const std::vector & images, ObjectArrays & objects, const std::vector & rois) { - if (!trt_common_->isInitialized()) { - return false; - } if (use_gpu_preprocess_) { preprocessWithRoiGpu(images, rois); } else { @@ -821,9 +837,6 @@ bool TrtYoloX::doInferenceWithRoi( bool TrtYoloX::doMultiScaleInference( const cv::Mat & image, ObjectArrays & objects, const std::vector & rois) { - if (!trt_common_->isInitialized()) { - return false; - } if (use_gpu_preprocess_) { multiScalePreprocessGpu(image, rois); } else { @@ -843,8 +856,10 @@ bool TrtYoloX::feedforward(const std::vector & images, ObjectArrays & o std::vector buffers = { input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(), out_classes_d_.get()}; - - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); const auto batch_size = images.size(); auto out_num_detections = std::make_unique(batch_size); @@ -897,7 +912,11 @@ bool TrtYoloX::feedforwardAndDecode( if (multitask_) { buffers = {input_d_.get(), out_prob_d_.get(), segmentation_out_prob_d_.get()}; } - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); const auto batch_size = images.size(); @@ -927,7 +946,7 @@ bool TrtYoloX::feedforwardAndDecode( static_cast(segmentation_out_elem_num_ / segmentation_out_elem_num_per_batch_); for (int m = 0; m < multitask_; m++) { const auto output_dims = - trt_common_->getBindingDimensions(m + 2); // 0 : input, 1 : output for detections + trt_common_->getTensorShape(m + 2); // 0 : input, 1 : output for detections size_t out_elem_num = std::accumulate( output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); out_elem_num = out_elem_num * batch; @@ -966,7 +985,10 @@ bool TrtYoloX::multiScaleFeedforward(const cv::Mat & image, int batch_size, Obje input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(), out_classes_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); auto out_num_detections = std::make_unique(batch_size); auto out_boxes = std::make_unique(4 * batch_size * max_detections_); @@ -1014,7 +1036,10 @@ bool TrtYoloX::multiScaleFeedforwardAndDecode( const cv::Mat & image, int batch_size, ObjectArrays & objects) { std::vector buffers = {input_d_.get(), out_prob_d_.get()}; - trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + if (!trt_common_->setTensorsAddresses(buffers)) { + return false; + } + trt_common_->enqueueV3(*stream_); CHECK_CUDA_ERROR(cudaMemcpyAsync( out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost, @@ -1036,7 +1061,7 @@ void TrtYoloX::decodeOutputs( float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const { ObjectArray proposals; - auto input_dims = trt_common_->getBindingDimensions(0); + auto input_dims = trt_common_->getTensorShape(0); const float input_height = static_cast(input_dims.d[2]); const float input_width = static_cast(input_dims.d[3]); std::vector grid_strides; @@ -1292,4 +1317,9 @@ void TrtYoloX::getColorizedMask( } } +int TrtYoloX::getBatchSize() const +{ + return batch_size_; +} + } // namespace autoware::tensorrt_yolox diff --git a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp index 94a3a37a4d08f..2048279dcf3f1 100644 --- a/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp +++ b/perception/autoware_tensorrt_yolox/src/tensorrt_yolox_node.cpp @@ -83,19 +83,18 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) roi_overlay_segment_labels_.ANIMAL = declare_parameter("roi_overlay_segment_label.ANIMAL"); replaceLabelMap(); - autoware::tensorrt_common::BuildConfig build_config( - calibration_algorithm, dla_core_id, quantize_first_layer, quantize_last_layer, - profile_per_layer, clip_value); + TrtCommonConfig trt_config( + model_path, precision, "", (1ULL << 30U), dla_core_id, profile_per_layer); + + CalibrationConfig calib_config( + calibration_algorithm, quantize_first_layer, quantize_last_layer, clip_value); const double norm_factor = 1.0; const std::string cache_dir = ""; - const autoware::tensorrt_common::BatchConfig batch_config{1, 1, 1}; - const size_t max_workspace_size = (1 << 30); trt_yolox_ = std::make_unique( - model_path, precision, label_map_.size(), score_threshold, nms_threshold, build_config, - preprocess_on_gpu, gpu_id, calibration_image_list_path, norm_factor, cache_dir, batch_config, - max_workspace_size, color_map_path); + trt_config, label_map_.size(), score_threshold, nms_threshold, preprocess_on_gpu, gpu_id, + calibration_image_list_path, norm_factor, cache_dir, color_map_path, calib_config); if (!trt_yolox_->isGPUInitialized()) { RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id); diff --git a/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp b/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp index 243e82c65dab9..63c0ee0cd3e8a 100644 --- a/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp +++ b/perception/autoware_tensorrt_yolox/src/yolox_single_image_inference_node.cpp @@ -46,7 +46,9 @@ class YoloXSingleImageInferenceNode : public rclcpp::Node const auto output_image_path = declare_parameter("output_image_path", p.string() + "_detect" + ext); - auto trt_yolox = std::make_unique(model_path, precision); + auto trt_config = tensorrt_common::TrtCommonConfig(model_path, precision); + + auto trt_yolox = std::make_unique(trt_config); auto image = cv::imread(image_path); tensorrt_yolox::ObjectArrays objects; std::vector masks; diff --git a/perception/autoware_traffic_light_classifier/CMakeLists.txt b/perception/autoware_traffic_light_classifier/CMakeLists.txt index d132577743ab0..181037caebfa5 100644 --- a/perception/autoware_traffic_light_classifier/CMakeLists.txt +++ b/perception/autoware_traffic_light_classifier/CMakeLists.txt @@ -34,7 +34,6 @@ if(NVINFER AND NVONNXPARSER AND NVINFER_PLUGIN) if(CUDA_VERBOSE) message(STATUS "TensorRT is available!") message(STATUS "NVINFER: ${NVINFER}") - message(STATUS "NVPARSERS: ${NVPARSERS}") message(STATUS "NVINFER_PLUGIN: ${NVINFER_PLUGIN}") message(STATUS "NVONNXPARSER: ${NVONNXPARSER}") endif() diff --git a/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp b/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp index f9bf2509aa2ae..d47cb1500fffd 100644 --- a/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp +++ b/perception/autoware_traffic_light_classifier/src/classifier/cnn_classifier.cpp @@ -51,13 +51,10 @@ CNNClassifier::CNNClassifier(rclcpp::Node * node_ptr) : node_ptr_(node_ptr) } readLabelfile(label_file_path, labels_); - nvinfer1::Dims input_dim = autoware::tensorrt_common::get_input_dims(model_file_path); - assert(input_dim.d[0] > 0); - batch_size_ = input_dim.d[0]; - autoware::tensorrt_common::BatchConfig batch_config{batch_size_, batch_size_, batch_size_}; classifier_ = std::make_unique( - model_file_path, precision, batch_config, mean_, std_); + model_file_path, precision, mean_, std_); + batch_size_ = classifier_->getBatchSize(); if (node_ptr_->declare_parameter("build_only", false)) { RCLCPP_INFO(node_ptr_->get_logger(), "TensorRT engine is built and shutdown node."); rclcpp::shutdown(); diff --git a/perception/autoware_traffic_light_fine_detector/CMakeLists.txt b/perception/autoware_traffic_light_fine_detector/CMakeLists.txt index 80d6e43c098ed..dffbab0e77681 100644 --- a/perception/autoware_traffic_light_fine_detector/CMakeLists.txt +++ b/perception/autoware_traffic_light_fine_detector/CMakeLists.txt @@ -39,7 +39,6 @@ if(NVINFER AND NVONNXPARSER AND NVINFER_PLUGIN) if(CUDA_VERBOSE) message(STATUS "TensorRT is available!") message(STATUS "NVINFER: ${NVINFER}") - message(STATUS "NVPARSERS: ${NVPARSERS}") message(STATUS "NVINFER_PLUGIN: ${NVINFER_PLUGIN}") message(STATUS "NVONNXPARSER: ${NVONNXPARSER}") endif() diff --git a/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp b/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp index 37ffca4a9526c..d9e0471a43d62 100644 --- a/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp +++ b/perception/autoware_traffic_light_fine_detector/src/traffic_light_fine_detector_node.cpp @@ -75,21 +75,18 @@ TrafficLightFineDetectorNode::TrafficLightFineDetectorNode(const rclcpp::NodeOpt RCLCPP_ERROR(this->get_logger(), "Could not find tlr id"); } - const autoware::tensorrt_common::BuildConfig build_config = - autoware::tensorrt_common::BuildConfig("MinMax", -1, false, false, false, 0.0); - const bool cuda_preprocess = true; const std::string calib_image_list = ""; const double scale = 1.0; const std::string cache_dir = ""; - nvinfer1::Dims input_dim = autoware::tensorrt_common::get_input_dims(model_path); - assert(input_dim.d[0] > 0); - batch_size_ = input_dim.d[0]; - const autoware::tensorrt_common::BatchConfig batch_config{batch_size_, batch_size_, batch_size_}; + + auto trt_config = autoware::tensorrt_common::TrtCommonConfig(model_path, precision); trt_yolox_ = std::make_unique( - model_path, precision, num_class, score_thresh_, nms_threshold, build_config, cuda_preprocess, - gpu_id, calib_image_list, scale, cache_dir, batch_config); + trt_config, num_class, score_thresh_, nms_threshold, cuda_preprocess, gpu_id, calib_image_list, + scale, cache_dir); + + batch_size_ = trt_yolox_->getBatchSize(); if (!trt_yolox_->isGPUInitialized()) { RCLCPP_ERROR(this->get_logger(), "GPU %d does not exist or is not suitable.", gpu_id); From 5063ded1e6eef0a00fcf4fe9a63cd1aff44e321f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 24 Dec 2024 07:54:52 +0000 Subject: [PATCH 02/17] style(pre-commit): autofix --- .../include/autoware/tensorrt_common/conv_profiler.hpp | 2 +- .../include/autoware/tensorrt_common/utils.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp index 4e567ba0b0e18..66e17a30588fc 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -65,7 +65,7 @@ class ConvProfiler : public tensorrt_common::Profiler */ explicit ConvProfiler( const std::vector & src_profilers = std::vector()) - : Profiler(src_profilers){}; + : Profiler(src_profilers) {}; /** * @brief Set per-layer profile information for model. diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp index f75821b898fa3..6701e81e5ef16 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp @@ -404,4 +404,4 @@ struct CalibrationConfig } // namespace tensorrt_common } // namespace autoware -#endif // AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ \ No newline at end of file +#endif // AUTOWARE__TENSORRT_COMMON__UTILS_HPP_ From 2941f1fed360c8c4e1a2b7a58e3dbbcbcd1ca803 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 25 Dec 2024 10:14:29 +0900 Subject: [PATCH 03/17] style(autoware_tensorrt_common): linting Signed-off-by: Amadeusz Szymko --- .../include/autoware/tensorrt_common/conv_profiler.hpp | 4 +++- .../include/autoware/tensorrt_common/profiler.hpp | 2 +- .../include/autoware/tensorrt_common/tensorrt_conv_calib.hpp | 1 + .../include/autoware/tensorrt_common/utils.hpp | 1 + perception/autoware_tensorrt_common/src/profiler.cpp | 3 +++ 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp index 66e17a30588fc..a9aa92c357b9b 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -65,7 +65,9 @@ class ConvProfiler : public tensorrt_common::Profiler */ explicit ConvProfiler( const std::vector & src_profilers = std::vector()) - : Profiler(src_profilers) {}; + : Profiler(src_profilers) + { + } /** * @brief Set per-layer profile information for model. diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp index d82eab8c5f89b..c5683f0aa9125 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp @@ -51,7 +51,7 @@ class Profiler : public nvinfer1::IProfiler * * @param[in] src_profilers Source profilers to merge. */ - Profiler(const std::vector & src_profilers = std::vector()); + explicit Profiler(const std::vector & src_profilers = std::vector()); /** * @brief Report layer time. diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp index 2b87bb08dc22a..44a944031086e 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_conv_calib.hpp @@ -23,6 +23,7 @@ #include #include +#include #include namespace autoware diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp index 6701e81e5ef16..be640f52147d9 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/utils.hpp @@ -32,6 +32,7 @@ namespace fs = ::std::experimental::filesystem; #include #include #include +#include namespace autoware { diff --git a/perception/autoware_tensorrt_common/src/profiler.cpp b/perception/autoware_tensorrt_common/src/profiler.cpp index bf630abb5d83f..4720ad1015769 100644 --- a/perception/autoware_tensorrt_common/src/profiler.cpp +++ b/perception/autoware_tensorrt_common/src/profiler.cpp @@ -14,7 +14,10 @@ #include +#include #include +#include +#include namespace autoware { From 75f82f1f563fdb4dee0594bcc00a97c21624a1c5 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 7 Jan 2025 15:35:31 +0900 Subject: [PATCH 04/17] style(autoware_lidar_centerpoint): typo Co-authored-by: Kenzo Lobos Tsunekawa --- perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp index 0d5c646229d44..c7c35e6335d8a 100644 --- a/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp +++ b/perception/autoware_lidar_centerpoint/lib/centerpoint_trt.cpp @@ -111,7 +111,7 @@ void CenterPointTRT::initPtr() void CenterPointTRT::initTrt( const TrtCommonConfig & encoder_param, const TrtCommonConfig & head_param) { - /// encoder input profile + // encoder input profile auto enc_in_dims = nvinfer1::Dims{ 3, {static_cast(config_.max_voxel_size_), From 0f19baba667f24bd54f67f45dbcc2bf3b631f11f Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 7 Jan 2025 15:50:38 +0900 Subject: [PATCH 05/17] docs(autoware_tensorrt_common): grammar Co-authored-by: Kenzo Lobos Tsunekawa --- perception/autoware_tensorrt_common/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perception/autoware_tensorrt_common/README.md b/perception/autoware_tensorrt_common/README.md index cf19b187337b2..54bfccdfcb561 100644 --- a/perception/autoware_tensorrt_common/README.md +++ b/perception/autoware_tensorrt_common/README.md @@ -4,7 +4,7 @@ This package provides a high-level API to work with TensorRT. This library simpl ## Usage -Here is example usage of the library. For full API documentation, please refer to the doxygen documentation (see header [file](include/autoware/tensorrt_common/tensorrt_common.hpp)). +Here is an example usage of the library. For the full API documentation, please refer to the doxygen documentation (see header [file](include/autoware/tensorrt_common/tensorrt_common.hpp)). ```c++ #include From fa498ad3b8b3208cda49ca8f03a4301b42033e3b Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 7 Jan 2025 15:52:39 +0900 Subject: [PATCH 06/17] fix(autoware_lidar_transfusion): reuse cast variable Signed-off-by: Amadeusz Szymko --- perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp b/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp index 339b338244d77..dd075a45cb29c 100644 --- a/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp +++ b/perception/autoware_lidar_transfusion/lib/transfusion_trt.cpp @@ -240,7 +240,7 @@ bool TransfusionTRT::preprocess( success &= network_trt_ptr_->setTensor( "voxels", voxel_features_d_.get(), nvinfer1::Dims3{ - static_cast(params_input), config_.max_num_points_per_pillar_, + params_input_i32, config_.max_num_points_per_pillar_, static_cast(config_.num_point_feature_size_)}); success &= network_trt_ptr_->setTensor( "num_points", voxel_num_d_.get(), nvinfer1::Dims{1, {params_input_i32}}); From 76cff713fbac98f9827c81656246b659cefcbe58 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 7 Jan 2025 16:56:47 +0900 Subject: [PATCH 07/17] fix(autoware_tensorrt_common): remove deprecated inference API Signed-off-by: Amadeusz Szymko --- .../tensorrt_common/tensorrt_common.hpp | 12 ----------- .../src/tensorrt_common.cpp | 20 ------------------- 2 files changed, 32 deletions(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp index 6393a34c36d7a..9355732962e23 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/tensorrt_common.hpp @@ -280,18 +280,6 @@ class TrtCommon // NOLINT */ [[nodiscard]] std::shared_ptr getLogger(); - /** - * @brief Execute inference via TensorRT context. - * - * @param[in] bindings input/output tensor addresses. - * @param[in] stream CUDA stream. - * @param[in] input_consumed Signal whether input buffers can be refilled with new data - * @return Whether inference is successful. - */ -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 - bool enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed); -#endif - /** * @brief Execute inference via TensorRT context. * diff --git a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp index 601c6f108bd3b..f2f9cb7273b13 100644 --- a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp +++ b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp @@ -396,26 +396,6 @@ std::shared_ptr TrtCommon::getLogger() return logger_; } -#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH < 10000 -bool TrtCommon::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed) -{ - if (!context_) { - logger_->log(nvinfer1::ILogger::Severity::kERROR, "Context is not initialized"); - return false; - } - if (trt_config_->profile_per_layer) { - auto inference_start = std::chrono::high_resolution_clock::now(); - auto success = context_->enqueueV2(bindings, stream, input_consumed); - auto inference_end = std::chrono::high_resolution_clock::now(); - host_profiler_->reportLayerTime( - "inference_host", - std::chrono::duration(inference_end - inference_start).count()); - return success; - } - return context_->enqueueV2(bindings, stream, input_consumed); -} -#endif - bool TrtCommon::enqueueV3(cudaStream_t stream) { if (!context_) { From 4112498ba416652c7fea95a150027422c0a7c0d7 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 7 Jan 2025 17:19:32 +0900 Subject: [PATCH 08/17] style(autoware_tensorrt_common): grammar Co-authored-by: Kenzo Lobos Tsunekawa --- perception/autoware_tensorrt_common/src/tensorrt_common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp index f2f9cb7273b13..244b76ad4c546 100644 --- a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp +++ b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp @@ -140,7 +140,7 @@ bool TrtCommon::setup(ProfileDimsPtr profile_dims, NetworkIOPtr network_io) if (!validateNetworkIO() || !validateProfileDims()) { logger_->log( nvinfer1::ILogger::Severity::kERROR, - "Final network validation failed. Possibly input / output of currently " + "Final network validation failed. Possibly the input / output of the currently " "deployed model has been changed. Check your configuration file with the current model."); return false; } From 2f03ff30fb8e6aa3afbadc27d1bf35be4dd970b8 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 7 Jan 2025 17:19:48 +0900 Subject: [PATCH 09/17] style(autoware_tensorrt_common): grammar Co-authored-by: Kenzo Lobos Tsunekawa --- perception/autoware_tensorrt_common/src/tensorrt_common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp index 244b76ad4c546..ba422277416ab 100644 --- a/perception/autoware_tensorrt_common/src/tensorrt_common.cpp +++ b/perception/autoware_tensorrt_common/src/tensorrt_common.cpp @@ -141,7 +141,7 @@ bool TrtCommon::setup(ProfileDimsPtr profile_dims, NetworkIOPtr network_io) logger_->log( nvinfer1::ILogger::Severity::kERROR, "Final network validation failed. Possibly the input / output of the currently " - "deployed model has been changed. Check your configuration file with the current model."); + "deployed model has changed. Check your configuration file with the current model."); return false; } From ec72c657f53ee288177f9ce16c7e93eff65b29f4 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Jan 2025 09:09:21 +0900 Subject: [PATCH 10/17] fix(autoware_tensorrt_common): const pointer Signed-off-by: Amadeusz Szymko --- .../include/autoware/tensorrt_common/conv_profiler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp index a9aa92c357b9b..71b40942c5429 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -74,7 +74,7 @@ class ConvProfiler : public tensorrt_common::Profiler * * @param[in] layer Layer to set profile information. */ - void setProfDict(nvinfer1::ILayer * layer) noexcept final + void setProfDict(nvinfer1::ILayer * const layer) noexcept final { std::string name = layer->getName(); layer_dict_[name]; From 45be672e7faf5c2294fac0c47b2f785290736f40 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Jan 2025 09:12:23 +0900 Subject: [PATCH 11/17] fix(autoware_tensorrt_common): remove unused method declaration Signed-off-by: Amadeusz Szymko --- .../include/autoware/tensorrt_common/profiler.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp index c5683f0aa9125..caaffebc8d99d 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp @@ -75,13 +75,6 @@ class Profiler : public nvinfer1::IProfiler */ virtual void setProfDict([[maybe_unused]] nvinfer1::ILayer * layer) noexcept {}; - /** - * @brief Get printable per-layer profile information for model. - * - * @param[in] format Format for layer information. - * @return Layer information. - */ - std::string getLayerInformation(nvinfer1::LayerInformationFormat format); /** * @brief Output Profiler to ostream. From e155555faa7ba0b32bb9db0d53bf0940ed488806 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 00:20:15 +0000 Subject: [PATCH 12/17] style(pre-commit): autofix --- .../include/autoware/tensorrt_common/profiler.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp index caaffebc8d99d..d45351f97fd01 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/profiler.hpp @@ -75,7 +75,6 @@ class Profiler : public nvinfer1::IProfiler */ virtual void setProfDict([[maybe_unused]] nvinfer1::ILayer * layer) noexcept {}; - /** * @brief Output Profiler to ostream. * From f9db3210c08091b437867e50c7a2073ed8ebfe67 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Jan 2025 09:28:35 +0900 Subject: [PATCH 13/17] refactor(autoware_tensorrt_common): readability Co-authored-by: Kotaro Uetake <60615504+ktro2828@users.noreply.github.com> --- perception/autoware_tensorrt_common/src/profiler.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/perception/autoware_tensorrt_common/src/profiler.cpp b/perception/autoware_tensorrt_common/src/profiler.cpp index 4720ad1015769..2cef93c655a36 100644 --- a/perception/autoware_tensorrt_common/src/profiler.cpp +++ b/perception/autoware_tensorrt_common/src/profiler.cpp @@ -28,13 +28,13 @@ Profiler::Profiler(const std::vector & src_profilers) { index_ = 0; for (const auto & src_profiler : src_profilers) { - for (const auto & rec : src_profiler.profile_) { - auto it = profile_.find(rec.first); + for (const auto & [name, record] : src_profiler.profile_) { + auto it = profile_.find(name); if (it == profile_.end()) { - profile_.insert(rec); + profile_.emplace(name, record); } else { - it->second.time += rec.second.time; - it->second.count += rec.second.count; + it->second.time += record.time; + it->second.count += record.count; } } } From b707837782d73ce261c951c674937e0c02c96230 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Jan 2025 09:37:33 +0900 Subject: [PATCH 14/17] fix(autoware_tensorrt_common): return if layer not registered Signed-off-by: Amadeusz Szymko --- perception/autoware_tensorrt_common/src/profiler.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/perception/autoware_tensorrt_common/src/profiler.cpp b/perception/autoware_tensorrt_common/src/profiler.cpp index 2cef93c655a36..433cca51f2b4f 100644 --- a/perception/autoware_tensorrt_common/src/profiler.cpp +++ b/perception/autoware_tensorrt_common/src/profiler.cpp @@ -42,6 +42,9 @@ Profiler::Profiler(const std::vector & src_profilers) void Profiler::reportLayerTime(const char * layerName, float ms) noexcept { + if (profile_.find(layerName) == profile_.end()) { + return; + } profile_[layerName].count++; profile_[layerName].time += ms; if (profile_[layerName].min_time == -1.0) { From e9a9369b55e344ed0e17af1f7814d794aa242473 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Jan 2025 09:52:21 +0900 Subject: [PATCH 15/17] refactor(autoware_tensorrt_common): readability Co-authored-by: Kotaro Uetake <60615504+ktro2828@users.noreply.github.com> --- .../tensorrt_common/conv_profiler.hpp | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp index 71b40942c5429..2576a95216375 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -76,27 +76,24 @@ class ConvProfiler : public tensorrt_common::Profiler */ void setProfDict(nvinfer1::ILayer * const layer) noexcept final { - std::string name = layer->getName(); - layer_dict_[name]; - layer_dict_[name].type = layer->getType(); - if (layer->getType() == nvinfer1::LayerType::kCONVOLUTION) { + if (const auto type = layer->getType(); type == nvinfer1::LayerType::kCONVOLUTION) { + const auto name = layer->getName(); auto conv = dynamic_cast(layer); + nvinfer1::ITensor * in = layer->getInput(0); - nvinfer1::Dims dim_in = in->getDimensions(); + nvinfer1::Dims in_dim = in->getDimensions(); + nvinfer1::ITensor * out = layer->getOutput(0); - nvinfer1::Dims dim_out = out->getDimensions(); + nvinfer1::Dims out_dim = out->getDimensions(); + nvinfer1::Dims k_dims = conv->getKernelSizeNd(); nvinfer1::Dims s_dims = conv->getStrideNd(); - int groups = conv->getNbGroups(); - int stride = s_dims.d[0]; - int kernel = k_dims.d[0]; - layer_dict_[name].in_c = dim_in.d[1]; - layer_dict_[name].out_c = dim_out.d[1]; - layer_dict_[name].w = dim_in.d[3]; - layer_dict_[name].h = dim_in.d[2]; - layer_dict_[name].k = kernel; - layer_dict_[name].stride = stride; - layer_dict_[name].groups = groups; + + int32_t kernel = k_dims.d[0]; + int32_t stride = s_dims.d[0]; + int32_t groups = conv->getNbGroups(); + + layer_dict_.insert_or_assign(name, ConvLayerInfo{in_dim.d[1], out_dim.d[1], in_dim.d[3], in_dim.d[2], kernel, stride, groups, type}); } } From 7ebc429d15d53ae9550c938708ec4b23b42fbf2b Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Jan 2025 09:53:02 +0900 Subject: [PATCH 16/17] fix(autoware_tensorrt_common): rename struct Signed-off-by: Amadeusz Szymko --- .../include/autoware/tensorrt_common/conv_profiler.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp index 2576a95216375..e20003ece00b9 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -28,10 +28,10 @@ namespace autoware namespace tensorrt_common { /** - * @struct LayerInfo + * @struct ConvLayerInfo * @brief Information of a layer. */ -struct LayerInfo +struct ConvLayerInfo { //! @brief Input channel. int in_c; @@ -99,7 +99,7 @@ class ConvProfiler : public tensorrt_common::Profiler private: //! @brief Per-layer information. - std::map layer_dict_; + std::map layer_dict_; }; } // namespace tensorrt_common } // namespace autoware From cd234d68e9165d222da40634f80f18f986e14611 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 00:55:02 +0000 Subject: [PATCH 17/17] style(pre-commit): autofix --- .../include/autoware/tensorrt_common/conv_profiler.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp index e20003ece00b9..0b58a797d8b9b 100644 --- a/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp +++ b/perception/autoware_tensorrt_common/include/autoware/tensorrt_common/conv_profiler.hpp @@ -79,7 +79,7 @@ class ConvProfiler : public tensorrt_common::Profiler if (const auto type = layer->getType(); type == nvinfer1::LayerType::kCONVOLUTION) { const auto name = layer->getName(); auto conv = dynamic_cast(layer); - + nvinfer1::ITensor * in = layer->getInput(0); nvinfer1::Dims in_dim = in->getDimensions(); @@ -93,7 +93,9 @@ class ConvProfiler : public tensorrt_common::Profiler int32_t stride = s_dims.d[0]; int32_t groups = conv->getNbGroups(); - layer_dict_.insert_or_assign(name, ConvLayerInfo{in_dim.d[1], out_dim.d[1], in_dim.d[3], in_dim.d[2], kernel, stride, groups, type}); + layer_dict_.insert_or_assign( + name, ConvLayerInfo{ + in_dim.d[1], out_dim.d[1], in_dim.d[3], in_dim.d[2], kernel, stride, groups, type}); } }