diff --git a/perception/autoware_lidar_bevfusion/CMakeLists.txt b/perception/autoware_lidar_bevfusion/CMakeLists.txt
new file mode 100644
index 0000000000000..7761c37d9b914
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/CMakeLists.txt
@@ -0,0 +1,214 @@
+cmake_minimum_required(VERSION 3.14)
+project(autoware_lidar_bevfusion)
+
+find_package(autoware_cmake REQUIRED)
+autoware_package()
+
+add_compile_options(-Wno-deprecated-declarations)
+
+option(CUDA_VERBOSE "Verbose output of CUDA modules" OFF)
+
+# set flags for CUDA availability
+option(CUDA_AVAIL "CUDA available" OFF)
+find_package(CUDA)
+if(CUDA_FOUND)
+  find_library(CUBLAS_LIBRARIES cublas HINTS
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib
+  )
+  if(CUDA_VERBOSE)
+    message("CUDA is available!")
+    message("CUDA Libs: ${CUDA_LIBRARIES}")
+    message("CUDA Headers: ${CUDA_INCLUDE_DIRS}")
+  endif()
+  # Note: cublas_device was depreciated in CUDA version 9.2
+  #       https://forums.developer.nvidia.com/t/where-can-i-find-libcublas-device-so-or-libcublas-device-a/67251/4
+  #       In LibTorch, CUDA_cublas_device_LIBRARY is used.
+  unset(CUDA_cublas_device_LIBRARY CACHE)
+  set(CUDA_AVAIL ON)
+else()
+  message("CUDA NOT FOUND")
+  set(CUDA_AVAIL OFF)
+endif()
+
+# set flags for TensorRT availability
+option(TRT_AVAIL "TensorRT available" OFF)
+# try to find the tensorRT modules
+find_library(NVINFER nvinfer)
+find_library(NVONNXPARSER nvonnxparser)
+if(NVINFER AND NVONNXPARSER)
+  if(CUDA_VERBOSE)
+    message("TensorRT is available!")
+    message("NVINFER: ${NVINFER}")
+    message("NVONNXPARSER: ${NVONNXPARSER}")
+  endif()
+  set(TRT_AVAIL ON)
+else()
+  message("TensorRT is NOT Available")
+  set(TRT_AVAIL OFF)
+endif()
+
+# set flags for CUDNN availability
+option(CUDNN_AVAIL "CUDNN available" OFF)
+# try to find the CUDNN module
+find_library(CUDNN_LIBRARY
+NAMES libcudnn.so${__cudnn_ver_suffix} libcudnn${__cudnn_ver_suffix}.dylib ${__cudnn_lib_win_name}
+PATHS $ENV{LD_LIBRARY_PATH} ${__libpath_cudart} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
+PATH_SUFFIXES lib lib64 bin
+DOC "CUDNN library."
+)
+if(CUDNN_LIBRARY)
+  if(CUDA_VERBOSE)
+    message(STATUS "CUDNN is available!")
+    message(STATUS "CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
+  endif()
+  set(CUDNN_AVAIL ON)
+else()
+  message("CUDNN is NOT Available")
+  set(CUDNN_AVAIL OFF)
+endif()
+
+# set flags for spconv availability
+option(SPCONV_AVAIL "spconv available" OFF)
+# try to find spconv
+find_package(cumm)
+find_package(spconv)
+if(${cumm_FOUND} AND ${spconv_FOUND})
+  message("spconv is available!")
+  set(SPCONV_AVAIL ON)
+else()
+  message("spconv is NOT Available")
+  set(SPCONV_AVAIL OFF)
+endif()
+
+if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL AND SPCONV_AVAIL)
+  find_package(ament_cmake_auto REQUIRED)
+  ament_auto_find_build_dependencies()
+
+  include_directories(
+    include
+    ${CUDA_INCLUDE_DIRS}
+  )
+
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
+  endif()
+
+  # Find Eigen3
+  find_package(Eigen3 3.3 REQUIRED NO_MODULE)
+
+  add_definitions("-DTV_CUDA")
+
+  # cSpell:ignore expt
+  list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 1675 --extended-lambda")
+
+  cuda_add_library(bev_ops SHARED
+    src/bev_ops/bev_pool_cuda.cu
+  )
+
+  add_library(autoware_tensorrt_plugins SHARED
+    src/tensorrt_plugins/plugin_utils.cpp
+    src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp
+    src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp
+    src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.cpp
+    src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.cpp
+    src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp
+    src/tensorrt_plugins/implicit_gemm_plugin.cpp
+
+    src/tensorrt_plugins/plugin_registration.cpp
+  )
+
+  target_compile_definitions(autoware_tensorrt_plugins PRIVATE _GLIBCXX_USE_CXX11_ABI=1)
+
+  target_link_libraries(autoware_tensorrt_plugins PRIVATE
+      ${NVINFER}
+      CUDA::cudart
+      bev_ops
+      spconv::spconv
+  )
+
+  cuda_add_library(${PROJECT_NAME}_cuda_lib SHARED
+    lib/postprocess/circle_nms_kernel.cu
+    lib/postprocess/postprocess_kernel.cu
+    lib/preprocess/preprocess_kernel.cu
+  )
+
+  target_link_libraries(${PROJECT_NAME}_cuda_lib
+    spconv::spconv
+  )
+
+  target_include_directories(${PROJECT_NAME}_cuda_lib
+    SYSTEM PUBLIC
+    ${autoware_cuda_utils_INCLUDE_DIRS}
+  )
+
+  ament_auto_add_library(${PROJECT_NAME}_lib SHARED
+    lib/detection_class_remapper.cpp
+    lib/postprocess/non_maximum_suppression.cpp
+    lib/preprocess/voxel_generator.cpp
+    lib/preprocess/pointcloud_densification.cpp
+    lib/preprocess/precomputed_features.cpp
+    lib/ros_utils.cpp
+    lib/bevfusion_trt.cpp
+  )
+
+  target_compile_definitions(${PROJECT_NAME}_lib PRIVATE
+    TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR}
+  )
+
+  target_link_libraries(${PROJECT_NAME}_lib
+    ${NVINFER}
+    ${NVONNXPARSER}
+    ${CUDA_LIBRARIES}
+    ${CUBLAS_LIBRARIES}
+    ${CUDA_curand_LIBRARY}
+    ${CUDNN_LIBRARY}
+    ${PROJECT_NAME}_cuda_lib
+  )
+
+  # To suppress unknown-pragmas error. The root-cause is CUB library in CUDA 11.6.
+  # This issue was fixed by https://github.com/NVIDIA/cub/commit/7d608bf1dc14553e2fb219eabeed80b76621b6fe
+  target_include_directories(${PROJECT_NAME}_lib
+    SYSTEM PUBLIC
+    ${CUDA_INCLUDE_DIRS}
+    $(autoware_point_types_INCLUDE_DIRS)
+  )
+
+  ament_auto_add_library(${PROJECT_NAME}_component SHARED
+    src/lidar_bevfusion_node.cpp
+  )
+
+  target_link_libraries(${PROJECT_NAME}_component
+    ${PROJECT_NAME}_lib
+  )
+
+  rclcpp_components_register_node(${PROJECT_NAME}_component
+    PLUGIN "autoware::lidar_bevfusion::LidarBEVFusionNode"
+    EXECUTABLE ${PROJECT_NAME}_node
+  )
+
+  install(
+    TARGETS ${PROJECT_NAME}_cuda_lib
+    DESTINATION lib
+  )
+
+  install(
+    TARGETS autoware_tensorrt_plugins
+    DESTINATION share/${PROJECT_NAME}/plugins
+  )
+
+  ament_auto_package(
+    INSTALL_TO_SHARE
+      launch
+      config
+  )
+
+else()
+  find_package(ament_cmake_auto REQUIRED)
+  ament_auto_find_build_dependencies()
+
+  ament_auto_package(
+    INSTALL_TO_SHARE
+      launch
+  )
+endif()
diff --git a/perception/autoware_lidar_bevfusion/README.md b/perception/autoware_lidar_bevfusion/README.md
new file mode 100644
index 0000000000000..a6fb95cb752d8
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/README.md
@@ -0,0 +1,81 @@
+# autoware_lidar_bevfusion
+
+## Purpose
+
+The `autoware_lidar_bevfusion` package is used for 3D object detection based on camera-lidar fusion.
+
+## Inner-workings / Algorithms
+
+This package implements a TensorRT powered inference node for BEVFusion [1].
+The sparse convolution backend corresponds to [spconv](https://github.com/traveller59/spconv).
+Autoware installs it automatically in its setup script. If needed, the user can also build it and install it following the [following instructions](https://github.com/autowarefoundation/spconv_cpp).
+
+## Inputs / Outputs
+
+### Input
+
+| Name                   | Type                            | Description               |
+| ---------------------- | ------------------------------- | ------------------------- |
+| `~/input/pointcloud`   | `sensor_msgs::msg::PointCloud2` | Input pointcloud topics.  |
+| `~/input/image*`       | `sensor_msgs::msg::Image`       | Input image topics.       |
+| `~/input/camera_info*` | `sensor_msgs::msg::CameraInfo`  | Input camera info topics. |
+
+### Output
+
+| Name                                   | Type                                             | Description                 |
+| -------------------------------------- | ------------------------------------------------ | --------------------------- |
+| `~/output/objects`                     | `autoware_perception_msgs::msg::DetectedObjects` | Detected objects.           |
+| `debug/cyclic_time_ms`                 | `tier4_debug_msgs::msg::Float64Stamped`          | Cyclic time (ms).           |
+| `debug/pipeline_latency_ms`            | `tier4_debug_msgs::msg::Float64Stamped`          | Pipeline latency time (ms). |
+| `debug/processing_time/preprocess_ms`  | `tier4_debug_msgs::msg::Float64Stamped`          | Preprocess (ms).            |
+| `debug/processing_time/inference_ms`   | `tier4_debug_msgs::msg::Float64Stamped`          | Inference time (ms).        |
+| `debug/processing_time/postprocess_ms` | `tier4_debug_msgs::msg::Float64Stamped`          | Postprocess time (ms).      |
+| `debug/processing_time/total_ms`       | `tier4_debug_msgs::msg::Float64Stamped`          | Total processing time (ms). |
+
+## Parameters
+
+### BEVFusion node
+
+{{ json_to_markdown("perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json") }}
+
+### BEVFusion model
+
+{{ json_to_markdown("perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json") }}
+
+### Detection class remapper
+
+{{ json_to_markdown("perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json") }}
+
+### The `build_only` option
+
+The `autoware_lidar_bevfusion` node has a `build_only` option to build the TensorRT engine file from the specified ONNX file, after which the program exits.
+
+```bash
+ros2 launch autoware_lidar_bevfusion lidar_bevfusion.launch.xml build_only:=true
+```
+
+### The `log_level` option
+
+The default logging severity level for `autoware_lidar_bevfusion` is `info`. For debugging purposes, the developer may decrease severity level using `log_level` parameter:
+
+```bash
+ros2 launch autoware_lidar_bevfusion lidar_bevfusion.launch.xml log_level:=debug
+```
+
+## Assumptions / Known limits
+
+This node assumes that the input pointcloud follows the `PointXYZIRC` layout defined in `autoware_point_types`.
+
+## Trained Models
+
+TODO
+
+### Changelog
+
+## References/External links
+
+[1] Zhijian Liu, Haotian Tang, Alexander Amini, Xinyu Yang, Huizi Mao, Daniela Rus, and Song Han. "BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View Representation." 2023 International Conference on Robotics and Automation. <!-- cspell:disable-line -->
+
+## (Optional) Future extensions / Unimplemented parts
+
+Although this node can perform camera-lidar fusion, as is the first method in autoware to actually use images and lidars for inference, the package structure and its full integration in the autoware pipeline are left for future work. In the current structure, it can be employed without any changes as a lidar-based detector.
diff --git a/perception/autoware_lidar_bevfusion/config/bevfusion.param.yaml b/perception/autoware_lidar_bevfusion/config/bevfusion.param.yaml
new file mode 100644
index 0000000000000..be2ef9e93fe73
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/config/bevfusion.param.yaml
@@ -0,0 +1,23 @@
+/**:
+  ros__parameters:
+    # modality
+    sensor_fusion: true
+    # non-network params
+    max_camera_lidar_delay: 0.12
+    # plugins
+    plugins_path: $(find-pkg-share autoware_lidar_bevfusion)/plugins/libautoware_tensorrt_plugins.so
+    # network
+    trt_precision: fp16
+    cloud_capacity: 2000000
+    onnx_path: "$(var model_path)/bevfusion_camera_lidar_v2.onnx"
+    engine_path: "$(var model_path)/bevfusion_camera_lidar_v2.engine"
+    # pre-process params
+    densification_num_past_frames: 0
+    densification_world_frame_id: map
+    # post-process params
+    circle_nms_dist_threshold: 0.5
+    iou_nms_target_class_names: ["CAR"]
+    iou_nms_search_distance_2d: 10.0
+    iou_nms_threshold: 0.1
+    yaw_norm_thresholds: [0.3, 0.3, 0.3, 0.3, 0.0] # refers to the class_names
+    score_threshold: 0.1
diff --git a/perception/autoware_lidar_bevfusion/config/bevfusion_ml_package.param.yaml b/perception/autoware_lidar_bevfusion/config/bevfusion_ml_package.param.yaml
new file mode 100644
index 0000000000000..4e767a99a3c20
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/config/bevfusion_ml_package.param.yaml
@@ -0,0 +1,24 @@
+/**:
+  ros__parameters:
+    class_names: ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"]
+    voxels_num: [1, 128000, 256000] # [min, opt, max]
+    point_cloud_range: [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0] # [x_min, y_min, z_min, x_max, y_max, z_max]
+    voxel_size: [0.17, 0.17, 0.2] # [x, y, z]
+    num_proposals: 500
+    out_size_factor: 8
+    max_points_per_voxel: 10
+
+    d_bound: [1.0, 166.2, 1.4]
+    x_bound: [-122.4, 122.4, 0.68]
+    y_bound: [-122.4, 122.4, 0.68]
+    z_bound: [-10.0, 10.0, 20.0]
+    num_cameras: 6
+    raw_image_height: 1080
+    raw_image_width: 1440
+    img_aug_scale_x: 0.489
+    img_aug_scale_y: 0.489
+    roi_height: 384
+    roi_width: 704
+    features_height: 48
+    features_width: 88
+    num_depth_features: 118
diff --git a/perception/autoware_lidar_bevfusion/config/detection_class_remapper.param.yaml b/perception/autoware_lidar_bevfusion/config/detection_class_remapper.param.yaml
new file mode 100644
index 0000000000000..40275d750f0d9
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/config/detection_class_remapper.param.yaml
@@ -0,0 +1,38 @@
+/**:
+  ros__parameters:
+    allow_remapping_by_area_matrix:
+      # NOTE(knzo25): We turn all vehicles into trailers if they go over 3x12 [m^2].
+      # NOTE(knzo25): We turn cars into trucks if they have an area between 2.2 x 5.5 and 3.0 * 12.0 [m^2]
+      # row: original class. column: class to remap to
+      #UNKNOWN, CAR, TRUCK, BUS,  TRAILER, MOTORBIKE, BICYCLE,PEDESTRIAN
+      [0,       0,   0,     0,    0,       0,         0,      0,         #UNKNOWN
+       0,       0,   1,     0,    1,       0,         0,      0,         #CAR
+       0,       0,   0,     0,    1,       0,         0,      0,         #TRUCK
+       0,       0,   0,     0,    1,       0,         0,      0,         #BUS
+       0,       0,   0,     0,    0,       0,         0,      0,         #TRAILER
+       0,       0,   0,     0,    0,       0,         0,      0,         #MOTORBIKE
+       0,       0,   0,     0,    0,       0,         0,      0,         #BICYCLE
+       0,       0,   0,     0,    0,       0,         0,      0]         #PEDESTRIAN
+
+    min_area_matrix:
+      #UNKNOWN, CAR,   TRUCK,   BUS,   TRAILER,  MOTORBIKE, BICYCLE, PEDESTRIAN
+      [ 0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #UNKNOWN
+        0.000,  0.000, 12.100,  0.000, 36.000,   0.000,     0.000,   0.000,      #CAR
+        0.000,  0.000,  0.000,  0.000, 36.000,   0.000,     0.000,   0.000,      #TRUCK
+        0.000,  0.000,  0.000,  0.000, 36.000,   0.000,     0.000,   0.000,      #BUS
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #TRAILER
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #MOTORBIKE
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #BICYCLE
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000]      #PEDESTRIAN
+
+
+    max_area_matrix:
+      #UNKNOWN, CAR,   TRUCK,  BUS,    TRAILER,   MOTORBIKE, BICYCLE, PEDESTRIAN
+      [ 0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #UNKNOWN
+        0.000,  0.000, 36.000,  0.000, 999.999,   0.000,     0.000,   0.000,      #CAR
+        0.000,  0.000,  0.000,  0.000, 999.999,   0.000,     0.000,   0.000,      #TRUCK
+        0.000,  0.000,  0.000,  0.000, 999.999,   0.000,     0.000,   0.000,      #BUS
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #TRAILER
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #MOTORBIKE
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #BICYCLE
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000]      #PEDESTRIAN
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/bev_ops/bev_pool_cuda.hpp b/perception/autoware_lidar_bevfusion/include/autoware/bev_ops/bev_pool_cuda.hpp
new file mode 100644
index 0000000000000..fd74dd79aa3ed
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/bev_ops/bev_pool_cuda.hpp
@@ -0,0 +1,28 @@
+// Copyright 2025 (c) OpenMMLab. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/open-mmlab/mmdetection3d/blob/main/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.cu
+// https://github.com/mit-han-lab/bevfusion/blob/main/mmdet3d/ops/bev_pool/src/bev_pool_cuda.cu
+
+#ifndef AUTOWARE__BEV_OPS__BEV_POOL_CUDA_HPP_
+#define AUTOWARE__BEV_OPS__BEV_POOL_CUDA_HPP_
+
+#include <cuda_runtime_api.h>
+
+void bev_pool(
+  int b, int d, int h, int w, int n, int c, int n_intervals, const float * x,
+  const int * geom_feats, const int * interval_starts, const int * interval_lengths, float * out,
+  cudaStream_t & stream);
+
+#endif  // AUTOWARE__BEV_OPS__BEV_POOL_CUDA_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_config.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_config.hpp
new file mode 100644
index 0000000000000..6e659fb356554
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_config.hpp
@@ -0,0 +1,194 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_CONFIG_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_CONFIG_HPP_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class BEVFusionConfig
+{
+public:
+  BEVFusionConfig(
+    const bool sensor_fusion, const std::string & plugins_path, const std::int64_t out_size_factor,
+    const std::int64_t cloud_capacity, const std::int64_t max_points_per_voxel,
+    const std::vector<std::int64_t> & voxels_num, const std::vector<float> & point_cloud_range,
+    const std::vector<float> & voxel_size, const std::vector<float> & d_bound,
+    const std::vector<float> & x_bound, const std::vector<float> & y_bound,
+    const std::vector<float> & z_bound, const std::int64_t num_cameras,
+    const std::int64_t raw_image_height, const std::int64_t raw_image_width,
+    const float img_aug_scale_x, const float img_aug_scale_y, const std::int64_t roi_height,
+    const std::int64_t roi_width, const std::int64_t features_height,
+    const std::int64_t features_width, const std::int64_t num_depth_features,
+    const std::int64_t num_proposals, const float circle_nms_dist_threshold,
+    const std::vector<double> & yaw_norm_thresholds, const float score_threshold)
+  {
+    sensor_fusion_ = sensor_fusion;
+    plugins_path_ = plugins_path;
+
+    out_size_factor_ = out_size_factor;
+
+    cloud_capacity_ = cloud_capacity;
+    max_points_per_voxel_ = max_points_per_voxel;
+
+    if (voxels_num.size() == 3) {
+      min_num_voxels_ = voxels_num[0];
+      max_num_voxels_ = voxels_num[2];
+
+      voxels_num_[0] = voxels_num[0];
+      voxels_num_[1] = voxels_num[1];
+      voxels_num_[2] = voxels_num[2];
+    }
+    if (point_cloud_range.size() == 6) {
+      min_x_range_ = point_cloud_range[0];
+      min_y_range_ = point_cloud_range[1];
+      min_z_range_ = point_cloud_range[2];
+      max_x_range_ = point_cloud_range[3];
+      max_y_range_ = point_cloud_range[4];
+      max_z_range_ = point_cloud_range[5];
+    }
+    if (voxel_size.size() == 3) {
+      voxel_x_size_ = voxel_size[0];
+      voxel_y_size_ = voxel_size[1];
+      voxel_z_size_ = voxel_size[2];
+    }
+    if (d_bound.size() == 3 && x_bound.size() == 3 && y_bound.size() == 3 && z_bound.size() == 3) {
+      d_bound_ = d_bound;
+      x_bound_ = x_bound;
+      y_bound_ = y_bound;
+      z_bound_ = z_bound;
+    }
+
+    num_cameras_ = num_cameras;
+    raw_image_height_ = raw_image_height;
+    raw_image_width_ = raw_image_width;
+    img_aug_scale_x_ = img_aug_scale_x;
+    img_aug_scale_y_ = img_aug_scale_y;
+    roi_height_ = roi_height;
+    roi_width_ = roi_width;
+    features_height_ = features_height;
+    features_width_ = features_width;
+    num_depth_features_ = num_depth_features;
+    resized_height_ = raw_image_height_ * img_aug_scale_y_;
+    resized_width_ = raw_image_width_ * img_aug_scale_x_;
+
+    if (num_proposals > 0) {
+      num_proposals_ = num_proposals;
+    }
+    if (score_threshold > 0.0) {
+      score_threshold_ = score_threshold;
+    }
+    if (circle_nms_dist_threshold > 0.0) {
+      circle_nms_dist_threshold_ = circle_nms_dist_threshold;
+    }
+    yaw_norm_thresholds_ =
+      std::vector<float>(yaw_norm_thresholds.begin(), yaw_norm_thresholds.end());
+    for (auto & yaw_norm_threshold : yaw_norm_thresholds_) {
+      yaw_norm_threshold =
+        (yaw_norm_threshold >= 0.0 && yaw_norm_threshold < 1.0) ? yaw_norm_threshold : 0.0;
+    }
+    grid_x_size_ = static_cast<std::int64_t>((max_x_range_ - min_x_range_) / voxel_x_size_);
+    grid_y_size_ = static_cast<std::int64_t>((max_y_range_ - min_y_range_) / voxel_y_size_);
+    grid_z_size_ = static_cast<std::int64_t>((max_z_range_ - min_z_range_) / voxel_z_size_);
+  }
+
+  ///// MODALITY /////
+  bool sensor_fusion_{};
+
+  // CUDA parameters
+  const std::uint32_t threads_per_block_{256};  // threads number for a block
+
+  // TensorRT parameters
+  std::string plugins_path_{};
+
+  ///// NETWORK PARAMETERS /////
+
+  // Common network parameters
+  std::int64_t out_size_factor_{};
+
+  std::int64_t cloud_capacity_{};
+  std::int64_t min_num_voxels_{};
+  std::int64_t max_num_voxels_{};
+  std::int64_t max_points_per_voxel_;
+  const std::int64_t num_point_feature_size_{5};  // x, y, z, intensity, lag
+
+  // Pointcloud range in meters
+  float min_x_range_{};
+  float max_x_range_{};
+  float min_y_range_{};
+  float max_y_range_{};
+  float min_z_range_{};
+  float max_z_range_{};
+
+  // Voxel size in meters
+  float voxel_x_size_{};
+  float voxel_y_size_{};
+  float voxel_z_size_{};
+
+  // Grid size
+  std::int64_t grid_x_size_{};
+  std::int64_t grid_y_size_{};
+  std::int64_t grid_z_size_{};
+
+  // Camera branch parameters
+  std::vector<float> d_bound_{};
+  std::vector<float> x_bound_{};
+  std::vector<float> y_bound_{};
+  std::vector<float> z_bound_{};
+
+  std::int64_t num_cameras_{};
+  std::int64_t raw_image_height_{};
+  std::int64_t raw_image_width_{};
+
+  float img_aug_scale_x_{};
+  float img_aug_scale_y_{};
+
+  std::int64_t roi_height_{};
+  std::int64_t roi_width_{};
+
+  std::int64_t resized_height_{};
+  std::int64_t resized_width_{};
+
+  std::int64_t features_height_{};
+  std::int64_t features_width_{};
+  std::int64_t num_depth_features_{};
+
+  // Head parameters
+  std::int64_t num_proposals_{};
+  const std::size_t num_classes_{5};
+
+  // Post processing parameters
+
+  // the score threshold for classification
+  float score_threshold_{};
+
+  float circle_nms_dist_threshold_{};
+  std::vector<float> yaw_norm_thresholds_{};
+  // the detected boxes result decode by (x, y, z, w, l, h, yaw, vx, vy)
+  const std::int64_t num_box_values_{10};
+
+  ///// RUNTIME DIMENSIONS /////
+  std::array<std::int64_t, 3> voxels_num_{};
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_CONFIG_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_trt.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_trt.hpp
new file mode 100644
index 0000000000000..9a20a90ba39fc
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_trt.hpp
@@ -0,0 +1,150 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_TRT_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_TRT_HPP_
+
+#include "autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp"
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+#include "autoware/lidar_bevfusion/preprocess/voxel_generator.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+#include "autoware/lidar_bevfusion/visibility_control.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+#include <autoware/tensorrt_common/tensorrt_common.hpp>
+#include <autoware/universe_utils/system/stop_watch.hpp>
+
+#include <sensor_msgs/msg/camera_info.hpp>
+#include <sensor_msgs/msg/image.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <pcl/point_cloud.h>
+#include <pcl/point_types.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+using autoware::cuda_utils::CudaUniquePtr;
+
+class NetworkParam
+{
+public:
+  NetworkParam(std::string onnx_path, std::string engine_path, std::string trt_precision)
+  : onnx_path_(std::move(onnx_path)),
+    engine_path_(std::move(engine_path)),
+    trt_precision_(std::move(trt_precision))
+  {
+  }
+
+  std::string onnx_path() const { return onnx_path_; }
+  std::string engine_path() const { return engine_path_; }
+  std::string trt_precision() const { return trt_precision_; }
+
+private:
+  std::string onnx_path_;
+  std::string engine_path_;
+  std::string trt_precision_;
+};
+
+class LIDAR_BEVFUSION_PUBLIC BEVFusionTRT
+{
+public:
+  using Matrix4fRowM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
+
+  explicit BEVFusionTRT(
+    const tensorrt_common::TrtCommonConfig & trt_config,
+    const DensificationParam & densification_param, const BEVFusionConfig & config);
+  virtual ~BEVFusionTRT();
+
+  bool detect(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr & msg,
+    const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+    const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer,
+    std::vector<Box3D> & det_boxes3d, std::unordered_map<std::string, double> & proc_timing);
+
+  void setIntrinsicsExtrinsics(
+    std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+    std::vector<Matrix4fRowM> & lidar2camera_vector);
+
+protected:
+  void initPtr();
+  void initTrt(const tensorrt_common::TrtCommonConfig & trt_config);
+
+  bool preProcess(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr & pc_msg,
+    const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+    const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer);
+
+  bool inference();
+
+  bool postProcess(std::vector<Box3D> & det_boxes3d);
+
+  std::unique_ptr<autoware::tensorrt_common::TrtCommon> network_trt_ptr_{nullptr};
+  std::unique_ptr<VoxelGenerator> vg_ptr_{nullptr};
+  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_{
+    nullptr};
+  std::unique_ptr<PreprocessCuda> pre_ptr_{nullptr};
+  std::unique_ptr<PostprocessCuda> post_ptr_{nullptr};
+  cudaStream_t stream_{nullptr};
+  std::vector<cudaStream_t> camera_streams_{};
+
+  BEVFusionConfig config_;
+  std::vector<int> roi_start_y_vector_;
+
+  // pre-process inputs
+
+  unsigned int voxel_features_size_{0};
+  unsigned int voxel_coords_size_{0};
+  unsigned int bbox_pred_size_{0};
+
+  // lidar buffers
+  CudaUniquePtr<float[]> points_d_{nullptr};
+  CudaUniquePtr<float[]> voxel_features_d_{nullptr};
+  CudaUniquePtr<std::int32_t[]> voxel_coords_d_{nullptr};
+  CudaUniquePtr<std::int32_t[]> num_points_per_voxel_d_{nullptr};
+
+  // pre computed tensors
+  std::int64_t num_geom_feats_{};
+  std::int64_t num_kept_{};
+  std::int64_t num_ranks_{};
+  std::int64_t num_indices_{};
+  CudaUniquePtr<float_t[]> lidar2image_d_{};
+  CudaUniquePtr<std::int32_t[]> geom_feats_d_{};
+  CudaUniquePtr<std::uint8_t[]> kept_d_{};
+  CudaUniquePtr<std::int64_t[]> ranks_d_{};
+  CudaUniquePtr<std::int64_t[]> indices_d_{};
+
+  // image buffers
+  CudaUniquePtr<std::uint8_t[]> roi_tensor_d_{nullptr};
+  std::vector<CudaUniquePtr<std::uint8_t[]>> image_buffers_d_{};
+  CudaUniquePtr<float[]> camera_masks_d_{nullptr};
+
+  // output buffers
+  CudaUniquePtr<std::int64_t[]> label_pred_output_d_{nullptr};
+  CudaUniquePtr<float[]> bbox_pred_output_d_{nullptr};
+  CudaUniquePtr<float[]> score_output_d_{nullptr};
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_TRT_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/detection_class_remapper.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/detection_class_remapper.hpp
new file mode 100644
index 0000000000000..5658684f9790e
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/detection_class_remapper.hpp
@@ -0,0 +1,48 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__DETECTION_CLASS_REMAPPER_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__DETECTION_CLASS_REMAPPER_HPP_
+
+#include <Eigen/Core>
+
+#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/detected_objects.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/shape.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class DetectionClassRemapper
+{
+public:
+  void setParameters(
+    const std::vector<std::int64_t> & allow_remapping_by_area_matrix,
+    const std::vector<double> & min_area_matrix, const std::vector<double> & max_area_matrix);
+  void mapClasses(autoware_perception_msgs::msg::DetectedObjects & msg);
+
+protected:
+  Eigen::Matrix<bool, Eigen::Dynamic, Eigen::Dynamic> allow_remapping_by_area_matrix_;
+  Eigen::MatrixXd min_area_matrix_;
+  Eigen::MatrixXd max_area_matrix_;
+  int num_labels_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__DETECTION_CLASS_REMAPPER_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/lidar_bevfusion_node.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/lidar_bevfusion_node.hpp
new file mode 100644
index 0000000000000..1ee0d2c0071cf
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/lidar_bevfusion_node.hpp
@@ -0,0 +1,97 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__LIDAR_BEVFUSION_NODE_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__LIDAR_BEVFUSION_NODE_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_trt.hpp"
+#include "autoware/lidar_bevfusion/detection_class_remapper.hpp"
+#include "autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp"
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+#include "autoware/lidar_bevfusion/visibility_control.hpp"
+
+#include <Eigen/Core>
+#include <autoware/universe_utils/ros/debug_publisher.hpp>
+#include <autoware/universe_utils/ros/published_time_publisher.hpp>
+#include <autoware/universe_utils/system/stop_watch.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/detected_objects.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/shape.hpp>
+#include <sensor_msgs/msg/camera_info.hpp>
+#include <sensor_msgs/msg/compressed_image.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class LIDAR_BEVFUSION_PUBLIC LidarBEVFusionNode : public rclcpp::Node
+{
+public:
+  using Matrix4f = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
+
+  explicit LidarBEVFusionNode(const rclcpp::NodeOptions & options);
+
+private:
+  void cloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg);
+  void imageCallback(const sensor_msgs::msg::Image::ConstSharedPtr msg, std::size_t camera_id);
+  void cameraInfoCallback(const sensor_msgs::msg::CameraInfo & msg, std::size_t camera_id);
+
+  rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::ConstSharedPtr cloud_sub_{nullptr};
+  std::vector<rclcpp::Subscription<sensor_msgs::msg::Image>::ConstSharedPtr> image_subs_;
+  std::vector<rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::ConstSharedPtr> camera_info_subs_;
+  rclcpp::Publisher<autoware_perception_msgs::msg::DetectedObjects>::SharedPtr objects_pub_{
+    nullptr};
+
+  tf2_ros::Buffer tf_buffer_;
+  tf2_ros::TransformListener tf_listener_{tf_buffer_};
+
+  DetectionClassRemapper detection_class_remapper_;
+  float score_threshold_{0.0};
+  std::vector<std::string> class_names_;
+  std::optional<std::string> lidar_frame_;
+  float max_camera_lidar_delay_;
+
+  std::vector<sensor_msgs::msg::Image::ConstSharedPtr> image_msgs_;
+  std::vector<float> camera_masks_;
+  std::vector<std::optional<sensor_msgs::msg::CameraInfo>> camera_info_msgs_;
+  std::vector<std::optional<Matrix4f>> lidar2camera_extrinsics_;
+
+  bool sensor_fusion_{false};
+  bool images_available_{false};
+  bool intrinsics_available_{false};
+  bool extrinsics_available_{false};
+  bool intrinsics_extrinsics_precomputed_{false};
+
+  NonMaximumSuppression iou_bev_nms_;
+
+  std::unique_ptr<BEVFusionTRT> detector_ptr_{nullptr};
+
+  // debugger
+  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_{
+    nullptr};
+  std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_ptr_{nullptr};
+  std::unique_ptr<autoware::universe_utils::PublishedTimePublisher> published_time_pub_{nullptr};
+};
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__LIDAR_BEVFUSION_NODE_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp
new file mode 100644
index 0000000000000..f6b00c19c5f06
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp
@@ -0,0 +1,34 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
+
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <thrust/device_vector.h>
+
+#include <cstddef>
+
+namespace autoware::lidar_bevfusion
+{
+// Non-maximum suppression (NMS) uses the distance on the xy plane instead of
+// intersection over union (IoU) to suppress overlapped objects.
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream);
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp
new file mode 100644
index 0000000000000..c8d64a76b41c6
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp
@@ -0,0 +1,82 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
+
+#include "autoware/lidar_bevfusion/ros_utils.hpp"
+
+#include <Eigen/Eigen>
+
+#include <autoware_perception_msgs/msg/detected_object.hpp>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+using autoware_perception_msgs::msg::DetectedObject;
+
+// TODO(knzo25): Update this implementation to mach changes from centerpoint
+enum class NMS_TYPE {
+  IoU_BEV
+  // IoU_3D
+  // Distance_2D
+  // Distance_3D
+};
+
+struct NMSParams
+{
+  NMS_TYPE nms_type_{};
+  std::vector<std::string> target_class_names_{};
+  double search_distance_2d_{};
+  double iou_threshold_{};
+  // double distance_threshold_{};
+};
+
+std::vector<bool> classNamesToBooleanMask(const std::vector<std::string> & class_names)
+{
+  std::vector<bool> mask;
+  constexpr std::size_t num_object_classification = 8;
+  mask.resize(num_object_classification);
+  for (const auto & class_name : class_names) {
+    const auto semantic_type = getSemanticType(class_name);
+    mask.at(semantic_type) = true;
+  }
+
+  return mask;
+}
+
+class NonMaximumSuppression
+{
+public:
+  void setParameters(const NMSParams &);
+
+  std::vector<DetectedObject> apply(const std::vector<DetectedObject> &);
+
+private:
+  bool isTargetLabel(const std::uint8_t);
+
+  bool isTargetPairObject(const DetectedObject &, const DetectedObject &);
+
+  Eigen::MatrixXd generateIoUMatrix(const std::vector<DetectedObject> &);
+
+  NMSParams params_{};
+  std::vector<bool> target_class_mask_{};
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp
new file mode 100644
index 0000000000000..4a7c8845e85ee
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp
@@ -0,0 +1,47 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class PostprocessCuda
+{
+public:
+  explicit PostprocessCuda(const BEVFusionConfig & config, cudaStream_t & stream);
+
+  cudaError_t generateDetectedBoxes3D_launch(
+    const std::int64_t * label_pred_output, const float * bbox_pred_output,
+    const float * score_output, std::vector<Box3D> & det_boxes3d, cudaStream_t stream);
+
+private:
+  BEVFusionConfig config_;
+  cudaStream_t stream_;
+  cudaStream_t stream_event_;
+  cudaEvent_t start_, stop_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/point_type.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/point_type.hpp
new file mode 100644
index 0000000000000..35cf87e7d6c3d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/point_type.hpp
@@ -0,0 +1,35 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINT_TYPE_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINT_TYPE_HPP_
+
+#include <cstdint>
+
+namespace autoware::lidar_bevfusion
+{
+
+struct InputPointType
+{
+  float x;
+  float y;
+  float z;
+  std::uint8_t intensity;
+  std::uint8_t return_type;
+  std::uint16_t channel;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINT_TYPE_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp
new file mode 100644
index 0000000000000..c02ba430568cf
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp
@@ -0,0 +1,104 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
+
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+
+#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
+
+#include <tf2_ros/buffer.h>
+#include <tf2_ros/transform_listener.h>
+
+#include <cstddef>
+#include <list>
+#include <string>
+#include <utility>
+
+namespace autoware::lidar_bevfusion
+{
+
+using autoware::cuda_utils::CudaUniquePtr;
+
+class DensificationParam
+{
+public:
+  DensificationParam(const std::string & world_frame_id, const unsigned int num_past_frames)
+  : world_frame_id_(std::move(world_frame_id)),
+    pointcloud_cache_size_(num_past_frames + /*current frame*/ 1)
+  {
+  }
+
+  std::string world_frame_id() const { return world_frame_id_; }
+  unsigned int pointcloud_cache_size() const { return pointcloud_cache_size_; }
+
+private:
+  std::string world_frame_id_;
+  unsigned int pointcloud_cache_size_{1};
+};
+
+struct PointCloudWithTransform
+{
+  CudaUniquePtr<InputPointType[]> data_d{nullptr};
+  std_msgs::msg::Header header;
+  std::size_t num_points{0};
+  Eigen::Affine3f affine_past2world;
+};
+
+class PointCloudDensification
+{
+public:
+  explicit PointCloudDensification(const DensificationParam & param, cudaStream_t & stream);
+
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
+
+  double getCurrentTimestamp() const { return current_timestamp_; }
+  Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
+  std::list<PointCloudWithTransform>::iterator getPointCloudCacheIter()
+  {
+    return pointcloud_cache_.begin();
+  }
+  bool isCacheEnd(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return iter == pointcloud_cache_.end();
+  }
+  std::size_t getIdx(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return std::distance(pointcloud_cache_.begin(), iter);
+  }
+  std::size_t getCacheSize()
+  {
+    return std::distance(pointcloud_cache_.begin(), pointcloud_cache_.end());
+  }
+  unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
+
+private:
+  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void dequeue();
+
+  DensificationParam param_;
+  double current_timestamp_{0.0};
+  Eigen::Affine3f affine_world2current_;
+  std::list<PointCloudWithTransform> pointcloud_cache_;
+  cudaStream_t stream_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/precomputed_features.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/precomputed_features.hpp
new file mode 100644
index 0000000000000..7b5eff750a56b
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/precomputed_features.hpp
@@ -0,0 +1,89 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PRECOMPUTED_FEATURES_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PRECOMPUTED_FEATURES_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+
+#include <Eigen/Dense>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <sensor_msgs/msg/camera_info.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <tuple>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+using Tensor1D = Eigen::Tensor<float, 1, Eigen::RowMajor>;
+using Tensor2D = Eigen::Tensor<float, 2, Eigen::RowMajor>;
+using Tensor3D = Eigen::Tensor<float, 3, Eigen::RowMajor>;
+using Tensor4D = Eigen::Tensor<float, 4, Eigen::RowMajor>;
+using Tensor5D = Eigen::Tensor<float, 5, Eigen::RowMajor>;
+using Tensor6D = Eigen::Tensor<float, 6, Eigen::RowMajor>;
+using Tensor7D = Eigen::Tensor<float, 7, Eigen::RowMajor>;
+
+using TensorMap1D = Eigen::TensorMap<Tensor1D>;
+using TensorMap2D = Eigen::TensorMap<Tensor2D>;
+using TensorMap3D = Eigen::TensorMap<Tensor3D>;
+using TensorMap4D = Eigen::TensorMap<Tensor4D>;
+using TensorMap5D = Eigen::TensorMap<Tensor5D>;
+using TensorMap6D = Eigen::TensorMap<Tensor6D>;
+using TensorMap7D = Eigen::TensorMap<Tensor7D>;
+
+using Matrix4fRowM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
+using Matrix3fRowM = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>;
+
+using Vector3fRowM = Eigen::Matrix<float, 1, 3, Eigen::RowMajor>;
+
+Tensor4D create_frustum(const BEVFusionConfig & config);
+
+Tensor5D get_geometry(
+  const Tensor4D & frustum,             // [D, H, W, 3]
+  const Tensor3D & camera2lidar_rots,   // [N, 3, 3]
+  const Tensor2D & camera2lidar_trans,  // [N, 3]
+  const Tensor3D & intrinsics_inverse,  // [N, 3, 3]
+  const Tensor3D & post_rots_inverse,   // [N, 3, 3]
+  const Tensor2D & post_trans           // [N, 3]
+);
+
+// Define the function
+std::tuple<
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> >
+bev_pool_aux(const Tensor5D & geom_feats_input, const BEVFusionConfig & config);
+
+std::tuple<
+  Eigen::VectorXf,  // lidar2image
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> >
+precompute_features(
+  const std::vector<Matrix4fRowM> & lidar2camera_transforms,
+  const std::vector<Matrix4fRowM> & camera_aug_matrices,
+  const std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+  const BEVFusionConfig & config);
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PRECOMPUTED_FEATURES_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp
new file mode 100644
index 0000000000000..1c3afae5cb585
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp
@@ -0,0 +1,76 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <cuda_runtime_api.h>
+#include <tensorview/tensor.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace autoware::lidar_bevfusion
+{
+
+class PreprocessCuda
+{
+public:
+  PreprocessCuda(const BEVFusionConfig & config, cudaStream_t & stream, bool allocate_buffers);
+
+  cudaError_t generateSweepPoints_launch(
+    const InputPointType * input_data, std::size_t points_size, float time_lag,
+    const float * transform, float * output_points);
+
+  std::size_t generateVoxels(
+    const float * points, unsigned int points_size, float * voxel_features,
+    std::int32_t * voxel_coords, std::int32_t * num_points_per_voxel);
+
+  cudaError_t resize_and_extract_roi_launch(
+    const std::uint8_t * input_img, std::uint8_t * output_img, int H, int W, int H2, int W2, int H3,
+    int W3, int y_start, int x_start, cudaStream_t stream);
+
+private:
+  BEVFusionConfig config_;
+  cudaStream_t stream_;
+
+  tv::Tensor hash_key_value_;
+  tv::Tensor point_indices_data_;
+  tv::Tensor points_voxel_id_;
+};
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/voxel_generator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/voxel_generator.hpp
new file mode 100644
index 0000000000000..7b90c4e86de06
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/voxel_generator.hpp
@@ -0,0 +1,64 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+#include "autoware/lidar_bevfusion/ros_utils.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+#include <autoware/point_types/types.hpp>
+#include <tf2_eigen/tf2_eigen.hpp>
+
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class VoxelGenerator
+{
+public:
+  explicit VoxelGenerator(
+    const DensificationParam & densification_param, const BEVFusionConfig & config,
+    cudaStream_t & stream);
+  std::size_t generateSweepPoints(CudaUniquePtr<float[]> & points_d);
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
+  void initCloudInfo(const sensor_msgs::msg::PointCloud2 & msg);
+  std::tuple<const std::uint32_t, const std::uint8_t, const std::uint8_t> getFieldInfo(
+    const sensor_msgs::msg::PointCloud2 & msg, const std::string & field_name);
+
+private:
+  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
+  std::unique_ptr<PreprocessCuda> pre_ptr_{nullptr};
+  BEVFusionConfig config_;
+  CudaUniquePtr<float[]> affine_past2current_d_{nullptr};
+  std::vector<float> points_;
+  cudaStream_t stream_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/ros_utils.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/ros_utils.hpp
new file mode 100644
index 0000000000000..62128024a15aa
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/ros_utils.hpp
@@ -0,0 +1,67 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__ROS_UTILS_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__ROS_UTILS_HPP_
+
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <autoware/point_types/types.hpp>
+
+#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/detected_objects.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/shape.hpp>
+#include <sensor_msgs/msg/point_field.hpp>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#define CHECK_OFFSET(structure1, structure2, field)             \
+  static_assert(                                                \
+    offsetof(structure1, field) == offsetof(structure2, field), \
+    "Offset of " #field " in " #structure1 " does not match the one in " #structure2 ".")
+#define CHECK_TYPE(structure1, structure2, field)                             \
+  static_assert(                                                              \
+    std::is_same_v<decltype(structure1::field), decltype(structure2::field)>, \
+    "Type of " #field " in " #structure1 " and " #structure1 " have different types.")
+#define CHECK_FIELD(structure1, structure2, field) \
+  CHECK_OFFSET(structure1, structure2, field);     \
+  CHECK_TYPE(structure1, structure2, field)
+
+namespace autoware::lidar_bevfusion
+{
+using sensor_msgs::msg::PointField;
+
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, x);
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, y);
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, z);
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, intensity);
+static_assert(sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRC));
+
+// TODO(knzo25): will move this from the pointcloud preprocessor to aut autoware_point_types after
+// this is merged
+bool is_data_layout_compatible_with_point_xyzirc(const sensor_msgs::msg::PointCloud2 & input);
+
+void box3DToDetectedObject(
+  const Box3D & box3d, const std::vector<std::string> & class_names,
+  autoware_perception_msgs::msg::DetectedObject & obj);
+
+uint8_t getSemanticType(const std::string & class_name);
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__ROS_UTILS_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/utils.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/utils.hpp
new file mode 100644
index 0000000000000..e0cbe9a7ebabf
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/utils.hpp
@@ -0,0 +1,57 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__UTILS_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__UTILS_HPP_
+
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <unordered_map>
+
+namespace autoware::lidar_bevfusion
+{
+
+struct Box3D
+{
+  int label;
+  float score;
+  float x;
+  float y;
+  float z;
+  float width;
+  float length;
+  float height;
+  float yaw;
+  float vx;
+  float vy;
+};
+
+// cspell: ignore divup
+template <typename T1, typename T2>
+unsigned int divup(const T1 a, const T2 b)
+{
+  if (a == 0) {
+    throw std::runtime_error("A dividend of divup isn't positive.");
+  }
+  if (b == 0) {
+    throw std::runtime_error("A divisor of divup isn't positive.");
+  }
+
+  return (a + b - 1) / b;
+}
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__UTILS_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/visibility_control.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/visibility_control.hpp
new file mode 100644
index 0000000000000..96e64e770ded3
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/visibility_control.hpp
@@ -0,0 +1,37 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__VISIBILITY_CONTROL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__VISIBILITY_CONTROL_HPP_
+
+////////////////////////////////////////////////////////////////////////////////
+#if defined(__WIN32)
+#if defined(LIDAR_BEVFUSION_BUILDING_DLL) || defined(LIDAR_BEVFUSION_EXPORTS)
+#define LIDAR_BEVFUSION_PUBLIC __declspec(dllexport)
+#define LIDAR_BEVFUSION_LOCAL
+#else  // defined(LIDAR_BEVFUSION_BUILDING_DLL) || defined(LIDAR_BEVFUSION_EXPORTS)
+#define LIDAR_BEVFUSION_PUBLIC __declspec(dllimport)
+#define LIDAR_BEVFUSION_LOCAL
+#endif  // defined(LIDAR_BEVFUSION_BUILDING_DLL) || defined(LIDAR_BEVFUSION_EXPORTS)
+#elif defined(__linux__)
+#define LIDAR_BEVFUSION_PUBLIC __attribute__((visibility("default")))
+#define LIDAR_BEVFUSION_LOCAL __attribute__((visibility("hidden")))
+#elif defined(__APPLE__)
+#define LIDAR_BEVFUSION_PUBLIC __attribute__((visibility("default")))
+#define LIDAR_BEVFUSION_LOCAL __attribute__((visibility("hidden")))
+#else
+#error "Unsupported Build Configuration"
+#endif
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__VISIBILITY_CONTROL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.hpp
new file mode 100644
index 0000000000000..65e9c2ffb8b4a
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.hpp
@@ -0,0 +1,140 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__GET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__GET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_HPP_
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <cuda_runtime.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+constexpr char const * const kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_NAME{
+  "GetIndicePairsImplicitGemm"};  // cSpell:ignore Indice
+constexpr char const * const kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_VERSION{"1"};
+constexpr char const * const kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_NAMESPACE{""};
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct GetIndicesPairsImplicitGemmParameters
+{
+  std::int32_t batch_size;
+  std::int32_t algo;
+  std::int32_t is_train;
+  std::vector<std::int32_t> dilation;
+  std::vector<std::int32_t> ksize;
+  std::vector<std::int32_t> out_padding;
+  std::vector<std::int32_t> padding;
+  std::vector<std::int32_t> spatial_shape;
+  std::vector<std::int32_t> stride;
+  std::int32_t subm;  // cSpell:ignore subm
+  std::int32_t transpose;
+
+  nvinfer1::Dims dilation_dims;
+  nvinfer1::Dims ksize_dims;
+  nvinfer1::Dims out_padding_dims;
+  nvinfer1::Dims padding_dims;
+  nvinfer1::Dims spatial_shape_dims;
+  nvinfer1::Dims stride_dims;
+};
+
+class GetIndicesPairsImplicitGemmPlugin : public IPluginV3,
+                                          public IPluginV3OneCore,
+                                          public IPluginV3OneBuild,
+                                          public IPluginV3OneRuntime
+{
+public:
+  GetIndicesPairsImplicitGemmPlugin(
+    const std::string & name, GetIndicesPairsImplicitGemmParameters const & params);
+
+  ~GetIndicesPairsImplicitGemmPlugin() override = default;
+
+  // IPluginV3 Methods
+
+  IPluginCapability * getCapabilityInterface(PluginCapabilityType type) noexcept override;
+
+  IPluginV3 * clone() noexcept override;
+
+  // IPluginV3OneCore Methods
+
+  char const * getPluginName() const noexcept override;
+
+  char const * getPluginVersion() const noexcept override;
+
+  char const * getPluginNamespace() const noexcept override;
+
+  // IPluginV3OneBuild Methods
+
+  std::int32_t getNbOutputs() const noexcept override;
+
+  std::int32_t configurePlugin(
+    DynamicPluginTensorDesc const * in, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * out, std::int32_t num_outputs) noexcept override;
+
+  bool supportsFormatCombination(
+    std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+    std::int32_t num_outputs) noexcept override;
+
+  std::int32_t getOutputDataTypes(
+    DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+    std::int32_t num_inputs) const noexcept override;
+
+  std::int32_t getOutputShapes(
+    DimsExprs const * inputs, std::int32_t num_inputs, DimsExprs const * shape_inputs,
+    std::int32_t num_shape_inputs, DimsExprs * outputs, std::int32_t num_outputs,
+    IExprBuilder & expr_builder) noexcept override;
+
+  // IPluginV3OneRuntime Methods
+
+  std::int32_t enqueue(
+    PluginTensorDesc const * input_desc, PluginTensorDesc const * output_desc,
+    void const * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+  std::int32_t onShapeChange(
+    PluginTensorDesc const * in, std::int32_t num_inputs, PluginTensorDesc const * out,
+    std::int32_t num_outputs) noexcept override;
+
+  IPluginV3 * attachToContext(IPluginResourceContext * context) noexcept override;
+
+  PluginFieldCollection const * getFieldsToSerialize() noexcept override;
+
+  std::size_t getWorkspaceSize(
+    DynamicPluginTensorDesc const * inputs, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * outputs, std::int32_t num_outputs) const noexcept override;
+
+private:
+  void initFieldsToSerialize();
+
+  // upper bound of number of output indices. needed to bound memory usage.
+  static constexpr int out_indices_num_limit_{256000};
+
+  std::string layer_name_;
+  GetIndicesPairsImplicitGemmParameters params_;
+  std::vector<nvinfer1::PluginField> data_to_serialize_;
+  nvinfer1::PluginFieldCollection fc_to_serialize_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__GET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.hpp
new file mode 100644
index 0000000000000..99d9a6225fda2
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.hpp
@@ -0,0 +1,65 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__GET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__GET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+
+#include "autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+// Plugin factory class.
+class GetIndicesPairsImplicitGemmPluginCreator : public nvinfer1::IPluginCreatorV3One
+{
+public:
+  GetIndicesPairsImplicitGemmPluginCreator();
+
+  ~GetIndicesPairsImplicitGemmPluginCreator() override = default;
+
+  char const * getPluginNamespace() const noexcept override
+  {
+    return kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_NAMESPACE;
+  }
+
+  char const * getPluginName() const noexcept override
+  {
+    return kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_NAME;
+  }
+
+  char const * getPluginVersion() const noexcept override
+  {
+    return kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_VERSION;
+  }
+
+  nvinfer1::PluginFieldCollection const * getFieldNames() noexcept override;
+
+  IPluginV3 * createPlugin(
+    char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept override;
+
+private:
+  nvinfer1::PluginFieldCollection fc_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__GET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin.hpp
new file mode 100644
index 0000000000000..783c6a8902485
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin.hpp
@@ -0,0 +1,139 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_HPP_
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <cuda_runtime.h>
+#include <spconvlib/spconv/csrc/sparse/convops/gemmops/GemmTunerSimple.h>  // cSpell:ignore spconvlib
+#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+constexpr char const * const kIMPLICIT_GEMM_PLUGIN_NAME{"ImplicitGemm"};
+constexpr char const * const kIMPLICIT_GEMM_PLUGIN_VERSION{"1"};
+constexpr char const * const kIMPLICIT_GEMM_PLUGIN_NAMESPACE{""};
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct ImplicitGemmParameters
+{
+  float act_alpha;
+  float act_beta;
+  std::int64_t is_subm;  // cSpell:ignore subm
+  std::int64_t is_train;
+  float output_add_scale;
+  float output_scale;
+};
+
+class ImplicitGemmPlugin : public IPluginV3,
+                           public IPluginV3OneCore,
+                           public IPluginV3OneBuild,
+                           public IPluginV3OneRuntime
+{
+public:
+  using ConvTunerSimple = spconvlib::spconv::csrc::sparse::convops::spops::ConvTuner;
+  ImplicitGemmPlugin(const std::string & name, ImplicitGemmParameters const & params);
+
+  ~ImplicitGemmPlugin() override = default;
+
+  // IPluginV3 Methods
+
+  IPluginCapability * getCapabilityInterface(PluginCapabilityType type) noexcept override;
+
+  IPluginV3 * clone() noexcept override;
+
+  // IPluginV3OneCore Methods
+
+  char const * getPluginName() const noexcept override;
+
+  char const * getPluginVersion() const noexcept override;
+
+  char const * getPluginNamespace() const noexcept override;
+
+  // IPluginV3OneBuild Methods
+
+  std::int32_t getNbOutputs() const noexcept override;
+
+  std::int32_t configurePlugin(
+    DynamicPluginTensorDesc const * in, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * out, std::int32_t num_outputs) noexcept override;
+
+  bool supportsFormatCombination(
+    std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+    std::int32_t num_outputs) noexcept override;
+
+  std::int32_t getOutputDataTypes(
+    DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+    std::int32_t num_inputs) const noexcept override;
+
+  std::int32_t getOutputShapes(
+    DimsExprs const * inputs, std::int32_t num_inputs, DimsExprs const * shape_inputs,
+    std::int32_t num_shape_inputs, DimsExprs * outputs, std::int32_t num_outputs,
+    IExprBuilder & expr_builder) noexcept override;
+
+  // IPluginV3OneRuntime Methods
+
+  std::int32_t enqueue(
+    PluginTensorDesc const * input_desc, PluginTensorDesc const * output_desc,
+    void const * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+  std::int32_t onShapeChange(
+    PluginTensorDesc const * in, std::int32_t num_inputs, PluginTensorDesc const * out,
+    std::int32_t num_outputs) noexcept override;
+
+  IPluginV3 * attachToContext(IPluginResourceContext * context) noexcept override;
+
+  PluginFieldCollection const * getFieldsToSerialize() noexcept override;
+
+  std::size_t getWorkspaceSize(
+    DynamicPluginTensorDesc const * inputs, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * outputs, std::int32_t num_outputs) const noexcept override;
+
+private:
+  static constexpr std::int32_t INOUT_IN_FEATURES_INDEX = 0;
+  static constexpr std::int32_t INOUT_FILTERS_INDEX = 1;
+  static constexpr std::int32_t INOUT_PAIR_FWD_INDEX = 2;
+  static constexpr std::int32_t INOUT_PAIR_MASK_FWD_SPLITS_INDEX = 3;
+  static constexpr std::int32_t INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX = 4;
+  static constexpr std::int32_t INOUT_OUT_FEATURES_INDEX = 5;
+
+  void initFieldsToSerialize();
+
+  std::string layer_name_;
+  ImplicitGemmParameters params_;
+  std::tuple<int, int> arch_;
+  std::vector<nvinfer1::PluginField> data_to_serialize_;
+  nvinfer1::PluginFieldCollection fc_to_serialize_;
+
+  std::unique_ptr<ConvTunerSimple> tuner_fp32_ptr_{};
+  std::unique_ptr<ConvTunerSimple> tuner_fp16_ptr_{};
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp
new file mode 100644
index 0000000000000..c6450b2e9b1ab
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp
@@ -0,0 +1,59 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+// Plugin factory class.
+class ImplicitGemmPluginCreator : public nvinfer1::IPluginCreatorV3One
+{
+public:
+  ImplicitGemmPluginCreator();
+
+  ~ImplicitGemmPluginCreator() override = default;
+
+  char const * getPluginNamespace() const noexcept override
+  {
+    return kIMPLICIT_GEMM_PLUGIN_NAMESPACE;
+  }
+
+  char const * getPluginName() const noexcept override { return kIMPLICIT_GEMM_PLUGIN_NAME; }
+
+  char const * getPluginVersion() const noexcept override { return kIMPLICIT_GEMM_PLUGIN_VERSION; }
+
+  nvinfer1::PluginFieldCollection const * getFieldNames() noexcept override;
+
+  IPluginV3 * createPlugin(
+    char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept override;
+
+private:
+  nvinfer1::PluginFieldCollection fc_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_registration.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_registration.hpp
new file mode 100644
index 0000000000000..4ce6e28a205aa
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_registration.hpp
@@ -0,0 +1,29 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__PLUGIN_REGISTRATION_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__PLUGIN_REGISTRATION_HPP_
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+
+// These are the functions that TensorRT library will call at the runtime.
+
+extern "C" void setLoggerFinder(nvinfer1::ILoggerFinder * finder);
+
+extern "C" nvinfer1::IPluginCreatorInterface * const * getPluginCreators(
+  std::int32_t & num_creators);
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__PLUGIN_REGISTRATION_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_utils.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_utils.hpp
new file mode 100644
index 0000000000000..c368f796b714d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_utils.hpp
@@ -0,0 +1,35 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__PLUGIN_UTILS_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__PLUGIN_UTILS_HPP_
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+#include <stdexcept>
+
+void caughtError(std::exception const & e);
+
+void logDebug(char const * msg);
+
+void logInfo(char const * msg);
+
+#define PLUGIN_ASSERT(val) reportAssertion((val), #val, __FILE__, __LINE__)
+void reportAssertion(bool success, char const * msg, char const * file, std::int32_t line);
+
+#define PLUGIN_VALIDATE(val) reportValidation((val), #val, __FILE__, __LINE__)
+void reportValidation(bool success, char const * msg, char const * file, std::int32_t line);
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__PLUGIN_UTILS_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp
new file mode 100644
index 0000000000000..e488b677ce90f
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp
@@ -0,0 +1,123 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_HPP_
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <cuda_runtime.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+constexpr char const * const kQUICK_CUMSUM_CUDA_PLUGIN_NAME{"QuickCumsumCuda"};
+constexpr char const * const kQUICK_CUMSUM_CUDA_PLUGIN_VERSION{"1"};
+constexpr char const * const kQUICK_CUMSUM_CUDA_PLUGIN_NAMESPACE{""};
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct QuickCumsumCudaParameters
+{
+  std::int32_t batch_size;
+  std::int32_t dimension;
+  std::int32_t height;
+  std::int32_t width;
+};
+
+class QuickCumsumCudaPlugin : public IPluginV3,
+                              public IPluginV3OneCore,
+                              public IPluginV3OneBuild,
+                              public IPluginV3OneRuntime
+{
+public:
+  QuickCumsumCudaPlugin(const std::string & name, QuickCumsumCudaParameters const & params);
+
+  ~QuickCumsumCudaPlugin() override = default;
+
+  // IPluginV3 Methods
+
+  IPluginCapability * getCapabilityInterface(PluginCapabilityType type) noexcept override;
+
+  IPluginV3 * clone() noexcept override;
+
+  // IPluginV3OneCore Methods
+
+  char const * getPluginName() const noexcept override;
+
+  char const * getPluginVersion() const noexcept override;
+
+  char const * getPluginNamespace() const noexcept override;
+
+  // IPluginV3OneBuild Methods
+
+  std::int32_t getNbOutputs() const noexcept override;
+
+  std::int32_t configurePlugin(
+    DynamicPluginTensorDesc const * in, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * out, std::int32_t num_outputs) noexcept override;
+
+  bool supportsFormatCombination(
+    int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+    std::int32_t num_outputs) noexcept override;
+
+  std::int32_t getOutputDataTypes(
+    DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+    std::int32_t num_inputs) const noexcept override;
+
+  std::int32_t getOutputShapes(
+    DimsExprs const * inputs, std::int32_t num_inputs, DimsExprs const * shape_inputs,
+    std::int32_t num_shape_inputs, DimsExprs * outputs, std::int32_t num_outputs,
+    IExprBuilder & expr_builder) noexcept override;
+
+  // IPluginV3OneRuntime Methods
+
+  std::int32_t enqueue(
+    PluginTensorDesc const * input_desc, PluginTensorDesc const * output_desc,
+    void const * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+  std::int32_t onShapeChange(
+    PluginTensorDesc const * in, std::int32_t num_inputs, PluginTensorDesc const * out,
+    std::int32_t num_outputs) noexcept override;
+
+  IPluginV3 * attachToContext(IPluginResourceContext * context) noexcept override;
+
+  PluginFieldCollection const * getFieldsToSerialize() noexcept override;
+
+  std::size_t getWorkspaceSize(
+    DynamicPluginTensorDesc const * inputs, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * outputs, std::int32_t num_outputs) const noexcept override;
+
+private:
+  // TensorRT plugin parameters.
+  std::string layer_name_;
+  QuickCumsumCudaParameters params_;
+
+  void initFieldsToSerialize();
+
+  std::vector<nvinfer1::PluginField> data_to_serialize_;
+  nvinfer1::PluginFieldCollection fc_to_serialize_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp
new file mode 100644
index 0000000000000..c7ddc4e6f9f0d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp
@@ -0,0 +1,62 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_CREATOR_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_CREATOR_HPP_
+
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+// Plugin factory class.
+class QuickCumsumCudaPluginCreator : public nvinfer1::IPluginCreatorV3One
+{
+public:
+  QuickCumsumCudaPluginCreator();
+
+  ~QuickCumsumCudaPluginCreator() override = default;
+
+  char const * getPluginNamespace() const noexcept override
+  {
+    return kQUICK_CUMSUM_CUDA_PLUGIN_NAMESPACE;
+  }
+
+  char const * getPluginName() const noexcept override { return kQUICK_CUMSUM_CUDA_PLUGIN_NAME; }
+
+  char const * getPluginVersion() const noexcept override
+  {
+    return kQUICK_CUMSUM_CUDA_PLUGIN_VERSION;
+  }
+
+  nvinfer1::PluginFieldCollection const * getFieldNames() noexcept override;
+
+  IPluginV3 * createPlugin(
+    char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept override;
+
+private:
+  nvinfer1::PluginFieldCollection fc_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_CREATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/launch/lidar_bevfusion.launch.xml b/perception/autoware_lidar_bevfusion/launch/lidar_bevfusion.launch.xml
new file mode 100644
index 0000000000000..a2cecb2f72f62
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/launch/lidar_bevfusion.launch.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0"?>
+<launch>
+  <arg name="input/pointcloud" default="/sensing/lidar/concatenated/pointcloud"/>
+  <arg name="output/objects" default="objects"/>
+  <!-- <arg name="data_path" default="$(env HOME)/autoware_data" description="packages data and artifacts directory path"/> -->
+  <arg name="data_path" default="$(find-pkg-share autoware_lidar_bevfusion)/autoware_data" description="packages data and artifacts directory path"/>
+  <arg name="model_name" default="bevfusion" description="options: `bevfusion`"/>
+  <!-- <arg name="model_path" default="$(var data_path)/lidar_bevfusion"/> -->
+  <arg name="model_path" default="$(find-pkg-share autoware_lidar_bevfusion)/config"/>
+  <arg name="model_param_path" default="$(find-pkg-share autoware_lidar_bevfusion)/config/$(var model_name).param.yaml"/>
+  <arg name="ml_package_param_path" default="$(var model_path)/$(var model_name)_ml_package.param.yaml"/>
+  <arg name="class_remapper_param_path" default="$(find-pkg-share autoware_lidar_bevfusion)/config/detection_class_remapper.param.yaml"/>
+  <arg name="build_only" default="false" description="shutdown node after TensorRT engine file is built"/>
+  <arg name="log_level" default="info"/>
+
+  <arg name="use_pointcloud_container" default="false" description="use pointcloud_container"/>
+  <arg name="pointcloud_container_name" default="pointcloud_container" description="pointcloud_container name"/>
+
+  <arg name="use_decompress" default="true" description="use image_transport_decompressor"/>
+
+  <arg name="camera_info0" default="/sensing/camera/camera0/camera_info"/>
+  <arg name="camera_info1" default="/sensing/camera/camera1/camera_info"/>
+  <arg name="camera_info2" default="/sensing/camera/camera2/camera_info"/>
+  <arg name="camera_info3" default="/sensing/camera/camera3/camera_info"/>
+  <arg name="camera_info4" default="/sensing/camera/camera4/camera_info"/>
+  <arg name="camera_info5" default="/sensing/camera/camera5/camera_info"/>
+
+  <arg name="image0" default="/sensing/camera/camera0/image_rect_color"/>
+  <arg name="image1" default="/sensing/camera/camera1/image_rect_color"/>
+  <arg name="image2" default="/sensing/camera/camera2/image_rect_color"/>
+  <arg name="image3" default="/sensing/camera/camera3/image_rect_color"/>
+  <arg name="image4" default="/sensing/camera/camera4/image_rect_color"/>
+  <arg name="image5" default="/sensing/camera/camera5/image_rect_color"/>
+
+  <arg name="decompressor_param_file" default="$(find-pkg-share autoware_image_transport_decompressor)/config/image_transport_decompressor.param.yaml"/>
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image0)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image0)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image1)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image1)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image2)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image2)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image3)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image3)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image4)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image4)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image5)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image5)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <group if="$(var use_pointcloud_container)">
+    <load_composable_node target="$(var pointcloud_container_name)">
+      <composable_node pkg="autoware_lidar_bevfusion" plugin="autoware::lidar_bevfusion::LidarTransfusionNode" name="lidar_bevfusion">
+        <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/>
+        <remap from="~/output/objects" to="$(var output/objects)"/>
+        <param from="$(var model_param_path)" allow_substs="true"/>
+        <param from="$(var ml_package_param_path)" allow_substs="true"/>
+        <param from="$(var class_remapper_param_path)"/>
+
+        <remap from="~/input/camera_info0" to="$(var camera_info0)"/>
+        <remap from="~/input/camera_info1" to="$(var camera_info1)"/>
+        <remap from="~/input/camera_info2" to="$(var camera_info2)"/>
+        <remap from="~/input/camera_info3" to="$(var camera_info3)"/>
+        <remap from="~/input/camera_info4" to="$(var camera_info4)"/>
+        <remap from="~/input/camera_info5" to="$(var camera_info5)"/>
+        <remap from="~/input/image0" to="$(var image0)"/>
+        <remap from="~/input/image1" to="$(var image1)"/>
+        <remap from="~/input/image2" to="$(var image2)"/>
+        <remap from="~/input/image3" to="$(var image3)"/>
+        <remap from="~/input/image4" to="$(var image4)"/>
+        <remap from="~/input/image5" to="$(var image5)"/>
+
+        <!-- This parameter shall NOT be included in param file. See also: https://github.com/autowarefoundation/autoware.universe/pull/6167 -->
+        <param name="build_only" value="$(var build_only)"/>
+      </composable_node>
+    </load_composable_node>
+  </group>
+  <group unless="$(var use_pointcloud_container)">
+    <node pkg="autoware_lidar_bevfusion" exec="autoware_lidar_bevfusion_node" name="lidar_bevfusion" output="screen" args="--ros-args --log-level $(var log_level)">
+      <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/>
+      <remap from="~/output/objects" to="$(var output/objects)"/>
+      <param from="$(var model_param_path)" allow_substs="true"/>
+      <param from="$(var ml_package_param_path)" allow_substs="true"/>
+      <param from="$(var class_remapper_param_path)"/>
+
+      <remap from="~/input/camera_info0" to="$(var camera_info0)"/>
+      <remap from="~/input/camera_info1" to="$(var camera_info1)"/>
+      <remap from="~/input/camera_info2" to="$(var camera_info2)"/>
+      <remap from="~/input/camera_info3" to="$(var camera_info3)"/>
+      <remap from="~/input/camera_info4" to="$(var camera_info4)"/>
+      <remap from="~/input/camera_info5" to="$(var camera_info5)"/>
+      <remap from="~/input/image0" to="$(var image0)"/>
+      <remap from="~/input/image1" to="$(var image1)"/>
+      <remap from="~/input/image2" to="$(var image2)"/>
+      <remap from="~/input/image3" to="$(var image3)"/>
+      <remap from="~/input/image4" to="$(var image4)"/>
+      <remap from="~/input/image5" to="$(var image5)"/>
+
+      <!-- This parameter shall NOT be included in param file. See also: https://github.com/autowarefoundation/autoware.universe/pull/6167 -->
+      <param name="build_only" value="$(var build_only)"/>
+    </node>
+  </group>
+</launch>
diff --git a/perception/autoware_lidar_bevfusion/lib/bevfusion_trt.cpp b/perception/autoware_lidar_bevfusion/lib/bevfusion_trt.cpp
new file mode 100644
index 0000000000000..a7a00771d6b71
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/bevfusion_trt.cpp
@@ -0,0 +1,532 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/bevfusion_trt.hpp"
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/preprocess/precomputed_features.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+
+#include <autoware/cuda_utils/cuda_utils.hpp>
+#include <autoware/universe_utils/math/constants.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+BEVFusionTRT::BEVFusionTRT(
+  const tensorrt_common::TrtCommonConfig & trt_config,
+  const DensificationParam & densification_param, const BEVFusionConfig & config)
+: config_(config)
+{
+  vg_ptr_ = std::make_unique<VoxelGenerator>(densification_param, config_, stream_);
+
+  stop_watch_ptr_ =
+    std::make_unique<autoware::universe_utils::StopWatch<std::chrono::milliseconds>>();
+  stop_watch_ptr_->tic("processing/inner");
+
+  initPtr();
+  initTrt(trt_config);
+
+  CHECK_CUDA_ERROR(cudaStreamCreate(&stream_));
+
+  camera_streams_.resize(config_.num_cameras_);
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    CHECK_CUDA_ERROR(cudaStreamCreate(&camera_streams_[i]));
+  }
+}
+
+BEVFusionTRT::~BEVFusionTRT()
+{
+  if (stream_) {
+    cudaStreamSynchronize(stream_);
+    cudaStreamDestroy(stream_);
+  }
+
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    if (camera_streams_[i]) {
+      cudaStreamSynchronize(camera_streams_[i]);
+      cudaStreamDestroy(camera_streams_[i]);
+    }
+  }
+}
+
+void BEVFusionTRT::initPtr()
+{
+  // point cloud to voxels
+  voxel_features_size_ =
+    config_.max_num_voxels_ * config_.max_points_per_voxel_ * config_.num_point_feature_size_;
+  voxel_coords_size_ = 3 * config_.max_num_voxels_;
+
+  // output of TRT -- input of post-process
+  bbox_pred_size_ = config_.num_proposals_ * config_.num_box_values_;
+  label_pred_output_d_ = autoware::cuda_utils::make_unique<std::int64_t[]>(config_.num_proposals_);
+  bbox_pred_output_d_ = autoware::cuda_utils::make_unique<float[]>(bbox_pred_size_);
+  score_output_d_ = autoware::cuda_utils::make_unique<float[]>(config_.num_proposals_);
+
+  // lidar branch
+  voxel_features_d_ = autoware::cuda_utils::make_unique<float[]>(voxel_features_size_);
+  voxel_coords_d_ = autoware::cuda_utils::make_unique<std::int32_t[]>(voxel_coords_size_);
+  num_points_per_voxel_d_ =
+    autoware::cuda_utils::make_unique<std::int32_t[]>(config_.max_num_voxels_);
+  points_d_ = autoware::cuda_utils::make_unique<float[]>(
+    config_.cloud_capacity_ * config_.num_point_feature_size_);
+
+  // pre computed tensors
+  if (config_.sensor_fusion_) {
+    lidar2image_d_ = autoware::cuda_utils::make_unique<float[]>(config_.num_cameras_ * 4 * 4);
+    std::int64_t num_geom_feats = config_.num_cameras_ * config_.features_height_ *
+                                  config_.features_width_ * config_.num_depth_features_;
+    geom_feats_d_ = autoware::cuda_utils::make_unique<std::int32_t[]>(4 * num_geom_feats);
+    kept_d_ = autoware::cuda_utils::make_unique<std::uint8_t[]>(num_geom_feats);
+    ranks_d_ = autoware::cuda_utils::make_unique<std::int64_t[]>(num_geom_feats);
+    indices_d_ = autoware::cuda_utils::make_unique<std::int64_t[]>(num_geom_feats);
+
+    // image branch
+    roi_tensor_d_ = autoware::cuda_utils::make_unique<std::uint8_t[]>(
+      config_.num_cameras_ * config_.roi_height_ * config_.roi_width_ * 3);
+    for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) {
+      image_buffers_d_.emplace_back(autoware::cuda_utils::make_unique<std::uint8_t[]>(
+        config_.raw_image_height_ * config_.raw_image_width_ * 3));
+    }
+    camera_masks_d_ = autoware::cuda_utils::make_unique<float[]>(config_.num_cameras_);
+  }
+
+  pre_ptr_ = std::make_unique<PreprocessCuda>(config_, stream_, true);
+  post_ptr_ = std::make_unique<PostprocessCuda>(config_, stream_);
+}
+
+void BEVFusionTRT::initTrt(const tensorrt_common::TrtCommonConfig & trt_config)
+{
+  std::vector<autoware::tensorrt_common::NetworkIO> network_io;
+
+  // Lidar branch
+  network_io.emplace_back(
+    "voxels",
+    nvinfer1::Dims{3, {-1, config_.max_points_per_voxel_, config_.num_point_feature_size_}});
+  network_io.emplace_back("num_points_per_voxel", nvinfer1::Dims{1, {-1}});
+  network_io.emplace_back("coors", nvinfer1::Dims{2, {-1, 3}});
+
+  // Camera branch
+  if (config_.sensor_fusion_) {
+    network_io.emplace_back("points", nvinfer1::Dims{2, {-1, 5}});
+    network_io.emplace_back("camera_mask", nvinfer1::Dims{1, {-1}});
+    network_io.emplace_back(
+      "imgs", nvinfer1::Dims{4, {-1, 3, config_.roi_height_, config_.roi_width_}});
+    network_io.emplace_back("lidar2image", nvinfer1::Dims{3, {-1, 4, 4}});  // 4x4 matrix
+
+    network_io.emplace_back("geom_feats", nvinfer1::Dims{2, {-1, 4}});
+    network_io.emplace_back("kept", nvinfer1::Dims{1, {-1}});
+    network_io.emplace_back("ranks", nvinfer1::Dims{1, {-1}});
+    network_io.emplace_back("indices", nvinfer1::Dims{1, {-1}});
+  }
+
+  // Outputs
+  network_io.emplace_back(
+    "bbox_pred", nvinfer1::Dims{2, {config_.num_box_values_, config_.num_proposals_}});
+  network_io.emplace_back("score", nvinfer1::Dims{1, {config_.num_proposals_}});
+  network_io.emplace_back("label_pred", nvinfer1::Dims{1, {config_.num_proposals_}});
+
+  std::vector<autoware::tensorrt_common::ProfileDims> profile_dims;
+
+  // Lidar branch
+  profile_dims.emplace_back(
+    "voxels",
+    nvinfer1::Dims{
+      3, {config_.voxels_num_[0], config_.max_points_per_voxel_, config_.num_point_feature_size_}},
+    nvinfer1::Dims{
+      3, {config_.voxels_num_[1], config_.max_points_per_voxel_, config_.num_point_feature_size_}},
+    nvinfer1::Dims{
+      3, {config_.voxels_num_[2], config_.max_points_per_voxel_, config_.num_point_feature_size_}});
+
+  profile_dims.emplace_back(
+    "num_points_per_voxel", nvinfer1::Dims{1, {config_.voxels_num_[0]}},
+    nvinfer1::Dims{1, {config_.voxels_num_[1]}}, nvinfer1::Dims{1, {config_.voxels_num_[2]}});
+
+  profile_dims.emplace_back(
+    "coors", nvinfer1::Dims{2, {config_.voxels_num_[0], 3}},
+    nvinfer1::Dims{2, {config_.voxels_num_[1], 3}}, nvinfer1::Dims{2, {config_.voxels_num_[2], 3}});
+
+  // Camera branch
+  if (config_.sensor_fusion_) {
+    profile_dims.emplace_back(
+      "points", nvinfer1::Dims{2, {config_.voxels_num_[0], 5}},
+      nvinfer1::Dims{2, {config_.voxels_num_[1], 5}},
+      nvinfer1::Dims{2, {config_.voxels_num_[2], 5}});
+
+    profile_dims.emplace_back(
+      "camera_mask", nvinfer1::Dims{1, {1}}, nvinfer1::Dims{1, {config_.num_cameras_}},
+      nvinfer1::Dims{1, {config_.num_cameras_}});
+
+    profile_dims.emplace_back(
+      "imgs", nvinfer1::Dims{4, {1, 3, config_.roi_height_, config_.roi_width_}},
+      nvinfer1::Dims{4, {config_.num_cameras_, 3, config_.roi_height_, config_.roi_width_}},
+      nvinfer1::Dims{4, {config_.num_cameras_, 3, config_.roi_height_, config_.roi_width_}});
+    profile_dims.emplace_back(
+      "lidar2image", nvinfer1::Dims{3, {1, 4, 4}}, nvinfer1::Dims{3, {config_.num_cameras_, 4, 4}},
+      nvinfer1::Dims{3, {config_.num_cameras_, 4, 4}});
+
+    const std::int64_t num_geom_feats = config_.num_cameras_ * config_.features_height_ *
+                                        config_.features_width_ * config_.num_depth_features_;
+
+    profile_dims.emplace_back(
+      "geom_feats", nvinfer1::Dims{2, {0, 4}}, nvinfer1::Dims{2, {num_geom_feats, 4}},
+      nvinfer1::Dims{2, {num_geom_feats, 4}});
+
+    profile_dims.emplace_back(
+      "kept", nvinfer1::Dims{1, {0}}, nvinfer1::Dims{1, {num_geom_feats}},
+      nvinfer1::Dims{1, {num_geom_feats}});
+
+    profile_dims.emplace_back(
+      "ranks", nvinfer1::Dims{1, {0}}, nvinfer1::Dims{1, {num_geom_feats}},
+      nvinfer1::Dims{1, {num_geom_feats}});
+
+    profile_dims.emplace_back(
+      "indices", nvinfer1::Dims{1, {0}}, nvinfer1::Dims{1, {num_geom_feats}},
+      nvinfer1::Dims{1, {num_geom_feats}});
+  }
+
+  auto network_io_ptr =
+    std::make_unique<std::vector<autoware::tensorrt_common::NetworkIO>>(network_io);
+  auto profile_dims_ptr =
+    std::make_unique<std::vector<autoware::tensorrt_common::ProfileDims>>(profile_dims);
+
+  network_trt_ptr_ = std::make_unique<autoware::tensorrt_common::TrtCommon>(
+    trt_config, std::make_shared<autoware::tensorrt_common::Profiler>(),
+    std::vector<std::string>{config_.plugins_path_});
+
+  if (!network_trt_ptr_->setup(std::move(profile_dims_ptr), std::move(network_io_ptr))) {
+    throw std::runtime_error("Failed to setup TRT engine." + config_.plugins_path_);
+  }
+
+  if (config_.sensor_fusion_) {
+    /* nvinfer1::Dims input_imgs_shape;
+    input_imgs_shape.nbDims = 4;
+    input_imgs_shape.d[0] = config_.num_cameras_;
+    input_imgs_shape.d[1] = 3;
+    input_imgs_shape.d[2] = config_.roi_height_;
+    input_imgs_shape.d[3] = config_.roi_width_;
+
+    nvinfer1::Dims input_lidar2image_shape;
+    input_lidar2image_shape.nbDims = 3;
+    input_lidar2image_shape.d[0] = config_.num_cameras_;
+    input_lidar2image_shape.d[1] = 4;
+    input_lidar2image_shape.d[2] = 4;
+
+    nvinfer1::Dims input_geom_feats_shape;
+    input_geom_feats_shape.nbDims = 2;
+    input_geom_feats_shape.d[0] = num_ranks_;
+    input_geom_feats_shape.d[1] = 4;
+
+    nvinfer1::Dims input_kept_shape;
+    input_kept_shape.nbDims = 1;
+    input_kept_shape.d[0] = num_kept_;
+
+    nvinfer1::Dims input_ranks_shape;
+    input_ranks_shape.nbDims = 1;
+    input_ranks_shape.d[0] = num_ranks_;
+
+    nvinfer1::Dims input_indices_shape;
+    input_indices_shape.nbDims = 1;
+    input_indices_shape.d[0] = num_indices_; */
+
+    std::vector<float> lidar2image_host(config_.num_cameras_ * 4 * 4);
+    cudaMemcpy(
+      lidar2image_host.data(), lidar2image_d_.get(), config_.num_cameras_ * 4 * 4 * sizeof(float),
+      cudaMemcpyDeviceToHost);
+
+    network_trt_ptr_->setInputShape("camera_mask", nvinfer1::Dims{1, {config_.num_cameras_}});
+    network_trt_ptr_->setInputShape(
+      "imgs",
+      nvinfer1::Dims{4, {config_.num_cameras_, 3, config_.roi_height_, config_.roi_width_}});
+    network_trt_ptr_->setInputShape("lidar2image", nvinfer1::Dims{3, {config_.num_cameras_, 4, 4}});
+    network_trt_ptr_->setInputShape("geom_feats", nvinfer1::Dims{2, {num_ranks_, 4}});
+    network_trt_ptr_->setInputShape("kept", nvinfer1::Dims{1, {num_kept_}});
+    network_trt_ptr_->setInputShape("ranks", nvinfer1::Dims{1, {num_ranks_}});
+    network_trt_ptr_->setInputShape("indices", nvinfer1::Dims{1, {num_indices_}});
+  }
+
+  network_trt_ptr_->setTensorAddress("voxels", voxel_features_d_.get());
+  network_trt_ptr_->setTensorAddress("num_points_per_voxel", num_points_per_voxel_d_.get());
+  network_trt_ptr_->setTensorAddress("coors", voxel_coords_d_.get());
+
+  if (config_.sensor_fusion_) {
+    network_trt_ptr_->setTensorAddress("points", points_d_.get());
+    network_trt_ptr_->setTensorAddress("camera_mask", camera_masks_d_.get());
+    network_trt_ptr_->setTensorAddress("imgs", roi_tensor_d_.get());
+    network_trt_ptr_->setTensorAddress("lidar2image", lidar2image_d_.get());
+    network_trt_ptr_->setTensorAddress("geom_feats", geom_feats_d_.get());
+    network_trt_ptr_->setTensorAddress("kept", kept_d_.get());
+    network_trt_ptr_->setTensorAddress("ranks", ranks_d_.get());
+    network_trt_ptr_->setTensorAddress("indices", indices_d_.get());
+  }
+
+  network_trt_ptr_->setTensorAddress("label_pred", label_pred_output_d_.get());
+  network_trt_ptr_->setTensorAddress("bbox_pred", bbox_pred_output_d_.get());
+  network_trt_ptr_->setTensorAddress("score", score_output_d_.get());
+}
+
+bool BEVFusionTRT::detect(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr & pc_msg,
+  const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+  const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer,
+  std::vector<Box3D> & det_boxes3d, std::unordered_map<std::string, double> & proc_timing)
+{
+  stop_watch_ptr_->toc("processing/inner", true);
+  if (!preProcess(pc_msg, image_msgs, camera_masks, tf_buffer)) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Pre-process failed. Skipping detection.");
+    return false;
+  }
+  proc_timing.emplace(
+    "debug/processing_time/preprocess_ms", stop_watch_ptr_->toc("processing/inner", true));
+
+  if (!inference()) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Inference failed. Skipping detection.");
+    return false;
+  }
+  proc_timing.emplace(
+    "debug/processing_time/inference_ms", stop_watch_ptr_->toc("processing/inner", true));
+
+  if (!postProcess(det_boxes3d)) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Post-process failed. Skipping detection");
+    return false;
+  }
+  proc_timing.emplace(
+    "debug/processing_time/postprocess_ms", stop_watch_ptr_->toc("processing/inner", true));
+
+  return true;
+}
+
+void BEVFusionTRT::setIntrinsicsExtrinsics(
+  std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+  std::vector<Matrix4fRowM> & lidar2camera_vector)
+{
+  roi_start_y_vector_.clear();
+  std::vector<Matrix4fRowM> img_aug_matrices;
+
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    float fx = camera_info_vector[i].p[0];
+    float fy = camera_info_vector[i].p[5];
+    float cx = camera_info_vector[i].p[2];
+    float cy = camera_info_vector[i].p[6];
+
+    Matrix4fRowM camera2lidar_matrix = lidar2camera_vector[i].inverse();
+    float r31 = camera2lidar_matrix(2, 0);
+    float r32 = camera2lidar_matrix(2, 1);
+    float r33 = camera2lidar_matrix(2, 2);
+
+    float yl = cy + cx * (fy / fx) * (r31 / r32) - fy * (r33 / r32);
+    float yr =
+      cy + (cx - config_.raw_image_width_ + 1) * (fy / fx) * (r31 / r32) - fy * (r33 / r32);
+    float yh = std::max(0.0f, std::min(yr, yl));
+    float yh_resized = yh * config_.img_aug_scale_y_;
+    int crop_h = static_cast<int>(
+      std::min(yh_resized, static_cast<float>(config_.resized_height_ - config_.roi_height_)));
+
+    Matrix4fRowM img_aug_matrix = Matrix4fRowM::Identity();
+    img_aug_matrix(0, 0) = config_.img_aug_scale_x_;
+    img_aug_matrix(1, 1) = config_.img_aug_scale_y_;
+    img_aug_matrix(1, 3) = -static_cast<float>(crop_h);
+
+    img_aug_matrices.push_back(img_aug_matrix);
+    roi_start_y_vector_.push_back(crop_h);
+  }
+
+  auto [lidar2images_flattened, geom_feats, kept, ranks, indices] =
+    precompute_features(lidar2camera_vector, img_aug_matrices, camera_info_vector, config_);
+
+  assert(static_cast<std::int64_t>(lidar2images_flattened.size()) == config_.num_cameras_ * 4 * 4);
+
+  assert(
+    static_cast<std::int64_t>(geom_feats.size()) <=
+    config_.num_cameras_ * 4 * config_.features_height_ * config_.features_width_ *
+      config_.num_depth_features_);
+  assert(
+    static_cast<std::int64_t>(kept.size()) == config_.num_cameras_ * config_.features_height_ *
+                                                config_.features_width_ *
+                                                config_.num_depth_features_);
+  assert(
+    static_cast<std::int64_t>(ranks.size()) <= config_.num_cameras_ * config_.features_height_ *
+                                                 config_.features_width_ *
+                                                 config_.num_depth_features_);
+  assert(
+    static_cast<std::int64_t>(indices.size()) <= config_.num_cameras_ * config_.features_height_ *
+                                                   config_.features_width_ *
+                                                   config_.num_depth_features_);
+
+  num_geom_feats_ = static_cast<std::int64_t>(geom_feats.size());
+  num_kept_ = static_cast<std::int64_t>(kept.size());
+  num_ranks_ = static_cast<std::int64_t>(ranks.size());
+  num_indices_ = static_cast<std::int64_t>(indices.size());
+
+  assert(num_geom_feats_ == 4 * num_ranks_);
+  assert(num_ranks_ == num_indices_);
+
+  cudaMemcpy(
+    lidar2image_d_.get(), lidar2images_flattened.data(),
+    config_.num_cameras_ * 4 * 4 * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(
+    geom_feats_d_.get(), geom_feats.data(), num_geom_feats_ * sizeof(std::int32_t),
+    cudaMemcpyHostToDevice);
+  cudaMemcpy(kept_d_.get(), kept.data(), num_kept_ * sizeof(std::uint8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(
+    ranks_d_.get(), ranks.data(), num_ranks_ * sizeof(std::int64_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(
+    indices_d_.get(), indices.data(), num_indices_ * sizeof(std::int64_t), cudaMemcpyHostToDevice);
+}
+
+bool BEVFusionTRT::preProcess(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr & pc_msg,
+  const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+  const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer)
+{
+  using autoware::cuda_utils::clear_async;
+
+  if (!is_data_layout_compatible_with_point_xyzirc(*pc_msg)) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Invalid point type. Skipping detection.");
+    return false;
+  }
+
+  if (!vg_ptr_->enqueuePointCloud(*pc_msg, tf_buffer)) {
+    return false;
+  }
+
+  // TODO(knzo25): these should be able to be removed as they are filled by TensorRT
+  clear_async(label_pred_output_d_.get(), config_.num_proposals_, stream_);
+  clear_async(bbox_pred_output_d_.get(), bbox_pred_size_, stream_);
+  clear_async(score_output_d_.get(), config_.num_proposals_, stream_);
+
+  clear_async(voxel_features_d_.get(), voxel_features_size_, stream_);
+  clear_async(voxel_coords_d_.get(), voxel_coords_size_, stream_);
+  clear_async(num_points_per_voxel_d_.get(), config_.max_num_voxels_, stream_);
+  clear_async(points_d_.get(), config_.cloud_capacity_ * config_.num_point_feature_size_, stream_);
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  // TODO(knzo25): move this to each image callback
+  if (config_.sensor_fusion_) {
+    int start_x =
+      std::max(0, static_cast<int>(config_.resized_width_) - static_cast<int>(config_.roi_width_)) /
+      2;
+
+    for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) {
+      int start_y = roi_start_y_vector_[camera_id];
+      cudaMemcpyAsync(
+        image_buffers_d_[camera_id].get(), image_msgs[camera_id]->data.data(),
+        config_.raw_image_height_ * config_.raw_image_width_ * 3, cudaMemcpyHostToDevice, stream_);
+
+      pre_ptr_->resize_and_extract_roi_launch(
+        image_buffers_d_[camera_id].get(),
+        &roi_tensor_d_[camera_id * config_.roi_height_ * config_.roi_width_ * 3],
+        config_.raw_image_height_, config_.raw_image_width_, config_.resized_height_,
+        config_.resized_width_, config_.roi_height_, config_.roi_width_, start_y, start_x,
+        camera_streams_[camera_id]);
+    }
+
+    cudaMemcpyAsync(
+      camera_masks_d_.get(), camera_masks.data(), config_.num_cameras_ * sizeof(float),
+      cudaMemcpyHostToDevice, stream_);
+  }
+
+  const auto num_points = vg_ptr_->generateSweepPoints(points_d_);
+
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  auto num_voxels = static_cast<std::int64_t>(pre_ptr_->generateVoxels(
+    points_d_.get(), num_points, voxel_features_d_.get(), voxel_coords_d_.get(),
+    num_points_per_voxel_d_.get()));
+
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  if (num_voxels < config_.min_num_voxels_) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "Too few voxels (" << num_voxels << ") for the actual optimization profile ("
+                         << config_.min_num_voxels_ << ")");
+    return false;
+  }
+  if (num_voxels > config_.max_num_voxels_) {
+    RCLCPP_ERROR_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "Actual number of voxels (" << num_voxels
+                                  << ") is over the limit for the actual optimization profile ("
+                                  << config_.max_num_voxels_ << "). Clipping to the limit.");
+    num_voxels = config_.max_num_voxels_;
+  }
+
+  network_trt_ptr_->setInputShape(
+    "voxels", nvinfer1::Dims{
+                3, {num_voxels, config_.max_points_per_voxel_, config_.num_point_feature_size_}});
+  network_trt_ptr_->setInputShape("num_points_per_voxel", nvinfer1::Dims{1, {num_voxels}});
+  network_trt_ptr_->setInputShape("coors", nvinfer1::Dims{2, {num_voxels, 3}});
+
+  if (config_.sensor_fusion_) {
+    network_trt_ptr_->setInputShape(
+      "points", nvinfer1::Dims{2, {static_cast<std::int64_t>(num_points), 5}});
+    network_trt_ptr_->setInputShape("geom_feats", nvinfer1::Dims{2, {num_ranks_, 4}});
+    network_trt_ptr_->setInputShape("kept", nvinfer1::Dims{1, {num_kept_}});
+    network_trt_ptr_->setInputShape("ranks", nvinfer1::Dims{1, {num_ranks_}});
+    network_trt_ptr_->setInputShape("indices", nvinfer1::Dims{1, {num_indices_}});
+  }
+
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    cudaStreamSynchronize(camera_streams_[i]);
+  }
+
+  return true;
+}
+
+bool BEVFusionTRT::inference()
+{
+  auto status = network_trt_ptr_->enqueueV3(stream_);
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  if (!status) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Fail to enqueue and skip to detect.");
+    return false;
+  }
+
+  return true;
+}
+
+bool BEVFusionTRT::postProcess(std::vector<Box3D> & det_boxes3d)
+{
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  CHECK_CUDA_ERROR(post_ptr_->generateDetectedBoxes3D_launch(
+    label_pred_output_d_.get(), bbox_pred_output_d_.get(), score_output_d_.get(), det_boxes3d,
+    stream_));
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+  return true;
+}
+
+}  //  namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/detection_class_remapper.cpp b/perception/autoware_lidar_bevfusion/lib/detection_class_remapper.cpp
new file mode 100644
index 0000000000000..e16aa0f0177b0
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/detection_class_remapper.cpp
@@ -0,0 +1,74 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <autoware/lidar_bevfusion/detection_class_remapper.hpp>
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+namespace autoware::lidar_bevfusion
+{
+
+void DetectionClassRemapper::setParameters(
+  const std::vector<std::int64_t> & allow_remapping_by_area_matrix,
+  const std::vector<double> & min_area_matrix, const std::vector<double> & max_area_matrix)
+{
+  assert(allow_remapping_by_area_matrix.size() == min_area_matrix.size());
+  assert(allow_remapping_by_area_matrix.size() == max_area_matrix.size());
+  assert(
+    std::pow(static_cast<int>(std::sqrt(min_area_matrix.size())), 2) == min_area_matrix.size());
+
+  num_labels_ = static_cast<int>(std::sqrt(min_area_matrix.size()));
+
+  Eigen::Map<const Eigen::Matrix<std::int64_t, Eigen::Dynamic, Eigen::Dynamic>>
+    allow_remapping_by_area_matrix_tmp(
+      allow_remapping_by_area_matrix.data(), num_labels_, num_labels_);
+  allow_remapping_by_area_matrix_ = allow_remapping_by_area_matrix_tmp.transpose()
+                                      .cast<bool>();  // Eigen is column major by default
+
+  Eigen::Map<const Eigen::MatrixXd> min_area_matrix_tmp(
+    min_area_matrix.data(), num_labels_, num_labels_);
+  min_area_matrix_ = min_area_matrix_tmp.transpose();  // Eigen is column major by default
+
+  Eigen::Map<const Eigen::MatrixXd> max_area_matrix_tmp(
+    max_area_matrix.data(), num_labels_, num_labels_);
+  max_area_matrix_ = max_area_matrix_tmp.transpose();  // Eigen is column major by default
+
+  min_area_matrix_ = min_area_matrix_.unaryExpr(
+    [](double v) { return std::isfinite(v) ? v : std::numeric_limits<double>::max(); });
+  max_area_matrix_ = max_area_matrix_.unaryExpr(
+    [](double v) { return std::isfinite(v) ? v : std::numeric_limits<double>::max(); });
+}
+
+void DetectionClassRemapper::mapClasses(autoware_perception_msgs::msg::DetectedObjects & msg)
+{
+  for (auto & object : msg.objects) {
+    const float bev_area = object.shape.dimensions.x * object.shape.dimensions.y;
+
+    for (auto & classification : object.classification) {
+      auto & label = classification.label;
+
+      for (int i = 0; i < num_labels_; ++i) {
+        if (
+          allow_remapping_by_area_matrix_(label, i) && bev_area >= min_area_matrix_(label, i) &&
+          bev_area <= max_area_matrix_(label, i)) {
+          label = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/postprocess/circle_nms_kernel.cu b/perception/autoware_lidar_bevfusion/lib/postprocess/circle_nms_kernel.cu
new file mode 100644
index 0000000000000..d30bc06dfbe3d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/postprocess/circle_nms_kernel.cu
@@ -0,0 +1,147 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <cstddef>
+
+namespace
+{
+const std::size_t THREADS_PER_BLOCK_NMS = 16;
+}  // namespace
+
+namespace autoware::lidar_bevfusion
+{
+
+__device__ inline float dist2dPow(const Box3D * a, const Box3D * b)
+{
+  return powf(a->x - b->x, 2) + powf(a->y - b->y, 2);
+}
+
+// cspell: ignore divup
+__global__ void circleNMS_Kernel(
+  const Box3D * __restrict__ boxes, const std::size_t num_boxes3d, const std::size_t col_blocks,
+  const float dist2d_pow_threshold, std::uint64_t * __restrict__ mask)
+{
+  // params: boxes (N,)
+  // params: mask (N, divup(N/THREADS_PER_BLOCK_NMS))
+
+  const auto row_start = blockIdx.y;
+  const auto col_start = blockIdx.x;
+
+  if (row_start > col_start) return;
+
+  const std::size_t row_size =
+    fminf(num_boxes3d - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+  const std::size_t col_size =
+    fminf(num_boxes3d - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+  __shared__ Box3D block_boxes[THREADS_PER_BLOCK_NMS];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x] = boxes[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const std::size_t cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const Box3D * cur_box = boxes + cur_box_idx;
+
+    std::uint64_t t = 0;
+    std::size_t start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (std::size_t i = start; i < col_size; i++) {
+      if (dist2dPow(cur_box, block_boxes + i) < dist2d_pow_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+cudaError_t circleNMS_launch(
+  const thrust::device_vector<Box3D> & boxes3d, const std::size_t num_boxes3d,
+  std::size_t col_blocks, const float distance_threshold,
+  thrust::device_vector<std::uint64_t> & mask, cudaStream_t stream)
+{
+  const float dist2d_pow_thres = powf(distance_threshold, 2);
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  circleNMS_Kernel<<<blocks, threads, 0, stream>>>(
+    thrust::raw_pointer_cast(boxes3d.data()), num_boxes3d, col_blocks, dist2d_pow_thres,
+    thrust::raw_pointer_cast(mask.data()));
+
+  return cudaGetLastError();
+}
+
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream)
+{
+  const auto num_boxes3d = boxes3d.size();
+  const auto col_blocks = divup(num_boxes3d, THREADS_PER_BLOCK_NMS);
+  thrust::device_vector<std::uint64_t> mask_d(num_boxes3d * col_blocks);
+
+  CHECK_CUDA_ERROR(
+    circleNMS_launch(boxes3d, num_boxes3d, col_blocks, distance_threshold, mask_d, stream));
+
+  // memcpy device to host
+  thrust::host_vector<std::uint64_t> mask_h(mask_d.size());
+  thrust::copy(mask_d.begin(), mask_d.end(), mask_h.begin());
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+
+  // generate keep_mask
+  std::vector<std::uint64_t> remv_h(col_blocks);
+  thrust::host_vector<bool> keep_mask_h(keep_mask.size());
+  std::size_t num_to_keep = 0;
+  for (std::size_t i = 0; i < num_boxes3d; i++) {
+    auto nblock = i / THREADS_PER_BLOCK_NMS;
+    auto inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_h[nblock] & (1ULL << inblock))) {
+      keep_mask_h[i] = true;
+      num_to_keep++;
+      std::uint64_t * p = &mask_h[0] + i * col_blocks;
+      for (std::size_t j = nblock; j < col_blocks; j++) {
+        remv_h[j] |= p[j];
+      }
+    } else {
+      keep_mask_h[i] = false;
+    }
+  }
+
+  // memcpy host to device
+  keep_mask = keep_mask_h;
+
+  return num_to_keep;
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/postprocess/non_maximum_suppression.cpp b/perception/autoware_lidar_bevfusion/lib/postprocess/non_maximum_suppression.cpp
new file mode 100644
index 0000000000000..dd64a73890247
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/postprocess/non_maximum_suppression.cpp
@@ -0,0 +1,114 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp"
+
+#include <autoware/object_recognition_utils/geometry.hpp>
+#include <autoware/object_recognition_utils/object_recognition_utils.hpp>
+#include <autoware/universe_utils/geometry/geometry.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+void NonMaximumSuppression::setParameters(const NMSParams & params)
+{
+  assert(params.search_distance_2d_ >= 0.0);
+  assert(params.iou_threshold_ >= 0.0 && params.iou_threshold_ <= 1.0);
+
+  params_ = params;
+  target_class_mask_ = classNamesToBooleanMask(params.target_class_names_);
+}
+
+bool NonMaximumSuppression::isTargetLabel(const std::uint8_t label)
+{
+  if (label >= target_class_mask_.size()) {
+    return false;
+  }
+
+  assert(label < target_class_mask_.size());
+
+  return target_class_mask_.at(label);
+}
+
+bool NonMaximumSuppression::isTargetPairObject(
+  const DetectedObject & object1, const DetectedObject & object2)
+{
+  const auto label1 =
+    autoware::object_recognition_utils::getHighestProbLabel(object1.classification);
+  const auto label2 =
+    autoware::object_recognition_utils::getHighestProbLabel(object2.classification);
+
+  if (isTargetLabel(label1) && isTargetLabel(label2)) {
+    return true;
+  }
+
+  const auto search_sqr_dist_2d = params_.search_distance_2d_ * params_.search_distance_2d_;
+  const auto sqr_dist_2d = autoware::universe_utils::calcSquaredDistance2d(
+    autoware::object_recognition_utils::getPose(object1),
+    autoware::object_recognition_utils::getPose(object2));
+  return sqr_dist_2d <= search_sqr_dist_2d;
+}
+
+Eigen::MatrixXd NonMaximumSuppression::generateIoUMatrix(
+  const std::vector<DetectedObject> & input_objects)
+{
+  // NOTE: row = target objects to be suppressed, col = source objects to be compared
+  Eigen::MatrixXd triangular_matrix =
+    Eigen::MatrixXd::Zero(input_objects.size(), input_objects.size());
+  for (std::size_t target_i = 0; target_i < input_objects.size(); ++target_i) {
+    for (std::size_t source_i = 0; source_i < target_i; ++source_i) {
+      const auto & target_obj = input_objects.at(target_i);
+      const auto & source_obj = input_objects.at(source_i);
+      if (!isTargetPairObject(target_obj, source_obj)) {
+        continue;
+      }
+
+      if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
+        const double iou = autoware::object_recognition_utils::get2dIoU(target_obj, source_obj);
+        triangular_matrix(target_i, source_i) = iou;
+        // NOTE: If the target object has any objects with iou > iou_threshold, it
+        // will be suppressed regardless of later results.
+        if (iou > params_.iou_threshold_) {
+          break;
+        }
+      }
+    }
+  }
+
+  return triangular_matrix;
+}
+
+std::vector<DetectedObject> NonMaximumSuppression::apply(
+  const std::vector<DetectedObject> & input_objects)
+{
+  Eigen::MatrixXd iou_matrix = generateIoUMatrix(input_objects);
+
+  std::vector<DetectedObject> output_objects;
+  output_objects.reserve(input_objects.size());
+  for (std::size_t i = 0; i < input_objects.size(); ++i) {
+    const auto value = iou_matrix.row(i).maxCoeff();
+    if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
+      if (value <= params_.iou_threshold_) {
+        output_objects.emplace_back(input_objects.at(i));
+      }
+    }
+  }
+
+  return output_objects;
+}
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/postprocess/postprocess_kernel.cu b/perception/autoware_lidar_bevfusion/lib/postprocess/postprocess_kernel.cu
new file mode 100644
index 0000000000000..9ca15ba19e46d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/postprocess/postprocess_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp"
+#include "autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp"
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <cstddef>
+
+namespace autoware::lidar_bevfusion
+{
+struct is_score_greater
+{
+  is_score_greater(float t) : t_(t) {}
+
+  __device__ bool operator()(const Box3D & b) { return b.score > t_; }
+
+private:
+  float t_{0.0};
+};
+
+struct is_kept
+{
+  __device__ bool operator()(const bool keep) { return keep; }
+};
+
+struct score_greater
+{
+  __device__ bool operator()(const Box3D & lb, const Box3D & rb) { return lb.score > rb.score; }
+};
+
+__device__ inline float sigmoid(float x)
+{
+  return 1.0f / (1.0f + expf(-x));
+}
+
+__global__ void generateBoxes3D_kernel(
+  const std::int64_t * __restrict__ label_pred_output, const float * __restrict__ bbox_pred_output,
+  const float * __restrict__ score_output, const float voxel_size_x, const float voxel_size_y,
+  const float min_x_range, const float min_y_range, const int num_proposals,
+  const float out_size_factor, const float * __restrict__ yaw_norm_thresholds,
+  Box3D * __restrict__ det_boxes3d)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= num_proposals) {
+    return;
+  }
+
+  const float yaw_sin = bbox_pred_output[6 * num_proposals + point_idx];
+  const float yaw_cos = bbox_pred_output[7 * num_proposals + point_idx];
+  const float yaw_norm = sqrtf(yaw_sin * yaw_sin + yaw_cos * yaw_cos);
+  const int label = static_cast<int>(label_pred_output[point_idx]);
+
+  det_boxes3d[point_idx].label = label;
+  det_boxes3d[point_idx].score =
+    yaw_norm >= yaw_norm_thresholds[label] ? score_output[point_idx] : 0.f;
+
+  det_boxes3d[point_idx].x =
+    bbox_pred_output[0 * num_proposals + point_idx] * out_size_factor * voxel_size_x + min_x_range;
+  det_boxes3d[point_idx].y =
+    bbox_pred_output[1 * num_proposals + point_idx] * out_size_factor * voxel_size_y + min_y_range;
+  det_boxes3d[point_idx].z = bbox_pred_output[2 * num_proposals + point_idx];
+  det_boxes3d[point_idx].length = expf(bbox_pred_output[3 * num_proposals + point_idx]);
+  det_boxes3d[point_idx].width = expf(bbox_pred_output[4 * num_proposals + point_idx]);
+  det_boxes3d[point_idx].height = expf(bbox_pred_output[5 * num_proposals + point_idx]);
+  det_boxes3d[point_idx].yaw = atan2f(yaw_sin, yaw_cos);
+  det_boxes3d[point_idx].vx = bbox_pred_output[8 * num_proposals + point_idx];
+  det_boxes3d[point_idx].vy = bbox_pred_output[9 * num_proposals + point_idx];
+}
+
+PostprocessCuda::PostprocessCuda(const BEVFusionConfig & config, cudaStream_t & stream)
+: config_(config), stream_(stream)
+{
+}
+
+// cspell: ignore divup
+cudaError_t PostprocessCuda::generateDetectedBoxes3D_launch(
+  const std::int64_t * label_pred_output, const float * bbox_pred_output,
+  const float * score_output, std::vector<Box3D> & det_boxes3d, cudaStream_t stream)
+{
+  dim3 threads = {config_.threads_per_block_};
+  dim3 blocks = {divup(config_.num_proposals_, threads.x)};
+
+  auto boxes3d_d = thrust::device_vector<Box3D>(config_.num_proposals_);
+  auto yaw_norm_thresholds_d = thrust::device_vector<float>(
+    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
+
+  generateBoxes3D_kernel<<<blocks, threads, 0, stream>>>(
+    label_pred_output, bbox_pred_output, score_output, config_.voxel_x_size_, config_.voxel_y_size_,
+    config_.min_x_range_, config_.min_y_range_, config_.num_proposals_, config_.out_size_factor_,
+    thrust::raw_pointer_cast(yaw_norm_thresholds_d.data()),
+    thrust::raw_pointer_cast(boxes3d_d.data()));
+
+  // suppress by score
+  const auto num_det_boxes3d = thrust::count_if(
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), is_score_greater(config_.score_threshold_));
+
+  if (num_det_boxes3d == 0) {
+    return cudaGetLastError();
+  }
+
+  thrust::device_vector<Box3D> det_boxes3d_d(num_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), det_boxes3d_d.begin(),
+    is_score_greater(config_.score_threshold_));
+
+  // sort by score
+  thrust::sort(det_boxes3d_d.begin(), det_boxes3d_d.end(), score_greater());
+
+  // supress by NMS
+  thrust::device_vector<bool> final_keep_mask_d(num_det_boxes3d);
+  const auto num_final_det_boxes3d =
+    circleNMS(det_boxes3d_d, config_.circle_nms_dist_threshold_, final_keep_mask_d, stream);
+  thrust::device_vector<Box3D> final_det_boxes3d_d(num_final_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, det_boxes3d_d.begin(), det_boxes3d_d.end(), final_keep_mask_d.begin(),
+    final_det_boxes3d_d.begin(), is_kept());
+
+  // memcpy device to host
+  det_boxes3d.resize(num_final_det_boxes3d);
+  thrust::copy(final_det_boxes3d_d.begin(), final_det_boxes3d_d.end(), det_boxes3d.begin());
+
+  return cudaGetLastError();
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/pointcloud_densification.cpp b/perception/autoware_lidar_bevfusion/lib/preprocess/pointcloud_densification.cpp
new file mode 100644
index 0000000000000..1b25ac1ce3577
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/pointcloud_densification.cpp
@@ -0,0 +1,111 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+
+#include <pcl_ros/transforms.hpp>
+#include <tf2_eigen/tf2_eigen.hpp>
+
+#include <boost/optional.hpp>
+
+#include <pcl_conversions/pcl_conversions.h>
+
+#include <string>
+#include <utility>
+
+namespace
+{
+
+boost::optional<geometry_msgs::msg::Transform> getTransform(
+  const tf2_ros::Buffer & tf_buffer, const std::string & target_frame_id,
+  const std::string & source_frame_id, const rclcpp::Time & time)
+{
+  try {
+    geometry_msgs::msg::TransformStamped transform_stamped;
+    transform_stamped = tf_buffer.lookupTransform(
+      target_frame_id, source_frame_id, time, rclcpp::Duration::from_seconds(0.5));
+    return transform_stamped.transform;
+  } catch (tf2::TransformException & ex) {
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_bevfusion"), ex.what());
+    return boost::none;
+  }
+}
+
+Eigen::Affine3f transformToEigen(const geometry_msgs::msg::Transform & t)
+{
+  Eigen::Affine3f a;
+  a.matrix() = tf2::transformToEigen(t).matrix().cast<float>();
+  return a;
+}
+
+}  // namespace
+
+namespace autoware::lidar_bevfusion
+{
+
+PointCloudDensification::PointCloudDensification(
+  const DensificationParam & param, cudaStream_t & stream)
+: param_(param), stream_(stream)
+{
+}
+
+bool PointCloudDensification::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  const auto header = pointcloud_msg.header;
+
+  if (param_.pointcloud_cache_size() > 1) {
+    auto transform_world2current =
+      getTransform(tf_buffer, header.frame_id, param_.world_frame_id(), header.stamp);
+    if (!transform_world2current) {
+      return false;
+    }
+    auto affine_world2current = transformToEigen(transform_world2current.get());
+
+    enqueue(pointcloud_msg, affine_world2current);
+  } else {
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+  }
+
+  dequeue();
+
+  return true;
+}
+
+void PointCloudDensification::enqueue(
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+{
+  affine_world2current_ = affine_world2current;
+  current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
+
+  auto data_d = autoware::cuda_utils::make_unique<InputPointType[]>(msg.width * msg.height);
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    data_d.get(), msg.data.data(), sizeof(std::uint8_t) * msg.width * msg.height * msg.point_step,
+    cudaMemcpyHostToDevice, stream_));
+
+  PointCloudWithTransform pointcloud = {
+    std::move(data_d), msg.header, msg.width * msg.height, affine_world2current.inverse()};
+
+  pointcloud_cache_.push_front(std::move(pointcloud));
+}
+
+void PointCloudDensification::dequeue()
+{
+  if (pointcloud_cache_.size() > param_.pointcloud_cache_size()) {
+    pointcloud_cache_.pop_back();
+  }
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/precomputed_features.cpp b/perception/autoware_lidar_bevfusion/lib/preprocess/precomputed_features.cpp
new file mode 100644
index 0000000000000..fe634ecb0963b
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/precomputed_features.cpp
@@ -0,0 +1,376 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/preprocess/precomputed_features.hpp"
+
+#include <Eigen/Dense>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+Tensor4D create_frustum(const BEVFusionConfig & config)
+{
+  const float dbound_start = config.d_bound_[0];
+  const float dbound_end = config.d_bound_[1];
+  const float dbound_step = config.d_bound_[2];
+  const int roi_height = static_cast<int>(config.roi_height_);            // Image height
+  const int roi_width = static_cast<int>(config.roi_width_);              // Image width
+  const int features_height = static_cast<int>(config.features_height_);  // Feature height
+  const int features_width = static_cast<int>(config.features_width_);    // Feature width
+
+  // Compute D (number of depth layers)
+  int D = static_cast<Eigen::Index>(std::floor((dbound_end - dbound_start) / dbound_step));
+  assert(static_cast<std::int64_t>(D) == config.num_depth_features_);
+
+  Tensor1D ds_vec(D);
+  for (Eigen::Index i = 0; i < D; ++i) {
+    ds_vec(i) = dbound_start + i * dbound_step;
+  }
+
+  // Create xs_lin and ys_lin tensors
+  Tensor1D xs_lin(features_width);
+  for (Eigen::Index i = 0; i < features_width; ++i) {
+    xs_lin(i) = static_cast<double>(i) * (roi_width - 1) / (features_width - 1);
+  }
+
+  Tensor1D ys_lin(features_height);
+  for (Eigen::Index i = 0; i < features_height; ++i) {
+    ys_lin(i) = static_cast<double>(i) * (roi_height - 1) / (features_height - 1);
+  }
+
+  // Reshape and broadcast ds_vec to (D, features_height, features_width)
+  Tensor3D ds_tensor = ds_vec.reshape(Eigen::array<Eigen::Index, 3>{D, 1, 1});
+  Tensor3D ds_broadcast =
+    ds_tensor.broadcast(Eigen::array<Eigen::Index, 3>{1, features_height, features_width});
+
+  // Reshape and broadcast xs_lin to (1, 1, features_width)
+  Tensor3D xs_tensor = xs_lin.reshape(Eigen::array<Eigen::Index, 3>{1, 1, features_width});
+  Tensor3D xs_broadcast = xs_tensor.broadcast(Eigen::array<Eigen::Index, 3>{D, features_height, 1});
+
+  // Reshape and broadcast ys_lin to (1, features_height, 1)
+  Tensor3D ys_tensor = ys_lin.reshape(Eigen::array<Eigen::Index, 3>{1, features_height, 1});
+  Tensor3D ys_broadcast = ys_tensor.broadcast(Eigen::array<Eigen::Index, 3>{D, 1, features_width});
+
+  // Stack xs, ys, ds along the last dimension to form frustum
+  Tensor4D frustum(D, features_height, features_width, 3);
+  frustum.chip(0, 3) = xs_broadcast;
+  frustum.chip(1, 3) = ys_broadcast;
+  frustum.chip(2, 3) = ds_broadcast;
+
+  return frustum;
+}
+
+Tensor5D get_geometry(
+  const Tensor4D & frustum,             // [D, H, W, 3]
+  const Tensor3D & camera2lidar_rots,   // [N, 3, 3]
+  const Tensor2D & camera2lidar_trans,  // [N, 3]
+  const Tensor3D & intrinsics_inverse,  // [N, 3, 3]
+  const Tensor3D & post_rots_inverse,   // [N, 3, 3]
+  const Tensor2D & post_trans           // [N, 3]
+)
+{
+  // Get dimensions
+  int N = camera2lidar_trans.dimension(0);
+  int D = frustum.dimension(0);
+  int H = frustum.dimension(1);
+  int W = frustum.dimension(2);
+
+  // Reshape and broadcast frustum to [N, D, H, W, 3]
+  Tensor5D frustum_broadcast = frustum.reshape(Eigen::array<Eigen::Index, 5>{1, D, H, W, 3})
+                                 .broadcast(Eigen::array<Eigen::Index, 5>{N, 1, 1, 1, 1});
+
+  // Reshape post_trans to [N, 1, 1, 1, 3]
+  Tensor5D post_trans_broadcast = post_trans.reshape(Eigen::array<Eigen::Index, 5>{N, 1, 1, 1, 3})
+                                    .broadcast(Eigen::array<Eigen::Index, 5>{1, D, H, W, 1});
+
+  // Subtract post_trans from frustum
+  Tensor5D points = frustum_broadcast - post_trans_broadcast;
+
+  // cSpell:ignore unsqueezed
+  // Unsqueeze points to [N, D, H, W, 3, 1]
+  Tensor6D points_unsqueezed = points.reshape(Eigen::array<Eigen::Index, 6>{N, D, H, W, 3, 1});
+
+  Tensor5D points_rotated(N, D, H, W, 3);
+
+  for (Eigen::Index camera_index = 0; camera_index < N; camera_index++) {
+    Tensor2D post_rot_inverse = post_rots_inverse.chip(camera_index, 0);
+    Tensor4D points_sliced = points.chip(camera_index, 0);
+
+    Eigen::array<Eigen::IndexPair<int>, 1> contract_dims = {Eigen::IndexPair<int>(3, 0)};
+    Eigen::array<int, 2> shuffling({1, 0});
+    Tensor2D post_rot_inverse_transposed = post_rot_inverse.shuffle(shuffling);
+
+    Tensor4D points_sliced_rotated =
+      points_sliced.contract(post_rot_inverse_transposed, contract_dims);
+    points_rotated.chip(camera_index, 0) = points_sliced_rotated;
+  }
+
+  // Remove the last dimension (size 1)
+  Tensor5D points_squeezed = points_rotated;
+
+  Tensor5D points_xy = points_squeezed.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 0}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 2});
+  Tensor5D points_z = points_squeezed.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 2}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 1});
+  Tensor5D points_xy_scaled =
+    points_xy * points_z.broadcast(Eigen::array<Eigen::Index, 5>{1, 1, 1, 1, 2});
+
+  // Concatenate points_xy_scaled and points_z along the last dimension
+  Tensor5D points_cat(points_squeezed.dimensions());
+  points_cat.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 0}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 2}) =
+    points_xy_scaled;
+  points_cat.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 2}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 1}) =
+    points_z;
+
+  Tensor3D combine(intrinsics_inverse.dimensions());
+
+  for (Eigen::Index camera_index = 0; camera_index < N; camera_index++) {
+    Tensor2D camera2lidar_rot = camera2lidar_rots.chip(camera_index, 0);
+    Tensor2D intrinsics_inverse_aux = intrinsics_inverse.chip(camera_index, 0);
+
+    Eigen::array<Eigen::IndexPair<int>, 1> contract_dims = {Eigen::IndexPair<int>(1, 0)};
+
+    Tensor2D combine_sliced = camera2lidar_rot.contract(intrinsics_inverse_aux, contract_dims);
+    combine.chip(camera_index, 0) = combine_sliced;
+  }
+
+  // Reshape combine to [N, 1, 1, 1, 3, 3]
+  Tensor6D combine_reshaped = combine.reshape(Eigen::array<Eigen::Index, 6>{N, 1, 1, 1, 3, 3});
+
+  Tensor5D points_transformed(points_cat.dimensions());
+
+  for (Eigen::Index camera_index = 0; camera_index < N; camera_index++) {
+    Tensor2D combine_sliced = combine.chip(camera_index, 0);
+    Tensor4D points_sliced = points_cat.chip(camera_index, 0);
+
+    Eigen::array<Eigen::IndexPair<int>, 1> contract_dims = {Eigen::IndexPair<int>(3, 0)};
+    Eigen::array<int, 2> shuffling({1, 0});
+    Tensor2D combined_transposed = combine_sliced.shuffle(shuffling);
+
+    Tensor4D points_transformed_sliced = points_sliced.contract(combined_transposed, contract_dims);
+    points_transformed.chip(camera_index, 0) = points_transformed_sliced;
+  }
+
+  // Squeeze the last dimension
+  Tensor5D points_squeezed_final =
+    points_transformed.reshape(Eigen::array<Eigen::Index, 5>{N, D, H, W, 3});
+
+  // Add camera2lidar_trans
+  Tensor5D camera2lidar_trans_broadcast =
+    camera2lidar_trans.reshape(Eigen::array<Eigen::Index, 5>{N, 1, 1, 1, 3})
+      .broadcast(Eigen::array<Eigen::Index, 5>{1, D, H, W, 1});
+  Tensor5D points_final = points_squeezed_final + camera2lidar_trans_broadcast;
+
+  return points_final;
+}
+
+// Define the function
+std::tuple<
+    Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,  // geom_feats
+    Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,  // kept
+    Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,  // ranks
+    Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>   // indices
+> bev_pool_aux(
+    const Tensor5D& geom_feats_input,
+    const BEVFusionConfig & config)
+{
+  Eigen::Vector3f dx(config.x_bound_[2], config.y_bound_[2], config.z_bound_[2]);
+  Eigen::Vector3f bx(
+    config.x_bound_[0] + config.x_bound_[2] / 2.0, config.y_bound_[0] + config.y_bound_[2] / 2.0,
+    config.z_bound_[0] + config.z_bound_[2] / 2.0);
+  Eigen::Vector3i nx(
+    static_cast<int>((config.x_bound_[1] - config.x_bound_[0]) / config.x_bound_[2]),
+    static_cast<int>((config.y_bound_[1] - config.y_bound_[0]) / config.y_bound_[2]),
+    static_cast<int>((config.z_bound_[1] - config.z_bound_[0]) / config.z_bound_[2]));
+
+  // Get dimensions
+  Eigen::Index N = geom_feats_input.dimension(0);
+  Eigen::Index D = geom_feats_input.dimension(1);
+  Eigen::Index H = geom_feats_input.dimension(2);
+  Eigen::Index W = geom_feats_input.dimension(3);
+  Eigen::Index C = geom_feats_input.dimension(4);
+  assert(C == 3);
+
+  Eigen::Index Nprime = N * D * H * W;
+
+  TensorMap1D dx_map(dx.data(), 3);
+  TensorMap1D bx_map(bx.data(), 3);
+
+  Tensor5D dx_broadcast = dx_map.reshape(Eigen::array<Eigen::Index, 5>{1, 1, 1, 1, C})
+                            .broadcast(Eigen::array<Eigen::Index, 5>{N, D, H, W, 1});
+
+  Tensor5D bx_broadcast = bx_map.reshape(Eigen::array<Eigen::Index, 5>{1, 1, 1, 1, 3})
+                            .broadcast(Eigen::array<Eigen::Index, 5>{N, D, H, W, 1});
+
+  Tensor5D geom_feats_aux = (geom_feats_input - (bx_broadcast - 0.5 * dx_broadcast)) / dx_broadcast;
+
+  Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, 3, Eigen::RowMajor>> geom_feats_map(
+    geom_feats_aux.data(), Nprime, C);
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 3, Eigen::RowMajor> geom_feats_int =
+    geom_feats_map.cast<std::int32_t>();
+
+  // Concatenate geom_feats_int and batch_ix along column
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor> geom_feats_cat(Nprime, 4);
+  geom_feats_cat.block(0, 0, Nprime, C) = geom_feats_int;
+  geom_feats_cat.col(3).setZero();
+
+  // Filter out points outside box
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor> kept(Nprime);
+  for (Eigen::Index i = 0; i < Nprime; ++i) {
+    int32_t x = geom_feats_cat(i, 0);
+    int32_t y = geom_feats_cat(i, 1);
+    int32_t z = geom_feats_cat(i, 2);
+
+    bool keep = (x >= 0) && (x < nx(0)) && (y >= 0) && (y < nx(1)) && (z >= 0) && (z < nx(2));
+    kept(i) = keep;
+  }
+
+  // Collect indices where kept is true
+  std::vector<std::int64_t> kept_indices;
+  for (Eigen::Index i = 0; i < Nprime; ++i) {
+    if (kept(i)) {
+      kept_indices.push_back(i);
+    }
+  }
+
+  // Create geom_feats_kept
+  Eigen::Index N_kept = kept_indices.size();
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor> geom_feats_kept(N_kept, 4);
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    std::int64_t idx = kept_indices[i];
+    geom_feats_kept.row(i) = geom_feats_cat.row(idx);
+  }
+
+  // Compute ranks
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> ranks(N_kept);
+  std::int64_t factor0 = static_cast<std::int64_t>(W) * D;
+  std::int64_t factor1 = static_cast<std::int64_t>(D);
+  std::int64_t factor2 = 1;
+
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    std::int32_t x = geom_feats_kept(i, 0);
+    std::int32_t y = geom_feats_kept(i, 1);
+    std::int32_t z = geom_feats_kept(i, 2);
+    std::int64_t batch = geom_feats_kept(i, 3);
+
+    std::int64_t rank = x * factor0 + y * factor1 + z * factor2 + batch;
+    ranks(i) = rank;
+  }
+
+  //  Sort ranks and get indices
+  std::vector<std::pair<std::int64_t, std::int64_t>> rank_idx_pairs(N_kept);
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    rank_idx_pairs[i] = std::make_pair(ranks(i), i);
+  }
+  std::sort(rank_idx_pairs.begin(), rank_idx_pairs.end());
+
+  //  Create sorted ranks and indices
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> ranks_sorted(N_kept);
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> indices(N_kept);
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor> geom_feats_sorted(N_kept, 4);
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    std::int64_t rank = rank_idx_pairs[i].first;
+    std::int64_t idx = rank_idx_pairs[i].second;
+
+    ranks_sorted(i) = rank;
+    indices(i) = idx;
+    geom_feats_sorted.row(i) = geom_feats_kept.row(idx);
+  }
+
+  //  Return geom_feats_sorted, kept, ranks_sorted, indices
+  return std::make_tuple(geom_feats_sorted, kept, ranks_sorted, indices);
+}
+
+std::tuple<
+  Eigen::VectorXf, Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>>
+precompute_features(
+  const std::vector<Matrix4fRowM> & lidar2camera_transforms,
+  const std::vector<Matrix4fRowM> & camera_aug_matrices,
+  const std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+  const BEVFusionConfig & config)
+{
+  Eigen::VectorXf lidar2images_flat(config.num_cameras_ * 4 * 4);
+
+  Tensor4D frustum = autoware::lidar_bevfusion::create_frustum(config);
+
+  Tensor3D camera2lidar_rotations(config.num_cameras_, 3, 3);
+  Tensor2D camera2lidar_translations(config.num_cameras_, 3);
+  Tensor3D intrinsics_inverse(config.num_cameras_, 3, 3);
+  Tensor3D post_rots_inverse(config.num_cameras_, 3, 3);
+  Tensor2D post_trans(config.num_cameras_, 3);
+
+  for (std::int64_t camera_id = 0; camera_id < config.num_cameras_; camera_id++) {
+    Matrix4fRowM cam2image = Matrix4fRowM::Identity();
+    cam2image(0, 0) = camera_info_vector[camera_id].p[0];
+    cam2image(1, 1) = camera_info_vector[camera_id].p[5];
+    cam2image(0, 2) = camera_info_vector[camera_id].p[2];
+    cam2image(1, 2) = camera_info_vector[camera_id].p[6];
+
+    Matrix4fRowM img_aug_matrix = camera_aug_matrices[camera_id];
+    Matrix4fRowM img_aug_matrix_inverse = img_aug_matrix.inverse().eval();
+
+    Matrix4fRowM lidar2camera = lidar2camera_transforms[camera_id];
+    Matrix4fRowM camera2lidar = lidar2camera.inverse().eval();
+
+    Matrix4fRowM lidar2image = cam2image * lidar2camera;
+    Eigen::VectorXf lidar2image_flat(Eigen::Map<Eigen::VectorXf>(lidar2image.data(), 16));
+    lidar2images_flat.segment(16 * camera_id, 16) = lidar2image_flat;
+
+    Matrix3fRowM camera2lidar_rot = camera2lidar.block<3, 3>(0, 0);
+    Vector3fRowM camera2lidar_trans = camera2lidar.block<3, 1>(0, 3);
+    Matrix3fRowM intrinsics_inverse_matrix = cam2image.inverse().eval().block<3, 3>(0, 0);
+    Matrix3fRowM post_rots_inv = img_aug_matrix_inverse.block<3, 3>(0, 0);
+    Vector3fRowM post_trans_vec = img_aug_matrix.block<3, 1>(0, 3);
+
+    TensorMap2D camera2lidar_rot_tensor(camera2lidar_rot.data(), 3, 3);
+    TensorMap1D camera2lidar_trans_tensor(camera2lidar_trans.data(), 3);
+    TensorMap2D intrinsics_inverse_tensor(intrinsics_inverse_matrix.data(), 3, 3);
+    TensorMap2D post_rots_inv_tensor(post_rots_inv.data(), 3, 3);
+    TensorMap1D post_trans_tensor(post_trans_vec.data(), 3);
+
+    camera2lidar_rotations.chip(camera_id, 0) = camera2lidar_rot_tensor;
+    camera2lidar_translations.chip(camera_id, 0) = camera2lidar_trans_tensor;
+    intrinsics_inverse.chip(camera_id, 0) = intrinsics_inverse_tensor;
+    post_rots_inverse.chip(camera_id, 0) = post_rots_inv_tensor;
+    post_trans.chip(camera_id, 0) = post_trans_tensor;
+  }
+
+  Tensor5D geometry = get_geometry(
+    frustum,                    // [D, H, W, 3]
+    camera2lidar_rotations,     // [N, 3, 3]
+    camera2lidar_translations,  // [N, 3]
+    intrinsics_inverse,         // [N, 3, 3]
+    post_rots_inverse,          // [N, 3, 3]
+    post_trans                  // [N, 3]
+  );
+
+  auto [geom_feats, kept, ranks, indices] = bev_pool_aux(geometry, config);
+
+  return std::make_tuple(lidar2images_flat, geom_feats, kept, ranks, indices);
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/preprocess_kernel.cu b/perception/autoware_lidar_bevfusion/lib/preprocess/preprocess_kernel.cu
new file mode 100644
index 0000000000000..2a3289b4e508b
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/preprocess_kernel.cu
@@ -0,0 +1,228 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <spconvlib/spconv/csrc/sparse/all/ops3d/Point2Voxel.h>  // cSpell:ignore spconvlib
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+
+namespace autoware::lidar_bevfusion
+{
+
+PreprocessCuda::PreprocessCuda(
+  const BEVFusionConfig & config, cudaStream_t & stream, bool allocate_buffers)
+: stream_(stream), config_(config)
+{
+  if (allocate_buffers) {
+    hash_key_value_ = tv::empty({config.cloud_capacity_ * 2}, tv::custom128, 0);
+    point_indices_data_ = tv::empty({config.cloud_capacity_}, tv::int64, 0);
+    points_voxel_id_ = tv::empty({config.cloud_capacity_}, tv::int64, 0);
+  }
+}
+
+__global__ void generateSweepPoints_kernel(
+  const InputPointType * __restrict__ input_points, std::size_t points_size, float time_lag,
+  const float * transform_array, int num_features, float * __restrict__ output_points)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  const InputPointType * input_point = &input_points[point_idx];
+  float input_x = input_point->x;
+  float input_y = input_point->y;
+  float input_z = input_point->z;
+  float input_intensity = static_cast<float>(input_point->intensity);
+
+  output_points[point_idx * num_features] = transform_array[0] * input_x +
+                                            transform_array[4] * input_y +
+                                            transform_array[8] * input_z + transform_array[12];
+  output_points[point_idx * num_features + 1] = transform_array[1] * input_x +
+                                                transform_array[5] * input_y +
+                                                transform_array[9] * input_z + transform_array[13];
+  output_points[point_idx * num_features + 2] = transform_array[2] * input_x +
+                                                transform_array[6] * input_y +
+                                                transform_array[10] * input_z + transform_array[14];
+  output_points[point_idx * num_features + 3] = input_intensity;
+  output_points[point_idx * num_features + 4] = time_lag;
+}
+
+cudaError_t PreprocessCuda::generateSweepPoints_launch(
+  const InputPointType * input_data, std::size_t points_size, float time_lag,
+  const float * transform_array, float * output_points)
+{
+  dim3 blocks(divup(points_size, config_.threads_per_block_));
+  dim3 threads(config_.threads_per_block_);
+
+  generateSweepPoints_kernel<<<blocks, threads, 0, stream_>>>(
+    input_data, points_size, time_lag, transform_array, config_.num_point_feature_size_,
+    output_points);
+
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+std::size_t PreprocessCuda::generateVoxels(
+  const float * points, unsigned int points_size, float * voxel_features,
+  std::int32_t * voxel_coords, std::int32_t * num_points_per_voxel)
+{
+  using Point2VoxelGPU3D = spconvlib::spconv::csrc::sparse::all::ops3d::Point2Voxel;
+
+  std::array<float, 3> vsize_xyz{
+    config_.voxel_z_size_, config_.voxel_y_size_, config_.voxel_x_size_};
+
+  std::array<std::int32_t, 3> grid_size{
+    static_cast<std::int32_t>(config_.grid_z_size_),
+    static_cast<std::int32_t>(config_.grid_y_size_),
+    static_cast<std::int32_t>(config_.grid_x_size_)};
+
+  std::array<std::int64_t, 3> grid_stride{
+    static_cast<std::int64_t>(config_.grid_y_size_ * config_.grid_x_size_),
+    static_cast<std::int64_t>(config_.grid_x_size_), 1};
+
+  std::array<float, 6> coors_range{config_.min_z_range_, config_.min_y_range_,
+                                   config_.min_x_range_, config_.max_z_range_,
+                                   config_.max_y_range_, config_.max_x_range_};
+
+  tv::Tensor pc = tv::from_blob(
+    points, {static_cast<std::int64_t>(points_size), config_.num_point_feature_size_}, tv::float32,
+    0);
+
+  auto point_limit = pc.dim(0);
+
+  tv::Tensor voxels_padded = tv::from_blob(
+    voxel_features, {config_.max_num_voxels_, config_.max_points_per_voxel_, pc.dim(1)},
+    tv::float32, 0);
+
+  tv::Tensor indices_padded_no_batch =
+    tv::from_blob(voxel_coords, {config_.max_num_voxels_, 3}, tv::int32, 0);
+
+  tv::Tensor num_points_per_voxel_tensor =
+    tv::from_blob(num_points_per_voxel, {config_.max_num_voxels_}, tv::int32, 0);
+
+  auto p2v_res = Point2VoxelGPU3D::point_to_voxel_hash_static(
+    pc, voxels_padded, indices_padded_no_batch, num_points_per_voxel_tensor, hash_key_value_,
+    point_indices_data_, points_voxel_id_, vsize_xyz, grid_size, grid_stride, coors_range, true,
+    false, reinterpret_cast<std::uintptr_t>(stream_));
+
+  std::size_t real_num_voxels = static_cast<std::size_t>(std::get<0>(p2v_res).dim(0));
+
+  return real_num_voxels;
+}
+
+__global__ void resize_and_extract_roi(
+  const std::uint8_t * __restrict__ input_img, std::uint8_t * __restrict__ output_img, int H,
+  int W,                     // Original image dimensions (Height, Width)
+  int H2, int W2,            // Resized image dimensions (Height, Width)
+  int H3, int W3,            // ROI dimensions (Height, Width)
+  int y_start, int x_start)  // ROI top-left coordinates in resized image
+{
+  // Calculate the global thread indices
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // Width index in output (ROI) image
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // Height index in output (ROI) image
+
+  // Check if the thread corresponds to a valid pixel in the ROI
+  if (i >= W3 || j >= H3) return;
+
+  // Compute scaling factors from original to resized image
+  float scale_y = static_cast<float>(H) / H2;
+  float scale_x = static_cast<float>(W) / W2;
+
+  // Map output pixel (i, j) in ROI to position in resized image
+  int i_resized = i + x_start;
+  int j_resized = j + y_start;
+
+  // Map position in resized image back to position in original image
+  float i_original = (i_resized + 0.5f) * scale_x - 0.5f;
+  float j_original = (j_resized + 0.5f) * scale_y - 0.5f;
+
+  // Compute coordinates for bilinear interpolation
+  int i0 = static_cast<int>(floorf(i_original));
+  int j0 = static_cast<int>(floorf(j_original));
+  int i1 = i0 + 1;
+  int j1 = j0 + 1;
+
+  // Compute interpolation weights
+  float di = i_original - i0;
+  float dj = j_original - j0;
+
+  float w00 = (1.0f - di) * (1.0f - dj);
+  float w01 = (1.0f - di) * dj;
+  float w10 = di * (1.0f - dj);
+  float w11 = di * dj;
+
+  // Loop over the three color channels
+  for (int c = 0; c < 3; ++c) {
+    float v00 = 0.0f, v01 = 0.0f, v10 = 0.0f, v11 = 0.0f;
+
+    // Boundary checks for each neighboring pixel
+    if (i0 >= 0 && i0 < W && j0 >= 0 && j0 < H)
+      v00 = static_cast<float>(input_img[(j0 * W + i0) * 3 + c]);
+    if (i0 >= 0 && i0 < W && j1 >= 0 && j1 < H)
+      v01 = static_cast<float>(input_img[(j1 * W + i0) * 3 + c]);
+    if (i1 >= 0 && i1 < W && j0 >= 0 && j0 < H)
+      v10 = static_cast<float>(input_img[(j0 * W + i1) * 3 + c]);
+    if (i1 >= 0 && i1 < W && j1 >= 0 && j1 < H)
+      v11 = static_cast<float>(input_img[(j1 * W + i1) * 3 + c]);
+
+    // Compute the interpolated pixel value
+    float value = w00 * v00 + w01 * v01 + w10 * v10 + w11 * v11;
+
+    // Store the result in the output ROI image
+    output_img[(j * W3 + i) * 3 + c] = static_cast<std::uint8_t>(value);
+  }
+}
+
+cudaError_t PreprocessCuda::resize_and_extract_roi_launch(
+  const std::uint8_t * input_img, std::uint8_t * output_img, int H,
+  int W,                     // Original image dimensions
+  int H2, int W2,            // Resized image dimensions
+  int H3, int W3,            // ROI dimensions
+  int y_start, int x_start,  // ROI top-left coordinates in resized image
+  cudaStream_t stream)
+{
+  // Define the block and grid dimensions
+  dim3 threads(16, 16);
+  dim3 blocks(divup(W3, threads.x), divup(H3, threads.y));
+
+  // Launch the kernel
+  resize_and_extract_roi<<<blocks, threads, 0, stream>>>(
+    input_img, output_img, H, W, H2, W2, H3, W3, y_start, x_start);
+
+  // Check for errors
+  return cudaGetLastError();
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/voxel_generator.cpp b/perception/autoware_lidar_bevfusion/lib/preprocess/voxel_generator.cpp
new file mode 100644
index 0000000000000..6ad6558490dc5
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/voxel_generator.cpp
@@ -0,0 +1,89 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/preprocess/voxel_generator.hpp"
+
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+
+#include <sensor_msgs/point_cloud2_iterator.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+namespace autoware::lidar_bevfusion
+{
+
+VoxelGenerator::VoxelGenerator(
+  const DensificationParam & densification_param, const BEVFusionConfig & config,
+  cudaStream_t & stream)
+: config_(config), stream_(stream)
+{
+  pd_ptr_ = std::make_unique<PointCloudDensification>(densification_param, stream_);
+
+  pre_ptr_ = std::make_unique<PreprocessCuda>(config_, stream_, false);
+
+  affine_past2current_d_ =
+    autoware::cuda_utils::make_unique<float[]>(Eigen::Affine3f::MatrixType::SizeAtCompileTime);
+}
+
+bool VoxelGenerator::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer)
+{
+  return pd_ptr_->enqueuePointCloud(msg, tf_buffer);
+}
+
+std::size_t VoxelGenerator::generateSweepPoints(CudaUniquePtr<float[]> & points_d)
+{
+  std::size_t point_counter{0};
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
+       pc_cache_iter++) {
+    auto sweep_num_points = pc_cache_iter->num_points;
+    auto output_offset = point_counter * config_.num_point_feature_size_;
+
+    if (point_counter + sweep_num_points >= static_cast<std::size_t>(config_.cloud_capacity_)) {
+      RCLCPP_WARN_STREAM(
+        rclcpp::get_logger("lidar_bevfusion"), "Exceeding cloud capacity. Used "
+                                                 << pd_ptr_->getIdx(pc_cache_iter) << " out of "
+                                                 << pd_ptr_->getCacheSize() << " sweep(s)");
+      break;
+    }
+
+    auto affine_past2current =
+      pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
+    static_assert(std::is_same<decltype(affine_past2current.matrix()), Eigen::Matrix4f &>::value);
+    static_assert(!Eigen::Matrix4f::IsRowMajor, "matrices should be col-major.");
+
+    float time_lag = static_cast<float>(
+      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
+
+    CHECK_CUDA_ERROR(cudaMemcpyAsync(
+      affine_past2current_d_.get(), affine_past2current.data(),
+      Eigen::Affine3f::MatrixType::SizeAtCompileTime * sizeof(float), cudaMemcpyHostToDevice,
+      stream_));
+    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+    pre_ptr_->generateSweepPoints_launch(
+      pc_cache_iter->data_d.get(), sweep_num_points, time_lag, affine_past2current_d_.get(),
+      points_d.get() + output_offset);
+    point_counter += sweep_num_points;
+  }
+
+  return point_counter;
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/ros_utils.cpp b/perception/autoware_lidar_bevfusion/lib/ros_utils.cpp
new file mode 100644
index 0000000000000..22a396a067d0a
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/ros_utils.cpp
@@ -0,0 +1,126 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/ros_utils.hpp"
+
+#include <autoware/object_recognition_utils/object_recognition_utils.hpp>
+#include <autoware/universe_utils/geometry/geometry.hpp>
+#include <autoware/universe_utils/math/constants.hpp>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+using Label = autoware_perception_msgs::msg::ObjectClassification;
+
+bool is_data_layout_compatible_with_point_xyzirc(const sensor_msgs::msg::PointCloud2 & input)
+{
+  using PointIndex = autoware::point_types::PointXYZIRCIndex;
+  using autoware::point_types::PointXYZIRC;
+  if (input.fields.size() < 6) {
+    return false;
+  }
+  bool same_layout = true;
+  const auto & field_x = input.fields.at(static_cast<size_t>(PointIndex::X));
+  same_layout &= field_x.name == "x";
+  same_layout &= field_x.offset == offsetof(PointXYZIRC, x);
+  same_layout &= field_x.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_x.count == 1;
+  const auto & field_y = input.fields.at(static_cast<size_t>(PointIndex::Y));
+  same_layout &= field_y.name == "y";
+  same_layout &= field_y.offset == offsetof(PointXYZIRC, y);
+  same_layout &= field_y.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_y.count == 1;
+  const auto & field_z = input.fields.at(static_cast<size_t>(PointIndex::Z));
+  same_layout &= field_z.name == "z";
+  same_layout &= field_z.offset == offsetof(PointXYZIRC, z);
+  same_layout &= field_z.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_z.count == 1;
+  const auto & field_intensity = input.fields.at(static_cast<size_t>(PointIndex::Intensity));
+  same_layout &= field_intensity.name == "intensity";
+  same_layout &= field_intensity.offset == offsetof(PointXYZIRC, intensity);
+  same_layout &= field_intensity.datatype == sensor_msgs::msg::PointField::UINT8;
+  same_layout &= field_intensity.count == 1;
+  const auto & field_return_type = input.fields.at(static_cast<size_t>(PointIndex::ReturnType));
+  same_layout &= field_return_type.name == "return_type";
+  same_layout &= field_return_type.offset == offsetof(PointXYZIRC, return_type);
+  same_layout &= field_return_type.datatype == sensor_msgs::msg::PointField::UINT8;
+  same_layout &= field_return_type.count == 1;
+  const auto & field_ring = input.fields.at(static_cast<size_t>(PointIndex::Channel));
+  same_layout &= field_ring.name == "channel";
+  same_layout &= field_ring.offset == offsetof(PointXYZIRC, channel);
+  same_layout &= field_ring.datatype == sensor_msgs::msg::PointField::UINT16;
+  same_layout &= field_ring.count == 1;
+
+  return same_layout;
+}
+
+void box3DToDetectedObject(
+  const Box3D & box3d, const std::vector<std::string> & class_names,
+  autoware_perception_msgs::msg::DetectedObject & obj)
+{
+  obj.existence_probability = box3d.score;
+
+  // classification
+  autoware_perception_msgs::msg::ObjectClassification classification;
+  classification.probability = 1.0f;
+  if (box3d.label >= 0 && static_cast<std::size_t>(box3d.label) < class_names.size()) {
+    classification.label = getSemanticType(class_names[box3d.label]);
+  } else {
+    classification.label = Label::UNKNOWN;
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_bevfusion"), "Unexpected label: UNKNOWN is set.");
+  }
+
+  if (autoware::object_recognition_utils::isCarLikeVehicle(classification.label)) {
+    obj.kinematics.orientation_availability =
+      autoware_perception_msgs::msg::DetectedObjectKinematics::SIGN_UNKNOWN;
+  }
+
+  obj.classification.emplace_back(classification);
+
+  // pose and shape
+  obj.kinematics.pose_with_covariance.pose.position =
+    autoware::universe_utils::createPoint(box3d.x, box3d.y, box3d.z);
+  obj.kinematics.pose_with_covariance.pose.orientation =
+    autoware::universe_utils::createQuaternionFromYaw(box3d.yaw);
+  obj.shape.type = autoware_perception_msgs::msg::Shape::BOUNDING_BOX;
+  obj.shape.dimensions =
+    autoware::universe_utils::createTranslation(box3d.length, box3d.width, box3d.height);
+}
+
+std::uint8_t getSemanticType(const std::string & class_name)
+{
+  if (class_name == "CAR") {
+    return Label::CAR;
+  } else if (class_name == "TRUCK") {
+    return Label::TRUCK;
+  } else if (class_name == "BUS") {
+    return Label::BUS;
+  } else if (class_name == "TRAILER") {
+    return Label::TRAILER;
+  } else if (class_name == "MOTORCYCLE") {
+    return Label::MOTORCYCLE;
+  } else if (class_name == "BICYCLE") {
+    return Label::BICYCLE;
+  } else if (class_name == "PEDESTRIAN") {
+    return Label::PEDESTRIAN;
+  } else {  // CONSTRUCTION_VEHICLE, BARRIER, TRAFFIC_CONE
+    return Label::UNKNOWN;
+  }
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/package.xml b/perception/autoware_lidar_bevfusion/package.xml
new file mode 100644
index 0000000000000..eb38aa9a07f99
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/package.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>autoware_lidar_bevfusion</name>
+  <version>1.0.0</version>
+  <description>The autoware_lidar_bevfusion package</description>
+  <maintainer email="kenzo.lobos@tier4.jp">Kenzo Lobos-Tsunekawa</maintainer>
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+  <buildtool_depend>autoware_cmake</buildtool_depend>
+
+  <depend>autoware_cuda_utils</depend>
+  <depend>autoware_object_recognition_utils</depend>
+  <depend>autoware_perception_msgs</depend>
+  <depend>autoware_point_types</depend>
+  <depend>autoware_tensorrt_common</depend>
+  <depend>autoware_universe_utils</depend>
+  <depend>launch_ros</depend>
+  <depend>pcl_ros</depend>
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+  <depend>tf2_eigen</depend>
+  <depend>tf2_geometry_msgs</depend>
+  <depend>tf2_sensor_msgs</depend>
+
+  <exec_depend>autoware_image_transport_decompressor</exec_depend>
+
+  <test_depend>ament_cmake_ros</test_depend>
+  <test_depend>ament_lint_auto</test_depend>
+  <test_depend>autoware_lint_common</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json b/perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json
new file mode 100644
index 0000000000000..a3af7a6da7e82
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json
@@ -0,0 +1,121 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for LiDAR TransFusion Node",
+  "$comment": "This schema is not considered in CI workflow. See https://github.com/autowarefoundation/autoware.universe/pull/8205#issuecomment-2255074224.",
+  "type": "object",
+  "definitions": {
+    "bevfusion": {
+      "type": "object",
+      "properties": {
+        "trt_precision": {
+          "type": "string",
+          "description": "A precision of TensorRT engine.",
+          "default": "fp16",
+          "enum": ["fp16", "fp32"]
+        },
+        "cloud_capacity": {
+          "type": "integer",
+          "description": "Capacity of the point cloud buffer (should be set to at least the maximum theoretical number of points).",
+          "default": 2000000,
+          "minimum": 1
+        },
+        "onnx_path": {
+          "type": "string",
+          "description": "A path to ONNX model file.",
+          "default": "$(var model_path)/bevfusion.onnx",
+          "pattern": "\\.onnx$"
+        },
+        "engine_path": {
+          "type": "string",
+          "description": "A path to TensorRT engine file.",
+          "default": "$(var model_path)/bevfusion.engine",
+          "pattern": "\\.engine$"
+        },
+        "densification_num_past_frames": {
+          "type": "integer",
+          "description": "A number of past frames to be considered as same input frame.",
+          "default": 1,
+          "minimum": 0
+        },
+        "densification_world_frame_id": {
+          "type": "string",
+          "description": "A name of frame id where world coordinates system is defined with respect to.",
+          "default": "map"
+        },
+        "circle_nms_dist_threshold": {
+          "type": "number",
+          "description": "A distance threshold between detections in NMS.",
+          "default": 0.5,
+          "minimum": 0.0
+        },
+        "iou_nms_target_class_names": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "An array of class names to be target in NMS.",
+          "default": ["CAR"],
+          "uniqueItems": true
+        },
+        "iou_nms_search_distance_2d": {
+          "type": "number",
+          "description": "A maximum distance value to search the nearest objects.",
+          "default": 10.0,
+          "minimum": 0.0
+        },
+        "iou_nms_threshold": {
+          "type": "number",
+          "description": "A threshold value of NMS using IoU score.",
+          "default": 0.1,
+          "minimum": 0.0,
+          "maximum": 1.0
+        },
+        "yaw_norm_thresholds": {
+          "type": "array",
+          "items": {
+            "type": "number",
+            "minimum": 0.0,
+            "maximum": 1.0
+          },
+          "description": "A thresholds array of direction vectors norm, all of objects with vector norm less than this threshold are ignored.",
+          "default": [0.3, 0.3, 0.3, 0.3, 0.0]
+        },
+        "score_threshold": {
+          "type": "number",
+          "description": "A threshold value of confidence score, all of objects with score less than this threshold are ignored.",
+          "default": 0.2,
+          "minimum": 0.0
+        }
+      },
+      "required": [
+        "trt_precision",
+        "cloud_capacity",
+        "onnx_path",
+        "engine_path",
+        "densification_num_past_frames",
+        "densification_world_frame_id",
+        "circle_nms_dist_threshold",
+        "iou_nms_target_class_names",
+        "iou_nms_search_distance_2d",
+        "iou_nms_threshold",
+        "yaw_norm_thresholds",
+        "score_threshold"
+      ],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/bevfusion"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json b/perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json
new file mode 100644
index 0000000000000..106c8c2a29502
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json
@@ -0,0 +1,206 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for bevfusion",
+  "type": "object",
+  "definitions": {
+    "bevfusion": {
+      "type": "object",
+      "properties": {
+        "class_names": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "Predicted classes' names.",
+          "default": ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"],
+          "uniqueItems": true
+        },
+        "voxels_num": {
+          "type": "array",
+          "items": {
+            "type": "integer"
+          },
+          "description": "Voxel ranges used during inference [min, opt, max].",
+          "default": [5000, 30000, 60000]
+        },
+        "point_cloud_range": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Range in meters of the pointcloud in meters [min_x, min_y, min_z, max_x, max_y, max_z].",
+          "default": [-76.8, -76.8, -3.0, 76.8, 76.8, 5.0],
+          "minItems": 6,
+          "maxItems": 6
+        },
+        "voxel_size": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Voxels size [x, y, z] in meters.",
+          "default": [0.3, 0.3, 8.0],
+          "minItems": 3,
+          "maxItems": 3
+        },
+        "num_proposals": {
+          "type": "integer",
+          "description": "Number of object proposals.",
+          "default": 500,
+          "minimum": 1
+        },
+
+        "out_size_factor": {
+          "type": "integer",
+          "description": "Output size factor using in the network.",
+          "default": 8,
+          "minimum": 1
+        },
+        "max_points_per_voxel": {
+          "type": "integer",
+          "description": "Maximum number of points that a voxel can hold.",
+          "default": 10,
+          "minimum": 1
+        },
+        "d_bound": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Distance bounds used in the view transform in meters (min, max, and step).",
+          "default": [1.0, 166.2, 1.4],
+          "minItems": 3,
+          "maxItems": 3
+        },
+        "x_bound": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "x-axis bounds used in the view transform in meters (min, max, and step).",
+          "default": [-122.4, 122.4, 0.68],
+          "minItems": 3,
+          "maxItems": 3
+        },
+        "y_bound": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "y-axis bounds used in the view transform in meters (min, max, and step).",
+          "default": [-122.4, 122.4, 0.68],
+          "minItems": 3,
+          "maxItems": 3
+        },
+        "z_bound": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "z-axis bounds used in the view transform in meters (min, max, and step).",
+          "default": [-10.0, 10.0, 20.0],
+          "minItems": 3,
+          "maxItems": 3
+        },
+        "num_cameras": {
+          "type": "integer",
+          "description": "Number of cameras to use.",
+          "default": 6,
+          "minimum": 0
+        },
+        "raw_image_height": {
+          "type": "integer",
+          "description": "Raw image height in pixels.",
+          "default": 1080,
+          "minimum": 0
+        },
+        "raw_image_width": {
+          "type": "integer",
+          "description": "Raw image width in pixels.",
+          "default": 1440,
+          "minimum": 0
+        },
+        "img_aug_scale_x": {
+          "type": "number",
+          "description": "Raw image scaling before ROI extraction.",
+          "default": 0.489,
+          "minimum": 0.0,
+          "maximum": 1.0
+        },
+        "img_aug_scale_y": {
+          "type": "number",
+          "description": "Raw image scaling before ROI extraction.",
+          "default": 0.489,
+          "minimum": 0.0,
+          "maximum": 1.0
+        },
+        "roi_height": {
+          "type": "integer",
+          "description": "ROI image height (input to the network) in pixels.",
+          "default": 384,
+          "minimum": 0
+        },
+        "roi_width": {
+          "type": "integer",
+          "description": "ROI image width (input to the network) in pixels.",
+          "default": 704,
+          "minimum": 0
+        },
+        "features_height": {
+          "type": "integer",
+          "description": "Image features height (output of the image backbone) in pixels.",
+          "default": 48,
+          "minimum": 0
+        },
+        "features_width": {
+          "type": "integer",
+          "description": "Image features width (output of the image backbone) in pixels.",
+          "default": 88,
+          "minimum": 0
+        },
+        "num_depth_features": {
+          "type": "integer",
+          "description": "Number of depth features used in the view transform.",
+          "default": 118,
+          "minimum": 0
+        }
+      },
+      "required": [
+        "class_names",
+        "voxels_num",
+        "point_cloud_range",
+        "voxel_size",
+        "num_proposals",
+        "d_bound",
+        "x_bound",
+        "y_bound",
+        "z_bound",
+        "num_cameras",
+        "raw_image_height",
+        "raw_image_width",
+        "img_aug_scale_x",
+        "img_aug_scale_y",
+        "roi_height",
+        "roi_width",
+        "features_height",
+        "features_width",
+        "num_depth_features"
+      ],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/bevfusion"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json b/perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json
new file mode 100644
index 0000000000000..8aaefa295fcc0
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json
@@ -0,0 +1,72 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for Detection Class Remapper",
+  "type": "object",
+  "definitions": {
+    "detection_class_remapper": {
+      "type": "object",
+      "properties": {
+        "allow_remapping_by_area_matrix": {
+          "type": "array",
+          "items": {
+            "type": "integer"
+          },
+          "description": "Whether to allow remapping of classes. The order of 8x8 matrix classes comes from ObjectClassification msg.",
+          "default": [
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0
+          ],
+          "minItems": 64,
+          "maxItems": 64
+        },
+        "min_area_matrix": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Minimum area for specific class to consider class remapping.",
+          "default": [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.1, 0.0, 36.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 36.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 36.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+          ],
+          "minItems": 64,
+          "maxItems": 64
+        },
+        "max_area_matrix": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Maximum area for specific class to consider class remapping.",
+          "default": [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 36.0, 0.0, 999.999, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 999.999, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+          ],
+          "minItems": 64,
+          "maxItems": 64
+        }
+      },
+      "required": ["allow_remapping_by_area_matrix", "min_area_matrix", "max_area_matrix"],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/detection_class_remapper"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/perception/autoware_lidar_bevfusion/src/bev_ops/bev_pool_cuda.cu b/perception/autoware_lidar_bevfusion/src/bev_ops/bev_pool_cuda.cu
new file mode 100644
index 0000000000000..b4e2be9da0000
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/bev_ops/bev_pool_cuda.cu
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/open-mmlab/mmdetection3d/blob/main/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.cu
+// https://github.com/mit-han-lab/bevfusion/blob/main/mmdet3d/ops/bev_pool/src/bev_pool_cuda.cu
+// Available under Apache-2.0 license
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+  Function: pillar pooling
+  Args:
+    b                : batch size
+    d                : depth of the feature map
+    h                : height of pooled feature map
+    w                : width of pooled feature map
+    n                : number of input points
+    c                : number of channels
+    n_intervals      : number of unique points
+    x                : input features, FloatTensor[n, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+__global__ void bev_pool_kernel(
+  int b, int d, int h, int w, int n, int c, int n_intervals, const float * __restrict__ x,
+  const int * __restrict__ geom_feats, const int * __restrict__ interval_starts,
+  const int * __restrict__ interval_lengths, float * __restrict__ out)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+
+  const int * cur_geom_feats = geom_feats + interval_start * 4;
+  const float * cur_x = x + interval_start * c + cur_c;
+  float * cur_out = out + cur_geom_feats[3] * d * h * w * c + cur_geom_feats[2] * h * w * c +
+                    cur_geom_feats[0] * w * c + cur_geom_feats[1] * c + cur_c;
+  float psum = 0;
+  for (int i = 0; i < interval_length; i++) {
+    psum += cur_x[i * c];
+  }
+
+  *cur_out = psum;
+}
+
+void bev_pool(
+  int b, int d, int h, int w, int n, int c, int n_intervals, const float * x,
+  const int * geom_feats, const int * interval_starts, const int * interval_lengths, float * out,
+  cudaStream_t & stream)
+{
+  bev_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256, 0, stream>>>(
+    b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out);
+}
diff --git a/perception/autoware_lidar_bevfusion/src/lidar_bevfusion_node.cpp b/perception/autoware_lidar_bevfusion/src/lidar_bevfusion_node.cpp
new file mode 100644
index 0000000000000..9ceb246a6cfdb
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/lidar_bevfusion_node.cpp
@@ -0,0 +1,341 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/lidar_bevfusion_node.hpp"
+
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+LidarBEVFusionNode::LidarBEVFusionNode(const rclcpp::NodeOptions & options)
+: Node("lidar_bevfusion", options), tf_buffer_(this->get_clock())
+{
+  auto descriptor = rcl_interfaces::msg::ParameterDescriptor{}.set__read_only(true);
+
+  // Modality
+  sensor_fusion_ = this->declare_parameter<bool>("sensor_fusion", descriptor);
+
+  // Non network parameters
+  max_camera_lidar_delay_ = this->declare_parameter<float>("max_camera_lidar_delay", descriptor);
+
+  // TensorRT parameters
+  const std::string plugins_path = this->declare_parameter<std::string>("plugins_path", descriptor);
+
+  // Network parameters
+  const std::string onnx_path = this->declare_parameter<std::string>("onnx_path", descriptor);
+  const std::string engine_path = this->declare_parameter<std::string>("engine_path", descriptor);
+  const std::string trt_precision =
+    this->declare_parameter<std::string>("trt_precision", descriptor);
+
+  // Common parameters
+  const auto out_size_factor = this->declare_parameter<std::int64_t>("out_size_factor", descriptor);
+
+  auto to_float_vector = [](const auto & v) -> std::vector<float> {
+    return std::vector<float>(v.begin(), v.end());
+  };
+
+  // Lidar branch parameters
+  const auto cloud_capacity = this->declare_parameter<std::int64_t>("cloud_capacity", descriptor);
+  const auto max_points_per_voxel =
+    this->declare_parameter<std::int64_t>("max_points_per_voxel", descriptor);
+  const auto voxels_num =
+    this->declare_parameter<std::vector<std::int64_t>>("voxels_num", descriptor);
+  const auto point_cloud_range =
+    to_float_vector(this->declare_parameter<std::vector<double>>("point_cloud_range", descriptor));
+  const auto voxel_size =
+    to_float_vector(this->declare_parameter<std::vector<double>>("voxel_size", descriptor));
+
+  // Camera branch parameters
+  const auto d_bound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("d_bound", descriptor));
+  const auto x_bound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("x_bound", descriptor));
+  const auto y_bound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("y_bound", descriptor));
+  const auto z_bound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("z_bound", descriptor));
+  const auto num_cameras = this->declare_parameter<std::int64_t>("num_cameras", descriptor);
+  const auto raw_image_height =
+    this->declare_parameter<std::int64_t>("raw_image_height", descriptor);
+  const auto raw_image_width = this->declare_parameter<std::int64_t>("raw_image_width", descriptor);
+  const auto img_aug_scale_x = this->declare_parameter<float>("img_aug_scale_x", descriptor);
+  const auto img_aug_scale_y = this->declare_parameter<float>("img_aug_scale_y", descriptor);
+  const auto roi_height = this->declare_parameter<std::int64_t>("roi_height", descriptor);
+  const auto roi_width = this->declare_parameter<std::int64_t>("roi_width", descriptor);
+  const auto features_height = this->declare_parameter<std::int64_t>("features_height", descriptor);
+  const auto features_width = this->declare_parameter<int>("features_width", descriptor);
+  const auto num_depth_features = this->declare_parameter<int>("num_depth_features", descriptor);
+
+  // Head parameters
+  const auto num_proposals = this->declare_parameter<std::int64_t>("num_proposals", descriptor);
+  class_names_ = this->declare_parameter<std::vector<std::string>>("class_names", descriptor);
+
+  if (point_cloud_range.size() != 6) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "The size of point_cloud_range != 6: use the default parameters.");
+  }
+  if (voxel_size.size() != 3) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "The size of voxel_size != 3: use the default parameters.");
+  }
+
+  // pre-process
+  const std::string densification_world_frame_id =
+    this->declare_parameter<std::string>("densification_world_frame_id", descriptor);
+  const int densification_num_past_frames =
+    this->declare_parameter<std::int64_t>("densification_num_past_frames", descriptor);
+
+  // post-process
+  const float circle_nms_dist_threshold =
+    static_cast<float>(this->declare_parameter<double>("circle_nms_dist_threshold", descriptor));
+  {  // IoU NMS
+    NMSParams p;
+    p.nms_type_ = NMS_TYPE::IoU_BEV;
+    p.target_class_names_ =
+      this->declare_parameter<std::vector<std::string>>("iou_nms_target_class_names", descriptor);
+    p.search_distance_2d_ =
+      this->declare_parameter<double>("iou_nms_search_distance_2d", descriptor);
+    p.iou_threshold_ = this->declare_parameter<double>("iou_nms_threshold", descriptor);
+    iou_bev_nms_.setParameters(p);
+  }
+  const auto yaw_norm_thresholds =
+    this->declare_parameter<std::vector<double>>("yaw_norm_thresholds", descriptor);
+  const float score_threshold =
+    static_cast<float>(this->declare_parameter<double>("score_threshold", descriptor));
+
+  DensificationParam densification_param(
+    densification_world_frame_id, densification_num_past_frames);
+
+  BEVFusionConfig config(
+    sensor_fusion_, plugins_path, out_size_factor, cloud_capacity, max_points_per_voxel, voxels_num,
+    point_cloud_range, voxel_size, d_bound, x_bound, y_bound, z_bound, num_cameras,
+    raw_image_height, raw_image_width, img_aug_scale_x, img_aug_scale_y, roi_height, roi_width,
+    features_height, features_width, num_depth_features, num_proposals, circle_nms_dist_threshold,
+    yaw_norm_thresholds, score_threshold);
+
+  const auto allow_remapping_by_area_matrix = this->declare_parameter<std::vector<std::int64_t>>(
+    "allow_remapping_by_area_matrix", descriptor);
+  const auto min_area_matrix =
+    this->declare_parameter<std::vector<double>>("min_area_matrix", descriptor);
+  const auto max_area_matrix =
+    this->declare_parameter<std::vector<double>>("max_area_matrix", descriptor);
+  detection_class_remapper_.setParameters(
+    allow_remapping_by_area_matrix, min_area_matrix, max_area_matrix);
+
+  auto trt_config =
+    tensorrt_common::TrtCommonConfig(onnx_path, trt_precision, engine_path, 1ULL << 32U);
+  detector_ptr_ = std::make_unique<BEVFusionTRT>(trt_config, densification_param, config);
+
+  cloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
+    "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1),
+    std::bind(&LidarBEVFusionNode::cloudCallback, this, std::placeholders::_1));
+
+  objects_pub_ = this->create_publisher<autoware_perception_msgs::msg::DetectedObjects>(
+    "~/output/objects", rclcpp::QoS(1));
+
+  if (sensor_fusion_) {
+    image_subs_.resize(num_cameras);
+    camera_info_subs_.resize(num_cameras);
+    image_msgs_.resize(num_cameras);
+    camera_info_msgs_.resize(num_cameras);
+    lidar2camera_extrinsics_.resize(num_cameras);
+
+    for (std::int64_t camera_id = 0; camera_id < num_cameras; ++camera_id) {
+      image_subs_[camera_id] = this->create_subscription<sensor_msgs::msg::Image>(
+        "~/input/image" + std::to_string(camera_id), rclcpp::SensorDataQoS{}.keep_last(1),
+        [this, camera_id](const sensor_msgs::msg::Image::ConstSharedPtr msg) {
+          this->imageCallback(msg, camera_id);
+        });
+
+      camera_info_subs_[camera_id] = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+        "~/input/camera_info" + std::to_string(camera_id), rclcpp::SensorDataQoS{}.keep_last(1),
+        [this, camera_id](const sensor_msgs::msg::CameraInfo & msg) {
+          this->cameraInfoCallback(msg, camera_id);
+        });
+    }
+  }
+
+  published_time_pub_ = std::make_unique<autoware::universe_utils::PublishedTimePublisher>(this);
+
+  // initialize debug tool
+  {
+    using autoware::universe_utils::DebugPublisher;
+    using autoware::universe_utils::StopWatch;
+    stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>();
+    debug_publisher_ptr_ = std::make_unique<DebugPublisher>(this, this->get_name());
+    stop_watch_ptr_->tic("cyclic");
+    stop_watch_ptr_->tic("processing/total");
+  }
+
+  if (this->declare_parameter<bool>("build_only", false, descriptor)) {
+    RCLCPP_INFO(this->get_logger(), "TensorRT engine was built Shutting down the node.");
+    rclcpp::shutdown();
+  }
+}
+
+void LidarBEVFusionNode::cloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr pc_msg)
+{
+  lidar_frame_ = pc_msg->header.frame_id;
+
+  if (sensor_fusion_ && (!extrinsics_available_ || !images_available_ || !intrinsics_available_)) {
+    return;
+  }
+
+  if (sensor_fusion_ && !intrinsics_extrinsics_precomputed_) {
+    std::vector<sensor_msgs::msg::CameraInfo> camera_info_msgs;
+    std::vector<Matrix4f> lidar2camera_extrinsics;
+
+    std::transform(
+      camera_info_msgs_.begin(), camera_info_msgs_.end(), std::back_inserter(camera_info_msgs),
+      [](const auto & opt) { return *opt; });
+
+    std::transform(
+      lidar2camera_extrinsics_.begin(), lidar2camera_extrinsics_.end(),
+      std::back_inserter(lidar2camera_extrinsics), [](const auto & opt) { return *opt; });
+
+    detector_ptr_->setIntrinsicsExtrinsics(camera_info_msgs, lidar2camera_extrinsics);
+    intrinsics_extrinsics_precomputed_ = true;
+  }
+
+  const auto objects_sub_count =
+    objects_pub_->get_subscription_count() + objects_pub_->get_intra_process_subscription_count();
+  if (objects_sub_count < 1) {
+    return;
+  }
+
+  if (stop_watch_ptr_) {
+    stop_watch_ptr_->toc("processing/total", true);
+  }
+
+  double lidar_stamp = rclcpp::Time(pc_msg->header.stamp).seconds();
+  camera_masks_.resize(camera_info_msgs_.size());
+  for (std::size_t i = 0; i < camera_masks_.size(); ++i) {
+    camera_masks_[i] = 1.f ? (lidar_stamp - rclcpp::Time(image_msgs_[i]->header.stamp).seconds()) <
+                               max_camera_lidar_delay_
+                           : 0.f;
+  }
+
+  std::vector<Box3D> det_boxes3d;
+  std::unordered_map<std::string, double> proc_timing;
+  bool is_success =
+    detector_ptr_->detect(pc_msg, image_msgs_, camera_masks_, tf_buffer_, det_boxes3d, proc_timing);
+  if (!is_success) {
+    return;
+  }
+
+  std::vector<autoware_perception_msgs::msg::DetectedObject> raw_objects;
+  raw_objects.reserve(det_boxes3d.size());
+  for (const auto & box3d : det_boxes3d) {
+    autoware_perception_msgs::msg::DetectedObject obj;
+    box3DToDetectedObject(box3d, class_names_, obj);
+    raw_objects.emplace_back(obj);
+  }
+
+  autoware_perception_msgs::msg::DetectedObjects output_msg;
+  output_msg.header = pc_msg->header;
+  output_msg.objects = iou_bev_nms_.apply(raw_objects);
+
+  detection_class_remapper_.mapClasses(output_msg);
+
+  if (objects_sub_count > 0) {
+    objects_pub_->publish(output_msg);
+    published_time_pub_->publish_if_subscribed(objects_pub_, output_msg.header.stamp);
+  }
+
+  // add processing time for debug
+  if (debug_publisher_ptr_ && stop_watch_ptr_) {
+    const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic", true);
+    const double processing_time_ms = stop_watch_ptr_->toc("processing/total", true);
+    const double pipeline_latency_ms =
+      std::chrono::duration<double, std::milli>(
+        std::chrono::nanoseconds(
+          (this->get_clock()->now() - output_msg.header.stamp).nanoseconds()))
+        .count();
+    debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/cyclic_time_ms", cyclic_time_ms);
+    debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/pipeline_latency_ms", pipeline_latency_ms);
+    debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/processing_time/total_ms", processing_time_ms);
+    for (const auto & [topic, time_ms] : proc_timing) {
+      debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(topic, time_ms);
+    }
+  }
+}
+
+void LidarBEVFusionNode::imageCallback(
+  const sensor_msgs::msg::Image::ConstSharedPtr msg, std::size_t camera_id)
+{
+  image_msgs_[camera_id] = msg;
+
+  std::size_t num_valid_images = std::count_if(
+    image_msgs_.begin(), image_msgs_.end(),
+    [](const auto & image_msg) { return image_msg != nullptr; });
+
+  images_available_ = num_valid_images == image_msgs_.size();
+}
+
+void LidarBEVFusionNode::cameraInfoCallback(
+  const sensor_msgs::msg::CameraInfo & msg, std::size_t camera_id)
+{
+  camera_info_msgs_[camera_id] = msg;
+
+  std::size_t num_valid_intrinsics = std::count_if(
+    camera_info_msgs_.begin(), camera_info_msgs_.end(),
+    [](const auto & opt) { return opt.has_value(); });
+
+  intrinsics_available_ = num_valid_intrinsics == camera_info_msgs_.size();
+
+  if (
+    lidar2camera_extrinsics_[camera_id].has_value() || !lidar_frame_.has_value() ||
+    extrinsics_available_) {
+    return;
+  }
+
+  try {
+    geometry_msgs::msg::TransformStamped transform_stamped;
+    transform_stamped =
+      tf_buffer_.lookupTransform(msg.header.frame_id, *lidar_frame_, msg.header.stamp);
+
+    Eigen::Matrix4f lidar2camera_transform =
+      tf2::transformToEigen(transform_stamped.transform).matrix().cast<float>();
+
+    Matrix4f lidar2camera_rowmajor_transform = lidar2camera_transform.eval();
+    lidar2camera_extrinsics_[camera_id] = lidar2camera_rowmajor_transform;
+  } catch (tf2::TransformException & ex) {
+    RCLCPP_WARN(this->get_logger(), "%s", ex.what());
+    return;
+  }
+
+  std::size_t num_valid_extrinsics = std::count_if(
+    lidar2camera_extrinsics_.begin(), lidar2camera_extrinsics_.end(),
+    [](const auto & opt) { return opt.has_value(); });
+
+  extrinsics_available_ = num_valid_extrinsics == lidar2camera_extrinsics_.size();
+}
+
+}  // namespace autoware::lidar_bevfusion
+
+#include "rclcpp_components/register_node_macro.hpp"
+
+RCLCPP_COMPONENTS_REGISTER_NODE(autoware::lidar_bevfusion::LidarBEVFusionNode)
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.cpp
new file mode 100644
index 0000000000000..8f1e0bd15ca1d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.cpp
@@ -0,0 +1,421 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin.hpp"
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <spconvlib/spconv/csrc/sparse/all/SpconvOps.h>  // cSpell:ignore spconvlib
+#include <spconvlib/spconv/csrc/sparse/alloc/StaticAllocator.h>
+#include <spconvlib/spconv/csrc/sparse/convops/SimpleExternalSpconvMatmul.h>
+#include <spconvlib/spconv/csrc/sparse/convops/gemmops/GemmTunerSimple.h>
+#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
+#include <spconvlib/spconv/csrc/sparse/inference/InferenceOps.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+// GetIndicesPairsImplicitGemm
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+GetIndicesPairsImplicitGemmPlugin::GetIndicesPairsImplicitGemmPlugin(
+  const std::string & name, GetIndicesPairsImplicitGemmParameters const & params)
+: layer_name_{name}, params_{params}
+{
+  initFieldsToSerialize();
+}
+
+void GetIndicesPairsImplicitGemmPlugin::initFieldsToSerialize()
+{
+  data_to_serialize_.clear();
+  data_to_serialize_.emplace_back("batch_size", &params_.batch_size, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("algo", &params_.algo, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("is_train", &params_.is_train, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back(
+    "dilation_dims", &params_.dilation_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back("ksize_dims", &params_.ksize_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back(
+    "out_padding_dims", &params_.out_padding_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back("padding_dims", &params_.padding_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back(
+    "spatial_shape_dims", &params_.spatial_shape_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back("stride_dims", &params_.stride_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back(
+    "subm", &params_.subm, PluginFieldType::kINT32, 1);  // cSpell:ignore subm
+  data_to_serialize_.emplace_back("transpose", &params_.transpose, PluginFieldType::kINT32, 1);
+
+  fc_to_serialize_.nbFields = data_to_serialize_.size();
+  fc_to_serialize_.fields = data_to_serialize_.data();
+}
+
+IPluginCapability * GetIndicesPairsImplicitGemmPlugin::getCapabilityInterface(
+  PluginCapabilityType type) noexcept
+{
+  try {
+    if (type == PluginCapabilityType::kBUILD) {
+      return static_cast<IPluginV3OneBuild *>(this);
+    }
+    if (type == PluginCapabilityType::kRUNTIME) {
+      return static_cast<IPluginV3OneRuntime *>(this);
+    }
+    PLUGIN_ASSERT(type == PluginCapabilityType::kCORE);
+    return static_cast<IPluginV3OneCore *>(this);
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+IPluginV3 * GetIndicesPairsImplicitGemmPlugin::clone() noexcept
+{
+  try {
+    IPluginV3 * const plugin{new GetIndicesPairsImplicitGemmPlugin{layer_name_, params_}};
+    return plugin;
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+char const * GetIndicesPairsImplicitGemmPlugin::getPluginName() const noexcept
+{
+  return kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_NAME;
+}
+
+char const * GetIndicesPairsImplicitGemmPlugin::getPluginVersion() const noexcept
+{
+  return kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_VERSION;
+}
+
+char const * GetIndicesPairsImplicitGemmPlugin::getPluginNamespace() const noexcept
+{
+  return kGET_INDICES_PAIRS_IMPLICIT_GEMM_PLUGIN_NAMESPACE;
+}
+
+std::int32_t GetIndicesPairsImplicitGemmPlugin::getNbOutputs() const noexcept
+{
+  return 5;
+}
+
+std::int32_t GetIndicesPairsImplicitGemmPlugin::configurePlugin(
+  DynamicPluginTensorDesc const * in, std::int32_t num_inputs, DynamicPluginTensorDesc const * out,
+  std::int32_t num_outputs) noexcept
+{
+  // Validate input arguments.
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+  PLUGIN_ASSERT(in[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[1].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[2].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[3].desc.dims.nbDims == 1);
+  PLUGIN_ASSERT(out[4].desc.dims.nbDims == 0);
+
+  std::int64_t kernel_volume = 1;
+  for (const std::int64_t ksize : params_.ksize) {
+    kernel_volume *= ksize;
+  }
+
+  PLUGIN_ASSERT(in[0].desc.dims.d[1] == 4);  // coords + 1
+
+  PLUGIN_ASSERT(out[0].desc.dims.d[1] == 4);  // coords + 1
+  PLUGIN_ASSERT(out[0].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[1].desc.dims.d[0] == kernel_volume);
+  PLUGIN_ASSERT(out[1].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[2].desc.dims.d[1] == 1);
+  PLUGIN_ASSERT(out[2].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[3].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[4].desc.type == in[0].desc.type);
+
+  return 0;
+}
+
+bool GetIndicesPairsImplicitGemmPlugin::supportsFormatCombination(
+  std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+  std::int32_t num_outputs) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+
+  return (
+    in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+    in_out[pos].desc.type == nvinfer1::DataType::kINT32);
+}
+
+std::int32_t GetIndicesPairsImplicitGemmPlugin::getOutputDataTypes(
+  DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+  std::int32_t num_inputs) const noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+
+  output_types[0] = input_types[0];
+  output_types[1] = input_types[0];
+  output_types[2] = input_types[0];
+  output_types[3] = input_types[0];
+  output_types[4] = input_types[0];
+
+  return 0;
+}
+
+std::int32_t GetIndicesPairsImplicitGemmPlugin::getOutputShapes(
+  DimsExprs const * inputs, std::int32_t num_inputs,
+  [[maybe_unused]] DimsExprs const * shape_inputs, [[maybe_unused]] std::int32_t num_shape_inputs,
+  DimsExprs * outputs, std::int32_t num_outputs, IExprBuilder & expr_builder) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+  PLUGIN_ASSERT(inputs[0].nbDims == 2);
+
+  std::int64_t kernel_volume = 1;
+
+  for (std::size_t i = 0; i < params_.ksize.size(); ++i) {
+    kernel_volume *= params_.ksize[i];
+  }
+
+  if (params_.subm) {
+    outputs[0] = inputs[0];
+    outputs[0].d[1] = inputs[0].d[1];
+
+    outputs[1].nbDims = 2;
+    outputs[1].d[0] = expr_builder.constant(kernel_volume);
+    outputs[1].d[1] = inputs[0].d[0];
+
+    outputs[2].nbDims = 2;
+    outputs[2].d[0] = inputs[0].d[0];
+    outputs[2].d[1] = expr_builder.constant(1);
+
+    outputs[3].nbDims = 1;
+    outputs[3].d[0] = inputs[0].d[0];
+
+  } else {
+    auto opt_value = expr_builder.operation(
+      DimensionOperation::kCEIL_DIV, *inputs[0].d[0], *expr_builder.constant(2));
+
+    outputs[0].nbDims = 2;
+    outputs[0].d[0] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_indices_num_limit_));
+    outputs[0].d[1] = inputs[0].d[1];
+
+    outputs[1].nbDims = 2;
+    outputs[1].d[0] = expr_builder.constant(kernel_volume);
+    outputs[1].d[1] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_indices_num_limit_));
+
+    outputs[2].nbDims = 2;
+    outputs[2].d[0] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_indices_num_limit_));
+    outputs[2].d[1] = expr_builder.constant(1);
+
+    outputs[3].nbDims = 1;
+    outputs[3].d[0] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_indices_num_limit_));
+  }
+
+  // num_activate_out
+  outputs[4].nbDims = 0;
+
+  return 0;
+}
+
+std::int32_t GetIndicesPairsImplicitGemmPlugin::enqueue(
+  PluginTensorDesc const * input_desc, [[maybe_unused]] PluginTensorDesc const * output_desc,
+  void const * const * inputs, void * const * outputs, [[maybe_unused]] void * workspace,
+  cudaStream_t stream) noexcept
+{
+  using SpconvOps = spconvlib::spconv::csrc::sparse::all::SpconvOps;
+  using StaticAllocator = spconvlib::spconv::csrc::sparse::alloc::StaticAllocator;
+
+  const bool is_subm = params_.subm;
+  const bool direct_table = true;
+  const bool use_direct_table = direct_table && !is_subm;
+  const int static_num_act_in = out_indices_num_limit_;
+
+  std::vector<int> ksize(params_.ksize.begin(), params_.ksize.end());
+  std::vector<int> stride(params_.stride.begin(), params_.stride.end());
+  std::vector<int> padding(params_.padding.begin(), params_.padding.end());
+  std::vector<int> dilation(params_.dilation.begin(), params_.dilation.end());
+  std::vector<int> input_dims(params_.spatial_shape.begin(), params_.spatial_shape.end());
+
+  auto out_dims = SpconvOps::get_conv_output_size(input_dims, ksize, stride, padding, dilation);
+  std::vector<std::int64_t> output_dims_i64(out_dims.begin(), out_dims.end());
+  std::int64_t out_spatial_volume = std::accumulate(
+    output_dims_i64.begin(), output_dims_i64.end(), static_cast<std::int64_t>(1),
+    std::multiplies<std::int64_t>());
+
+  bool use_int64_hash_k =
+    out_spatial_volume >= static_cast<std::int64_t>(std::numeric_limits<int>::max());
+
+  int kernel_volume =
+    std::accumulate(params_.ksize.begin(), params_.ksize.end(), 1, std::multiplies<int>());
+
+  auto max_act_out_theory = SpconvOps::get_handcrafted_max_act_out(
+    input_desc[0].dims.d[0], ksize, stride, padding, dilation);
+
+  auto ws_tensors = SpconvOps::get_indice_gen_tensors_from_workspace(
+    reinterpret_cast<std::uint8_t *>(workspace), kernel_volume, out_indices_num_limit_,
+    out_indices_num_limit_, max_act_out_theory, is_subm, use_int64_hash_k, use_direct_table);
+
+  int pair_fwd_size_padded = is_subm ? input_desc[0].dims.d[0] : out_indices_num_limit_;
+  tv::Tensor pair_fwd_padded =
+    tv::from_blob(outputs[1], {kernel_volume, pair_fwd_size_padded}, tv::int32, 0);
+
+  bool is_split_mask =
+    params_.algo == static_cast<std::int64_t>(tv::gemm::SparseConvAlgo::kMaskSplitImplicitGemm);
+  int mask_count = is_split_mask ? 2 : 1;
+
+  tv::Tensor pair_mask_fwd_padded =
+    tv::from_blob(outputs[2], {mask_count, pair_fwd_size_padded}, tv::int32, 0);
+  tv::Tensor mask_argsort_fwd_padded =
+    tv::from_blob(outputs[3], {mask_count, pair_fwd_size_padded}, tv::int32, 0);
+  tv::Tensor out_indices = tv::from_blob(
+    outputs[0], {is_subm ? input_desc[0].dims.d[0] : out_indices_num_limit_, 4}, tv::int32, 0);
+  tv::Tensor indices_kernel_num = tv::zeros({kernel_volume}, tv::int32, 0);
+
+  tv::Tensor input_indices = tv::from_blob(inputs[0], {input_desc[0].dims.d[0], 4}, tv::int32, 0);
+
+  std::tuple<tv::Tensor, int> pair_res;
+
+  tv::Context ctx;
+  ctx.set_cuda_stream_int(reinterpret_cast<std::uintptr_t>(stream));
+
+  if (is_subm) {
+    out_indices.copy_(input_indices, ctx);
+
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK, pair_mask_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_MASK_ARG_SORT, mask_argsort_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_indices});
+    ws_tensors.insert(
+      {SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});  // cSpell:ignore INDICE
+    StaticAllocator alloc(ws_tensors);
+
+    // cSpell:ignore indice
+    pair_res = SpconvOps::get_indice_pairs_implicit_gemm(
+      alloc, input_indices, params_.batch_size, input_dims, static_cast<int>(params_.algo), ksize,
+      stride, padding, dilation, {0, 0, 0}, params_.subm, params_.transpose, false /*is_train*/,
+      reinterpret_cast<std::uintptr_t>(stream), out_indices_num_limit_, tv::CUDAKernelTimer(false),
+      use_direct_table);
+
+  } else {
+    tv::Tensor pair_bwd_padded = tv::empty({kernel_volume, static_num_act_in}, tv::int32, 0);
+    tv::Tensor pair_mask_bwd_padded = tv::empty({mask_count, static_num_act_in}, tv::int32, 0);
+    tv::Tensor mask_argsort_bwd_padded = tv::empty({mask_count, static_num_act_in}, tv::int32, 0);
+
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_BWD, pair_bwd_padded});
+
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK, pair_mask_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK_BWD, pair_mask_bwd_padded});
+
+    ws_tensors.insert({SPCONV_ALLOC_MASK_ARG_SORT, mask_argsort_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_MASK_ARG_SORT_BWD, mask_argsort_bwd_padded});
+
+    ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_indices});
+    ws_tensors.insert(
+      {SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});  // cSpell:ignore INDICE
+
+    StaticAllocator alloc(ws_tensors);
+
+    // cSpell:ignore indice
+    pair_res = SpconvOps::get_indice_pairs_implicit_gemm(
+      alloc, input_indices, params_.batch_size, input_dims, static_cast<int>(params_.algo), ksize,
+      stride, padding, dilation, {0, 0, 0}, params_.subm, params_.transpose, false /*is_train*/,
+      reinterpret_cast<std::uintptr_t>(stream), out_indices_num_limit_, tv::CUDAKernelTimer(false),
+      use_direct_table);
+  }
+
+  std::int32_t num_act_out_real = std::get<1>(pair_res);
+  std::int32_t * num_act_out_data = static_cast<std::int32_t *>(outputs[4]);
+
+  cudaError_t const status = cudaMemcpyAsync(
+    num_act_out_data, &num_act_out_real, sizeof(std::int32_t), cudaMemcpyHostToDevice, stream);
+
+  return status;
+}
+
+std::int32_t GetIndicesPairsImplicitGemmPlugin::onShapeChange(
+  [[maybe_unused]] PluginTensorDesc const * in, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] PluginTensorDesc const * out, [[maybe_unused]] std::int32_t num_outputs) noexcept
+{
+  return 0;
+}
+
+IPluginV3 * GetIndicesPairsImplicitGemmPlugin::attachToContext(
+  [[maybe_unused]] IPluginResourceContext * context) noexcept
+{
+  return clone();
+}
+
+PluginFieldCollection const * GetIndicesPairsImplicitGemmPlugin::getFieldsToSerialize() noexcept
+{
+  return &fc_to_serialize_;
+}
+
+std::size_t GetIndicesPairsImplicitGemmPlugin::getWorkspaceSize(
+  [[maybe_unused]] DynamicPluginTensorDesc const * inputs, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] DynamicPluginTensorDesc const * outputs,
+  [[maybe_unused]] std::int32_t num_outputs) const noexcept
+{
+  using SpconvOps = spconvlib::spconv::csrc::sparse::all::SpconvOps;
+
+  bool is_subm = params_.subm;
+  const bool direct_table = true;
+  const bool use_direct_table = direct_table && !is_subm;
+
+  std::vector<int> ksize(params_.ksize.begin(), params_.ksize.end());
+  std::vector<int> stride(params_.stride.begin(), params_.stride.end());
+  std::vector<int> padding(params_.padding.begin(), params_.padding.end());
+  std::vector<int> dilation(params_.dilation.begin(), params_.dilation.end());
+  std::vector<int> input_dims(params_.spatial_shape.begin(), params_.spatial_shape.end());
+
+  auto out_dims = SpconvOps::get_conv_output_size(input_dims, ksize, stride, padding, dilation);
+  std::vector<std::int64_t> output_dims_i64(out_dims.begin(), out_dims.end());
+  std::int64_t out_spatial_volume = std::accumulate(
+    output_dims_i64.begin(), output_dims_i64.end(), static_cast<std::int64_t>(1),
+    std::multiplies<int64_t>());
+  bool use_int64_hash_k =
+    out_spatial_volume >= static_cast<std::int64_t>(std::numeric_limits<int>::max());
+
+  int kernel_volume =
+    std::accumulate(params_.ksize.begin(), params_.ksize.end(), 1, std::multiplies<int>());
+
+  // query workspace size.
+  int workspace_size = SpconvOps::get_indice_gen_workspace_size(
+    kernel_volume, out_indices_num_limit_, out_indices_num_limit_, out_indices_num_limit_, is_subm,
+    use_int64_hash_k, use_direct_table);
+
+  return static_cast<std::size_t>(workspace_size);
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.cpp
new file mode 100644
index 0000000000000..e5396a10ab022
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.cpp
@@ -0,0 +1,320 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.hpp"
+
+#include "autoware/tensorrt_plugins//get_indices_pairs_implicit_gemm_plugin.hpp"
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntimePlugin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+REGISTER_TENSORRT_PLUGIN(GetIndicesPairsImplicitGemmPluginCreator);
+
+GetIndicesPairsImplicitGemmPluginCreator::GetIndicesPairsImplicitGemmPluginCreator()
+{
+  plugin_attributes_.clear();
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("algo", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("batch_size", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("dilation", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("is_train", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("ksize", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("out_padding", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("padding", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("spatial_shape", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("stride", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("subm", nullptr, PluginFieldType::kINT32, 1));  // cSpell:ignore subm
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("transpose", nullptr, PluginFieldType::kINT32, 1));
+
+  fc_.nbFields = plugin_attributes_.size();
+  fc_.fields = plugin_attributes_.data();
+}
+
+nvinfer1::PluginFieldCollection const *
+GetIndicesPairsImplicitGemmPluginCreator::getFieldNames() noexcept
+{
+  // This is only used in the build phase.
+  return &fc_;
+}
+
+IPluginV3 * GetIndicesPairsImplicitGemmPluginCreator::createPlugin(
+  char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept
+{
+  // The build phase and the deserialization phase are handled differently.
+  if (phase == TensorRTPhase::kBUILD || phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the ONNX node will be parsed and passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+
+      PLUGIN_VALIDATE(num_fields == 11);
+
+      GetIndicesPairsImplicitGemmParameters parameters;
+
+      for (std::int32_t i{0}; i < num_fields; ++i) {
+        const std::string attr_name = fields[i].name;
+        const nvinfer1::PluginFieldType type = fields[i].type;
+
+        if (attr_name == "batch_size") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.batch_size = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "algo") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.algo = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "is_train") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.is_train = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "dilation") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const dilation_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+
+          parameters.dilation_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.dilation.push_back(dilation_data[j]);
+            parameters.dilation_dims.d[j] = static_cast<std::int64_t>(dilation_data[j]);
+          }
+        }
+        if (attr_name == "dilation_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.dilation_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.dilation_dims.nbDims; ++j) {
+            parameters.dilation.push_back(static_cast<std::int32_t>(parameters.dilation_dims.d[j]));
+          }
+        }
+        if (attr_name == "ksize") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const ksize_data{static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.ksize_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.ksize.push_back(ksize_data[j]);
+            parameters.ksize_dims.d[j] = static_cast<std::int64_t>(ksize_data[j]);
+          }
+        }
+        if (attr_name == "ksize_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.ksize_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.ksize_dims.nbDims; ++j) {
+            parameters.ksize.push_back(static_cast<std::int32_t>(parameters.ksize_dims.d[j]));
+          }
+        }
+        if (attr_name == "out_padding") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const out_padding_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.out_padding_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.out_padding.push_back(out_padding_data[j]);
+            parameters.out_padding_dims.d[j] = static_cast<std::int64_t>(out_padding_data[j]);
+          }
+        }
+        if (attr_name == "out_padding_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.out_padding_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.out_padding_dims.nbDims; ++j) {
+            parameters.out_padding.push_back(
+              static_cast<std::int32_t>(parameters.out_padding_dims.d[j]));
+          }
+        }
+        if (attr_name == "padding") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const padding_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.padding_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.padding.push_back(padding_data[j]);
+            parameters.padding_dims.d[j] = static_cast<std::int64_t>(padding_data[j]);
+          }
+        }
+        if (attr_name == "padding_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.padding_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.padding_dims.nbDims; ++j) {
+            parameters.padding.push_back(static_cast<std::int32_t>(parameters.padding_dims.d[j]));
+          }
+        }
+        if (attr_name == "spatial_shape") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const spatial_shape_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.spatial_shape_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.spatial_shape.push_back(spatial_shape_data[j]);
+            parameters.spatial_shape_dims.d[j] = static_cast<std::int64_t>(spatial_shape_data[j]);
+          }
+        }
+        if (attr_name == "spatial_shape_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.spatial_shape_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.spatial_shape_dims.nbDims; ++j) {
+            parameters.spatial_shape.push_back(
+              static_cast<std::int32_t>(parameters.spatial_shape_dims.d[j]));
+          }
+        }
+        if (attr_name == "stride") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const stride_data{static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.stride_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.stride.push_back(stride_data[j]);
+            parameters.stride_dims.d[j] = static_cast<std::int64_t>(stride_data[j]);
+          }
+        }
+        if (attr_name == "stride_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.stride_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.stride_dims.nbDims; ++j) {
+            parameters.stride.push_back(static_cast<std::int32_t>(parameters.stride_dims.d[j]));
+          }
+        }
+        if (attr_name == "subm") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.subm = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "transpose") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.transpose = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+      }
+
+      // Log the attributes parsed from ONNX node.
+      std::stringstream ss;
+      ss << name << " plugin Attributes:";
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "batch_size: " << parameters.batch_size;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "algo: " << parameters.algo;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "is_train: " << parameters.is_train;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "dilation: ";
+      for (auto const & val : parameters.dilation) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "ksize: ";
+      for (auto const & val : parameters.ksize) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "out_padding: ";
+      for (auto const & val : parameters.out_padding) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "padding: ";
+      for (auto const & val : parameters.padding) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "spatial_shape: ";
+      for (auto const & val : parameters.spatial_shape) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "stride: ";
+      for (auto const & val : parameters.stride) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "subm: " << parameters.subm;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "transpose: " << parameters.transpose;
+      logDebug(ss.str().c_str());
+
+      GetIndicesPairsImplicitGemmPlugin * const plugin{
+        new GetIndicesPairsImplicitGemmPlugin{std::string(name), parameters}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else if (phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the serialized plugin will be passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+      PLUGIN_VALIDATE(num_fields == 1);
+
+      char const * attr_name = fields[0].name;
+      PLUGIN_VALIDATE(!strcmp(attr_name, "parameters"));
+      PLUGIN_VALIDATE(fields[0].type == nvinfer1::PluginFieldType::kUNKNOWN);
+      PLUGIN_VALIDATE(fields[0].length == sizeof(GetIndicesPairsImplicitGemmParameters));
+      GetIndicesPairsImplicitGemmParameters params{
+        *(static_cast<GetIndicesPairsImplicitGemmParameters const *>(fields[0].data))};
+
+      GetIndicesPairsImplicitGemmPlugin * const plugin{
+        new GetIndicesPairsImplicitGemmPlugin{std::string(name), params}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin.cpp
new file mode 100644
index 0000000000000..4d0b7824c96c1
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin.cpp
@@ -0,0 +1,331 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin.hpp"
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <spconvlib/spconv/csrc/sparse/all/SpconvOps.h>  // cSpell:ignore spconvlib
+#include <spconvlib/spconv/csrc/sparse/alloc/StaticAllocator.h>
+#include <spconvlib/spconv/csrc/sparse/convops/SimpleExternalSpconvMatmul.h>
+#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
+#include <spconvlib/spconv/csrc/sparse/inference/InferenceOps.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// ImplicitGemm
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+ImplicitGemmPlugin::ImplicitGemmPlugin(
+  const std::string & name, ImplicitGemmParameters const & params)
+: layer_name_{name}, params_{params}
+{
+  using ConvGemmOps = spconvlib::spconv::csrc::sparse::convops::spops::ConvGemmOps;
+  using ConvMain = spconvlib::cumm::conv::main::ConvMainUnitTest;
+
+  initFieldsToSerialize();
+
+  arch_ = ConvGemmOps::get_compute_capability();
+  tuner_fp16_ptr_ =
+    std::make_unique<ConvTunerSimple>(ConvMain::get_all_conv_algo_desp());  // cSpell:ignore desp
+  tuner_fp32_ptr_ = std::make_unique<ConvTunerSimple>(ConvMain::get_all_conv_algo_desp());
+}
+
+void ImplicitGemmPlugin::initFieldsToSerialize()
+{
+  data_to_serialize_.clear();
+  data_to_serialize_.emplace_back("act_alpha", &params_.act_alpha, PluginFieldType::kFLOAT32, 1);
+  data_to_serialize_.emplace_back("act_alpha", &params_.act_beta, PluginFieldType::kFLOAT32, 1);
+
+  data_to_serialize_.emplace_back(
+    "is_subm", &params_.is_subm, PluginFieldType::kINT32, 1);  // cSpell:ignore subm
+  data_to_serialize_.emplace_back("is_train", &params_.is_train, PluginFieldType::kINT32, 1);
+
+  data_to_serialize_.emplace_back(
+    "output_add_scale", &params_.output_add_scale, PluginFieldType::kFLOAT32, 1);
+  data_to_serialize_.emplace_back(
+    "output_scale", &params_.output_scale, PluginFieldType::kFLOAT32, 1);
+
+  fc_to_serialize_.nbFields = data_to_serialize_.size();
+  fc_to_serialize_.fields = data_to_serialize_.data();
+}
+
+IPluginCapability * ImplicitGemmPlugin::getCapabilityInterface(PluginCapabilityType type) noexcept
+{
+  try {
+    if (type == PluginCapabilityType::kBUILD) {
+      return static_cast<IPluginV3OneBuild *>(this);
+    }
+    if (type == PluginCapabilityType::kRUNTIME) {
+      return static_cast<IPluginV3OneRuntime *>(this);
+    }
+    PLUGIN_ASSERT(type == PluginCapabilityType::kCORE);
+    return static_cast<IPluginV3OneCore *>(this);
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+IPluginV3 * ImplicitGemmPlugin::clone() noexcept
+{
+  try {
+    IPluginV3 * const plugin{new ImplicitGemmPlugin{layer_name_, params_}};
+    return plugin;
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+char const * ImplicitGemmPlugin::getPluginName() const noexcept
+{
+  return kIMPLICIT_GEMM_PLUGIN_NAME;
+}
+
+char const * ImplicitGemmPlugin::getPluginVersion() const noexcept
+{
+  return kIMPLICIT_GEMM_PLUGIN_VERSION;
+}
+
+char const * ImplicitGemmPlugin::getPluginNamespace() const noexcept
+{
+  return kIMPLICIT_GEMM_PLUGIN_NAMESPACE;
+}
+
+std::int32_t ImplicitGemmPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+std::int32_t ImplicitGemmPlugin::configurePlugin(
+  DynamicPluginTensorDesc const * in, std::int32_t num_inputs, DynamicPluginTensorDesc const * out,
+  std::int32_t num_outputs) noexcept
+{
+  // Validate input arguments.
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(in[INOUT_IN_FEATURES_INDEX].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[INOUT_FILTERS_INDEX].desc.dims.nbDims == 5);
+  PLUGIN_ASSERT(in[INOUT_PAIR_FWD_INDEX].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].desc.dims.nbDims == 1);
+  PLUGIN_ASSERT(out[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(
+    in[INOUT_FILTERS_INDEX].desc.dims.d[4] == in[INOUT_IN_FEATURES_INDEX].desc.dims.d[1]);
+  PLUGIN_ASSERT(
+    in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.dims.d[0] == in[INOUT_PAIR_FWD_INDEX].desc.dims.d[1]);
+  PLUGIN_ASSERT(in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.dims.d[1] == 1);
+  PLUGIN_ASSERT(
+    in[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].desc.dims.d[0] ==
+    in[INOUT_PAIR_FWD_INDEX].desc.dims.d[1]);
+  PLUGIN_ASSERT(in[INOUT_IN_FEATURES_INDEX].desc.type == in[INOUT_FILTERS_INDEX].desc.type);
+  PLUGIN_ASSERT(in[INOUT_IN_FEATURES_INDEX].desc.type == out[0].desc.type);
+  PLUGIN_ASSERT(
+    in[INOUT_PAIR_FWD_INDEX].desc.type == in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.type);
+  PLUGIN_ASSERT(
+    in[INOUT_PAIR_FWD_INDEX].desc.type == in[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].desc.type);
+
+  return 0;
+}
+
+bool ImplicitGemmPlugin::supportsFormatCombination(
+  std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+  std::int32_t num_outputs) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+
+  bool supported = in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR;
+
+  switch (pos) {
+    case INOUT_IN_FEATURES_INDEX:
+      supported &=
+        (in_out[pos].desc.type == nvinfer1::DataType::kFLOAT ||
+         in_out[pos].desc.type == nvinfer1::DataType::kHALF);
+      break;
+    case INOUT_FILTERS_INDEX:
+    case INOUT_OUT_FEATURES_INDEX:
+      supported &= in_out[pos].desc.type == in_out[INOUT_IN_FEATURES_INDEX].desc.type;
+      break;
+    case INOUT_PAIR_FWD_INDEX:
+    case INOUT_PAIR_MASK_FWD_SPLITS_INDEX:
+    case INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX:
+      supported &= in_out[pos].desc.type == nvinfer1::DataType::kINT32;
+      break;
+    default:
+      supported = false;
+      break;
+  }
+
+  return supported;
+}
+
+std::int32_t ImplicitGemmPlugin::getOutputDataTypes(
+  DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+  std::int32_t num_inputs) const noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+
+  output_types[0] = input_types[INOUT_IN_FEATURES_INDEX];
+
+  return 0;
+}
+
+std::int32_t ImplicitGemmPlugin::getOutputShapes(
+  DimsExprs const * inputs, std::int32_t num_inputs,
+  [[maybe_unused]] DimsExprs const * shape_inputs, [[maybe_unused]] std::int32_t num_shape_inputs,
+  DimsExprs * outputs, std::int32_t num_outputs,
+  [[maybe_unused]] IExprBuilder & expr_builder) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(inputs[0].nbDims == 2);
+
+  outputs[0].nbDims = 2;
+  outputs[0].d[0] = inputs[3].d[0];
+  outputs[0].d[1] = inputs[1].d[0];
+
+  return 0;
+}
+
+std::int32_t ImplicitGemmPlugin::enqueue(
+  PluginTensorDesc const * input_desc, [[maybe_unused]] PluginTensorDesc const * output_desc,
+  void const * const * inputs, void * const * outputs, [[maybe_unused]] void * workspace,
+  cudaStream_t stream) noexcept
+{
+  using StaticAllocator = spconvlib::spconv::csrc::sparse::alloc::StaticAllocator;
+  using ConvGemmOps = spconvlib::spconv::csrc::sparse::convops::spops::ConvGemmOps;
+
+  std::int64_t num_act_in = input_desc[INOUT_IN_FEATURES_INDEX].dims.d[0];
+  std::int64_t num_in_features = input_desc[INOUT_IN_FEATURES_INDEX].dims.d[1];
+  // std::int64_t kernel_volume = input_desc[INOUT_PAIR_FWD_INDEX].dims.d[0];
+  std::int64_t num_act_out = input_desc[INOUT_PAIR_FWD_INDEX].dims.d[1];
+  std::int64_t num_out_features = input_desc[INOUT_FILTERS_INDEX].dims.d[0];
+
+  auto in_features_type = input_desc[INOUT_IN_FEATURES_INDEX].type;
+  [[maybe_unused]] auto filters_type = input_desc[INOUT_FILTERS_INDEX].type;
+  [[maybe_unused]] auto out_features_type = input_desc[INOUT_OUT_FEATURES_INDEX].type;
+
+  assert(in_features_type == filters_type);
+  assert(in_features_type == out_features_type);
+
+  auto dtype = in_features_type == DataType::kFLOAT ? tv::float32 : tv::float16;
+
+  tv::Tensor input_features =
+    tv::from_blob(inputs[INOUT_IN_FEATURES_INDEX], {num_act_in, num_in_features}, dtype, 0);
+
+  tv::Tensor weights = tv::from_blob(
+    inputs[INOUT_FILTERS_INDEX],
+    {input_desc[INOUT_FILTERS_INDEX].dims.d[0], input_desc[INOUT_FILTERS_INDEX].dims.d[1],
+     input_desc[INOUT_FILTERS_INDEX].dims.d[2], input_desc[INOUT_FILTERS_INDEX].dims.d[3],
+     input_desc[INOUT_FILTERS_INDEX].dims.d[4]},
+    dtype, 0);
+
+  tv::Tensor pair_fwd = tv::from_blob(
+    inputs[INOUT_PAIR_FWD_INDEX],
+    {input_desc[INOUT_PAIR_FWD_INDEX].dims.d[0], input_desc[INOUT_PAIR_FWD_INDEX].dims.d[1]},
+    tv::int32, 0);
+
+  tv::Tensor pair_mask_fwd_splits = tv::from_blob(
+    inputs[INOUT_PAIR_MASK_FWD_SPLITS_INDEX],
+    {1, input_desc[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].dims.d[0]}, tv::int32, 0);
+
+  tv::Tensor mask_argsort_fwd_splits = tv::from_blob(
+    inputs[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX],
+    {
+      1,
+      input_desc[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].dims.d[0],
+    },
+    tv::int32, 0);
+
+  PLUGIN_ASSERT(
+    input_desc[INOUT_PAIR_FWD_INDEX].dims.d[1] ==
+    input_desc[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].dims.d[0]);
+  PLUGIN_ASSERT(input_desc[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].dims.nbDims == 1);
+
+  tv::Tensor out_features = tv::from_blob(outputs[0], {num_act_out, num_out_features}, dtype, 0);
+
+  tv::Tensor mask_tensor = tv::zeros({1}, tv::uint32, -1);
+
+  auto mask_tensor_ptr = mask_tensor.data_ptr<uint32_t>();
+  mask_tensor_ptr[0] = 0xffffffff;
+
+  std::vector<tv::Tensor> pair_mask_splits;
+  std::vector<tv::Tensor> mask_argsort_splits;
+
+  pair_mask_splits.push_back(pair_mask_fwd_splits);
+  mask_argsort_splits.push_back(mask_argsort_fwd_splits);
+
+  std::unordered_map<std::string, tv::Tensor> tensor_dict{
+    {SPCONV_ALLOC_FEATURES, input_features},
+    {SPCONV_ALLOC_FILTERS, weights},
+    {SPCONV_ALLOC_OUT_FEATURES, out_features}};
+  StaticAllocator alloc2(tensor_dict);
+
+  auto & tuner_ptr = dtype == tv::float32 ? tuner_fp32_ptr_ : tuner_fp16_ptr_;
+
+  auto conv_run_status = ConvGemmOps::implicit_gemm(
+    alloc2, *tuner_ptr, input_features, weights, pair_fwd, pair_mask_splits, mask_argsort_splits,
+    num_act_out, mask_tensor, arch_, false, params_.is_subm,
+    reinterpret_cast<std::uintptr_t>(stream), tv::CUDAKernelTimer(false), true, false, tv::Tensor(),
+    0.0, 0.0, tv::gemm::Activation::kNone, false, 1.0, tv::Tensor(), tv::Tensor(), 0.0, -1);
+
+  return 0;
+}
+
+std::int32_t ImplicitGemmPlugin::onShapeChange(
+  [[maybe_unused]] PluginTensorDesc const * in, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] PluginTensorDesc const * out, [[maybe_unused]] std::int32_t num_outputs) noexcept
+{
+  return 0;
+}
+
+IPluginV3 * ImplicitGemmPlugin::attachToContext(
+  [[maybe_unused]] IPluginResourceContext * context) noexcept
+{
+  return clone();
+}
+
+PluginFieldCollection const * ImplicitGemmPlugin::getFieldsToSerialize() noexcept
+{
+  return &fc_to_serialize_;
+}
+
+std::size_t ImplicitGemmPlugin::getWorkspaceSize(
+  [[maybe_unused]] DynamicPluginTensorDesc const * inputs, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] DynamicPluginTensorDesc const * outputs,
+  [[maybe_unused]] std::int32_t num_outputs) const noexcept
+{
+  return 0;
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp
new file mode 100644
index 0000000000000..20e498da9c617
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp
@@ -0,0 +1,174 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp"
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin.hpp"
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntimePlugin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+REGISTER_TENSORRT_PLUGIN(ImplicitGemmPluginCreator);
+
+ImplicitGemmPluginCreator::ImplicitGemmPluginCreator()
+{
+  std::cout << "ImplicitGemmPluginCreator::ImplicitGemmPluginCreator" << std::endl << std::flush;
+
+  plugin_attributes_.clear();
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("act_alpha", nullptr, PluginFieldType::kFLOAT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("act_beta", nullptr, PluginFieldType::kFLOAT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("is_subm", nullptr, PluginFieldType::kINT32, 1));  // cSpell:ignore subm
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("is_train", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("output_add_scale", nullptr, PluginFieldType::kFLOAT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("output_scale", nullptr, PluginFieldType::kFLOAT32, 1));
+
+  fc_.nbFields = plugin_attributes_.size();
+  fc_.fields = plugin_attributes_.data();
+}
+
+nvinfer1::PluginFieldCollection const * ImplicitGemmPluginCreator::getFieldNames() noexcept
+{
+  // This is only used in the build phase.
+  return &fc_;
+}
+
+IPluginV3 * ImplicitGemmPluginCreator::createPlugin(
+  char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept
+{
+  // The build phase and the deserialization phase are handled differently.
+  if (phase == TensorRTPhase::kBUILD || phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the ONNX node will be parsed and passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+
+      PLUGIN_VALIDATE(num_fields == 6);
+
+      ImplicitGemmParameters parameters;
+
+      for (std::int32_t i{0}; i < num_fields; ++i) {
+        const std::string attr_name = fields[i].name;
+        const nvinfer1::PluginFieldType type = fields[i].type;
+
+        if (attr_name == "act_alpha") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.act_alpha = static_cast<float const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "act_beta") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.act_beta = static_cast<float const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "is_subm") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.is_subm = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "is_train") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.is_train = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "output_add_scale") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.output_add_scale = static_cast<float const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "output_scale") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.output_scale = static_cast<float const *>(fields[i].data)[0];
+        }
+      }
+
+      // Log the attributes parsed from ONNX node.
+      std::stringstream ss;
+      ss << name << " plugin Attributes:";
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "act_alpha: " << parameters.act_alpha;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "act_beta: " << parameters.act_beta;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "is_subm: " << parameters.is_subm;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "is_train: " << parameters.is_train;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "output_add_scale: " << parameters.output_add_scale;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "output_scale: " << parameters.output_scale;
+      logDebug(ss.str().c_str());
+
+      ImplicitGemmPlugin * const plugin{new ImplicitGemmPlugin{std::string(name), parameters}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else if (phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the serialized plugin will be passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+      PLUGIN_VALIDATE(num_fields == 1);
+
+      char const * attr_name = fields[0].name;
+      PLUGIN_VALIDATE(!strcmp(attr_name, "parameters"));
+      PLUGIN_VALIDATE(fields[0].type == nvinfer1::PluginFieldType::kUNKNOWN);
+      PLUGIN_VALIDATE(fields[0].length == sizeof(ImplicitGemmParameters));
+      ImplicitGemmParameters params{*(static_cast<ImplicitGemmParameters const *>(fields[0].data))};
+
+      ImplicitGemmPlugin * const plugin{new ImplicitGemmPlugin{std::string(name), params}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_registration.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_registration.cpp
new file mode 100644
index 0000000000000..8cd5f5601476b
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_registration.cpp
@@ -0,0 +1,73 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/get_indices_pairs_implicit_gemm_plugin_creator.hpp"
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp"
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+#include <iostream>
+#include <mutex>
+
+class ThreadSafeLoggerFinder
+{
+public:
+  ThreadSafeLoggerFinder() = default;
+
+  // Set the logger finder.
+  void setLoggerFinder(nvinfer1::ILoggerFinder * finder)
+  {
+    std::lock_guard<std::mutex> lk(mutex_);
+    if (logger_finder_ == nullptr && finder != nullptr) {
+      logger_finder_ = finder;
+    }
+  }
+
+  // Get the logger.
+  nvinfer1::ILogger * getLogger() noexcept
+  {
+    std::lock_guard<std::mutex> lk(mutex_);
+    if (logger_finder_ != nullptr) {
+      return logger_finder_->findLogger();
+    }
+    return nullptr;
+  }
+
+private:
+  nvinfer1::ILoggerFinder * logger_finder_{nullptr};
+  std::mutex mutex_;
+};
+
+ThreadSafeLoggerFinder logger_finder;
+
+extern "C" void setLoggerFinder(nvinfer1::ILoggerFinder * finder)
+{
+  logger_finder.setLoggerFinder(finder);
+}
+
+extern "C" nvinfer1::IPluginCreatorInterface * const * getCreators(std::int32_t & num_creators)
+{
+  num_creators = 3;
+  static nvinfer1::plugin::QuickCumsumCudaPluginCreator quick_cumsum_cuda_plugin_creator{};
+  static nvinfer1::plugin::GetIndicesPairsImplicitGemmPluginCreator
+    get_indices_pairs_implicit_gemm_plugin_creator{};
+  static nvinfer1::plugin::ImplicitGemmPluginCreator implicit_gemm_plugin_creator{};
+
+  static nvinfer1::IPluginCreatorInterface * const plugin_creator_list[] = {
+    &quick_cumsum_cuda_plugin_creator, &get_indices_pairs_implicit_gemm_plugin_creator,
+    &implicit_gemm_plugin_creator};
+  return plugin_creator_list;
+}
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_utils.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_utils.cpp
new file mode 100644
index 0000000000000..cabc121e30910
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_utils.cpp
@@ -0,0 +1,57 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+#include <cstring>
+#include <sstream>
+
+void caughtError(std::exception const & e)
+{
+  getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, e.what());
+}
+
+void logDebug(char const * msg)
+{
+  getLogger()->log(nvinfer1::ILogger::Severity::kVERBOSE, msg);
+}
+
+void logInfo(char const * msg)
+{
+  getLogger()->log(nvinfer1::ILogger::Severity::kINFO, msg);
+}
+
+void reportAssertion(bool success, char const * msg, char const * file, std::int32_t line)
+{
+  if (!success) {
+    std::ostringstream stream;
+    stream << "Assertion failed: " << msg << std::endl
+           << file << ':' << line << std::endl
+           << "Aborting..." << std::endl;
+    getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
+    std::abort();
+  }
+}
+
+void reportValidation(bool success, char const * msg, char const * file, std::int32_t line)
+{
+  if (!success) {
+    std::ostringstream stream;
+    stream << "Validation failed: " << msg << std::endl << file << ':' << line << std::endl;
+    getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
+  }
+}
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp
new file mode 100644
index 0000000000000..8fa0afb2795a1
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp
@@ -0,0 +1,248 @@
+// Copyright 2025 (c) OpenMMLab. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Modified from
+// https://github.com/open-mmlab/mmdetection3d/blob/main/projects/BEVFusion/bevfusion/ops/bev_pool/src/bev_pool_cuda.cu
+// https://github.com/mit-han-lab/bevfusion/blob/main/mmdet3d/ops/bev_pool/src/bev_pool_cuda.cu
+
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp"
+
+#include "autoware/bev_ops/bev_pool_cuda.hpp"
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+
+#include <algorithm>  // NOTE(knzo25): temporary
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+QuickCumsumCudaPlugin::QuickCumsumCudaPlugin(
+  const std::string & name, QuickCumsumCudaParameters const & params)
+: layer_name_{name}, params_{params}
+{
+  initFieldsToSerialize();
+}
+
+void QuickCumsumCudaPlugin::initFieldsToSerialize()
+{
+  data_to_serialize_.clear();
+  data_to_serialize_.emplace_back("batch_size", &params_.batch_size, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("dimension", &params_.dimension, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("height", &params_.height, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("width", &params_.width, PluginFieldType::kINT32, 1);
+
+  fc_to_serialize_.nbFields = data_to_serialize_.size();
+  fc_to_serialize_.fields = data_to_serialize_.data();
+}
+
+IPluginCapability * QuickCumsumCudaPlugin::getCapabilityInterface(
+  PluginCapabilityType type) noexcept
+{
+  try {
+    if (type == PluginCapabilityType::kBUILD) {
+      return static_cast<IPluginV3OneBuild *>(this);
+    }
+    if (type == PluginCapabilityType::kRUNTIME) {
+      return static_cast<IPluginV3OneRuntime *>(this);
+    }
+    PLUGIN_ASSERT(type == PluginCapabilityType::kCORE);
+    return static_cast<IPluginV3OneCore *>(this);
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+IPluginV3 * QuickCumsumCudaPlugin::clone() noexcept
+{
+  try {
+    IPluginV3 * const plugin{new QuickCumsumCudaPlugin{layer_name_, params_}};
+    return plugin;
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+char const * QuickCumsumCudaPlugin::getPluginName() const noexcept
+{
+  return kQUICK_CUMSUM_CUDA_PLUGIN_NAME;
+}
+
+char const * QuickCumsumCudaPlugin::getPluginVersion() const noexcept
+{
+  return kQUICK_CUMSUM_CUDA_PLUGIN_VERSION;
+}
+
+char const * QuickCumsumCudaPlugin::getPluginNamespace() const noexcept
+{
+  return kQUICK_CUMSUM_CUDA_PLUGIN_NAMESPACE;
+}
+
+std::int32_t QuickCumsumCudaPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+std::int32_t QuickCumsumCudaPlugin::configurePlugin(
+  DynamicPluginTensorDesc const * in, std::int32_t num_inputs, DynamicPluginTensorDesc const * out,
+  std::int32_t num_outputs) noexcept
+{
+  // Validate input arguments.
+  PLUGIN_ASSERT(num_inputs == 4);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(in[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[1].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[2].desc.dims.nbDims == 1);
+  PLUGIN_ASSERT(in[3].desc.dims.nbDims == 1);
+
+  PLUGIN_ASSERT(out[0].desc.dims.nbDims == 5);
+
+  PLUGIN_ASSERT(in[0].desc.dims.d[0] == -1);
+  PLUGIN_ASSERT(in[1].desc.dims.d[0] == -1);
+  PLUGIN_ASSERT(in[1].desc.dims.d[1] == 4);
+
+  PLUGIN_ASSERT(in[2].desc.dims.d[0] == -1);
+  PLUGIN_ASSERT(in[3].desc.dims.d[0] == -1);
+
+  PLUGIN_ASSERT(out[0].desc.type == in[0].desc.type);
+
+  return 0;
+}
+
+bool QuickCumsumCudaPlugin::supportsFormatCombination(
+  std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+  std::int32_t num_outputs) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 4 && num_outputs == 1 && pos < num_inputs + num_outputs);
+  bool valid = false;
+
+  const std::int32_t INPUT_FEATURES_INDEX = 0;
+  const std::int32_t INPUT_GEOM_FEATURES_INDEX = 1;
+  const std::int32_t INPUT_INTERVAL_LENGTH_INDEX = 2;
+  const std::int32_t INPUT_INTERVAL_START_INDEX = 3;  // cppcheck-suppress unreadVariable
+  const std::int32_t OUTPUT_INDEX = 4;                // cppcheck-suppress unreadVariable
+
+  switch (pos) {
+    case INPUT_FEATURES_INDEX:
+    case OUTPUT_INDEX:
+      valid = in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+              in_out[pos].desc.type == nvinfer1::DataType::kFLOAT;
+      break;
+    case INPUT_GEOM_FEATURES_INDEX:
+    case INPUT_INTERVAL_LENGTH_INDEX:
+    case INPUT_INTERVAL_START_INDEX:
+      valid = in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+              in_out[pos].desc.type == nvinfer1::DataType::kINT32;
+      break;
+    default:
+      valid = false;
+      break;
+  }
+
+  return valid;
+}
+
+std::int32_t QuickCumsumCudaPlugin::getOutputDataTypes(
+  DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+  std::int32_t num_inputs) const noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 4);
+  PLUGIN_ASSERT(num_outputs == 1);
+  output_types[0] = input_types[0];
+  return 0;
+}
+
+std::int32_t QuickCumsumCudaPlugin::getOutputShapes(
+  DimsExprs const * inputs, std::int32_t num_inputs,
+  [[maybe_unused]] DimsExprs const * shape_inputs, [[maybe_unused]] std::int32_t num_shape_inputs,
+  DimsExprs * outputs, std::int32_t num_outputs, IExprBuilder & expr_builder) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 4);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(inputs != nullptr);
+  PLUGIN_ASSERT(inputs[0].nbDims == 2);
+
+  outputs[0].nbDims = 5;
+  outputs[0].d[0] = expr_builder.constant(static_cast<std::int32_t>(params_.batch_size));
+  outputs[0].d[1] = expr_builder.constant(static_cast<std::int32_t>(params_.dimension));
+  outputs[0].d[2] = expr_builder.constant(static_cast<std::int32_t>(params_.height));
+  outputs[0].d[3] = expr_builder.constant(static_cast<std::int32_t>(params_.width));
+  outputs[0].d[4] = inputs[0].d[1];
+
+  return 0;
+}
+
+std::int32_t QuickCumsumCudaPlugin::enqueue(
+  PluginTensorDesc const * input_desc, [[maybe_unused]] PluginTensorDesc const * output_desc,
+  void const * const * inputs, void * const * outputs, [[maybe_unused]] void * workspace,
+  cudaStream_t stream) noexcept
+{
+  const auto output_features_num = input_desc[0].dims.d[0];
+  const auto output_features_dim = input_desc[0].dims.d[1];
+  const auto intervals_num = input_desc[2].dims.d[0];
+
+  cudaMemsetAsync(
+    outputs[0], 0,
+    params_.batch_size * params_.dimension * params_.height * params_.width * output_features_dim *
+      sizeof(float),
+    stream);
+
+  bev_pool(
+    static_cast<std::int32_t>(params_.batch_size), static_cast<std::int32_t>(params_.dimension),
+    static_cast<std::int32_t>(params_.height), static_cast<std::int32_t>(params_.width),
+    output_features_num, output_features_dim, intervals_num, static_cast<const float *>(inputs[0]),
+    static_cast<const std::int32_t *>(inputs[1]), static_cast<const std::int32_t *>(inputs[3]),
+    static_cast<const std::int32_t *>(inputs[2]), static_cast<float *>(outputs[0]), stream);
+
+  return cudaSuccess;
+}
+
+std::int32_t QuickCumsumCudaPlugin::onShapeChange(
+  [[maybe_unused]] PluginTensorDesc const * in, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] PluginTensorDesc const * out, [[maybe_unused]] std::int32_t num_outputs) noexcept
+{
+  return 0;
+}
+
+IPluginV3 * QuickCumsumCudaPlugin::attachToContext(
+  [[maybe_unused]] IPluginResourceContext * context) noexcept
+{
+  return clone();
+}
+
+PluginFieldCollection const * QuickCumsumCudaPlugin::getFieldsToSerialize() noexcept
+{
+  return &fc_to_serialize_;
+}
+
+std::size_t QuickCumsumCudaPlugin::getWorkspaceSize(
+  [[maybe_unused]] DynamicPluginTensorDesc const * inputs, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] DynamicPluginTensorDesc const * outputs,
+  [[maybe_unused]] std::int32_t num_outputs) const noexcept
+{
+  return 0;
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp
new file mode 100644
index 0000000000000..c74edc03b4a47
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp
@@ -0,0 +1,143 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp"
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp"
+
+#include <NvInferRuntimePlugin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+REGISTER_TENSORRT_PLUGIN(QuickCumsumCudaPluginCreator);
+
+QuickCumsumCudaPluginCreator::QuickCumsumCudaPluginCreator()
+{
+  plugin_attributes_.clear();
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("batch_size", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("dimension", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("height", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("width", nullptr, PluginFieldType::kINT32, 1));
+
+  fc_.nbFields = plugin_attributes_.size();
+  fc_.fields = plugin_attributes_.data();
+}
+
+nvinfer1::PluginFieldCollection const * QuickCumsumCudaPluginCreator::getFieldNames() noexcept
+{
+  // This is only used in the build phase.
+  return &fc_;
+}
+
+IPluginV3 * QuickCumsumCudaPluginCreator::createPlugin(
+  char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept
+{
+  if (phase == TensorRTPhase::kBUILD || phase == TensorRTPhase::kRUNTIME) {
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+
+      PLUGIN_VALIDATE(num_fields == 4);
+
+      QuickCumsumCudaParameters parameters;
+
+      for (std::int32_t i{0}; i < num_fields; ++i) {
+        const std::string attr_name = fields[i].name;
+        const nvinfer1::PluginFieldType type = fields[i].type;
+
+        if (attr_name == "batch_size") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.batch_size = *static_cast<std::int32_t const *>(fields[i].data);
+        } else if (attr_name == "dimension") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.dimension = *static_cast<std::int32_t const *>(fields[i].data);
+        } else if (attr_name == "height") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.height = *static_cast<std::int32_t const *>(fields[i].data);
+        } else if (attr_name == "width") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.width = *static_cast<std::int32_t const *>(fields[i].data);
+        }
+      }
+
+      // Log the attributes parsed from ONNX node.
+      std::stringstream ss;
+      ss << name << " plugin Attributes:";
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "batch_size: " << parameters.batch_size;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "dimension: " << parameters.dimension;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "height: " << parameters.height;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "width: " << parameters.width;
+      logDebug(ss.str().c_str());
+
+      QuickCumsumCudaPlugin * const plugin{
+        new QuickCumsumCudaPlugin{std::string(name), parameters}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else if (phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the serialized plugin will be passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+      PLUGIN_VALIDATE(num_fields == 1);
+
+      char const * attr_name = fields[0].name;
+      PLUGIN_VALIDATE(!strcmp(attr_name, "parameters"));
+      PLUGIN_VALIDATE(fields[0].type == nvinfer1::PluginFieldType::kUNKNOWN);
+      PLUGIN_VALIDATE(fields[0].length == sizeof(QuickCumsumCudaParameters));
+      QuickCumsumCudaParameters params{
+        *(static_cast<QuickCumsumCudaParameters const *>(fields[0].data))};
+
+      QuickCumsumCudaPlugin * const plugin{new QuickCumsumCudaPlugin{std::string(name), params}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/sensing/autoware_cuda_utils/include/autoware/cuda_utils/cuda_utils.hpp b/sensing/autoware_cuda_utils/include/autoware/cuda_utils/cuda_utils.hpp
new file mode 100644
index 0000000000000..9d0a0f3e3e015
--- /dev/null
+++ b/sensing/autoware_cuda_utils/include/autoware/cuda_utils/cuda_utils.hpp
@@ -0,0 +1,36 @@
+// Copyright 2025 Tier IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This code is licensed under CC0 1.0 Universal (Public Domain).
+// You can use this without any limitation.
+// https://creativecommons.org/publicdomain/zero/1.0/deed.en
+// borrowed from https://proc-cpuinfo.fixstars.com/2019/02/cuda_smart_pointer/
+
+#ifndef AUTOWARE__CUDA_UTILS__CUDA_UTILS_HPP_
+#define AUTOWARE__CUDA_UTILS__CUDA_UTILS_HPP_
+
+#include "autoware/cuda_utils/cuda_check_error.hpp"
+
+#include <cuda_runtime_api.h>
+
+namespace autoware::cuda_utils
+{
+template <typename T>
+void clear_async(T * ptr, std::size_t num_elem, cudaStream_t stream)
+{
+  CHECK_CUDA_ERROR(::cudaMemsetAsync(ptr, 0, sizeof(T) * num_elem, stream));
+}
+}  // namespace autoware::cuda_utils
+
+#endif  // AUTOWARE__CUDA_UTILS__CUDA_UTILS_HPP_