diff --git a/perception/autoware_mtr/.gitignore b/perception/autoware_mtr/.gitignore
new file mode 100644
index 0000000000000..42bd33fbaa013
--- /dev/null
+++ b/perception/autoware_mtr/.gitignore
@@ -0,0 +1,51 @@
+devel/
+logs/
+build/
+bin/
+msg_gen/
+srv_gen/
+data/*onnx
+msg/*Action.msg
+msg/*ActionFeedback.msg
+msg/*ActionGoal.msg
+msg/*ActionResult.msg
+msg/*Feedback.msg
+msg/*Goal.msg
+msg/*Result.msg
+msg/_*.py
+build_isolated/
+devel_isolated/
+
+# Generated by dynamic reconfigure
+*.cfgc
+/cfg/cpp/
+/cfg/*.py
+
+# Ignore generated docs
+*.dox
+*.wikidoc
+
+# eclipse stuff
+.project
+.cproject
+
+# qcreator stuff
+CMakeLists.txt.user
+
+srv/_*.py
+*.pcd
+*.pyc
+qtcreator-*
+*.user
+
+/planning/cfg
+/planning/docs
+/planning/src
+
+*~
+
+# Emacs
+.#*
+
+# Catkin custom files
+CATKIN_IGNORE
diff --git a/perception/autoware_mtr/.markdownlint.yaml b/perception/autoware_mtr/.markdownlint.yaml
new file mode 100644
index 0000000000000..584154b2009de
--- /dev/null
+++ b/perception/autoware_mtr/.markdownlint.yaml
@@ -0,0 +1,16 @@
+# This file is automatically synced from:
+# https://github.com/autowarefoundation/sync-file-templates
+# To make changes, update the source repository and follow the guidelines in its README.
+
+# See https://github.com/DavidAnson/markdownlint/blob/main/doc/Rules.md for all rules.
+default: true
+MD013: false
+MD024:
+  siblings_only: true
+MD029:
+  style: ordered
+MD033: false
+MD041: false
+MD045: false
+MD046: false
+MD049: false
diff --git a/perception/autoware_mtr/.pre-commit-config-optional.yaml b/perception/autoware_mtr/.pre-commit-config-optional.yaml
new file mode 100644
index 0000000000000..8c9345e15f064
--- /dev/null
+++ b/perception/autoware_mtr/.pre-commit-config-optional.yaml
@@ -0,0 +1,6 @@
+repos:
+  - repo: https://github.com/tcort/markdown-link-check
+    rev: v3.12.2
+    hooks:
+      - id: markdown-link-check
+        args: [--quiet, --config=.markdown-link-check.json]
diff --git a/perception/autoware_mtr/.pre-commit-config.yaml b/perception/autoware_mtr/.pre-commit-config.yaml
new file mode 100644
index 0000000000000..63dc504f61a2b
--- /dev/null
+++ b/perception/autoware_mtr/.pre-commit-config.yaml
@@ -0,0 +1,95 @@
+ci:
+  autofix_commit_msg: "style(pre-commit): autofix"
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-toml
+      - id: check-xml
+      - id: check-yaml
+        args: [--unsafe]
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
+
+  - repo: https://github.com/igorshubovych/markdownlint-cli
+    rev: v0.41.0
+    hooks:
+      - id: markdownlint
+        args: [-c, .markdownlint.yaml, --fix]
+
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v4.0.0-alpha.8
+    hooks:
+      - id: prettier
+
+  - repo: https://github.com/adrienverge/yamllint
+    rev: v1.35.1
+    hooks:
+      - id: yamllint
+
+  - repo: https://github.com/tier4/pre-commit-hooks-ros
+    rev: v0.10.0
+    hooks:
+      - id: flake8-ros
+      - id: prettier-xacro
+      - id: prettier-launch-xml
+      - id: prettier-package-xml
+      - id: ros-include-guard
+      - id: sort-package-xml
+
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+      - id: shellcheck
+
+  - repo: https://github.com/scop/pre-commit-shfmt
+    rev: v3.9.0-1
+    hooks:
+      - id: shfmt
+        args: [-w, -s, -i=4]
+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+
+  - repo: https://github.com/psf/black
+    rev: 24.8.0
+    hooks:
+      - id: black
+        args: [--line-length=100]
+
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+      - id: clang-format
+        types_or: [c++, c, cuda]
+
+  - repo: https://github.com/cpplint/cpplint
+    rev: 1.6.1
+    hooks:
+      - id: cpplint
+        args: [--quiet]
+        exclude: .cu
+
+  - repo: https://github.com/python-jsonschema/check-jsonschema
+    rev: 0.29.2
+    hooks:
+      - id: check-metaschema
+        files: ^.+/schema/.*schema\.json$
+
+  - repo: local
+    hooks:
+      - id: prettier-svg
+        name: prettier svg
+        description: Apply Prettier with plugin-xml to svg.
+        entry: prettier --write --list-different --ignore-unknown --print-width 200 --xml-self-closing-space false --xml-whitespace-sensitivity ignore
+        language: node
+        files: .svg$
+        additional_dependencies: [prettier@2.7.1, "@prettier/plugin-xml@2.2.0"]
diff --git a/perception/autoware_mtr/CMakeLists.txt b/perception/autoware_mtr/CMakeLists.txt
new file mode 100644
index 0000000000000..5dc5c2fb25106
--- /dev/null
+++ b/perception/autoware_mtr/CMakeLists.txt
@@ -0,0 +1,104 @@
+cmake_minimum_required(VERSION 3.8)
+project(autoware_mtr)
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+# find dependencies
+find_package(autoware_cmake REQUIRED)
+autoware_package()
+
+find_package(CUDA REQUIRED)
+if (CUDA_FOUND)
+  include_directories(${CUDA_INCLUDE_DIRS})
+  find_library(CUBLAS_LIBRARIES cublas HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+                                            ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+  find_library(
+    CUDNN_LIBRARIES
+    NAMES libcudnn.so${__cudnn_ver_suffix} libcudnn${__cudnn_ver_suffix}.dylib
+          ${__cudnn_lib_win_name}
+    PATHS $ENV{LD_LIBRARY_PATH} ${__libpath_cudart} ${CUDNN_ROOT_DIR}
+          ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
+    PATH_SUFFIXES lib lib64 bin
+    DOC "CUDNN library.")
+else()
+  message(FATAL_ERROR "Can not find CUDA")
+endif()
+
+list(APPEND TRT_PLUGINS "nvinfer")
+list(APPEND TRT_PLUGINS "nvonnxparser")
+list(APPEND TRT_PLUGINS "nvparsers")
+foreach(libName ${TRT_PLUGINS})
+  find_library(${libName}_lib NAMES ${libName} "/usr" PATH_SUFFIXES lib)
+  list(APPEND TRT_PLUGINS ${${libName}_lib})
+endforeach()
+
+# TRT plugins and cuda kernels
+cuda_add_library(${PROJECT_NAME}_cuda_lib SHARED
+  lib/src/attention/trt_attn_value_computation_kernel.cu
+  lib/src/attention/trt_attn_value_computation.cpp
+  lib/src/attention/trt_attn_weight_computation_kernel.cu
+  lib/src/attention/trt_attn_weight_computation.cpp
+  lib/src/knn/trt_knn_batch_kernel.cu
+  lib/src/knn/trt_knn_batch.cpp
+  lib/src/knn/trt_knn_batch_mlogk_kernel.cu
+  lib/src/knn/trt_knn_batch_mlogk.cpp
+  lib/src/preprocess/agent_preprocess_kernel.cu
+  lib/src/preprocess/polyline_preprocess_kernel.cu
+  lib/src/postprocess/postprocess_kernel.cu
+)
+target_include_directories(${PROJECT_NAME}_cuda_lib PUBLIC
+  lib/include
+)
+
+ament_auto_add_library(${PROJECT_NAME}_lib SHARED
+  src/builder.cpp
+  src/trt_mtr.cpp
+  src/conversions/lanelet.cpp
+  src/intention_point.cpp
+  src/conversions/history.cpp
+)
+target_link_libraries(${PROJECT_NAME}_lib
+  ${TRT_PLUGINS}
+  ${TRT_PLUGIN_LIBS}
+  ${CUDA_LIBRARIES}
+  ${CUBLAS_LIBRARIES}
+  ${CUDNN_LIBRARIES}
+  ${PROJECT_NAME}_cuda_lib
+)
+target_include_directories(${PROJECT_NAME}_lib PUBLIC
+  include
+  lib/include
+)
+
+target_compile_options(${PROJECT_NAME}_lib PRIVATE
+  -Wall -Wextra -Wpedantic -Werror -Wno-deprecated-declarations
+)
+
+# ROS 2 node
+ament_auto_add_library(${PROJECT_NAME}_node SHARED
+  src/node.cpp
+)
+
+target_link_libraries(${PROJECT_NAME}_node
+  ${PROJECT_NAME}_lib
+)
+
+rclcpp_components_register_node(${PROJECT_NAME}_node
+  PLUGIN "autoware::mtr::MTRNode"
+  EXECUTABLE ${PROJECT_NAME}
+)
+
+if(BUILD_TESTING)
+  find_package(ament_lint_auto REQUIRED)
+  ament_lint_auto_find_test_dependencies()
+  ament_auto_add_gtest(test_fixed_queue test/test_fixed_queue.cpp)
+endif()
+
+ament_auto_package(
+  INSTALL_TO_SHARE
+  launch
+  config
+  data
+)
diff --git a/perception/autoware_mtr/LICENSE b/perception/autoware_mtr/LICENSE
new file mode 100644
index 0000000000000..261eeb9e9f8b2
--- /dev/null
+++ b/perception/autoware_mtr/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/perception/autoware_mtr/README.md b/perception/autoware_mtr/README.md
new file mode 100644
index 0000000000000..010fa8505dbdb
--- /dev/null
+++ b/perception/autoware_mtr/README.md
@@ -0,0 +1,72 @@
+# autoware_mtr
+
+## Purpose
+
+The `autoware_mtr` package is used for 3D object motion prediction based on ML-based model called MTR.
+
+## Inner-workings / Algorithms
+
+The implementation bases on MTR [1] work. It uses TensorRT library for data process and network interface.
+
+## Inputs / Outputs
+
+### Input
+
+| Name                 | Type                                            | Description              |
+| -------------------- | ----------------------------------------------- | ------------------------ |
+| `~/input/objects`    | `autoware_perception_msgs::msg::TrackedObjects` | Input agent state.       |
+| `~/input/vector_map` | `autoware_map_msgs::msg::LeneletMapBin`         | Input vector map.        |
+| `~/input/ego`        | `sensor_msgs::msg::Odometry`                    | Input ego vehicle state. |
+
+### Output
+
+| Name               | Type                                              | Description                |
+| ------------------ | ------------------------------------------------- | -------------------------- |
+| `~/output/objects` | `autoware_perception_msgs::msg::PredictedObjects` | Predicted objects' motion. |
+
+## Parameters
+
+Following parameters can be specified in `.launch.xml` or command line.
+
+### `param_path`
+
+File path to the MTR configuration. (Default: `autoware_mtr/config/mtr.param.yaml`)
+
+### Configuration Parameters
+
+#### `model_params`
+
+| Name                          |  type   | Description                                                 |
+| :---------------------------- | :-----: | :---------------------------------------------------------- |
+| `model_path`                  |  `str`  | ONNX or engine file path.                                   |
+| `target_labels`               | `str[]` | An array of label names to be predicted.                    |
+| `num_past`                    |  `int`  | The number of history length.                               |
+| `num_mode`                    |  `int`  | The number of predicted modes.                              |
+| `num_future`                  |  `int`  | The number of predicted future length.                      |
+| `max_num_polyline`            |  `int`  | The maximum number of polylines to be contained in input.   |
+| `max_num_point`               |  `int`  | The maximum number of points included in a single polyline. |
+| `point_break_distance`        | `float` | Distance threshold to separate points into two polylines.   |
+| `intention_point_filepath`    |  `str`  | File path to intension points (.csv).                       |
+| `num_intention_point_cluster` |  `int`  | The number of clusters of intention points.                 |
+
+#### `build_params`
+
+| Name         |  type  | Description                                                 |
+| :----------- | :----: | :---------------------------------------------------------- |
+| `is_dynamic` | `bool` | Indicates whether the model allows dynamic shape inference. |
+| `precision`  | `str`  | Precision mode.                                             |
+| `MINMAX`     | `str`  | Calibration mode.                                           |
+
+### `data_path`
+
+Directory path to ONNX or TensorRT engine file. (Default: `autoware_mtr/data`)
+
+### `build_only`
+
+This option performs to build the TensorRT engine file from the ONNX file and exit after finishing it. (Default: `false`)
+
+You can execute with the following command:
+
+```bash
+ros2 launch autoware_mtr mtr.launch.xml build_only:=true
+```
diff --git a/perception/autoware_mtr/config/mtr.param.yaml b/perception/autoware_mtr/config/mtr.param.yaml
new file mode 100644
index 0000000000000..932d544495d5b
--- /dev/null
+++ b/perception/autoware_mtr/config/mtr.param.yaml
@@ -0,0 +1,17 @@
+/**:
+  ros__parameters:
+    model_params:
+      model_path: "$(var data_path)/mtr_static.onnx"
+      target_labels: ["VEHICLE", "PEDESTRIAN", "CYCLIST"]
+      num_past: 11
+      num_mode: 6
+      num_future: 80
+      max_num_polyline: 768
+      max_num_point: 20
+      point_break_distance: 1.0
+      intention_point_filepath: "$(var data_path)/intention_point.csv"
+      num_intention_point_cluster: 64
+    build_params:
+      is_dynamic: false
+      precision: "FP32"
+      calibration: "MINMAX"
diff --git a/perception/autoware_mtr/data/intention_point.csv b/perception/autoware_mtr/data/intention_point.csv
new file mode 100644
index 0000000000000..4a1dd40d774df
--- /dev/null
+++ b/perception/autoware_mtr/data/intention_point.csv
@@ -0,0 +1,192 @@
+83.31358111813775,-0.10072941747215425,VEHICLE
+9.63708093098419,-0.21903979647725835,VEHICLE
+177.02364184779515,0.14896461714506093,VEHICLE
+30.202525932768452,39.770351748996276,VEHICLE
+13.017453460143628,-47.65477812853313,VEHICLE
+37.96551406831798,0.35882891080252355,VEHICLE
+135.28288300240175,0.07318454776428718,VEHICLE
+22.581299759917737,-11.499114037699696,VEHICLE
+35.28419206368978,66.77646234434653,VEHICLE
+102.21595906561359,0.03479736135528899,VEHICLE
+53.018984885961935,-0.07070513548030011,VEHICLE
+48.14500443347089,-63.176686454128905,VEHICLE
+14.81040681866098,41.21025747789579,VEHICLE
+70.98121857460572,48.76864840754326,VEHICLE
+29.952205374359025,-24.491271384603404,VEHICLE
+240.1121603513413,0.40876524870669395,VEHICLE
+112.96998794807857,-0.6626782536634594,VEHICLE
+11.535102863478023,24.6087503284935,VEHICLE
+15.972999486515413,-22.402331618330425,VEHICLE
+-29.749150635040934,19.365306554458183,VEHICLE
+38.837490918969365,28.521719218554505,VEHICLE
+75.12534715421508,0.45524707149479826,VEHICLE
+22.1356009680794,-65.50042235168596,VEHICLE
+147.667334683977,-0.15420460547298664,VEHICLE
+32.92404725639907,13.425428408384349,VEHICLE
+20.821241263140003,11.866509426621738,VEHICLE
+16.421774854176704,61.40243322799172,VEHICLE
+16.20521970358857,-0.05240557684667735,VEHICLE
+30.39961023178619,0.02239890411088652,VEHICLE
+71.3732282468828,-13.197252636743817,VEHICLE
+160.46427534905007,-0.12450058691311172,VEHICLE
+26.184995977566512,-50.72960305193536,VEHICLE
+67.47509063983794,0.7088966461759214,VEHICLE
+55.59341134160711,65.92540112801592,VEHICLE
+110.19820491921735,13.36986405878499,VEHICLE
+9.946459640959318,10.639932128723443,VEHICLE
+96.45900409192926,38.745456793222075,VEHICLE
+11.150642903256283,-11.065153981266088,VEHICLE
+18.370803671188316,-35.208384113727355,VEHICLE
+58.12332918930498,28.407400941348005,VEHICLE
+122.46253938423963,-18.18162199541143,VEHICLE
+-10.372362664539338,47.15608107630011,VEHICLE
+123.88850325553135,0.8657285570358468,VEHICLE
+23.14906846590817,0.14013577245075504,VEHICLE
+36.72246632151757,-11.50791955554189,VEHICLE
+62.17961723750621,-38.16113529166156,VEHICLE
+60.255349648701284,0.3402476217172201,VEHICLE
+24.676850823734142,26.15851575291674,VEHICLE
+146.06185359624084,19.81777897597737,VEHICLE
+87.37642530385885,-49.37120849166174,VEHICLE
+29.217582632652448,52.466854930826635,VEHICLE
+92.24255344207724,1.0269363594524006,VEHICLE
+51.35773566968937,-16.858679079987603,VEHICLE
+202.88969500859582,-0.6135434826913582,VEHICLE
+44.54017657777002,47.644212807081004,VEHICLE
+3.553274456564374,-0.050630482129089494,VEHICLE
+48.341886681937126,14.266599993562439,VEHICLE
+149.5579662788205,-24.15545279805254,VEHICLE
+79.2199105091584,18.21919210874116,VEHICLE
+-6.625598315633162,12.675144082377521,VEHICLE
+94.47749147282539,-14.96227839803878,VEHICLE
+45.51725648445333,-0.20613185836059866,VEHICLE
+4.320320122401455,-25.861972448072912,VEHICLE
+37.67711139073701,-38.236581219014134,VEHICLE
+9.61012756292987,-1.626166760636742,PEDESTRIAN
+1.6012943859685747,-0.5222880919927995,PEDESTRIAN
+10.417423171793075,4.168765395082893,PEDESTRIAN
+30.165877791012033,0.9443346726543759,PEDESTRIAN
+11.275694770850642,0.602104658163488,PEDESTRIAN
+7.575045215991116,2.229632545811262,PEDESTRIAN
+5.440404657806666,-5.873988428286141,PEDESTRIAN
+21.74637271118164,-0.20824338209629067,PEDESTRIAN
+5.666046695576772,10.06275615427229,PEDESTRIAN
+3.5434901504148772,0.3566361072142261,PEDESTRIAN
+7.300691812980073,-1.4147541015839373,PEDESTRIAN
+-7.239123311536066,5.278028183969957,PEDESTRIAN
+12.838544084273805,0.27634052827289396,PEDESTRIAN
+49.23597074809827,-1.083151083243521,PEDESTRIAN
+10.232749005947733,-4.764095273689241,PEDESTRIAN
+10.029744060560203,0.27960282964346456,PEDESTRIAN
+20.097753611477934,12.677465005354446,PEDESTRIAN
+-1.7234067196647338,6.012964824835459,PEDESTRIAN
+-5.243654960241074,-4.54240223688957,PEDESTRIAN
+4.227448060010606,4.672668618904917,PEDESTRIAN
+10.924442376409258,-13.39580307688032,PEDESTRIAN
+8.060251735871837,-5.231071510622576,PEDESTRIAN
+9.363416959884319,6.717317995111996,PEDESTRIAN
+14.50409186103127,-0.6558042344608993,PEDESTRIAN
+9.079393733128594,3.082935659625415,PEDESTRIAN
+12.050464391708372,2.326085873493335,PEDESTRIAN
+18.95312432448069,-4.011247088511785,PEDESTRIAN
+23.27299641548319,4.935875710020675,PEDESTRIAN
+7.473063083456342,0.33913094917891407,PEDESTRIAN
+2.1331759617569723,-8.755232001958268,PEDESTRIAN
+1.6598805569020305,2.2264835146474247,PEDESTRIAN
+25.060731768608093,-2.7297780113294716,PEDESTRIAN
+6.24312299635352,-3.4591273970720238,PEDESTRIAN
+12.771120711344025,-1.907544178221929,PEDESTRIAN
+8.609554418619128,-7.480640052021412,PEDESTRIAN
+11.839472696628974,-0.508671370756991,PEDESTRIAN
+18.143608430845546,0.3659129395949102,PEDESTRIAN
+38.62964275905064,-2.878807255199977,PEDESTRIAN
+5.747641907439238,0.07273334688443972,PEDESTRIAN
+12.750787362456322,5.3989472184330225,PEDESTRIAN
+9.036868476563958,1.1392225010949337,PEDESTRIAN
+8.900814069313066,-0.3074934092596276,PEDESTRIAN
+5.77225670413436,-9.003250630102423,PEDESTRIAN
+4.644974783536821,-1.7992167588678774,PEDESTRIAN
+-2.734048414539984,1.3070609912082745,PEDESTRIAN
+10.402230190194171,1.9045561768006598,PEDESTRIAN
+7.672931251806371,4.772691267378191,PEDESTRIAN
+2.8842629428003317,-4.572883587257534,PEDESTRIAN
+34.02037658691406,13.169144248962402,PEDESTRIAN
+-6.847214352000844,-0.384562531655485,PEDESTRIAN
+8.649644048868026,-3.060604455724225,PEDESTRIAN
+12.05244591361598,12.773428766350996,PEDESTRIAN
+-0.9961213812009628,-6.185897613639262,PEDESTRIAN
+18.33287762535943,6.177935240003799,PEDESTRIAN
+2.5809229793501816,7.47769858790379,PEDESTRIAN
+13.21153731608954,-5.18699638486847,PEDESTRIAN
+14.148733378391636,1.7894855494592705,PEDESTRIAN
+6.273506949164649,6.948122468861667,PEDESTRIAN
+10.640743486095062,-0.6780669181702694,PEDESTRIAN
+19.00134840878573,-11.189485073089601,PEDESTRIAN
+11.03738529839831,-2.740356324669831,PEDESTRIAN
+25.126273824207818,0.7949037749820683,PEDESTRIAN
+-0.9026070001410034,-2.676681885078772,PEDESTRIAN
+5.5613501533385215,2.4639029791278224,PEDESTRIAN
+34.23404892454756,5.7000023380238956,CYCLIST
+11.101248473572209,0.1878690882552445,CYCLIST
+58.499493185397796,-0.12681177640910635,CYCLIST
+24.58085452251553,2.471349286922017,CYCLIST
+22.422780924829944,13.053710756630732,CYCLIST
+48.74841719002559,0.516642472661775,CYCLIST
+25.20883455276489,-20.044839859008786,CYCLIST
+36.03024959564209,-2.68259093122823,CYCLIST
+82.53688841599686,-0.20612470691020657,CYCLIST
+46.24678649902344,21.0171902179718,CYCLIST
+8.891835970037125,17.774530859554517,CYCLIST
+20.25145509209431,-0.05082348066036646,CYCLIST
+-10.578458561616781,0.3869281095616959,CYCLIST
+21.098386228084564,34.59569489955903,CYCLIST
+6.382585232908074,-16.506645029241387,CYCLIST
+19.716625179563252,-5.445932748771849,CYCLIST
+7.230897838592526,0.5130343590974809,CYCLIST
+32.55335546794691,31.67215136477822,CYCLIST
+72.18442005222127,1.009845858913357,CYCLIST
+49.38804136003766,-28.37724140712193,CYCLIST
+28.633797392177122,-0.3754431623335617,CYCLIST
+15.30891491454325,4.449968129028509,CYCLIST
+40.48208164932704,4.483412874807225,CYCLIST
+18.588481744130455,27.931559165318806,CYCLIST
+24.917308371407646,-9.43418445587158,CYCLIST
+47.191020900920286,-5.356957855871168,CYCLIST
+20.612224715096612,-39.57385090419225,CYCLIST
+37.02404185703823,12.3643497467041,CYCLIST
+26.742534319559734,21.288381364610455,CYCLIST
+36.54223831027162,1.3074939094995166,CYCLIST
+32.81819396373654,-0.6552566748020072,CYCLIST
+44.87005353096204,0.5966045727236915,CYCLIST
+8.368693745654564,27.36604765187139,CYCLIST
+30.52261005200838,-6.0198339562667025,CYCLIST
+57.868157704671226,32.555169423421226,CYCLIST
+5.196329174814995,-7.779524571186786,CYCLIST
+27.070191771895797,6.705634905232322,CYCLIST
+47.03257816242722,8.60380427342541,CYCLIST
+56.33857297897339,-7.308432310819625,CYCLIST
+3.3166103733038277,-0.2503264608792952,CYCLIST
+25.092793430426184,-1.8190701604653627,CYCLIST
+52.86965798549964,-0.8058032429181647,CYCLIST
+13.250703811645511,-26.973795970280964,CYCLIST
+2.2024684443193365,10.044839859008789,CYCLIST
+54.38298190081561,2.7558305506353027,CYCLIST
+34.70839107737822,20.00758409500122,CYCLIST
+64.3159403538966,-0.78287570637006,CYCLIST
+15.62673833790947,-1.2094380376477014,CYCLIST
+60.42839979088825,9.112664803214695,CYCLIST
+33.95394577026367,-15.673711471557617,CYCLIST
+12.455606278919038,-8.294502810826376,CYCLIST
+-5.831538856029507,23.902103185653687,CYCLIST
+39.58197693202807,-7.948247836983719,CYCLIST
+40.74147271136849,-0.5924087476213362,CYCLIST
+2.8944256986890515,-25.95755699702671,CYCLIST
+28.955958038568497,12.641273751854897,CYCLIST
+17.196987215677897,21.054426108466252,CYCLIST
+31.001477972861448,2.74578192921504,CYCLIST
+29.997188568115234,-29.97930075905539,CYCLIST
+16.37813091278076,14.201883527967663,CYCLIST
+20.38882696363661,6.6267309957080425,CYCLIST
+42.03949113325639,36.66604232788086,CYCLIST
+10.703137148981511,9.449819202008456,CYCLIST
+16.409427689342962,-16.209246635437015,CYCLIST
diff --git a/perception/autoware_mtr/include/autoware/mtr/agent.hpp b/perception/autoware_mtr/include/autoware/mtr/agent.hpp
new file mode 100644
index 0000000000000..797ed0d8b5e07
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/agent.hpp
@@ -0,0 +1,405 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__AGENT_HPP_
+#define AUTOWARE__MTR__AGENT_HPP_
+
+#include "autoware/mtr/fixed_queue.hpp"
+
+#include <geometry_msgs/msg/point.hpp>
+#include <geometry_msgs/msg/vector3.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <limits>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+namespace autoware::mtr
+{
+constexpr size_t AgentStateDim = 12;
+
+enum AgentLabel { VEHICLE = 0, PEDESTRIAN = 1, CYCLIST = 2 };
+
+enum AgentDimLabels {
+  X = 0,
+  Y = 1,
+  Z = 2,
+  L = 3,
+  W = 4,
+  H = 5,
+  YAW = 6,
+  VX = 7,
+  VY = 8,
+  AX = 9,
+  AY = 10,
+  VALIDITY = 11
+};
+
+/**
+ * @brief A class to represent a single state of an agent.
+ */
+struct AgentState
+{
+  // Construct a new instance filling all elements by `0.0f`.
+  AgentState() {}
+
+  /**
+   * @brief Construct a new instance with specified values.
+   *
+   * @param position 3D position [m].
+   * @param dimension Box dimension [m].
+   * @param yaw Heading yaw angle [rad].
+   * @param velocity Velocity [m/s].
+   * @param acceleration Acceleration [m/s^2].
+   * @param is_valid `1.0f` if valid, otherwise `0.0f`.
+   */
+  AgentState(
+    const geometry_msgs::msg::Point & position, const geometry_msgs::msg::Vector3 & dimension,
+    float yaw, const geometry_msgs::msg::Vector3 & velocity,
+    const geometry_msgs::msg::Vector3 & acceleration, float is_valid)
+  : position_(position),
+    dimension_(dimension),
+    yaw_(yaw),
+    velocity_(velocity),
+    acceleration_(acceleration),
+    is_valid_(is_valid)
+  {
+  }
+
+  // Construct a new instance filling all elements by `0.0f`.
+  static AgentState empty() noexcept { return AgentState(); }
+
+  // Return the agent state dimensions `D`.
+  static size_t dim() { return AgentStateDim; }
+
+  // Return the x position.
+  float x() const { return position_.x; }
+
+  // Return the y position.
+  float y() const { return position_.y; }
+
+  // Return the z position.
+  float z() const { return position_.z; }
+
+  // Return the length of object size.
+  float length() const { return dimension_.x; }
+
+  // Return the width of object size.
+  float width() const { return dimension_.y; }
+
+  // Return the height of object size.
+  float height() const { return dimension_.z; }
+
+  // Return the yaw angle `[rad]`.
+  float yaw() const { return yaw_; }
+
+  // Return the x velocity.
+  float vx() const { return velocity_.x; }
+
+  // Return the y velocity.
+  float vy() const { return velocity_.y; }
+
+  // Return the x acceleration.
+  float ax() const { return acceleration_.x; }
+
+  // Return the y acceleration.
+  float ay() const { return acceleration_.y; }
+
+  // Return `true` if the value is `1.0`.
+  bool is_valid() const { return is_valid_ == 1.0; }
+
+  // Return the state attribute as an array.
+  std::array<float, AgentStateDim> as_array() const noexcept
+  {
+    return {x(), y(), z(), length(), width(), height(), yaw(), vx(), vy(), ax(), ay(), is_valid_};
+  }
+
+private:
+  geometry_msgs::msg::Point position_;
+  geometry_msgs::msg::Vector3 dimension_;
+  float yaw_{0.0f};
+  geometry_msgs::msg::Vector3 velocity_;
+  geometry_msgs::msg::Vector3 acceleration_;
+  float is_valid_{0.0f};
+};
+
+/**
+ * @brief A class to represent the state history of an agent.
+ */
+struct AgentHistory
+{
+  /**
+   * @brief Construct a new Agent History filling the latest state by input state.
+   *
+   * @param state Object current state.
+   * @param object_id Object ID.
+   * @param label_id Label ID.
+   * @param current_time Current timestamp.
+   * @param max_time_length History length.
+   */
+  AgentHistory(
+    const AgentState & state, const std::string & object_id, const size_t label_id,
+    const double current_time, const size_t max_time_length)
+  : queue_(max_time_length),
+    object_id_(object_id),
+    label_id_(label_id),
+    latest_time_(current_time),
+    max_time_length_(max_time_length)
+  {
+    queue_.push_back(state);
+  }
+
+  // Return the history time length `T`.
+  size_t length() const { return max_time_length_; }
+
+  // Return the number of agent state dimensions `D`.
+  static size_t state_dim() { return AgentStateDim; }
+
+  // Return the data size of history `T * D`.
+  size_t size() const { return max_time_length_ * state_dim(); }
+
+  // Return the shape of history matrix ordering in `(T, D)`.
+  std::tuple<size_t, size_t> shape() const { return {max_time_length_, state_dim()}; }
+
+  // Return the object id.
+  const std::string & object_id() const { return object_id_; }
+
+  // Return the label id.
+  size_t label_id() const { return label_id_; }
+
+  /**
+   * @brief Return the last timestamp when non-empty state was pushed.
+   *
+   * @return double
+   */
+  double latest_time() const { return latest_time_; }
+
+  /**
+   * @brief Update history with input state and latest time.
+   *
+   * @param current_time The current timestamp.
+   * @param state The current agent state.
+   */
+  void update(double current_time, const AgentState & state) noexcept
+  {
+    queue_.push_back(state);
+    latest_time_ = current_time;
+  }
+
+  // Update history with all-zeros state, but latest time is not updated.
+  void update_empty() noexcept
+  {
+    const auto state = AgentState::empty();
+    queue_.push_back(state);
+  }
+
+  // Return a history states as an array.
+  std::vector<float> as_array() const noexcept
+  {
+    std::vector<float> output;
+    for (const auto & state : queue_) {
+      for (const auto & v : state.as_array()) {
+        output.push_back(v);
+      }
+    }
+    return output;
+  }
+
+  /**
+   * @brief Check whether the latest valid state is too old or not.
+   *
+   * @param current_time Current timestamp.
+   * @param threshold Time difference threshold value.
+   * @return true If the difference is greater than threshold.
+   * @return false Otherwise
+   */
+  bool is_ancient(double current_time, double threshold) const
+  {
+    /* TODO: Raise error if the current time is smaller than latest */
+    return current_time - latest_time_ >= threshold;
+  }
+
+  /**
+   * @brief Check whether the latest state data is valid.
+   *
+   * @return true If the end of element is 1.0f.
+   * @return false Otherwise.
+   */
+  bool is_valid_latest() const { return get_latest_state().is_valid(); }
+
+  // Get the latest agent state at `T`.
+  const AgentState & get_latest_state() const { return queue_.back(); }
+
+  // Get the latest agent state at `T`.
+  std::optional<AgentState> get_latest_valid_state() const
+  {
+    auto latest_valid_state = std::find_if(
+      queue_.rbegin(), queue_.rend(), [](const auto & state) { return state.is_valid(); });
+    return (latest_valid_state != queue_.rend()) ? std::make_optional(*latest_valid_state)
+                                                 : std::nullopt;
+  }
+
+private:
+  FixedQueue<AgentState> queue_;
+  const std::string object_id_;
+  const size_t label_id_;
+  double latest_time_;
+  const size_t max_time_length_;
+};
+
+/**
+ * @brief A class containing whole state histories of all agent.
+ */
+struct AgentData
+{
+  /**
+   * @brief Construct a new instance.
+   *
+   * @param histories An array of histories for each object.
+   * @param ego_index An index of ego.
+   * @param target_indices Indices of target agents.
+   * @param label_ids An array of label indices for each object.
+   * @param timestamps An array of timestamps.
+   */
+  AgentData(
+    const std::vector<AgentHistory> & histories, const size_t ego_index,
+    const std::vector<size_t> & target_indices, const std::vector<size_t> & label_ids,
+    const std::vector<float> & timestamps)
+  : num_target_(target_indices.size()),
+    num_agent_(histories.size()),
+    time_length_(timestamps.size()),
+    ego_index_(ego_index),
+    target_indices_(target_indices),
+    label_ids_(label_ids),
+    timestamps_(timestamps)
+  {
+    data_.reserve(num_agent_ * time_length_ * state_dim());
+    for (auto & history : histories) {
+      for (const auto & v : history.as_array()) {
+        data_.push_back(v);
+      }
+    }
+
+    current_target_data_.reserve(num_target_ * state_dim());  // (B, D)
+    target_label_ids_.reserve(num_target_);
+    for (const auto & idx : target_indices) {
+      target_label_ids_.emplace_back(label_ids.at(idx));
+      for (const auto & v : histories.at(idx).get_latest_state().as_array()) {
+        current_target_data_.push_back(v);
+      }
+    }
+
+    ego_data_.reserve(time_length_ * state_dim());
+    for (const auto & v : histories.at(ego_index).as_array()) {
+      ego_data_.push_back(v);
+    }
+  }
+
+  // Return the number of classes `C`.
+  static size_t num_class() { return 3; }  // TODO(ktro2828): Do not use magic number.
+
+  // Return the number of target agents `B`.
+  size_t num_target() const { return num_target_; }
+
+  // Return the number of agents `N`.
+  size_t num_agent() const { return num_agent_; }
+
+  // Return the timestamp length `T`.
+  size_t time_length() const { return time_length_; }
+
+  // Return the number of agent state dimensions `D`.
+  static size_t state_dim() { return AgentStateDim; }
+
+  // Return the index of ego.
+  size_t ego_index() const { return ego_index_; }
+
+  // Return the vector of indices of target agents, in shape `[B]`.
+  const std::vector<size_t> & target_indices() const { return target_indices_; }
+
+  // Return the vector of label ids of all agents, in shape `[N]`.
+  const std::vector<size_t> & label_ids() const { return label_ids_; }
+
+  // Return the vector of label ids of target agents, in shape `[B]`.
+  const std::vector<size_t> & target_label_ids() const { return target_label_ids_; }
+
+  // Return the vector of timestamps in shape `[T]`.
+  const std::vector<float> & timestamps() const { return timestamps_; }
+
+  // Return the number of all elements `N*T*D`.
+  size_t size() const { return num_agent_ * time_length_ * state_dim(); }
+
+  // Return the number of state dimensions of MTR input `T+C+D+3`.
+  size_t input_dim() const { return time_length_ + state_dim() + num_class() + 3; }
+
+  // Return the data shape ordering in (N, T, D).
+  std::tuple<size_t, size_t, size_t> shape() const
+  {
+    return {num_agent_, time_length_, state_dim()};
+  }
+
+  // Return the address pointer of data array.
+  const float * data_ptr() const noexcept { return data_.data(); }
+
+  // Return the address pointer of data array for target agents.
+  const float * current_target_data_ptr() const noexcept { return current_target_data_.data(); }
+
+  // Return the address pointer of data array for ego vehicle.
+  const float * ego_data_ptr() const noexcept { return ego_data_.data(); }
+
+private:
+  size_t num_target_;
+  size_t num_agent_;
+  size_t time_length_;
+  size_t ego_index_;
+  std::vector<size_t> target_indices_;
+  std::vector<size_t> label_ids_;
+  std::vector<size_t> target_label_ids_;
+  std::vector<float> timestamps_;
+  std::vector<float> data_;
+  std::vector<float> current_target_data_;
+  std::vector<float> ego_data_;
+};
+
+// Get label names from label indices.
+inline std::vector<std::string> getLabelNames(const std::vector<size_t> & label_index)
+{
+  std::vector<std::string> label_names;
+  label_names.reserve(label_index.size());
+  for (const auto & idx : label_index) {
+    switch (idx) {
+      case 0:
+        label_names.emplace_back("VEHICLE");
+        break;
+      case 1:
+        label_names.emplace_back("PEDESTRIAN");
+        break;
+      case 2:
+        label_names.emplace_back("CYCLIST");
+        break;
+      default:
+        std::ostringstream msg;
+        msg << "Error invalid label index: " << idx;
+        throw std::runtime_error(msg.str());
+        break;
+    }
+  }
+  return label_names;
+}
+
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__AGENT_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/builder.hpp b/perception/autoware_mtr/include/autoware/mtr/builder.hpp
new file mode 100644
index 0000000000000..7dbe0895caa8d
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/builder.hpp
@@ -0,0 +1,243 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__BUILDER_HPP_
+#define AUTOWARE__MTR__BUILDER_HPP_
+
+#include <NvInfer.h>
+#include <NvOnnxParser.h>
+
+#include <filesystem>
+namespace fs = ::std::filesystem;
+
+#include <autoware/tensorrt_common/logger.hpp>
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+
+namespace autoware::mtr
+{
+
+template <typename T>
+struct TrtDeleter
+{
+  void operator()(T * obj) const
+  {
+    if (obj) {
+#if NV_TENSORRT_MAJOR >= 8
+      delete obj;
+#else
+      obj->destroy();
+#endif
+    }
+  }
+};  // struct TrtDeleter
+
+template <typename T>
+using TrtUniquePtr = std::unique_ptr<T, TrtDeleter<T>>;
+
+// Type names of precisions.
+enum PrecisionType { FP32 = 0, FP16 = 1, INT8 = 2 };
+
+// Return corresponding PrecisionType from string.
+inline PrecisionType toPrecisionType(const std::string & name)
+{
+  if (name == "FP32") {
+    return PrecisionType::FP32;
+  }
+  if (name == "FP16") {
+    return PrecisionType::FP16;
+  }
+  if (name == "INT8") {
+    return PrecisionType::INT8;
+  }
+  throw std::invalid_argument("Invalid precision name.");
+}
+
+// Type names of calibrations.
+enum CalibrationType { ENTROPY = 0, LEGACY = 1, PERCENTILE = 2, MINMAX = 3 };
+
+// Return corresponding CalibrationType from string.
+inline CalibrationType toCalibrationType(const std::string & name)
+{
+  if (name == "ENTROPY") {
+    return CalibrationType::ENTROPY;
+  }
+  if (name == "LEGACY") {
+    return CalibrationType::LEGACY;
+  }
+  if (name == "PERCENTILE") {
+    return CalibrationType::PERCENTILE;
+  }
+  if (name == "MINMAX") {
+    return CalibrationType::MINMAX;
+  }
+  throw std::invalid_argument("Invalid calibration name.");
+}
+
+struct BatchOptConfig
+{
+  /**
+   * @brief Construct a new OptimizationConfig for a static shape inference.
+   *
+   * @param value
+   */
+  explicit BatchOptConfig(const int32_t value) : k_min(value), k_opt(value), k_max(value) {}
+
+  /**
+   * @brief Construct a new OptimizationConfig for a dynamic shape inference.
+   *
+   * @param k_min
+   * @param k_opt
+   * @param k_max
+   */
+  BatchOptConfig(const int32_t k_min, const int32_t k_opt, const int32_t k_max)
+  : k_min(k_min), k_opt(k_opt), k_max(k_max)
+  {
+  }
+
+  int32_t k_min, k_opt, k_max;
+};  // struct BatchOptConfig
+
+struct BuildConfig
+{
+  // type of precision
+  PrecisionType precision;
+
+  // type for calibration
+  CalibrationType calibration;
+
+  BatchOptConfig batch_target;
+  BatchOptConfig batch_agent;
+
+  /**
+   * @brief Construct a new instance with default configurations.
+   */
+  BuildConfig()
+  : precision(PrecisionType::FP32),
+    calibration(CalibrationType::MINMAX),
+    batch_target(1, 10, 20),
+    batch_agent(1, 30, 50),
+    is_dynamic_(false)
+  {
+  }
+
+  /**
+   * @brief Construct a new build config.
+   *
+   * @param is_dynamic
+   * @param precision
+   * @param calibration
+   */
+  BuildConfig(
+    const bool is_dynamic, const std::string & precision, const std::string & calibration,
+    const BatchOptConfig & batch_target = BatchOptConfig(1, 10, 20),
+    const BatchOptConfig & batch_agent = BatchOptConfig(1, 30, 50))
+  : precision(toPrecisionType(precision)),
+    calibration(toCalibrationType(calibration)),
+    batch_target(batch_target),
+    batch_agent(batch_agent),
+    is_dynamic_(is_dynamic)
+  {
+  }
+
+  bool is_dynamic() const { return is_dynamic_; }
+
+private:
+  bool is_dynamic_;
+};  // struct BuildConfig
+
+class MTRBuilder
+{
+public:
+  /**
+   * @brief Construct a new instance.
+   *
+   * @param model_path Path to engine or onnx file.
+   * @param build_config The configuration of build.
+   * @param max_workspace_size The max workspace size.
+   */
+  MTRBuilder(
+    const std::string & model_path, const BuildConfig & build_config = BuildConfig(),
+    const size_t max_workspace_size = (1ULL << 63));
+
+  /**
+   * @brief Setup engine for inference. After finishing setup successfully, `isInitialized` must
+   * return `true`.
+   */
+  void setup();
+
+  /**
+   * @brief Check whether engine was initialized successfully.
+   *
+   * @return True if plugins were initialized successfully.
+   */
+  [[nodiscard]] bool isInitialized() const;
+
+  // Return true if the model supports dynamic shape inference.
+  [[nodiscard]] bool isDynamic() const;
+
+  // Set binding dimensions for specified for dynamic shape inference.
+  bool setBindingDimensions(int index, nvinfer1::Dims dimensions);
+
+  /**
+   * @brief A wrapper of `nvinfer1::IExecuteContext::enqueueV2`.
+   *
+   * @param bindings An array of pointers to input and output buffers for the network.
+   * @param stream A cuda stream on which the inference kernels will be enqueued.
+   * @param inputConsumed An optional event which will be signaled when the input buffers can be
+   * refilled with new data.
+   * @return True If the kernels were enqueued successfully.
+   */
+  bool enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * inputConsumed);
+
+private:
+  /**
+   * @brief Load engin file.
+   *
+   * @param filepath Engine file path.
+   * @return True if the engine were loaded successfully.
+   */
+  bool loadEngine(const std::string & filepath);
+
+  // Create a cache path of engine file.
+  [[nodiscard]] fs::path createEngineCachePath() const;
+
+  /**
+   * @brief Build engine from onnx file.
+   *
+   * @param filepath Onnx file path.
+   * @param output_engine_filepath Output engine file path.
+   * @return True if the engine were built successfully.
+   */
+  bool buildEngineFromOnnx(
+    const std::string & filepath, const std::string & output_engine_filepath);
+
+  autoware::tensorrt_common::Logger logger_;
+  TrtUniquePtr<nvinfer1::IRuntime> runtime_;
+  TrtUniquePtr<nvinfer1::ICudaEngine> engine_;
+  TrtUniquePtr<nvinfer1::IExecutionContext> context_;
+
+  fs::path model_filepath_;
+  size_t max_workspace_size_;
+  std::unique_ptr<const BuildConfig> build_config_;
+
+  bool is_initialized_{false};
+};  // class MTRBuilder
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__BUILDER_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/conversions/history.hpp b/perception/autoware_mtr/include/autoware/mtr/conversions/history.hpp
new file mode 100644
index 0000000000000..56d6bcb06a712
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/conversions/history.hpp
@@ -0,0 +1,62 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__CONVERSIONS__HISTORY_HPP_
+#define AUTOWARE__MTR__CONVERSIONS__HISTORY_HPP_
+
+#include "autoware/mtr/agent.hpp"
+
+#include <cmath>
+#include <cstddef>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace autoware::mtr
+{
+
+/**
+ * @brief Transform a 2D point with a reference point and rotation
+ *
+ * @param history history to modify.
+ * @param reference_state reference state used as reference frame.
+ * @return  vector of histories, each in the reference frame of the last state.
+ */
+[[nodiscard]] std::pair<float, float> transform2D(
+  const std::pair<float, float> input_xy, const std::pair<float, float> reference_xy,
+  const std::pair<float, float> rotation_cos_sin);
+
+/**
+ * @brief Get histories with agent states in the reference frame of the last state
+ *
+ * @param history history to modify.
+ * @param reference_state reference state used as reference frame.
+ * @return  vector of histories, each in the reference frame of the last state.
+ */
+[[nodiscard]] AgentHistory getAgentCentricHistory(
+  const AgentHistory & history, const AgentState & reference_state);
+
+/**
+ * @brief Get histories with agent states in the reference frame of the last state
+ *
+ * @param histories vector of histories to modify.
+ * @return  vector of histories, each in the reference frame of the last state.
+ */
+[[nodiscard]] std::vector<AgentHistory> getAgentCentricHistories(
+  const std::vector<AgentHistory> & histories);
+
+}  // namespace autoware::mtr
+
+#endif  // AUTOWARE__MTR__CONVERSIONS__HISTORY_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/conversions/lanelet.hpp b/perception/autoware_mtr/include/autoware/mtr/conversions/lanelet.hpp
new file mode 100644
index 0000000000000..17742d333acca
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/conversions/lanelet.hpp
@@ -0,0 +1,234 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__CONVERSIONS__LANELET_HPP_
+#define AUTOWARE__MTR__CONVERSIONS__LANELET_HPP_
+
+#include "autoware/mtr/polyline.hpp"
+
+#include <geometry_msgs/msg/detail/point__struct.hpp>
+
+#include <lanelet2_core/LaneletMap.h>
+#include <lanelet2_core/primitives/CompoundPolygon.h>
+#include <lanelet2_core/primitives/Lanelet.h>
+#include <lanelet2_core/primitives/LineString.h>
+#include <lanelet2_core/utility/Optional.h>
+
+#include <cstddef>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace autoware::mtr
+{
+/**
+ * @brief Insert lane points into the container from the end of it.
+ *
+ * @param points Sequence of points to be inserted.
+ * @param container Points container.
+ */
+inline void insertLanePoints(
+  const std::vector<LanePoint> & points, std::vector<LanePoint> & container)
+{
+  container.reserve(container.size() * 2);
+  container.insert(container.end(), points.begin(), points.end());
+}
+
+inline lanelet::Optional<std::string> toTypeName(const lanelet::ConstLanelet & lanelet)
+{
+  return lanelet.hasAttribute("type") ? lanelet.attribute("type").as<std::string>()
+                                      : lanelet::Optional<std::string>();
+}
+
+inline lanelet::Optional<std::string> toTypeName(const lanelet::ConstLineString3d & linestring)
+{
+  return linestring.hasAttribute("type") ? linestring.attribute("type").as<std::string>()
+                                         : lanelet::Optional<std::string>();
+}
+
+/**
+ * @brief Extract the subtype name from a lanelet.
+ *
+ * @param lanelet Lanelet instance.
+ * @return std::optional<string>
+ */
+inline lanelet::Optional<std::string> toSubtypeName(const lanelet::ConstLanelet & lanelet) noexcept
+{
+  return lanelet.hasAttribute("subtype") ? lanelet.attribute("subtype").as<std::string>()
+                                         : lanelet::Optional<std::string>();
+}
+
+/**
+ * @brief Extract the subtype name from a 3D linestring.
+ *
+ * @param linestring 3D linestring instance.
+ * @return lanelet::Optional<std::string>
+ */
+inline lanelet::Optional<std::string> toSubtypeName(
+  const lanelet::ConstLineString3d & linestring) noexcept
+{
+  return linestring.hasAttribute("subtype") ? linestring.attribute("subtype").as<std::string>()
+                                            : lanelet::Optional<std::string>();
+}
+
+/**
+ * @brief Check if the specified lanelet is the turnable intersection.
+ *
+ * @param lanelet Lanelet instance.
+ * @return true if the lanelet has the attribute named turn_direction.
+ */
+inline bool isTurnableIntersection(const lanelet::ConstLanelet & lanelet) noexcept
+{
+  return lanelet.hasAttribute("turn_direction");
+}
+
+/**
+ * @brief Check if the specified lanelet subtype is kind of lane.
+ *
+ * @param subtype
+ * @return True if the lanelet subtype is the one of the (road, highway, road_shoulder,
+ * pedestrian_lane, bicycle_lane, walkway).
+ */
+inline bool isLaneLike(const lanelet::Optional<std::string> & subtype)
+{
+  if (!subtype) {
+    return false;
+  }
+  const auto & subtype_str = subtype.value();
+  return (
+    subtype_str == "road" || subtype_str == "highway" || subtype_str == "road_shoulder" ||
+    subtype_str == "pedestrian_lane" || subtype_str == "bicycle_lane" || subtype_str == "walkway");
+}
+
+/**
+ * @brief Check if the specified lanelet subtype is kind of the roadway.
+ *
+ * @param subtype Subtype of the corresponding lanelet.
+ * @return True if the subtype is the one of the (road, highway, road_shoulder).
+ */
+inline bool isRoadwayLike(const lanelet::Optional<std::string> & subtype)
+{
+  if (!subtype) {
+    return false;
+  }
+  const auto & subtype_str = subtype.value();
+  return subtype_str == "road" || subtype_str == "highway" || subtype_str == "road_shoulder";
+}
+
+/**
+ * @brief Check if the specified linestring is kind of the boundary.
+ *
+ * @param linestring 3D linestring.
+ * @return True if the type is the one of the (line_thin, line_thick, road_boarder) and the subtype
+ * is not virtual.
+ */
+inline bool isBoundaryLike(const lanelet::ConstLineString3d & linestring)
+{
+  const auto type = toTypeName(linestring);
+  const auto subtype = toSubtypeName(linestring);
+  if (!type || !subtype) {
+    return false;
+  }
+
+  const auto & type_str = type.value();
+  const auto & subtype_str = subtype.value();
+  return (type_str == "line_thin" || type_str == "line_thick" || type_str == "road_boarder") &&
+         subtype_str != "virtual";
+}
+
+/**
+ * @brief Check if the specified linestring is the kind of crosswalk.
+ *
+ * @param subtype Subtype of the corresponding polygon.
+ * @return True if the lanelet subtype is the one of the (crosswalk,).
+ */
+inline bool isCrosswalkLike(const lanelet::Optional<std::string> & subtype)
+{
+  if (!subtype) {
+    return false;
+  }
+
+  const auto & subtype_str = subtype.value();
+  return subtype_str == "crosswalk";
+}
+
+/**
+ * @brief A class to convert lanelet map to polyline.
+ */
+class LaneletConverter
+{
+public:
+  /**
+   * @brief Construct a new Lanelet Converter object
+   *
+   * @param lanelet_map_ptr Pointer of loaded lanelet map.
+   * @param max_num_polyline The max number of polylines to be contained in the tensor. If the total
+   * number of polylines are less than this value, zero-filled polylines will be padded.
+   * @param max_num_point The max number of points to be contained in a single polyline.
+   * @param point_break_distance Distance threshold to separate two polylines.
+   */
+  explicit LaneletConverter(
+    lanelet::LaneletMapConstPtr lanelet_map_ptr, size_t max_num_polyline, size_t max_num_point,
+    float point_break_distance)
+  : lanelet_map_ptr_(lanelet_map_ptr),
+    max_num_polyline_(max_num_polyline),
+    max_num_point_(max_num_point),
+    point_break_distance_(point_break_distance)
+  {
+  }
+
+  /**
+   * @brief Convert a lanelet map to the polyline data except of points whose distance from the
+   * specified position is farther than the threshold.
+   *
+   * @param position Origin to check the distance from this.
+   * @param distance_threshold Distance threshold
+   * @return std::optional<PolylineData>
+   */
+  std::optional<PolylineData> convert(
+    const geometry_msgs::msg::Point & position, double distance_threshold) const;
+
+private:
+  /**
+   * @brief Convert a linestring to the set of polylines.
+   *
+   * @param linestring Linestring instance.
+   * @param position Origin to check the distance from this.
+   * @param distance_threshold Distance threshold from the specified position.
+   * @return std::vector<LanePoint>
+   */
+  std::vector<LanePoint> fromLinestring(
+    const lanelet::ConstLineString3d & linestring, const geometry_msgs::msg::Point & position,
+    double distance_threshold) const noexcept;
+
+  /**
+   * @brief Convert a polygon to the set of polylines.
+   *
+   * @param polygon Polygon instance.
+   * @param position Origin to check the distance from this.
+   * @param distance_threshold Distance threshold from the specified position.
+   * @return std::vector<LanePoint>
+   */
+  std::vector<LanePoint> fromPolygon(
+    const lanelet::CompoundPolygon3d & polygon, const geometry_msgs::msg::Point & position,
+    double distance_threshold) const noexcept;
+
+  lanelet::LaneletMapConstPtr lanelet_map_ptr_;  //!< Pointer of lanelet map.
+  size_t max_num_polyline_;                      //!< The max number of polylines.
+  size_t max_num_point_;                         //!< The max number of points.
+  float point_break_distance_;                   //!< Distance threshold to separate two polylines.
+};
+}  // namespace autoware::mtr
+
+#endif  // AUTOWARE__MTR__CONVERSIONS__LANELET_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/cuda_helper.hpp b/perception/autoware_mtr/include/autoware/mtr/cuda_helper.hpp
new file mode 100644
index 0000000000000..468b6b61d3352
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/cuda_helper.hpp
@@ -0,0 +1,138 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This code is licensed under CC0 1.0 Universal (Public Domain).
+ * You can use this without any limitation.
+ * https://creativecommons.org/publicdomain/zero/1.0/deed.en
+ */
+
+#ifndef AUTOWARE__MTR__CUDA_HELPER_HPP_
+#define AUTOWARE__MTR__CUDA_HELPER_HPP_
+
+#include <cuda_runtime_api.h>
+
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#define CHECK_CUDA_ERROR(e) (cuda::check_error(e, __FILE__, __LINE__))
+
+namespace cuda
+{
+/**
+ * @brief Throw if the input cuda error is not `cudaSuccess`.
+ *
+ * @param e The cuda error types.
+ * @param f The file name.
+ * @param n The line number.
+ */
+inline void check_error(const ::cudaError_t e, const char * f, int n)
+{
+  if (e != ::cudaSuccess) {
+    ::std::stringstream s;
+    s << ::cudaGetErrorName(e) << " (" << e << ")@" << f << "#L" << n << ": "
+      << ::cudaGetErrorString(e);
+    throw ::std::runtime_error{s.str()};
+  }
+}
+
+struct deleter
+{
+  void operator()(void * p) const { CHECK_CUDA_ERROR(::cudaFree(p)); }
+};
+
+template <typename T>
+using unique_ptr = ::std::unique_ptr<T, deleter>;
+
+template <typename T>
+typename ::std::enable_if<::std::is_array<T>::value, cuda::unique_ptr<T>>::type make_unique(
+  const ::std::size_t n)
+{
+  using U = typename ::std::remove_extent<T>::type;
+  U * p;
+  CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast<void **>(&p), sizeof(U) * n));
+  return cuda::unique_ptr<T>{p};
+}
+
+template <typename T>
+cuda::unique_ptr<T> make_unique()
+{
+  T * p;
+  CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast<void **>(&p), sizeof(T)));
+  return cuda::unique_ptr<T>{p};
+}
+
+constexpr size_t CUDA_ALIGN = 256;
+
+template <typename T>
+inline size_t get_size_aligned(size_t num_elem)
+{
+  size_t size = num_elem * sizeof(T);
+  size_t extra_align = 0;
+  if (size % CUDA_ALIGN != 0) {
+    extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
+  }
+  return size + extra_align;
+}
+
+template <typename T>
+inline T * get_next_ptr(size_t num_elem, void *& workspace, size_t & workspace_size)
+{
+  size_t size = get_size_aligned<T>(num_elem);
+  if (size > workspace_size) {
+    throw ::std::runtime_error("Workspace is too small!");
+  }
+  workspace_size -= size;
+  T * ptr = reinterpret_cast<T *>(workspace);
+  workspace = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(workspace) + size);
+  return ptr;
+}
+
+class EventDebugger
+{
+public:
+  void createEvent(cudaStream_t stream = 0)
+  {
+    CHECK_CUDA_ERROR(cudaEventCreate(&start_));
+    CHECK_CUDA_ERROR(cudaEventCreate(&stop_));
+    CHECK_CUDA_ERROR(cudaEventRecord(start_, stream));
+    has_event_ = true;
+  }
+
+  void printElapsedTime(cudaStream_t stream = 0)
+  {
+    if (!has_event_) {
+      std::cerr << "No event was found" << std::endl;
+    } else {
+      CHECK_CUDA_ERROR(cudaEventRecord(stop_, stream));
+      CHECK_CUDA_ERROR(cudaEventSynchronize(stop_));
+      float elapsed_time;
+      CHECK_CUDA_ERROR(cudaEventElapsedTime(&elapsed_time, start_, stop_));
+      std::cout << "Processing time: " << elapsed_time << "ms" << std::endl;
+      cudaEventDestroy(start_);
+      cudaEventDestroy(stop_);
+      has_event_ = false;
+    }
+  }
+
+private:
+  cudaEvent_t start_, stop_;
+  bool has_event_{false};
+};
+
+}  // namespace cuda
+
+#endif  // AUTOWARE__MTR__CUDA_HELPER_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/fixed_queue.hpp b/perception/autoware_mtr/include/autoware/mtr/fixed_queue.hpp
new file mode 100644
index 0000000000000..8645628848792
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/fixed_queue.hpp
@@ -0,0 +1,87 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__FIXED_QUEUE_HPP_
+#define AUTOWARE__MTR__FIXED_QUEUE_HPP_
+
+#include <cstddef>
+#include <deque>
+#include <iterator>
+
+namespace autoware::mtr
+{
+
+template <typename T>
+class FixedQueue
+{
+public:
+  using size_type = typename std::deque<T>::size_type;
+  using reference = typename std::deque<T>::reference;
+  using const_reference = typename std::deque<T>::const_reference;
+  using iterator = typename std::deque<T>::iterator;
+  using riterator = typename std::reverse_iterator<iterator>;
+  using const_iterator = typename std::deque<T>::const_iterator;
+  using rconst_iterator = typename std::reverse_iterator<const_iterator>;
+
+  explicit FixedQueue(size_type size) : queue_(size) {}
+
+  void push_back(const T && t) noexcept
+  {
+    queue_.pop_front();
+    queue_.push_back(t);
+  }
+
+  void push_back(const T & t) noexcept
+  {
+    queue_.pop_front();
+    queue_.push_back(t);
+  }
+
+  void push_front(const T && t) noexcept
+  {
+    queue_.pop_back();
+    queue_.push_front(t);
+  }
+
+  void push_front(const T & t) noexcept
+  {
+    queue_.pop_back();
+    queue_.push_front(t);
+  }
+
+  reference front() noexcept { return queue_.front(); }
+  const_reference front() const noexcept { return queue_.front(); }
+
+  reference back() noexcept { return queue_.back(); }
+  const_reference back() const noexcept { return queue_.back(); }
+
+  iterator begin() noexcept { return queue_.begin(); }
+  const_iterator begin() const noexcept { return queue_.begin(); }
+
+  iterator end() noexcept { return queue_.end(); }
+  const_iterator end() const noexcept { return queue_.end(); }
+
+  riterator rbegin() noexcept { return queue_.rbegin(); }
+  rconst_iterator rbegin() const noexcept { return queue_.rbegin(); }
+
+  riterator rend() noexcept { return queue_.rend(); }
+  rconst_iterator rend() const noexcept { return queue_.rend(); }
+
+  size_type size() const noexcept { return queue_.size(); }
+
+private:
+  std::deque<T> queue_;
+};
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__FIXED_QUEUE_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/intention_point.hpp b/perception/autoware_mtr/include/autoware/mtr/intention_point.hpp
new file mode 100644
index 0000000000000..0cdf1c739dc3d
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/intention_point.hpp
@@ -0,0 +1,108 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__INTENTION_POINT_HPP_
+#define AUTOWARE__MTR__INTENTION_POINT_HPP_
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+namespace autoware::mtr
+{
+/**
+ * @brief A class to load and store intention points.
+ */
+struct IntentionPoint
+{
+public:
+  /**
+   * @brief Construct a new instance from a csv file.
+   *
+   * @param csv_filepath Path to csv file.
+   * @param num_cluster The number of clusters.
+   */
+  static IntentionPoint from_file(size_t num_cluster, const std::string csv_filepath)
+  {
+    std::ifstream file(csv_filepath);
+    if (!file.is_open()) {
+      std::ostringstream err_msg;
+      err_msg << "Error opening file: " << csv_filepath << ". Please check if the file exists.";
+      throw std::runtime_error(err_msg.str());
+    }
+
+    std::vector<std::tuple<float, float, std::string>> buffer;
+    std::string line;
+    while (std::getline(file, line)) {
+      std::istringstream ss(line);
+      float x, y;
+      std::string label;
+
+      ss >> x;
+      ss.ignore();
+      ss >> y;
+      ss.ignore();
+      std::getline(ss, label, ',');
+
+      buffer.emplace_back(x, y, label);
+    }
+    file.close();
+
+    std::unordered_map<std::string, std::vector<float>> data_map;
+    for (const auto & [x, y, label] : buffer) {
+      data_map[label].emplace_back(x);
+      data_map[label].emplace_back(y);
+    }
+
+    return {num_cluster, data_map};
+  }
+
+  // Returns the point state dimension, which is 2 (x, y).
+  static size_t state_dim() noexcept { return 2; }
+
+  /**
+   * @brief Load intention points for specified label names.
+   *
+   * @param label_names Array of label names for all agents, in shape [N].
+   * @return std::vector<float> Array of all points in shape, [N * num_cluster * 2].
+   */
+  std::vector<float> as_array(const std::vector<std::string> & label_names) const;
+
+  // Return the size of intension point `K*D`.
+  size_t size() const noexcept;
+
+  size_t num_cluster;  //!< The number of point clusters, which is K.
+
+private:
+  /**
+   * @brief Construct a new Intention.
+   *
+   * @param data_map Map of intention points hashed by label names.
+   * @param num_cluster The number of clusters.
+   */
+  IntentionPoint(
+    size_t num_cluster, const std::unordered_map<std::string, std::vector<float>> data_map)
+  : num_cluster(num_cluster), data_map_(data_map)
+  {
+  }
+
+  std::unordered_map<std::string, std::vector<float>> data_map_;  //!< Map of label name and points.
+};
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__INTENTION_POINT_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/node.hpp b/perception/autoware_mtr/include/autoware/mtr/node.hpp
new file mode 100644
index 0000000000000..4feaf0fe534a0
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/node.hpp
@@ -0,0 +1,142 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__NODE_HPP_
+#define AUTOWARE__MTR__NODE_HPP_
+
+#include "autoware/mtr/agent.hpp"
+#include "autoware/mtr/conversions/lanelet.hpp"
+#include "autoware/mtr/fixed_queue.hpp"
+#include "autoware/mtr/polyline.hpp"
+#include "autoware/mtr/trajectory.hpp"
+#include "autoware/mtr/trt_mtr.hpp"
+
+#include <autoware/object_recognition_utils/object_classification.hpp>
+#include <autoware/universe_utils/geometry/geometry.hpp>
+#include <autoware/universe_utils/ros/polling_subscriber.hpp>
+#include <autoware/universe_utils/ros/transform_listener.hpp>
+#include <autoware/universe_utils/ros/uuid_helper.hpp>
+#include <autoware_vehicle_info_utils/vehicle_info_utils.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include <autoware_map_msgs/msg/detail/lanelet_map_bin__struct.hpp>
+#include <autoware_map_msgs/msg/lanelet_map_bin.hpp>
+#include <autoware_perception_msgs/msg/detail/tracked_object__struct.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/predicted_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/predicted_objects.hpp>
+#include <autoware_perception_msgs/msg/predicted_path.hpp>
+#include <autoware_perception_msgs/msg/tracked_objects.hpp>
+#include <nav_msgs/msg/odometry.hpp>
+
+#include <lanelet2_core/LaneletMap.h>
+#include <lanelet2_routing/RoutingGraph.h>
+#include <lanelet2_traffic_rules/TrafficRules.h>
+
+#include <fstream>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace autoware::mtr
+{
+using HADMapBin = autoware_map_msgs::msg::LaneletMapBin;
+using autoware::vehicle_info_utils::VehicleInfo;
+using autoware_perception_msgs::msg::ObjectClassification;
+using autoware_perception_msgs::msg::PredictedObject;
+using autoware_perception_msgs::msg::PredictedObjectKinematics;
+using autoware_perception_msgs::msg::PredictedObjects;
+using autoware_perception_msgs::msg::PredictedPath;
+using autoware_perception_msgs::msg::TrackedObject;
+using autoware_perception_msgs::msg::TrackedObjects;
+using nav_msgs::msg::Odometry;
+
+class MTRNode : public rclcpp::Node
+{
+public:
+  explicit MTRNode(const rclcpp::NodeOptions & node_options);
+
+  // Object ID of the ego vehicle
+  inline static const std::string EGO_ID{"EGO"};
+
+private:
+  // Main callback being invoked when the tracked objects topic is subscribed.
+  void callback(const TrackedObjects::ConstSharedPtr object_msg);
+
+  // Callback being invoked when the HD map topic is subscribed.
+  void onMap(const HADMapBin::ConstSharedPtr map_msg);
+
+  // Fetch data of Ego's odometry topic.
+  std::optional<TrackedObject> getLatestEgo();
+
+  // Convert Lanelet to `PolylineData`.
+  bool convertLaneletToPolyline();
+
+  // Remove ancient agent histories.
+  void removeAncientAgentHistory(
+    const double current_time, const TrackedObjects::ConstSharedPtr objects_msg);
+
+  // Appends new states to history.
+  void updateAgentHistory(
+    const double current_time, const TrackedObjects::ConstSharedPtr objects_msg,
+    const TrackedObject & ego_msg);
+
+  // Extract target agents and return corresponding indices.
+  // NOTE: Extract targets in order of proximity, closest first.
+  std::vector<size_t> extractTargetAgent(const std::vector<AgentHistory> & histories);
+
+  // Return the timestamps relative from the first element.Return the timestamps relative from the
+  // first element.
+  std::vector<float> getRelativeTimestamps() const;
+
+  // Generate `PredictedObject` from `PredictedTrajectory`.
+  PredictedObject createPredictedObject(
+    const TrackedObject & object, const PredictedTrajectory & trajectory);
+
+  // ROS Publisher and Subscriber
+  // TODO(ktro2828): add debug publisher
+  rclcpp::Publisher<PredictedObjects>::SharedPtr pub_objects_;
+  rclcpp::Subscription<TrackedObjects>::SharedPtr sub_objects_;
+  rclcpp::Subscription<HADMapBin>::SharedPtr sub_map_;
+  // polling subscriber
+  autoware::universe_utils::InterProcessPollingSubscriber<Odometry> sub_ego_{
+    this, "/localization/kinematic_state"};
+
+  // Lanelet map pointers
+  std::shared_ptr<lanelet::LaneletMap> lanelet_map_ptr_;
+  std::shared_ptr<lanelet::routing::RoutingGraph> routing_graph_ptr_;
+  std::shared_ptr<lanelet::traffic_rules::TrafficRules> traffic_rules_ptr_;
+  std::unique_ptr<LaneletConverter> lanelet_converter_ptr_;
+
+  // Agent history
+  std::map<std::string, AgentHistory> agent_history_map_;
+  std::map<std::string, TrackedObject> object_msg_map_;
+  VehicleInfo vehicle_info_;
+
+  // Pose transform listener
+  autoware::universe_utils::TransformListener transform_listener_;
+
+  // MTR parameters
+  std::unique_ptr<MTRConfig> config_ptr_;
+  std::unique_ptr<BuildConfig> build_config_ptr_;
+  std::unique_ptr<TrtMTR> model_ptr_;
+
+  std::unique_ptr<FixedQueue<std::pair<float, AgentState>>> ego_states_;
+  std::unique_ptr<FixedQueue<double>> timestamps_;
+};  // class MTRNode
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__NODE_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/polyline.hpp b/perception/autoware_mtr/include/autoware/mtr/polyline.hpp
new file mode 100644
index 0000000000000..3b729cf617522
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/polyline.hpp
@@ -0,0 +1,235 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__POLYLINE_HPP_
+#define AUTOWARE__MTR__POLYLINE_HPP_
+
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <tuple>
+#include <vector>
+
+namespace autoware::mtr
+{
+constexpr size_t PointStateDim = 7;
+
+enum PolylineLabel { LANE = 0, ROAD_LINE = 1, ROAD_EDGE = 2, CROSSWALK = 3 };
+
+struct LanePoint
+{
+  // Construct a new instance filling all elements by `0.0f`.
+  LanePoint() : data_({0.0f}) {}
+
+  /**
+   * @brief Construct a new instance with specified values.
+   *
+   * @param x X position.
+   * @param y Y position.
+   * @param z Z position.
+   * @param dx Normalized delta x.
+   * @param dy Normalized delta y.
+   * @param dz Normalized delta z.
+   * @param label Label.
+   */
+  LanePoint(
+    const float x, const float y, const float z, const float dx, const float dy, const float dz,
+    const float label)
+  : data_({x, y, z, dx, dy, dz, label}), x_(x), y_(y), z_(z), label_(label)
+  {
+  }
+
+  // Construct a new instance filling all elements by `0.0f`.
+  static LanePoint empty() noexcept { return LanePoint(); }
+
+  // Return the point state dimensions `D`.
+  static size_t dim() { return PointStateDim; }
+
+  // Return the x position of the point.
+  float x() const { return x_; }
+
+  // Return the y position of the point.
+  float y() const { return y_; }
+
+  // Return the z position of the point.
+  float z() const { return z_; }
+
+  // Return the label of the point.
+  float label() const { return label_; }
+
+  /**
+   * @brief Return the distance between myself and another one.
+   *
+   * @param other Another point.
+   * @return float Distance between myself and another one.
+   */
+  float distance(const LanePoint & other) const
+  {
+    return std::hypot(x_ - other.x(), y_ - other.y(), z_ - other.z());
+  }
+
+  // Return the address pointer of data array.
+  const float * data_ptr() const noexcept { return data_.data(); }
+
+private:
+  std::array<float, PointStateDim> data_;
+  float x_{0.0f}, y_{0.0f}, z_{0.0f}, label_{0.0f};
+};
+
+struct PolylineData
+{
+  /**
+   * @brief Construct a new PolylineData instance.
+   *
+   * @param points Source points vector.
+   * @param min_num_polyline The minimum number of polylines should be generated. If the number of
+   * polylines, resulting in separating input points, is less than this value, empty polylines will
+   * be added.
+   * @param max_num_point The maximum number of points that each polyline can include. If the
+   * polyline contains fewer points than this value, empty points will be added.
+   * @param distance_threshold The distance threshold to separate polylines.
+   */
+  PolylineData(
+    const std::vector<LanePoint> & points, const size_t min_num_polyline,
+    const size_t max_num_point, const float distance_threshold)
+  : num_polyline_(0), num_point_(max_num_point), distance_threshold_(distance_threshold)
+  {
+    std::size_t point_cnt = 0;
+
+    // point_cnt > PointNum at a to a new polyline group
+    // distance > threshold -> add to a new polyline group
+    for (std::size_t i = 0; i < points.size(); ++i) {
+      auto & cur_point = points.at(i);
+
+      if (i == 0) {
+        addNewPolyline(cur_point, point_cnt);
+        continue;
+      }
+
+      if (point_cnt >= num_point_) {
+        addNewPolyline(cur_point, point_cnt);
+      } else if (const auto & prev_point = points.at(i - 1);
+                 cur_point.distance(prev_point) >= distance_threshold_ ||
+                 cur_point.label() != prev_point.label()) {
+        if (point_cnt < num_point_) {
+          addEmptyPoints(point_cnt);
+        }
+        addNewPolyline(cur_point, point_cnt);
+      } else {
+        addPoint(cur_point, point_cnt);
+      }
+    }
+    addEmptyPoints(point_cnt);
+
+    if (num_polyline_ < min_num_polyline) {
+      addEmptyPolyline(min_num_polyline - num_polyline_);
+    }
+  }
+
+  // Return the number of polylines `K`.
+  size_t num_polyline() const { return num_polyline_; }
+
+  // Return the number of points contained in each polyline `P`.
+  size_t num_point() const { return num_point_; }
+
+  // Return the number of point dimensions `D`.
+  static size_t state_dim() { return PointStateDim; }
+
+  // Return the number of all elements `K*P*D`.
+  size_t size() const { return num_polyline_ * num_point_ * state_dim(); }
+
+  // Return the number of state dimensions of MTR input `D+2`.
+  size_t input_dim() const { return state_dim() + 2; }
+
+  // Return the data shape ordering in `(K, P, D)`.
+  std::tuple<size_t, size_t, size_t> shape() const
+  {
+    return {num_polyline_, num_point_, state_dim()};
+  }
+
+  // Return the address pointer of data array.
+  const float * data_ptr() const noexcept { return data_.data(); }
+
+private:
+  /**
+   * @brief Add a new polyline group filled by empty points. This member function increments
+   * `PolylineNum` by `num_polyline` internally.
+   *
+   * @param num_polyline The number of polylines to add.
+   */
+  void addEmptyPolyline(size_t num_polyline)
+  {
+    for (size_t i = 0; i < num_polyline; ++i) {
+      size_t point_cnt = 0;
+      auto empty_point = LanePoint::empty();
+      addNewPolyline(empty_point, point_cnt);
+      addEmptyPoints(point_cnt);
+    }
+  }
+
+  /**
+   * @brief Add a new polyline group with the specified point. This member function increments
+   * `PolylineNum` by `1` internally.
+   *
+   * @param point LanePoint instance.
+   * @param point_cnt The current count of points, which will be reset to `1`.
+   */
+  void addNewPolyline(const LanePoint & point, size_t & point_cnt)
+  {
+    const auto s = point.data_ptr();
+    for (size_t d = 0; d < state_dim(); ++d) {
+      data_.push_back(*(s + d));
+    }
+    ++num_polyline_;
+    point_cnt = 1;
+  }
+
+  /**
+   * @brief Add `(PointNum - point_cnt)` empty points filled by `0.0`.
+   *
+   * @param point_cnt The number of current count of points, which will be reset to `PointNum`.
+   */
+  void addEmptyPoints(size_t & point_cnt)
+  {
+    const auto s = LanePoint::empty().data_ptr();
+    for (std::size_t n = point_cnt; n < num_point_; ++n) {
+      for (std::size_t d = 0; d < state_dim(); ++d) {
+        data_.push_back(*(s + d));
+      }
+    }
+    point_cnt = num_point_;
+  }
+
+  /**
+   * @brief Add the specified point and increment `point_cnt` by `1`.
+   *
+   * @param point
+   * @param point_cnt
+   */
+  void addPoint(const LanePoint & point, std::size_t & point_cnt)
+  {
+    const auto s = point.data_ptr();
+    for (size_t d = 0; d < state_dim(); ++d) {
+      data_.push_back(*(s + d));
+    }
+    ++point_cnt;
+  }
+
+  size_t num_polyline_;
+  size_t num_point_;
+  std::vector<float> data_;
+  const float distance_threshold_;
+};
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__POLYLINE_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/trajectory.hpp b/perception/autoware_mtr/include/autoware/mtr/trajectory.hpp
new file mode 100644
index 0000000000000..24641d8d4999d
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/trajectory.hpp
@@ -0,0 +1,170 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__TRAJECTORY_HPP_
+#define AUTOWARE__MTR__TRAJECTORY_HPP_
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <vector>
+
+namespace autoware::mtr
+{
+
+constexpr size_t PredictedStateDim = 7;
+
+/**
+ * @brief A class to represent a predicted state. Note that output elements are (x, y, stdX,
+ * stdY, rho, vx, vy).
+ */
+struct PredictedState
+{
+  explicit PredictedState(const std::array<double, PredictedStateDim> & state)
+  : x_(state.at(0)),
+    y_(state.at(1)),
+    std_x_(state.at(2)),
+    std_y_(state.at(3)),
+    rho_(state.at(4)),
+    vx_(state.at(5)),
+    vy_(state.at(6))
+  {
+  }
+
+  PredictedState(
+    const double x, const double y, const double std_x, const double std_y, const double rho,
+    const double vx, const double vy)
+  : x_(x), y_(y), std_x_(std_x), std_y_(std_y), rho_(rho), vx_(vx), vy_(vy)
+  {
+  }
+
+  // Return the predicted state dimensions `D`.
+  static size_t dim() { return PredictedStateDim; }
+
+  // Return the predicted x position.
+  double x() const { return x_; }
+
+  // Return the predicted y position.
+  double y() const { return y_; }
+
+  // Return the predicted std x.
+  double std_x() const { return std_x_; }
+
+  // Return the predicted mean y.
+  double std_y() const { return std_y_; }
+
+  // Return the predicted rho.
+  double rho() const { return rho_; }
+
+  // Return the predicted x velocity.
+  double vx() const { return vx_; }
+
+  // Return the predicted y velocity.
+  double vy() const { return vy_; }
+
+private:
+  double x_, y_, std_x_, std_y_, rho_, vx_, vy_;
+};  // struct PredictedState
+
+/**
+ * @brief A class to represent waypoints of a single mode.
+ */
+struct PredictedMode
+{
+  PredictedMode(const double score, const std::vector<double> & waypoints, const size_t num_future)
+  : score_(score), num_future_(num_future)
+  {
+    for (size_t t = 0; t < num_future_; ++t) {
+      const auto start_itr = waypoints.cbegin() + t * state_dim();
+      std::array<double, PredictedStateDim> state;
+      std::copy_n(start_itr, PredictedStateDim, state.begin());
+      waypoints_.emplace_back(state);
+    }
+  }
+
+  // Return the number of predicted future timestamps `T`.
+  size_t num_future() const { return num_future_; }
+
+  // Return the number of predicted state dimensions `D`.
+  static size_t state_dim() { return PredictedStateDim; }
+
+  // Return the predicted score.
+  double score() const { return score_; }
+
+  // Return the vector of waypoints.
+  const std::vector<PredictedState> & get_waypoints() const { return waypoints_; }
+
+private:
+  double score_;
+  size_t num_future_;
+  std::vector<PredictedState> waypoints_;
+};  // struct PredictedMode
+
+/**
+ * @brief A class to represent predicted trajectory for a single target agent.
+ */
+struct PredictedTrajectory
+{
+  /**
+   * @brief Construct a new instance.
+   *
+   * @note Predicted trajectories are sorted with the smallest scores.
+   *
+   * @param scores Predicted cores for each target, in shape `[B*M]`.
+   * @param trajectories Predicted trajectories for each target. `[B*M*T*D]`.
+   * @param num_mode The number of predicted modes.
+   * @param num_future The number of predicted timestamps.
+   */
+  PredictedTrajectory(
+    const std::vector<double> & scores, const std::vector<double> & modes, const size_t num_mode,
+    const size_t num_future)
+  : num_mode_(num_mode), num_future_(num_future)
+  {
+    for (size_t m = 0; m < num_mode_; ++m) {
+      const auto score = scores.at(m);
+      const auto wp_itr = modes.cbegin() + m * num_future_ * state_dim();
+      std::vector<double> waypoints(wp_itr, wp_itr + num_future_ * state_dim());
+      modes_.emplace_back(score, waypoints, num_future_);
+    }
+    // sort by score
+    sort_by_score();
+  }
+
+  // Return the number of predicted modes `M`.
+  size_t num_mode() const { return num_mode_; }
+
+  // Return the number of predicted future timestamps `T`.
+  size_t num_future() const { return num_future_; }
+
+  // Return the number of predicted state dimensions `D`.
+  static size_t state_dim() { return PredictedStateDim; }
+
+  // Return predicted modes. Modes are sorted in descending order based on their scores.
+  const std::vector<PredictedMode> & get_modes() const { return modes_; }
+
+private:
+  // Sort modes in descending order based on their scores.
+  void sort_by_score()
+  {
+    std::sort(modes_.begin(), modes_.end(), [](const auto & mode1, const auto & mode2) {
+      return mode1.score() > mode2.score();
+    });
+  }
+
+  size_t num_mode_;
+  size_t num_future_;
+  std::vector<PredictedMode> modes_;
+};  // struct PredictedTrajectory
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__TRAJECTORY_HPP_
diff --git a/perception/autoware_mtr/include/autoware/mtr/trt_mtr.hpp b/perception/autoware_mtr/include/autoware/mtr/trt_mtr.hpp
new file mode 100644
index 0000000000000..a22bf806fe152
--- /dev/null
+++ b/perception/autoware_mtr/include/autoware/mtr/trt_mtr.hpp
@@ -0,0 +1,195 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__MTR__TRT_MTR_HPP_
+#define AUTOWARE__MTR__TRT_MTR_HPP_
+
+#include "attention/trt_attn_value_computation.hpp"
+#include "attention/trt_attn_weight_computation.hpp"
+#include "autoware/mtr/agent.hpp"
+#include "autoware/mtr/builder.hpp"
+#include "autoware/mtr/cuda_helper.hpp"
+#include "autoware/mtr/intention_point.hpp"
+#include "autoware/mtr/polyline.hpp"
+#include "autoware/mtr/trajectory.hpp"
+#include "knn/trt_knn_batch.hpp"
+#include "knn/trt_knn_batch_mlogk_kernel.hpp"
+
+#include <array>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace autoware::mtr
+{
+/**
+ * @brief A configuration of MTR.
+ */
+struct MTRConfig
+{
+  /**
+   * @brief Construct a new instance.
+   *
+   * @param target_labels An array of target label names.
+   * @param num_past The number of past timestamps.
+   * @param num_mode The number of modes.
+   * @param num_future The number of future time step length predicted by MTR.
+   * @param max_num_polyline The max number of polylines which can be contained in a single input.
+   * @param max_num_point The max number of points contained in each polyline.
+   * @param intention_point_filepath The path to intention points file.
+   * @param num_intention_point_cluster The number of clusters for intension points.
+   */
+  explicit MTRConfig(
+    const std::vector<std::string> & target_labels = {"VEHICLE", "PEDESTRIAN", "CYCLIST"},
+    const size_t num_past = 11, const size_t num_mode = 6, const size_t num_future = 80,
+    const size_t max_num_polyline = 768, const size_t max_num_point = 20,
+    const float point_break_distance = 1.0f,
+    std::string intention_point_filepath = "./data/intention_point.csv",
+    const size_t num_intention_point_cluster = 64)
+  : target_labels(target_labels),
+    num_class(target_labels.size()),
+    num_past(num_past),
+    num_mode(num_mode),
+    num_future(num_future),
+    max_num_polyline(max_num_polyline),
+    max_num_point(max_num_point),
+    point_break_distance(point_break_distance),
+    intention_point_filepath(std::move(intention_point_filepath)),
+    num_intention_point_cluster(num_intention_point_cluster)
+  {
+  }
+
+  std::vector<std::string> target_labels;
+  size_t num_class;
+  size_t num_past;
+  size_t num_mode;
+  size_t num_future;
+  size_t max_num_polyline;
+  size_t max_num_point;
+  float point_break_distance;
+  std::string intention_point_filepath;
+  size_t num_intention_point_cluster;
+};  // struct MTRConfig
+
+/**
+ * @brief A class to inference with MTR.
+ */
+class TrtMTR
+{
+public:
+  /**
+   * @brief Construct a new instance.
+   *
+   * @param model_path The path to engine or onnx file.
+   * @param config The configuration of model.
+   * @param build_config The configuration of build.
+   * @param max_workspace_size The max size of workspace.
+   * @param build_config The configuration of build.
+   */
+  explicit TrtMTR(
+    const std::string & model_path, const MTRConfig & config = MTRConfig(),
+    const BuildConfig & build_config = BuildConfig(),
+    const size_t max_workspace_size = (1ULL << 30));
+
+  /**
+   * @brief Execute inference.
+   *
+   * @param agent_data The agent data to input.
+   * @param polyline_data The polyline data to input.
+   * @param trajectories A container to store predicted trajectories.
+   * @return True if the inference finishes successfully.
+   */
+  bool doInference(
+    const AgentData & agent_data, const PolylineData & polyline_data,
+    std::vector<PredictedTrajectory> & trajectories);
+
+  /**
+   * @brief Return the model configuration.
+   *
+   * @return const MtrConfig& The model configuration which can not be updated.
+   */
+  const MTRConfig & config() const { return config_; }
+
+private:
+  /**
+   * @brief Initialize pointers of cuda containers.
+   *
+   * @param agent_data The input agent data.
+   * @param polyline_data The input polyline data.
+   */
+  void initCudaPtr(const AgentData & agent_data, const PolylineData & polyline_data);
+
+  /**
+   * @brief Execute pre-process.
+   *
+   * @param agent_data The input agent data.
+   * @param polyline_data The input polyline data.
+   * @return True if the pre-process finishes successfully.
+   */
+  bool preProcess(const AgentData & agent_data, const PolylineData & polyline_data);
+
+  /**
+   * @brief Execute post-process.
+   *
+   * @param agent_data The input agent data.
+   * @param trajectories A container to store predicted trajectories.
+   * @return True if the post-process finishes successfully.
+   */
+  bool postProcess(const AgentData & agent_data, std::vector<PredictedTrajectory> & trajectories);
+
+  // model parameters
+  MTRConfig config_;
+
+  std::unique_ptr<MTRBuilder> builder_;
+  cudaStream_t stream_{nullptr};
+
+  IntentionPoint intention_point_;
+
+  // data size
+  // load from input data
+  int32_t num_target_, num_agent_, num_timestamp_, num_agent_attr_;
+  int32_t num_polyline_, num_point_, num_point_dim_, num_point_attr_;
+  // load from config
+  int32_t max_num_polyline_, num_mode_, num_future_, num_intention_point_;
+
+  // source data
+  cuda::unique_ptr<int[]> d_target_index_{nullptr};
+  cuda::unique_ptr<int[]> d_label_index_{nullptr};
+  cuda::unique_ptr<float[]> d_timestamp_{nullptr};
+  cuda::unique_ptr<float[]> d_trajectory_{nullptr};
+  cuda::unique_ptr<float[]> d_target_state_{nullptr};
+  cuda::unique_ptr<float[]> d_intention_point_{nullptr};
+  cuda::unique_ptr<float[]> d_polyline_{nullptr};
+
+  // preprocessed inputs
+  cuda::unique_ptr<float[]> d_in_trajectory_{nullptr};
+  cuda::unique_ptr<bool[]> d_in_trajectory_mask_{nullptr};
+  cuda::unique_ptr<float[]> d_in_last_pos_{nullptr};
+  cuda::unique_ptr<float[]> d_in_polyline_{nullptr};
+  cuda::unique_ptr<bool[]> d_in_polyline_mask_{nullptr};
+  cuda::unique_ptr<float[]> d_in_polyline_center_{nullptr};
+  // only used for topk extraction
+  cuda::unique_ptr<float[]> d_tmp_polyline_{nullptr};
+  cuda::unique_ptr<bool[]> d_tmp_polyline_mask_{nullptr};
+  cuda::unique_ptr<float[]> d_tmp_distance_{nullptr};
+
+  // outputs
+  cuda::unique_ptr<float[]> d_out_score_{nullptr};
+  cuda::unique_ptr<float[]> d_out_trajectory_{nullptr};
+  std::vector<float> h_out_score_;
+  std::vector<float> h_out_trajectory_;
+};  // class TrtMTR
+}  // namespace autoware::mtr
+#endif  // AUTOWARE__MTR__TRT_MTR_HPP_
diff --git a/perception/autoware_mtr/launch/mtr.launch.xml b/perception/autoware_mtr/launch/mtr.launch.xml
new file mode 100644
index 0000000000000..11fca98298923
--- /dev/null
+++ b/perception/autoware_mtr/launch/mtr.launch.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<launch>
+  <arg name="param_path" default="$(find-pkg-share autoware_mtr)/config/mtr.param.yaml"/>
+  <arg name="data_path" default="$(find-pkg-share autoware_mtr)/data"/>
+
+  <arg name="input/objects" default="/perception/object_recognition/tracking/objects"/>
+  <arg name="input/vector_map" default="/map/vector_map"/>
+  <arg name="input/ego" default="/localization/kinematic_state"/>
+  <arg name="output/objects" default="mtr/objects"/>
+  <arg name="build_only" default="false"/>
+
+  <node pkg="autoware_mtr" exec="autoware_mtr" name="mtr" output="screen">
+    <param from="$(var param_path)" allow_substs="true"/>
+    <remap from="~/input/objects" to="$(var input/objects)"/>
+    <remap from="~/input/vector_map" to="$(var input/vector_map)"/>
+    <remap from="~/input/ego" to="$(var input/ego)"/>
+    <remap from="~/output/objects" to="$(var output/objects)"/>
+    <param name="build_only" value="$(var build_only)"/>
+  </node>
+</launch>
diff --git a/perception/autoware_mtr/lib/include/attention/trt_attn_value_computation.hpp b/perception/autoware_mtr/lib/include/attention/trt_attn_value_computation.hpp
new file mode 100644
index 0000000000000..70c958c0575ed
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/attention/trt_attn_value_computation.hpp
@@ -0,0 +1,101 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ATTENTION__TRT_ATTN_VALUE_COMPUTATION_HPP_
+#define ATTENTION__TRT_ATTN_VALUE_COMPUTATION_HPP_
+
+#include "common/trt_plugin_base.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+/**
+ * @brief Attention value computation plugin.
+ *
+ * @param queryBatchCnt <int; [B]>
+ * @param keyBatchCnt <int; [B]>
+ * @param indexPairBatch <int; [Q]>
+ * @param indexPair <int; [Q, L]>
+ * @param attnWeight <float; [Q, L, numHead]>
+ * @param valueFeature <float; [K, numHead, headDim]>
+ * @return outputs <float; [Q, numHead, headDim]>
+ */
+class AttentionValueComputation : public TRTPluginBase
+{
+public:
+  explicit AttentionValueComputation(const std::string & name);
+  AttentionValueComputation(const std::string & name, const void * data, size_t length);
+  ~AttentionValueComputation() TRT_NOEXCEPT override;
+
+  /* IPluginV2DynamicExt methods */
+  nvinfer1::IPluginV2DynamicExt * clone() const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs * inputs, int nbInputs,
+    nvinfer1::IExprBuilder & exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc * ioDesc, int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::PluginTensorDesc * outDesc, int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(
+    const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void attachToContext(
+    cudnnContext * cudnnCtx, cublasContext * cublasCtx,
+    nvinfer1::IGpuAllocator * gpuAllocator) TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  /* IPluginV2Ext methods */
+  nvinfer1::DataType getOutputDataType(
+    int index, const nvinfer1::DataType * inTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+  /* IPluginV2 methods */
+  const char * getPluginType() const TRT_NOEXCEPT override;
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void * buffer) const TRT_NOEXCEPT override;
+};  // class AttentionValueComputation
+
+class AttentionValueComputationCreator : public TRTPluginCreatorBase
+{
+public:
+  AttentionValueComputationCreator();
+
+  const char * getPluginName() const TRT_NOEXCEPT override;
+
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * createPlugin(
+    const char * name, const nvinfer1::PluginFieldCollection * fc) TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT override;
+};  // class AttentionValueComputationCreator
+REGISTER_TENSORRT_PLUGIN(AttentionValueComputationCreator);
+}  // namespace autoware::trt_mtr
+#endif  // ATTENTION__TRT_ATTN_VALUE_COMPUTATION_HPP_
diff --git a/perception/autoware_mtr/lib/include/attention/trt_attn_value_computation_kernel.hpp b/perception/autoware_mtr/lib/include/attention/trt_attn_value_computation_kernel.hpp
new file mode 100644
index 0000000000000..fdcb20a4b448e
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/attention/trt_attn_value_computation_kernel.hpp
@@ -0,0 +1,48 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ATTENTION__TRT_ATTN_VALUE_COMPUTATION_KERNEL_HPP_
+#define ATTENTION__TRT_ATTN_VALUE_COMPUTATION_KERNEL_HPP_
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+/**
+ * @brief The launcher to invoke attention value computation kernel.
+ *
+ * @param B The size of batch.
+ * @param Q The size of query.
+ * @param L The size of local.
+ * @param K The size of key.
+ * @param numHead The number of heads.
+ * @param headDim The number of head dimensions.
+ * @param queryBatchCnt The number of queries for each batch, in shape [B].
+ * @param keyBatchCnt The number of keys for each batch, in shape [B].
+ * @param indexPairBatch The indices of batch for corresponding query, in shape [Q].
+ * @param indexPair The indices of key for corresponding query, in shape [Q*L].
+ * @param attnWeight Source attention weights, in shape [Q*L*numHead].
+ * @param valueFeature Source value features, in shape [K*numHead*headDim].
+ * @param output Output container, in shape [Q*numHead*headDim].
+ * @param stream CUDA stream.
+ *
+ * @return cudaError_t CUDA error type.
+ */
+cudaError_t AttentionValueComputationLauncher(
+  const int32_t B, const int32_t Q, const int32_t L, const int32_t K, const int32_t numHead,
+  const int32_t headDim, const int * queryBatchCnt, const int * keyBatchCnt,
+  const int * indexPairBatch, const int * indexPair, const float * attnWeight,
+  const float * valueFeature, float * output, cudaStream_t stream);
+
+#endif  // ATTENTION__TRT_ATTN_VALUE_COMPUTATION_KERNEL_HPP_
diff --git a/perception/autoware_mtr/lib/include/attention/trt_attn_weight_computation.hpp b/perception/autoware_mtr/lib/include/attention/trt_attn_weight_computation.hpp
new file mode 100644
index 0000000000000..42aaf2ecfeba7
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/attention/trt_attn_weight_computation.hpp
@@ -0,0 +1,101 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ATTENTION__TRT_ATTN_WEIGHT_COMPUTATION_HPP_
+#define ATTENTION__TRT_ATTN_WEIGHT_COMPUTATION_HPP_
+
+#include "common/trt_plugin_base.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+/**
+ * @brief The function to compute attention weight.
+ *
+ * @param queryBatchCnt <int; [B]>
+ * @param keyBatchCnt <int; [B]>
+ * @param indexPairBatch <int; [Q]>
+ * @param indexPair <int; [Q, L]>
+ * @param queryFeature <float; [Q, numHead, headDim]>
+ * @param keyFeature <float; [Q, numHead, headDim]>
+ * @param output <float; [Q, L, numHead]>
+ */
+class AttentionWeightComputation : public TRTPluginBase
+{
+public:
+  explicit AttentionWeightComputation(const std::string & name);
+  AttentionWeightComputation(const std::string & name, const void * data, size_t length);
+  ~AttentionWeightComputation() TRT_NOEXCEPT override;
+
+  /* IPluginV2DynamicExt methods */
+  nvinfer1::IPluginV2DynamicExt * clone() const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs * inputs, int nbInputs,
+    nvinfer1::IExprBuilder & exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc * ioDesc, int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::PluginTensorDesc * outDesc, int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(
+    const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void attachToContext(
+    cudnnContext * cudnnCtx, cublasContext * cublasCtx,
+    nvinfer1::IGpuAllocator * gpuAllocator) TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  /* IPluginV2Ext methods */
+  nvinfer1::DataType getOutputDataType(
+    int index, const nvinfer1::DataType * inTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+  /* IPluginV2 methods */
+  const char * getPluginType() const TRT_NOEXCEPT override;
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void * buffer) const TRT_NOEXCEPT override;
+};  // class AttentionWeightComputation
+
+class AttentionWeightComputationCreator : public TRTPluginCreatorBase
+{
+public:
+  AttentionWeightComputationCreator();
+
+  const char * getPluginName() const TRT_NOEXCEPT override;
+
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * createPlugin(
+    const char * name, const nvinfer1::PluginFieldCollection * fc) TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT override;
+};  // class AttentionWeightComputationCreator
+REGISTER_TENSORRT_PLUGIN(AttentionWeightComputationCreator);
+}  // namespace autoware::trt_mtr
+#endif  // ATTENTION__TRT_ATTN_WEIGHT_COMPUTATION_HPP_
diff --git a/perception/autoware_mtr/lib/include/attention/trt_attn_weight_computation_kernel.hpp b/perception/autoware_mtr/lib/include/attention/trt_attn_weight_computation_kernel.hpp
new file mode 100644
index 0000000000000..5ff72f4637efb
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/attention/trt_attn_weight_computation_kernel.hpp
@@ -0,0 +1,48 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ATTENTION__TRT_ATTN_WEIGHT_COMPUTATION_KERNEL_HPP_
+#define ATTENTION__TRT_ATTN_WEIGHT_COMPUTATION_KERNEL_HPP_
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+/**
+ * @brief The launcher to invoke attention weight computation kernel.
+ *
+ * @param B The size of batch.
+ * @param Q The size of query.
+ * @param L The size of local.
+ * @param K The size of key.
+ * @param numHead The number of heads.
+ * @param headDim The number of head dimensions.
+ * @param queryBatchCnt The number of queries for each batch, in shape [B].
+ * @param keyBatchCnt The number of keys for each batch, in shape [B].
+ * @param indexPairBatch The indices of batch for corresponding query, in shape [Q].
+ * @param indexPair The indices of key for corresponding query, in shape [Q*L].
+ * @param queryFeature Source query features, in shape [Q*numHead*headDim].
+ * @param keyFeature Source key features, in shape [K*numHead*headDim].
+ * @param output Output container, in shape [Q*L*numHead].
+ * @param stream CUDA stream.
+ *
+ * @return cudaError_t CUDA error type.
+ */
+cudaError_t AttentionWeightComputationLauncher(
+  const int32_t B, const int32_t Q, const int32_t L, const int32_t K, const int32_t numHead,
+  const int32_t headDim, const int * queryBatchCnt, const int * keyBatchCnt,
+  const int * indexPairBatch, const int * indexPair, const float * queryFeature,
+  const float * keyFeature, float * output, cudaStream_t stream);
+
+#endif  // ATTENTION__TRT_ATTN_WEIGHT_COMPUTATION_KERNEL_HPP_
diff --git a/perception/autoware_mtr/lib/include/common/trt_plugin_base.hpp b/perception/autoware_mtr/lib/include/common/trt_plugin_base.hpp
new file mode 100644
index 0000000000000..685cf075c1e43
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/common/trt_plugin_base.hpp
@@ -0,0 +1,173 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef COMMON__TRT_PLUGIN_BASE_HPP_
+#define COMMON__TRT_PLUGIN_BASE_HPP_
+
+#include "trt_plugin_helper.hpp"
+
+#include <NvInferRuntime.h>
+#include <NvInferVersion.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+namespace autoware::trt_mtr
+{
+#if NV_TENSORRT_MAJOR > 7
+#define TRT_NOEXCEPT noexcept
+#else
+#define TRT_NOEXCEPT
+#endif
+
+class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt
+{
+public:
+  explicit TRTPluginBase(const std::string & name) : mLayerName(name) {}
+
+  /* IPluginV2 methods */
+
+  /**
+   * @brief Return the plugin version. Should match the plugin version returned by the corresponding
+   * plugin creator.
+   *
+   * @return const char*
+   */
+  const char * getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  /**
+   * @brief Initialize the layer for execution. This is called when the engine is created.
+   *
+   * @return int 0 for success, else non-zero (which will cause engine termination).
+   */
+  int initialize() TRT_NOEXCEPT override { return STATUS_SUCCESS; }
+
+  /**
+   * @brief Release resources acquired during plugin layer initialization. This is called when the
+   * engine is destroyed.
+   *
+   */
+  void terminate() TRT_NOEXCEPT override {}
+
+  /**
+   * @brief Destroy the plugin objects. This will be called when the network, builder or engine is
+   * destroyed.
+   *
+   */
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+  /**
+   * @brief Set the Plugin Namespace object
+   *
+   * @param pluginNamespace
+   */
+  void setPluginNamespace(const char * pluginNamespace) TRT_NOEXCEPT override
+  {
+    mNamespace = pluginNamespace;
+  }
+
+  /**
+   * @brief Get the Plugin Namespace object
+   *
+   * @return const char*
+   */
+  const char * getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+
+  /**
+   * @brief
+   *
+   * @param inDesc
+   * @param nbInputs
+   * @param outDesc
+   * @param nbOutputs
+   */
+  void configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *, int, const nvinfer1::DynamicPluginTensorDesc *,
+    int) TRT_NOEXCEPT override
+  {
+  }
+
+  /**
+   * @brief Get the Workspace Size object
+   *
+   * @param inDesc
+   * @param nbInputs
+   * @param outDesc
+   * @param nbOutputs
+   * @return size_t
+   */
+  size_t getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *, int, const nvinfer1::PluginTensorDesc *,
+    int) const TRT_NOEXCEPT override
+  {
+    return 0;
+  }
+
+  /**
+   * @brief
+   *
+   * @param cudnnCtx
+   * @param cublasCtx
+   * @param gpuAllocator
+   */
+  void attachToContext(cudnnContext *, cublasContext *, nvinfer1::IGpuAllocator *)
+    TRT_NOEXCEPT override
+  {
+  }
+
+  /**
+   * @brief
+   *
+   */
+  void detachFromContext() TRT_NOEXCEPT override {}
+
+protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+#if NV_TENSORRT_MAJOR < 8
+protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+#endif
+};
+
+class TRTPluginCreatorBase : public nvinfer1::IPluginCreator
+{
+public:
+  const char * getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection * getFieldNames() TRT_NOEXCEPT override { return &mFC; }
+
+  void setPluginNamespace(const char * pluginNamespace) TRT_NOEXCEPT override
+  {
+    mNamespace = pluginNamespace;
+  }
+
+  const char * getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+
+protected:
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+}  // namespace autoware::trt_mtr
+
+#endif  // COMMON__TRT_PLUGIN_BASE_HPP_
diff --git a/perception/autoware_mtr/lib/include/common/trt_plugin_helper.hpp b/perception/autoware_mtr/lib/include/common/trt_plugin_helper.hpp
new file mode 100644
index 0000000000000..5fcb1c571ec75
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/common/trt_plugin_helper.hpp
@@ -0,0 +1,41 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef COMMON__TRT_PLUGIN_HELPER_HPP_
+#define COMMON__TRT_PLUGIN_HELPER_HPP_
+
+#include <NvInferRuntime.h>
+
+#include <iostream>
+
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+enum pluginStatus_t {
+  STATUS_SUCCESS = 0,
+  STATUS_FAILURE = 1,
+  STATUS_BAD_PARAM = 2,
+  STATUS_NOT_SUPPORTED = 3,
+  STATUS_NOT_INITIALIZED = 4
+};  // enum pluginStatus_t
+
+#define PLUGIN_ASSERT(assertion)                                             \
+  {                                                                          \
+    if (!(assertion)) {                                                      \
+      std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
+      abort();                                                               \
+    }                                                                        \
+  }
+
+#endif  // COMMON__TRT_PLUGIN_HELPER_HPP_
diff --git a/perception/autoware_mtr/lib/include/common/trt_serialize.hpp b/perception/autoware_mtr/lib/include/common/trt_serialize.hpp
new file mode 100644
index 0000000000000..279dc9a9695df
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/common/trt_serialize.hpp
@@ -0,0 +1,127 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef COMMON__TRT_SERIALIZE_HPP_
+#define COMMON__TRT_SERIALIZE_HPP_
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline void serialize_value(void ** buffer, T const & value);
+
+template <typename T>
+inline void deserialize_value(void const ** buffer, size_t * buffer_size, T * value);
+
+namespace
+{
+template <typename T, class Enable = void>
+struct Serializer
+{
+};
+
+template <typename T>
+struct Serializer<
+  T, typename std::enable_if<
+       std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
+{
+  static size_t serialized_size(T const & value) { return sizeof(T); }
+  static void serialize(void ** buffer, T const & value)
+  {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char *&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const ** buffer, size_t * buffer_size, T * value)
+  {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const *&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};  // struct struct Serializer<T, ...>
+
+template <>
+struct Serializer<const char *>
+{
+  static size_t serialized_size(const char * value) { return strlen(value) + 1; }
+  static void serialize(void ** buffer, const char * value)
+  {
+    constexpr size_t max_len = 10000;
+    auto value_len = ::snprintf(static_cast<char *>(*buffer), max_len, "%s", value);
+    reinterpret_cast<char *&>(*buffer) += value_len + 1;
+  }
+  static void deserialize(void const ** buffer, size_t * buffer_size, const char ** value)
+  {
+    *value = static_cast<char const *>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const *&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};  // struct Serializer<const char *>
+
+template <typename T>
+struct Serializer<
+  std::vector<T>,
+  typename std::enable_if<
+    std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
+{
+  static size_t serialized_size(std::vector<T> const & value)
+  {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+
+  static void serialize(void ** buffer, std::vector<T> const & value)
+  {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char *&>(*buffer) += nbyte;
+  }
+
+  static void deserialize(void const ** buffer, size_t * buffer_size, std::vector<T> * value)
+  {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const *&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};  // struct Serializer<std::vector<T>, ...>
+}  // namespace
+
+template <typename T>
+inline size_t serialized_size(T const & value)
+{
+  return Serializer<T>::serialized_size(value);
+}
+
+template <typename T>
+inline void serialize_value(void ** buffer, T const & value)
+{
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template <typename T>
+inline void deserialize_value(void const ** buffer, size_t * buffer_size, T * value)
+{
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
+#endif  // COMMON__TRT_SERIALIZE_HPP_
diff --git a/perception/autoware_mtr/lib/include/knn/trt_knn_batch.hpp b/perception/autoware_mtr/lib/include/knn/trt_knn_batch.hpp
new file mode 100644
index 0000000000000..28077ee6c7b3d
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/knn/trt_knn_batch.hpp
@@ -0,0 +1,103 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KNN__TRT_KNN_BATCH_HPP_
+#define KNN__TRT_KNN_BATCH_HPP_
+
+#include "common/trt_plugin_base.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+/**
+ * @brief The function to compute KNN batch.
+ *
+ * @param xyz <float; [N, 3]>
+ * @param query_xyz <float; [N, 3]>
+ * @param batch_idxs <int; [N]>
+ * @param query_batch_offsets <int; [M]>
+ * @param top_k <int>
+ * @return outputs <int; [N, top_k]>
+ */
+class KnnBatch : public TRTPluginBase
+{
+public:
+  KnnBatch(const std::string & name, const int32_t top_k);
+  KnnBatch(const std::string & name, const void * data, size_t length);
+  ~KnnBatch() TRT_NOEXCEPT override;
+
+  /* IPluginV2DynamicExt methods */
+  nvinfer1::IPluginV2DynamicExt * clone() const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs * inputs, int nbInputs,
+    nvinfer1::IExprBuilder & exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc * ioDesc, int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::PluginTensorDesc * outDesc, int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(
+    const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void attachToContext(
+    cudnnContext * cudnnCtx, cublasContext * cublasCtx,
+    nvinfer1::IGpuAllocator * gpuAllocator) TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  /* IPluginV2Ext methods */
+  nvinfer1::DataType getOutputDataType(
+    int index, const nvinfer1::DataType * inTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+  /* IPluginV2 methods */
+  const char * getPluginType() const TRT_NOEXCEPT override;
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void * buffer) const TRT_NOEXCEPT override;
+
+private:
+  int32_t mTopK;
+};  // class KnnBatch
+
+class KnnBatchCreator : public TRTPluginCreatorBase
+{
+public:
+  KnnBatchCreator();
+
+  const char * getPluginName() const TRT_NOEXCEPT override;
+
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * createPlugin(
+    const char * name, const nvinfer1::PluginFieldCollection * fc) TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT override;
+};  // class KnnBatchCreator
+REGISTER_TENSORRT_PLUGIN(KnnBatchCreator);
+}  // namespace autoware::trt_mtr
+#endif  // KNN__TRT_KNN_BATCH_HPP_
diff --git a/perception/autoware_mtr/lib/include/knn/trt_knn_batch_kernel.hpp b/perception/autoware_mtr/lib/include/knn/trt_knn_batch_kernel.hpp
new file mode 100644
index 0000000000000..7997c196bf00e
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/knn/trt_knn_batch_kernel.hpp
@@ -0,0 +1,24 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KNN__TRT_KNN_BATCH_KERNEL_HPP_
+#define KNN__TRT_KNN_BATCH_KERNEL_HPP_
+
+#include <cuda_runtime.h>
+
+cudaError_t KnnBatchLauncher(
+  const int32_t n, const int32_t m, const int32_t k, const float * xyz, const float * query_xyz,
+  const int * batch_idx, const int * query_batch_offsets, int * output, cudaStream_t stream);
+
+#endif  // KNN__TRT_KNN_BATCH_KERNEL_HPP_
diff --git a/perception/autoware_mtr/lib/include/knn/trt_knn_batch_mlogk.hpp b/perception/autoware_mtr/lib/include/knn/trt_knn_batch_mlogk.hpp
new file mode 100644
index 0000000000000..71725d79764be
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/knn/trt_knn_batch_mlogk.hpp
@@ -0,0 +1,103 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KNN__TRT_KNN_BATCH_MLOGK_HPP_
+#define KNN__TRT_KNN_BATCH_MLOGK_HPP_
+
+#include "common/trt_plugin_base.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+/**
+ * @brief The function to compute KNN batch with MLogK.
+ *
+ * @param xyz <float; [N, 3]>
+ * @param query_xyz <float; [N, 3]>
+ * @param batch_idxs <int; [N]>
+ * @param query_batch_offsets <int; [M]>
+ * @param top_k <int>
+ * @return outputs <int; [N, top_k]>
+ */
+class KnnBatchMlogK : public TRTPluginBase
+{
+public:
+  KnnBatchMlogK(const std::string & name, const int32_t top_k);
+  KnnBatchMlogK(const std::string & name, const void * data, size_t length);
+  ~KnnBatchMlogK() TRT_NOEXCEPT override;
+
+  /* IPluginV2DynamicExt methods */
+  nvinfer1::IPluginV2DynamicExt * clone() const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs * inputs, int nbInputs,
+    nvinfer1::IExprBuilder & exprBuilder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc * ioDesc, int nbInputs,
+    int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc * inDesc, int nbInputs,
+    const nvinfer1::PluginTensorDesc * outDesc, int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(
+    const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+    const void * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void attachToContext(
+    cudnnContext * cudnnCtx, cublasContext * cublasCtx,
+    nvinfer1::IGpuAllocator * gpuAllocator) TRT_NOEXCEPT override;
+
+  void detachFromContext() TRT_NOEXCEPT override;
+
+  /* IPluginV2Ext methods */
+  nvinfer1::DataType getOutputDataType(
+    int index, const nvinfer1::DataType * inTypes, int nbInputs) const TRT_NOEXCEPT override;
+
+  /* IPluginV2 methods */
+  const char * getPluginType() const TRT_NOEXCEPT override;
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void * buffer) const TRT_NOEXCEPT override;
+
+private:
+  int32_t mTopK;
+};  // class KnnBatchMlogK
+
+class KnnBatchMlogKCreator : public TRTPluginCreatorBase
+{
+public:
+  KnnBatchMlogKCreator();
+
+  const char * getPluginName() const TRT_NOEXCEPT override;
+
+  const char * getPluginVersion() const TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * createPlugin(
+    const char * name, const nvinfer1::PluginFieldCollection * fc) TRT_NOEXCEPT override;
+
+  nvinfer1::IPluginV2DynamicExt * deserializePlugin(
+    const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT override;
+};  // class KnnBatchMlogKCreator
+REGISTER_TENSORRT_PLUGIN(KnnBatchMlogKCreator);
+}  // namespace autoware::trt_mtr
+#endif  // KNN__TRT_KNN_BATCH_MLOGK_HPP_
diff --git a/perception/autoware_mtr/lib/include/knn/trt_knn_batch_mlogk_kernel.hpp b/perception/autoware_mtr/lib/include/knn/trt_knn_batch_mlogk_kernel.hpp
new file mode 100644
index 0000000000000..d5bbffcc69d9f
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/knn/trt_knn_batch_mlogk_kernel.hpp
@@ -0,0 +1,24 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KNN__TRT_KNN_BATCH_MLOGK_KERNEL_HPP_
+#define KNN__TRT_KNN_BATCH_MLOGK_KERNEL_HPP_
+
+#include <cuda_runtime.h>
+
+cudaError_t KnnBatchMlogKLauncher(
+  const int32_t n, const int32_t m, const int32_t k, const float * xyz, const float * query_xyz,
+  const int * batch_idx, const int * query_batch_offsets, int * output, cudaStream_t stream);
+
+#endif  // KNN__TRT_KNN_BATCH_MLOGK_KERNEL_HPP_
diff --git a/perception/autoware_mtr/lib/include/postprocess/postprocess_kernel.cuh b/perception/autoware_mtr/lib/include/postprocess/postprocess_kernel.cuh
new file mode 100644
index 0000000000000..949eacc6abeca
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/postprocess/postprocess_kernel.cuh
@@ -0,0 +1,54 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef POSTPROCESS__POSTPROCESS_KERNEL_CUH__
+#define POSTPROCESS__POSTPROCESS_KERNEL_CUH__
+
+#include <cuda_runtime.h>
+
+/**
+ * @brief Transform predicted trajectory from target agent coords to world coords.
+ *
+ * @param B The number of target agents.
+ * @param M The number of modes.
+ * @param T The number of future timestamps.
+ * @param AgentDim The number of target agent state dimensions, expecting (x, y, z, length, width,
+ * height, yaw, vx, vy, ax, ay).
+ * @param targetState Source target agent state, in shape [B*AgentDim].
+ * @param PredDim The number of predicted state dimension, expecting (x, y, ?, ?, ?, vx, vy).
+ * @param predTrajectory Predicted trajectory, in shape [B*M*T*PredDim].
+ */
+__global__ void transformTrajectoryKernel(
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory);
+
+/**
+ * @brief Execute postprocess to predicted score and trajectory.
+ *
+ * @param B The number of target agents.
+ * @param M The number of modes.
+ * @param T The number of future timestamps.
+ * @param AgentDim The number of target agent state dimensions, expecting (x, y, z, length, width,
+ * height, yaw, vx, vy, ax, ay).
+ * @param targetState Target agent states at the latest timestamp, in shape [B*inDim].
+ * @param PredDim The number predicted state dimensions, expecting (x, y, ?, ?, ?, vx, vy).
+ * @param predTrajectory Predicted trajectory, in shape [B*M*T*PredDim].
+ * @param stream CUDA stream.
+ * @return cudaError_t CUDA error type.
+ */
+cudaError_t postprocessLauncher(
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory, cudaStream_t stream);
+
+#endif  // POSTPROCESS__POSTPROCESS_KERNEL_CUH__
diff --git a/perception/autoware_mtr/lib/include/preprocess/agent_preprocess_kernel.cuh b/perception/autoware_mtr/lib/include/preprocess/agent_preprocess_kernel.cuh
new file mode 100644
index 0000000000000..9dd59ce2331a8
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/preprocess/agent_preprocess_kernel.cuh
@@ -0,0 +1,69 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PREPROCESS__AGENT_PREPROCESS_KERNEL_CUH_
+#define PREPROCESS__AGENT_PREPROCESS_KERNEL_CUH_
+
+#include <cuda_runtime.h>
+
+/**
+ * @brief Preprocess kernel for agent data.
+ *
+ * @param B The number of target agents.
+ * @param N The number of all agents.
+ * @param T The number of timestamps.
+ * @param D The number of agent state dimensions.
+ * @param C The number of agent classes.
+ * @param sdc_index The index of ego.
+ * @param target_index The array of target indices, in shape [B].
+ * @param object_type_index The array of agent class indices, in shape [N].
+ *  (e.g. 0: VEHICLE, 1:PEDESTRIAN, 2: CYCLIST).
+ * @param timestamps The array of timestamps, in shape [T].
+ * @param in_trajectory The array of trajectory, in shape [N*T*D].
+ * @param out_data Output trajectory data, in shape [B*N*T*(D+C+T+5)].
+ * @param out_mask Output mask trajectory data, in shape [B*N*T].
+ * @param out_last_pos Output last position, in shape [B*N*3].
+ */
+__global__ void agentPreprocessKernel(
+  const int B, const int N, const int T, const int D, const int C, const int sdc_index,
+  const int * target_index, const int * object_type_index, const float * timestamps,
+  const float * in_trajectory, float * out_data, bool * out_mask, float * out_last_pos);
+
+/**
+ * @brief Preprocess kernel for agent data.
+ *
+ * @param B The number of target agents.
+ * @param N The number of all agents.
+ * @param T The number of timestamps.
+ * @param D The number of agent state dimensions.
+ * @param C The number of agent classes.
+ * @param sdc_index The index of ego.
+ * @param target_index The array of target indices, in shape [B].
+ * @param object_type_index The array of agent class indices, in shape [N].
+ *  (e.g. 0: VEHICLE, 1:PEDESTRIAN, 2: CYCLIST).
+ * @param timestamps The array of timestamps, in shape [T].
+ * @param in_trajectory The array of trajectory, in shape [N*T*D].
+ * @param out_data Output trajectory data, in shape [B*N*T*(D+C+T+5)].
+ * @param out_mask Output mask trajectory data, in shape [B*N*T].
+ * @param out_last_pos Output last position, in shape [B*N*3].
+ * @param stream CUDA stream.
+ * @return cudaError_t CUDA error code.
+ */
+cudaError_t agentPreprocessLauncher(
+  const int B, const int N, const int T, const int D, const int C, const int sdc_index,
+  const int * target_index, const int * object_type_index, const float * timestamps,
+  const float * in_trajectory, float * out_data, bool * out_mask, float * out_last_pos,
+  cudaStream_t stream);
+
+#endif  // PREPROCESS__AGENT_PREPROCESS_KERNEL_HPP_
diff --git a/perception/autoware_mtr/lib/include/preprocess/polyline_preprocess_kernel.cuh b/perception/autoware_mtr/lib/include/preprocess/polyline_preprocess_kernel.cuh
new file mode 100644
index 0000000000000..0ffec0e01daf6
--- /dev/null
+++ b/perception/autoware_mtr/lib/include/preprocess/polyline_preprocess_kernel.cuh
@@ -0,0 +1,162 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_
+#define PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_
+
+/**
+ * @brief Transform polylines to target agent coordinate system and extend its feature with previous
+ * x, y`(D->D+2)`.
+ *
+ * Points which all elements are 0.0 except of typeID are filled by 0.0 and the corresponding
+ * mask value is 0.0 too.
+ *
+ * @param K The number of polylines.
+ * @param P The number of points contained in each polyline.
+ * @param PointDim The number of point dimensions, expecting (x, y, z, dx, dy, dz, typeID).
+ * @param inPolyline Source polyline, in shape [K*P*D].
+ * @param B The number of target agents.
+ * @param AgentDim The number of agent state dimensions, expecting (x, y, z, length, width, height,
+ * yaw, vx, vy, ax, ay).
+ * @param targetState Source target agent states, in shape [B*AgentDim].
+ * @param outPolyline Output polyline, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline mask, in shape [B*K*P].
+ */
+__global__ void transformPolylineKernel(
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask);
+
+/**
+ * @brief Set the Previous Position Kernel object
+ *
+ * @param B The number of target agents.
+ * @param K The number of polylines.
+ * @param P The number of points contained in each polyline.
+ * @param D The number of point dimensions, expecting (x, y, ..., preX, preY).
+ * @param polyline Source polyline, in shape [B*K*P*D].
+ */
+__global__ void setPreviousPositionKernel(
+  const int B, const int K, const int P, const int D, float * polyline);
+
+/**
+ * @brief Calculate center distance from target agent to each polyline.
+ *
+ * @note Polyline must have been transformed to target agent coordinates system.
+ *
+ * @param B The number of target agents.
+ * @param K The number of polylines.
+ * @param P The number of points contained in each polyline.
+ * @param D The number of point dimensions, expecting [x, y, ...].
+ * @param polyline Source polyline, in shape [B*K*P*D].
+ * @param polylineMask Source polyline mask, in shape [B*K*P].
+ * @param distance Output calculated distances, in shape [B*K].
+ */
+__global__ void calculateCenterDistanceKernel(
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * distance);
+
+/**
+ * @brief Extract K polylines with the smallest distances.
+ *
+ * @note Because this kernel supposes to allocate shared memory dynamically it is necessary to
+ * specify `sizeof(float) * L` in the kernel execution configuration.
+ *
+ * @param K The number of polylines to be extracted.
+ * @param B The number of target agents.
+ * @param L The number of original polylines.
+ * @param P The number of points contained in each polyline.
+ * @param D The number of point dimensions.
+ * @param inPolyline Source polyline, in shape [B*L*P*D].
+ * @param inPolylineMask Source polyline mask, in shape [B*L*P].
+ * @param inDistance Source distances from target agents to the centers of each polyline, in shape
+ * [B*L].
+ * @param outPolyline Output polyline, in shape [B*K*P*D].
+ * @param outPolylineMask Output polyline mask, in shape [B*K*P].
+ */
+__global__ void extractTopKPolylineKernel(
+  const int K, const int B, const int L, const int P, const int D, const float * inPolyline,
+  const bool * inPolylineMask, const float * inDistance, float * outPolyline,
+  bool * outPolylineMask);
+
+/**
+ * @brief Calculate center positions of each polyline with respect to target agent coordinates
+ * system.
+ *
+ * @note Polyline must have been transformed to target agent coordinates system.
+ *
+ * @param B The number of target agents.
+ * @param K The number of polylines.
+ * @param P The number of points contained in each polyline.
+ * @param D The number of point dimensions, expecting (x, y, z, ...).
+ * @param polyline Source polyline, in shape [B*K*P*D].
+ * @param polylineMask Source polyline mask, in shape [B*K*P].
+ * @param center Output centers, in shape [B*K*3].
+ */
+__global__ void calculatePolylineCenterKernel(
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * center);
+
+/**
+ * @brief In cases of the number of batch polylines (L) is greater than K,
+ *  extracts the topK elements.
+ *
+ * @param K The number of polylines to be extracted.
+ * @param L The number of original polylines.
+ * @param P The number of points contained in each polyline.
+ * @param PointDim The number of point state dimensions.
+ * @param inPolyline Source polylines, in shape [L*P*PointDim].
+ * @param B The number of target agents.
+ * @param AgentDim The number of agent state dimensions.
+ * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim].
+ * @param tmpPolyline A container to store transformed polyline temporary, in shape
+ * [B*L*P*(PointDim+2)].
+ * @param tmpPolylineMask A container to store transformed polyline mask temporary, in shape
+ * [B*L*P].
+ * @param tmpDistance A container to store distances temporary, in shape [B*L].
+ * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline masks, in shape [B*K*P].
+ * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords,
+ *  in shape [B*K*3].
+ * @param stream CUDA stream.
+ * @return cudaError_t
+ */
+cudaError_t polylinePreprocessWithTopkLauncher(
+  const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * tmpPolyline, bool * tmpPolylineMask,
+  float * tmpDistance, float * outPolyline, bool * outPolylineMask, float * outPolylineCenter,
+  cudaStream_t stream);
+
+/**
+ * @brief Do preprocess for polyline if the number of batched polylines is K.
+ *
+ * @param K The number of polylines.
+ * @param P The number of points contained in each polyline.
+ * @param PointDim The number of point state dimensions.
+ * @param inPolyline Source polylines, in shape [K*P*PointDim].
+ * @param B The number of target agents.
+ * @param AgentDim The number of agent state dimensions.
+ * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim].
+ * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline masks, in shape [B*K*P].
+ * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords,
+ *  in shape [B*K*3].
+ * @param stream CUDA stream.
+ * @return cudaError_t
+ */
+cudaError_t polylinePreprocessLauncher(
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream);
+
+#endif  // PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_
diff --git a/perception/autoware_mtr/lib/src/attention/trt_attn_value_computation.cpp b/perception/autoware_mtr/lib/src/attention/trt_attn_value_computation.cpp
new file mode 100644
index 0000000000000..ff8b81e0854d6
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/attention/trt_attn_value_computation.cpp
@@ -0,0 +1,207 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "attention/trt_attn_value_computation.hpp"
+
+#include "attention/trt_attn_value_computation_kernel.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+namespace
+{
+static const char * PLUGIN_VERSION{"1"};
+static const char * PLUGIN_NAME{"TRTAttentionValueComputation"};
+}  // namespace
+
+AttentionValueComputation::AttentionValueComputation(const std::string & name) : TRTPluginBase(name)
+{
+}
+
+AttentionValueComputation::AttentionValueComputation(const std::string & name, const void *, size_t)
+: TRTPluginBase(name)
+{
+}
+
+AttentionValueComputation::~AttentionValueComputation() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::IPluginV2DynamicExt * AttentionValueComputation::clone() const TRT_NOEXCEPT
+{
+  auto * plugin = new AttentionValueComputation(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::DimsExprs AttentionValueComputation::getOutputDimensions(
+  int, const nvinfer1::DimsExprs * inputs, int, nvinfer1::IExprBuilder &) TRT_NOEXCEPT
+{
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 3;
+  ret.d[0] = inputs[3].d[0];
+  ret.d[1] = inputs[5].d[1];
+  ret.d[2] = inputs[5].d[2];
+
+  return ret;
+}
+
+bool AttentionValueComputation::supportsFormatCombination(
+  int pos, const nvinfer1::PluginTensorDesc * ioDesc, int, int) TRT_NOEXCEPT
+{
+  return ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR &&
+         ioDesc[pos].type == (pos < 4 ? nvinfer1::DataType::kINT32 : nvinfer1::DataType::kFLOAT);
+}
+
+void AttentionValueComputation::configurePlugin(
+  const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+  const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT
+{
+  // Validate input arguments
+  PLUGIN_ASSERT(nbInputs == 6);
+  for (int pos = 0; pos < 6; ++pos) {
+    PLUGIN_ASSERT(
+      inDesc[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+      inDesc[pos].desc.type == (pos < 4 ? nvinfer1::DataType::kINT32 : nvinfer1::DataType::kFLOAT));
+  }
+
+  PLUGIN_ASSERT(nbOutputs == 1);
+  PLUGIN_ASSERT(
+    outDesc[0].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+    outDesc[0].desc.type == nvinfer1::DataType::kFLOAT);
+}
+
+size_t AttentionValueComputation::getWorkspaceSize(
+  const nvinfer1::PluginTensorDesc *, int, const nvinfer1::PluginTensorDesc *,
+  int) const TRT_NOEXCEPT
+{
+  return 0;
+}
+
+int AttentionValueComputation::enqueue(
+  const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+  const void * const * inputs, void * const * outputs, void *, cudaStream_t stream) TRT_NOEXCEPT
+{
+  // parse query_batch_cnt description
+  const int32_t B = inDesc[0].dims.d[0];
+
+  // parse index_pair description
+  const int32_t Q = inDesc[3].dims.d[0];
+  const int32_t L = inDesc[3].dims.d[1];
+
+  // parse value_features description
+  const int32_t K = inDesc[5].dims.d[0];
+  const int32_t numHead = inDesc[5].dims.d[1];
+  const int32_t headDim = inDesc[5].dims.d[2];
+
+  const void * queryBatchCnt = inputs[0];
+  const void * keyBatchCnt = inputs[1];
+  const void * indexPairBatch = inputs[2];
+  const void * indexPair = inputs[3];
+  const void * attnWeight = inputs[4];
+  const void * valueFeature = inputs[5];
+
+  void * output = outputs[0];
+
+  switch (outDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      AttentionValueComputationLauncher(
+        B, Q, L, K, numHead, headDim, reinterpret_cast<const int *>(queryBatchCnt),
+        reinterpret_cast<const int *>(keyBatchCnt), reinterpret_cast<const int *>(indexPairBatch),
+        reinterpret_cast<const int *>(indexPair), reinterpret_cast<const float *>(attnWeight),
+        reinterpret_cast<const float *>(valueFeature), reinterpret_cast<float *>(output), stream);
+      break;
+
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+void AttentionValueComputation::attachToContext(
+  cudnnContext *, cublasContext *, nvinfer1::IGpuAllocator *) TRT_NOEXCEPT
+{
+}
+
+void AttentionValueComputation::detachFromContext() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::DataType AttentionValueComputation::getOutputDataType(
+  int, const nvinfer1::DataType * inTypes, int) const TRT_NOEXCEPT
+{
+  return inTypes[4];
+}
+
+const char * AttentionValueComputation::getPluginType() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * AttentionValueComputation::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+int AttentionValueComputation::getNbOutputs() const TRT_NOEXCEPT
+{
+  return 1;
+}
+
+size_t AttentionValueComputation::getSerializationSize() const TRT_NOEXCEPT
+{
+  return 0;
+}
+
+void AttentionValueComputation::serialize(void *) const TRT_NOEXCEPT
+{
+}
+
+/* ====================== creator ====================== */
+AttentionValueComputationCreator::AttentionValueComputationCreator()
+{
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char * AttentionValueComputationCreator::getPluginName() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * AttentionValueComputationCreator::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2DynamicExt * AttentionValueComputationCreator::createPlugin(
+  const char * name, const nvinfer1::PluginFieldCollection *) TRT_NOEXCEPT
+{
+  auto plugin = new AttentionValueComputation(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2DynamicExt * AttentionValueComputationCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT
+{
+  auto plugin = new AttentionValueComputation(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+}  // namespace autoware::trt_mtr
diff --git a/perception/autoware_mtr/lib/src/attention/trt_attn_value_computation_kernel.cu b/perception/autoware_mtr/lib/src/attention/trt_attn_value_computation_kernel.cu
new file mode 100644
index 0000000000000..03871474d9f18
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/attention/trt_attn_value_computation_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "attention/trt_attn_value_computation_kernel.hpp"
+
+#include <iostream>
+
+/**
+ * @brief Attention value computation kernel.
+ *
+ * @tparam d The size of shared memory, which should be equal to `L`.
+ * @param B The size of batch.
+ * @param Q The size of query.
+ * @param L The size of local.
+ * @param K The size of key.
+ * @param numHead The number of heads.
+ * @param headDim The number of head dimensions.
+ * @param queryBatchCnt The number of queries for each batch, in shape [B].
+ * @param keyBatchCnt The number of keys for each batch, in shape [B].
+ * @param indexPairBatch The indices of batch for corresponding query, in shape [Q].
+ * @param indexPair The indices of key for corresponding query, in shape [Q*L].
+ * @param attnWeight Source attention weights, in shape [Q*L*numHead].
+ * @param valueFeature Source value features, in shape [K*numHead*headDim].
+ * @param output Output container, in shape [Q*numHead*headDim].
+ */
+template <unsigned int d>
+__global__ void attentionValueComputationKernel(
+  const int32_t B, const int32_t Q, const int32_t L, const int32_t K, const int32_t numHead,
+  const int32_t headDim, const int * queryBatchCnt, const int * keyBatchCnt,
+  const int * indexPairBatch, const int * indexPair, const float * attnWeight,
+  const float * valueFeature, float * output)
+{
+  const int query_idx = blockIdx.x;
+  const int head_idx = blockIdx.y;
+  const int hdim_idx = threadIdx.x;
+
+  if (query_idx >= Q || head_idx >= numHead || hdim_idx >= headDim) {
+    return;
+  }
+
+  // get key_start_idx
+  const int batch_idx = indexPairBatch[query_idx];
+  if (batch_idx < 0) {
+    return;
+  }
+
+  int key_start_idx = 0;
+  for (int i = 0; i < batch_idx; ++i) {
+    key_start_idx += keyBatchCnt[i];
+  }
+  // get shared variables
+  __shared__ float sharedAttnWeight[d];
+  __shared__ int sharedValueIdx[d];
+  for (int i = 0; i < L; i += blockDim.x) {
+    sharedAttnWeight[i] = attnWeight[query_idx * L * numHead + i * numHead + head_idx];
+
+    const int cur_key_idx = indexPair[query_idx * L + i];
+    sharedValueIdx[i] = cur_key_idx == -1 ? -1 : cur_key_idx + key_start_idx;
+  }
+  __syncthreads();
+
+  output += query_idx * numHead * headDim + head_idx * headDim + hdim_idx;
+
+  float attn_result = 0.0f;
+  for (int i = 0; i < L; ++i) {
+    // TODO: fix bug (an illegal memory access was encountered)
+    // value_idx need to guard with value_idx >= 0 && value_idx < K
+    if (const int value_idx = sharedValueIdx[i]; value_idx >= 0 && value_idx < K) {
+      attn_result += sharedAttnWeight[i] *
+                     valueFeature[value_idx * numHead * headDim + head_idx * headDim + hdim_idx];
+    }
+  }
+  output[0] = attn_result;
+}
+
+cudaError_t AttentionValueComputationLauncher(
+  const int32_t B, const int32_t Q, const int32_t L, const int32_t K, const int32_t numHead,
+  const int32_t headDim, const int * queryBatchCnt, const int * keyBatchCnt,
+  const int * indexPairBatch, const int * indexPair, const float * attnWeight,
+  const float * valueFeature, float * output, cudaStream_t stream)
+{
+  if (L > 512) {
+    return cudaError::cudaErrorInvalidValue;
+  }
+
+  dim3 blocks(Q, numHead);
+  dim3 threads(headDim);
+
+  switch (L) {
+    case 16:
+      attentionValueComputationKernel<16><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        attnWeight, valueFeature, output);
+      break;
+    case 32:
+      attentionValueComputationKernel<32><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        attnWeight, valueFeature, output);
+      break;
+    case 64:
+      attentionValueComputationKernel<64><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        attnWeight, valueFeature, output);
+      break;
+    case 128:
+      attentionValueComputationKernel<128><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        attnWeight, valueFeature, output);
+      break;
+    case 320:
+      attentionValueComputationKernel<320><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        attnWeight, valueFeature, output);
+      break;
+    case 384:
+      attentionValueComputationKernel<384><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        attnWeight, valueFeature, output);
+      break;
+    default:
+      attentionValueComputationKernel<512><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        attnWeight, valueFeature, output);
+      break;
+  }
+
+  return cudaGetLastError();
+}
diff --git a/perception/autoware_mtr/lib/src/attention/trt_attn_weight_computation.cpp b/perception/autoware_mtr/lib/src/attention/trt_attn_weight_computation.cpp
new file mode 100644
index 0000000000000..39641f021c1f1
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/attention/trt_attn_weight_computation.cpp
@@ -0,0 +1,209 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "attention/trt_attn_weight_computation.hpp"
+
+#include "attention/trt_attn_weight_computation_kernel.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+namespace
+{
+static const char * PLUGIN_VERSION{"1"};
+static const char * PLUGIN_NAME{"TRTAttentionWeightComputation"};
+}  // namespace
+
+AttentionWeightComputation::AttentionWeightComputation(const std::string & name)
+: TRTPluginBase(name)
+{
+}
+
+AttentionWeightComputation::AttentionWeightComputation(
+  const std::string & name, const void *, size_t)
+: TRTPluginBase(name)
+{
+}
+
+AttentionWeightComputation::~AttentionWeightComputation() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::IPluginV2DynamicExt * AttentionWeightComputation::clone() const TRT_NOEXCEPT
+{
+  auto * plugin = new AttentionWeightComputation(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::DimsExprs AttentionWeightComputation::getOutputDimensions(
+  int, const nvinfer1::DimsExprs * inputs, int, nvinfer1::IExprBuilder &) TRT_NOEXCEPT
+{
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 3;
+  ret.d[0] = inputs[3].d[0];
+  ret.d[1] = inputs[3].d[1];
+  ret.d[2] = inputs[5].d[1];
+
+  return ret;
+}
+
+bool AttentionWeightComputation::supportsFormatCombination(
+  int pos, const nvinfer1::PluginTensorDesc * ioDesc, int, int) TRT_NOEXCEPT
+{
+  return ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR &&
+         ioDesc[pos].type == (pos < 4 ? nvinfer1::DataType::kINT32 : nvinfer1::DataType::kFLOAT);
+}
+
+void AttentionWeightComputation::configurePlugin(
+  const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+  const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT
+{
+  // Validate input arguments
+  PLUGIN_ASSERT(nbInputs == 6);
+  for (int pos = 0; pos < 6; ++pos) {
+    PLUGIN_ASSERT(
+      inDesc[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+      inDesc[pos].desc.type == (pos < 4 ? nvinfer1::DataType::kINT32 : nvinfer1::DataType::kFLOAT));
+  }
+
+  PLUGIN_ASSERT(nbOutputs == 1);
+  PLUGIN_ASSERT(
+    outDesc[0].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+    outDesc[0].desc.type == nvinfer1::DataType::kFLOAT);
+}
+
+size_t AttentionWeightComputation::getWorkspaceSize(
+  const nvinfer1::PluginTensorDesc *, int, const nvinfer1::PluginTensorDesc *,
+  int) const TRT_NOEXCEPT
+{
+  return 0;
+}
+
+int AttentionWeightComputation::enqueue(
+  const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+  const void * const * inputs, void * const * outputs, void *, cudaStream_t stream) TRT_NOEXCEPT
+{
+  // parse query_batch_cnt description
+  const int32_t B = inDesc[0].dims.d[0];
+
+  // parse index_pair description
+  const int32_t Q = inDesc[3].dims.d[0];
+  const int32_t L = inDesc[3].dims.d[1];
+
+  // parse key_features description
+  const int32_t K = inDesc[5].dims.d[0];
+  const int32_t numHead = inDesc[5].dims.d[1];
+  const int32_t headDim = inDesc[5].dims.d[2];
+
+  const void * queryBatchCnt = inputs[0];
+  const void * keyBatchCnt = inputs[1];
+  const void * indexPairBatch = inputs[2];
+  const void * indexPair = inputs[3];
+  const void * queryFeature = inputs[4];
+  const void * keyFeature = inputs[5];
+
+  void * output = outputs[0];
+
+  switch (outDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      AttentionWeightComputationLauncher(
+        B, Q, L, K, numHead, headDim, reinterpret_cast<const int *>(queryBatchCnt),
+        reinterpret_cast<const int *>(keyBatchCnt), reinterpret_cast<const int *>(indexPairBatch),
+        reinterpret_cast<const int *>(indexPair), reinterpret_cast<const float *>(queryFeature),
+        reinterpret_cast<const float *>(keyFeature), reinterpret_cast<float *>(output), stream);
+      break;
+
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+void AttentionWeightComputation::attachToContext(
+  cudnnContext *, cublasContext *, nvinfer1::IGpuAllocator *) TRT_NOEXCEPT
+{
+}
+
+void AttentionWeightComputation::detachFromContext() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::DataType AttentionWeightComputation::getOutputDataType(
+  int, const nvinfer1::DataType * inTypes, int) const TRT_NOEXCEPT
+{
+  return inTypes[4];
+}
+
+const char * AttentionWeightComputation::getPluginType() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * AttentionWeightComputation::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+int AttentionWeightComputation::getNbOutputs() const TRT_NOEXCEPT
+{
+  return 1;
+}
+
+size_t AttentionWeightComputation::getSerializationSize() const TRT_NOEXCEPT
+{
+  return 0;
+}
+
+void AttentionWeightComputation::serialize(void *) const TRT_NOEXCEPT
+{
+}
+
+/* ====================== creator ====================== */
+AttentionWeightComputationCreator::AttentionWeightComputationCreator()
+{
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char * AttentionWeightComputationCreator::getPluginName() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * AttentionWeightComputationCreator::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2DynamicExt * AttentionWeightComputationCreator::createPlugin(
+  const char * name, const nvinfer1::PluginFieldCollection *) TRT_NOEXCEPT
+{
+  auto plugin = new AttentionWeightComputation(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2DynamicExt * AttentionWeightComputationCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT
+{
+  auto plugin = new AttentionWeightComputation(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+}  // namespace autoware::trt_mtr
diff --git a/perception/autoware_mtr/lib/src/attention/trt_attn_weight_computation_kernel.cu b/perception/autoware_mtr/lib/src/attention/trt_attn_weight_computation_kernel.cu
new file mode 100644
index 0000000000000..f0cfd57dbd5a5
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/attention/trt_attn_weight_computation_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "attention/trt_attn_weight_computation_kernel.hpp"
+
+/**
+ * @brief Attention weight computation kernel.
+ *
+ * @tparam d The size of shared memory, which should be equal to `L`.
+ * @param B The size of batch.
+ * @param Q The size of query.
+ * @param L The size of local.
+ * @param K The size of key.
+ * @param numHead The number of heads.
+ * @param headDim The number of head dimensions.
+ * @param queryBatchCnt The number of queries for each batch, in shape [B].
+ * @param keyBatchCnt The number of keys for each batch, in shape [B].
+ * @param indexPairBatch The indices of batch for corresponding query, in shape [Q].
+ * @param indexPair The indices of key for corresponding query, in shape [Q*L].
+ * @param queryFeature Source query features, in shape [Q*numHead*headDim].
+ * @param keyFeature Source key features, in shape [K*numHead*headDim].
+ * @param output Output container, in shape [Q*L*numHead].
+ */
+template <unsigned int d>
+__global__ void attentionWeightComputationKernel(
+  const int32_t B, const int32_t Q, const int32_t L, const int32_t K, const int32_t numHead,
+  const int32_t headDim, const int * queryBatchCnt, const int * keyBatchCnt,
+  const int * indexPairBatch, const int * indexPair, const float * queryFeature,
+  const float * keyFeature, float * output)
+{
+  const int query_idx = blockIdx.x;
+  const int head_idx = blockIdx.y;
+  const int local_key_idx = threadIdx.x;
+
+  const int index = query_idx * L + local_key_idx;
+
+  if (query_idx >= Q || head_idx >= numHead || local_key_idx >= L) {
+    return;
+  }
+
+  // build shared query features
+  __shared__ float sharedQueryFeature[d];
+  for (int i = local_key_idx; i < headDim; i += blockDim.x) {
+    sharedQueryFeature[i] = queryFeature[query_idx * numHead * headDim + head_idx * headDim + i];
+  }
+  __syncthreads();
+
+  if (indexPair[index] < 0) {
+    // ignore index
+    return;
+  }
+
+  // get real key index
+  const int batch_idx = indexPairBatch[query_idx];
+  if (batch_idx < 0) {
+    return;
+  }
+
+  int key_start_idx = 0;
+  for (int i = 0; i < batch_idx; ++i) {
+    key_start_idx += keyBatchCnt[i];
+  }
+  key_start_idx += indexPair[index];
+
+  // get key features
+  keyFeature += key_start_idx * numHead * headDim + head_idx * headDim;
+  output += index * numHead + head_idx;
+
+  float attn_weight = 0.0f;
+  for (int i = 0; i < headDim; ++i) {
+    attn_weight += keyFeature[i] * sharedQueryFeature[i];
+  }
+  output[0] = attn_weight;
+}
+
+cudaError_t AttentionWeightComputationLauncher(
+  const int32_t B, const int32_t Q, const int32_t L, const int32_t K, const int32_t numHead,
+  const int32_t headDim, const int * queryBatchCnt, const int * keyBatchCnt,
+  const int * indexPairBatch, const int * indexPair, const float * queryFeature,
+  const float * keyFeature, float * output, cudaStream_t stream)
+{
+  if (headDim > 150) {
+    return cudaError::cudaErrorInvalidValue;
+  }
+
+  dim3 blocks(Q, numHead);
+  dim3 threads(L);
+
+  switch (headDim) {
+    case 16:
+      attentionWeightComputationKernel<16><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        queryFeature, keyFeature, output);
+      break;
+    case 24:
+      attentionWeightComputationKernel<24><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        queryFeature, keyFeature, output);
+      break;
+    case 32:
+      attentionWeightComputationKernel<32><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        queryFeature, keyFeature, output);
+      break;
+    case 48:
+      attentionWeightComputationKernel<48><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        queryFeature, keyFeature, output);
+      break;
+    case 64:
+      attentionWeightComputationKernel<64><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        queryFeature, keyFeature, output);
+      break;
+    case 128:
+      attentionWeightComputationKernel<128><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        queryFeature, keyFeature, output);
+      break;
+    default:
+      attentionWeightComputationKernel<150><<<blocks, threads, 0, stream>>>(
+        B, Q, L, K, numHead, headDim, queryBatchCnt, keyBatchCnt, indexPairBatch, indexPair,
+        queryFeature, keyFeature, output);
+      break;
+  }
+
+  return cudaGetLastError();
+}
diff --git a/perception/autoware_mtr/lib/src/knn/trt_knn_batch.cpp b/perception/autoware_mtr/lib/src/knn/trt_knn_batch.cpp
new file mode 100644
index 0000000000000..86f6438ba9aa1
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/knn/trt_knn_batch.cpp
@@ -0,0 +1,208 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "knn/trt_knn_batch.hpp"
+
+#include "common/trt_serialize.hpp"
+#include "knn/trt_knn_batch_kernel.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+namespace
+{
+static const char * PLUGIN_VERSION{"1"};
+static const char * PLUGIN_NAME{"TRTKnnBatch"};
+}  // namespace
+
+KnnBatch::KnnBatch(const std::string & name, const int32_t top_k)
+: TRTPluginBase(name), mTopK(top_k)
+{
+}
+
+KnnBatch::KnnBatch(const std::string & name, const void * data, size_t length) : TRTPluginBase(name)
+{
+  deserialize_value(&data, &length, &mTopK);
+}
+
+KnnBatch::~KnnBatch() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::IPluginV2DynamicExt * KnnBatch::clone() const TRT_NOEXCEPT
+{
+  auto * plugin = new KnnBatch(mLayerName, mTopK);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::DimsExprs KnnBatch::getOutputDimensions(
+  int, const nvinfer1::DimsExprs * inputs, int, nvinfer1::IExprBuilder & exprBuilder) TRT_NOEXCEPT
+{
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 2;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = exprBuilder.constant(mTopK);
+
+  return ret;
+}
+
+bool KnnBatch::supportsFormatCombination(
+  int pos, const nvinfer1::PluginTensorDesc * ioDesc, int, int) TRT_NOEXCEPT
+{
+  return ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR &&
+         ioDesc[pos].type == (pos < 2 ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kINT32);
+}
+
+void KnnBatch::configurePlugin(
+  const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+  const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT
+{
+  // Validate input arguments
+  PLUGIN_ASSERT(nbInputs == 4);
+  for (int pos = 0; pos < 4; ++pos) {
+    PLUGIN_ASSERT(
+      inDesc[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+      inDesc[pos].desc.type == (pos < 2 ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kINT32));
+  }
+
+  PLUGIN_ASSERT(nbOutputs == 1);
+  PLUGIN_ASSERT(
+    outDesc[0].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+    outDesc[0].desc.type == nvinfer1::DataType::kINT32);
+}
+
+size_t KnnBatch::getWorkspaceSize(
+  const nvinfer1::PluginTensorDesc *, int, const nvinfer1::PluginTensorDesc *,
+  int) const TRT_NOEXCEPT
+{
+  return 0;
+}
+
+int KnnBatch::enqueue(
+  const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+  const void * const * inputs, void * const * outputs, void *, cudaStream_t stream) TRT_NOEXCEPT
+{
+  const int32_t n = inDesc[0].dims.d[0];
+  const int32_t m = inDesc[1].dims.d[0];
+
+  const void * xyz = inputs[0];
+  const void * query_xyz = inputs[1];
+  const void * batch_idxs = inputs[2];
+  const void * query_batch_offsets = inputs[3];
+
+  void * output = outputs[0];
+
+  switch (outDesc[0].type) {
+    case nvinfer1::DataType::kINT32:
+      KnnBatchLauncher(
+        n, m, mTopK, reinterpret_cast<const float *>(xyz),
+        reinterpret_cast<const float *>(query_xyz), reinterpret_cast<const int *>(batch_idxs),
+        reinterpret_cast<const int *>(query_batch_offsets), reinterpret_cast<int *>(output),
+        stream);
+      break;
+
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+void KnnBatch::attachToContext(cudnnContext *, cublasContext *, nvinfer1::IGpuAllocator *)
+  TRT_NOEXCEPT
+{
+}
+
+void KnnBatch::detachFromContext() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::DataType KnnBatch::getOutputDataType(int, const nvinfer1::DataType *, int) const
+  TRT_NOEXCEPT
+{
+  return nvinfer1::DataType::kINT32;
+}
+
+const char * KnnBatch::getPluginType() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * KnnBatch::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+int KnnBatch::getNbOutputs() const TRT_NOEXCEPT
+{
+  return 1;
+}
+
+size_t KnnBatch::getSerializationSize() const TRT_NOEXCEPT
+{
+  return sizeof(mTopK);
+}
+
+void KnnBatch::serialize(void * buffer) const TRT_NOEXCEPT
+{
+  serialize_value(&buffer, mTopK);
+}
+
+/* ====================== creator ====================== */
+KnnBatchCreator::KnnBatchCreator()
+{
+  mPluginAttributes.emplace_back(
+    nvinfer1::PluginField("top_k", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char * KnnBatchCreator::getPluginName() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * KnnBatchCreator::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2DynamicExt * KnnBatchCreator::createPlugin(
+  const char * name, const nvinfer1::PluginFieldCollection * fc) TRT_NOEXCEPT
+{
+  const nvinfer1::PluginField * fields = fc->fields;
+  int32_t top_k = 0;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char * attrName = fields[i].name;
+    if (!strcmp(attrName, "top_k")) {
+      PLUGIN_ASSERT(fields[i].type == nvinfer1::PluginFieldType::kINT32);
+      top_k = *static_cast<const int32_t *>(fields[i].data);
+    }
+  }
+  auto plugin = new KnnBatch(name, top_k);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2DynamicExt * KnnBatchCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT
+{
+  auto plugin = new KnnBatch(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+}  // namespace autoware::trt_mtr
diff --git a/perception/autoware_mtr/lib/src/knn/trt_knn_batch_kernel.cu b/perception/autoware_mtr/lib/src/knn/trt_knn_batch_kernel.cu
new file mode 100644
index 0000000000000..870e8d538a178
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/knn/trt_knn_batch_kernel.cu
@@ -0,0 +1,82 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "common/trt_plugin_helper.hpp"
+#include "knn/trt_knn_batch_kernel.hpp"
+
+__global__ void knn_batch_kernel(
+  const int32_t n, const int32_t m, const int32_t k, const float * xyz, const float * query_xyz,
+  const int * batch_idxs, const int * query_batch_offsets, int * output)
+{
+  const int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (pt_idx >= n) {
+    return;
+  }
+
+  xyz += pt_idx * 3;
+  output += pt_idx * k;
+
+  float ox = xyz[0];
+  float oy = xyz[1];
+  float oz = xyz[2];
+
+  float best[100];
+  int best_idx[100];
+  for (int i = 0; i < k; ++i) {
+    best[i] = 1e20;
+    best_idx[i] = -1;
+  }
+
+  int batch_idx = batch_idxs[pt_idx];
+  if (batch_idx < 0) {
+    return;
+  }
+
+  int start_idx = query_batch_offsets[batch_idx];
+  int end_idx = query_batch_offsets[batch_idx + 1];
+  for (int i = start_idx; i < end_idx; ++i) {
+    float x = query_xyz[i * 3 + 0];
+    float y = query_xyz[i * 3 + 1];
+    float z = query_xyz[i * 3 + 2];
+    float d2 = (ox - x) * (ox - x) + (oy - y) * (oy - y) + (oz - z) * (oz - z);
+    for (int32_t p = 0; p < k; ++p) {
+      if (d2 < best[p]) {
+        for (int32_t q = k - 1; q > p; --q) {
+          best[q] = best[q - 1];
+          best_idx[q] = best_idx[q - 1];
+        }
+        best[p] = d2;
+        best_idx[p] = i - start_idx;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < k; ++i) {
+    output[i] = best_idx[i];
+  }
+}
+
+cudaError_t KnnBatchLauncher(
+  const int32_t n, const int32_t m, const int32_t k, const float * xyz, const float * query_xyz,
+  const int * batch_idxs, const int * query_batch_offsets, int * output, cudaStream_t stream)
+{
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  knn_batch_kernel<<<blocks, threads, 0, stream>>>(
+    n, m, k, xyz, query_xyz, batch_idxs, query_batch_offsets, output);
+
+  return cudaGetLastError();
+}
diff --git a/perception/autoware_mtr/lib/src/knn/trt_knn_batch_mlogk.cpp b/perception/autoware_mtr/lib/src/knn/trt_knn_batch_mlogk.cpp
new file mode 100644
index 0000000000000..618772b03262d
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/knn/trt_knn_batch_mlogk.cpp
@@ -0,0 +1,208 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "knn/trt_knn_batch_mlogk.hpp"
+
+#include "common/trt_serialize.hpp"
+#include "knn/trt_knn_batch_mlogk_kernel.hpp"
+
+#include <string>
+
+namespace autoware::trt_mtr
+{
+namespace
+{
+static const char * PLUGIN_VERSION{"1"};
+static const char * PLUGIN_NAME{"TRTKnnBatchMlogK"};
+}  // namespace
+
+KnnBatchMlogK::KnnBatchMlogK(const std::string & name, const int32_t top_k)
+: TRTPluginBase(name), mTopK(top_k)
+{
+}
+
+KnnBatchMlogK::KnnBatchMlogK(const std::string & name, const void * data, size_t length)
+: TRTPluginBase(name)
+{
+  deserialize_value(&data, &length, &mTopK);
+}
+
+KnnBatchMlogK::~KnnBatchMlogK() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::IPluginV2DynamicExt * KnnBatchMlogK::clone() const TRT_NOEXCEPT
+{
+  auto * plugin = new KnnBatchMlogK(mLayerName, mTopK);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::DimsExprs KnnBatchMlogK::getOutputDimensions(
+  int, const nvinfer1::DimsExprs * inputs, int, nvinfer1::IExprBuilder & exprBuilder) TRT_NOEXCEPT
+{
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 2;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = exprBuilder.constant(mTopK);
+
+  return ret;
+}
+
+bool KnnBatchMlogK::supportsFormatCombination(
+  int pos, const nvinfer1::PluginTensorDesc * ioDesc, int, int) TRT_NOEXCEPT
+{
+  return ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR &&
+         ioDesc[pos].type == (pos < 2 ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kINT32);
+}
+
+void KnnBatchMlogK::configurePlugin(
+  const nvinfer1::DynamicPluginTensorDesc * inDesc, int nbInputs,
+  const nvinfer1::DynamicPluginTensorDesc * outDesc, int nbOutputs) TRT_NOEXCEPT
+{
+  // Validate input arguments
+  PLUGIN_ASSERT(nbInputs == 4);
+  for (int pos = 0; pos < 4; ++pos) {
+    PLUGIN_ASSERT(
+      inDesc[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+      inDesc[pos].desc.type == (pos < 2 ? nvinfer1::DataType::kFLOAT : nvinfer1::DataType::kINT32));
+  }
+
+  PLUGIN_ASSERT(nbOutputs == 1);
+  PLUGIN_ASSERT(
+    outDesc[0].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+    outDesc[0].desc.type == nvinfer1::DataType::kINT32);
+}
+
+size_t KnnBatchMlogK::getWorkspaceSize(
+  const nvinfer1::PluginTensorDesc *, int, const nvinfer1::PluginTensorDesc *,
+  int) const TRT_NOEXCEPT
+{
+  return 0;
+}
+
+int KnnBatchMlogK::enqueue(
+  const nvinfer1::PluginTensorDesc * inDesc, const nvinfer1::PluginTensorDesc * outDesc,
+  const void * const * inputs, void * const * outputs, void *, cudaStream_t stream) TRT_NOEXCEPT
+{
+  const int32_t n = inDesc[0].dims.d[0];
+  const int32_t m = inDesc[1].dims.d[0];
+
+  const void * xyz = inputs[0];
+  const void * query_xyz = inputs[1];
+  const void * batch_idxs = inputs[2];
+  const void * query_batch_offsets = inputs[3];
+
+  void * output = outputs[0];
+
+  switch (outDesc[0].type) {
+    case nvinfer1::DataType::kINT32:
+      KnnBatchMlogKLauncher(
+        n, m, mTopK, reinterpret_cast<const float *>(xyz),
+        reinterpret_cast<const float *>(query_xyz), reinterpret_cast<const int *>(batch_idxs),
+        reinterpret_cast<const int *>(query_batch_offsets), reinterpret_cast<int *>(output),
+        stream);
+      break;
+
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+void KnnBatchMlogK::attachToContext(cudnnContext *, cublasContext *, nvinfer1::IGpuAllocator *)
+  TRT_NOEXCEPT
+{
+}
+
+void KnnBatchMlogK::detachFromContext() TRT_NOEXCEPT
+{
+}
+
+nvinfer1::DataType KnnBatchMlogK::getOutputDataType(int, const nvinfer1::DataType *, int) const
+  TRT_NOEXCEPT
+{
+  return nvinfer1::DataType::kINT32;
+}
+
+const char * KnnBatchMlogK::getPluginType() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * KnnBatchMlogK::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+int KnnBatchMlogK::getNbOutputs() const TRT_NOEXCEPT
+{
+  return 1;
+}
+
+size_t KnnBatchMlogK::getSerializationSize() const TRT_NOEXCEPT
+{
+  return sizeof(mTopK);
+}
+
+void KnnBatchMlogK::serialize(void * buffer) const TRT_NOEXCEPT
+{
+  serialize_value(&buffer, mTopK);
+}
+
+/* ====================== creator ====================== */
+KnnBatchMlogKCreator::KnnBatchMlogKCreator()
+{
+  mPluginAttributes.emplace_back(
+    nvinfer1::PluginField("top_k", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char * KnnBatchMlogKCreator::getPluginName() const TRT_NOEXCEPT
+{
+  return PLUGIN_NAME;
+}
+
+const char * KnnBatchMlogKCreator::getPluginVersion() const TRT_NOEXCEPT
+{
+  return PLUGIN_VERSION;
+}
+
+nvinfer1::IPluginV2DynamicExt * KnnBatchMlogKCreator::createPlugin(
+  const char * name, const nvinfer1::PluginFieldCollection * fc) TRT_NOEXCEPT
+{
+  const nvinfer1::PluginField * fields = fc->fields;
+  int32_t top_k = 0;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char * attrName = fields[i].name;
+    if (!strcmp(attrName, "top_k")) {
+      PLUGIN_ASSERT(fields[i].type == nvinfer1::PluginFieldType::kINT32);
+      top_k = *static_cast<const int32_t *>(fields[i].data);
+    }
+  }
+  auto plugin = new KnnBatchMlogK(name, top_k);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2DynamicExt * KnnBatchMlogKCreator::deserializePlugin(
+  const char * name, const void * serialData, size_t serialLength) TRT_NOEXCEPT
+{
+  auto plugin = new KnnBatchMlogK(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+}  // namespace autoware::trt_mtr
diff --git a/perception/autoware_mtr/lib/src/knn/trt_knn_batch_mlogk_kernel.cu b/perception/autoware_mtr/lib/src/knn/trt_knn_batch_mlogk_kernel.cu
new file mode 100644
index 0000000000000..c0ccdc33cc1eb
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/knn/trt_knn_batch_mlogk_kernel.cu
@@ -0,0 +1,126 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "common/trt_plugin_helper.hpp"
+#include "knn/trt_knn_batch_mlogk_kernel.hpp"
+
+__global__ void knn_batch_mlogk_kernel(
+  const int32_t n, const int32_t m, const int32_t k, const float * xyz, const float * query_xyz,
+  const int * batch_idxs, const int * query_batch_offsets, int * output)
+{
+  const int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (pt_idx >= n) {
+    return;
+  }
+
+  xyz += pt_idx * 3;
+  output += pt_idx * k;
+
+  float ox = xyz[0];
+  float oy = xyz[1];
+  float oz = xyz[2];
+
+  float best[150];
+  int best_idx[150];
+
+  int32_t heap_len = 0;
+
+  for (int i = 0; i <= k; ++i) {
+    best[i] = 1e20;
+    best_idx[i] = -1;
+  }
+
+  int batch_idx = batch_idxs[pt_idx];
+  if (batch_idx < 0) {
+    return;
+  }
+
+  int start_idx = query_batch_offsets[batch_idx];
+  int end_idx = query_batch_offsets[batch_idx + 1];
+
+  int tmp_idx;
+  float tmp_val;
+  for (int i = start_idx; i < end_idx; ++i) {
+    float x = query_xyz[i * 3 + 0];
+    float y = query_xyz[i * 3 + 1];
+    float z = query_xyz[i * 3 + 2];
+    float d2 = (ox - x) * (ox - x) + (oy - y) * (oy - y) + (oz - z) * (oz - z);
+
+    if (heap_len < k) {
+      ++heap_len;
+      best[heap_len] = d2;
+      best_idx[heap_len] = i - start_idx;
+      int cur_idx = heap_len, fa_idx = cur_idx >> 1;
+      while (fa_idx > 0) {
+        if (best[cur_idx] < best[fa_idx]) {
+          break;
+        }
+        tmp_idx = best_idx[cur_idx];
+        best_idx[cur_idx] = best_idx[fa_idx];
+        best_idx[fa_idx] = tmp_idx;
+        tmp_val = best[cur_idx];
+        best[cur_idx] = best[fa_idx];
+        best[fa_idx] = tmp_val;
+        cur_idx = fa_idx;
+        fa_idx = cur_idx >> 1;
+      }
+    } else {
+      if (d2 > best[1]) {
+        continue;
+      }
+      best[1] = d2;
+      best_idx[1] = i - start_idx;
+
+      int32_t cur_idx = 1, son_idx;
+      while (cur_idx <= k) {
+        son_idx = cur_idx << 1;
+        if (son_idx > k) {
+          break;
+        }
+        if (son_idx + 1 <= k && best[son_idx] < best[son_idx + 1]) {
+          ++son_idx;
+        }
+
+        if (son_idx <= k && best[cur_idx] < best[son_idx]) {
+          tmp_idx = best_idx[cur_idx];
+          best_idx[cur_idx] = best_idx[son_idx];
+          best_idx[son_idx] = tmp_idx;
+          tmp_val = best[cur_idx];
+          best[cur_idx] = best[son_idx];
+          best[son_idx] = tmp_val;
+        } else {
+          break;
+        }
+        cur_idx = son_idx;
+      }
+    }
+  }
+
+  for (int i = 1; i <= k; ++i) {
+    output[i - 1] = best_idx[i];
+  }
+}
+
+cudaError_t KnnBatchMlogKLauncher(
+  const int32_t n, const int32_t m, const int32_t k, const float * xyz, const float * query_xyz,
+  const int * batch_idxs, const int * query_batch_offsets, int * output, cudaStream_t stream)
+{
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  knn_batch_mlogk_kernel<<<blocks, threads, 0, stream>>>(
+    n, m, k, xyz, query_xyz, batch_idxs, query_batch_offsets, output);
+
+  return cudaGetLastError();
+}
diff --git a/perception/autoware_mtr/lib/src/postprocess/postprocess_kernel.cu b/perception/autoware_mtr/lib/src/postprocess/postprocess_kernel.cu
new file mode 100644
index 0000000000000..8a6a8ddc96d19
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/postprocess/postprocess_kernel.cu
@@ -0,0 +1,54 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+__global__ void transformTrajectoryKernel(
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory)
+{
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int m = blockIdx.y * blockDim.y + threadIdx.y;
+  int t = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (b >= B || m >= M || t >= T) {
+    return;
+  }
+
+  const int predIdx = (b * M * T + m * T + t) * PredDim;
+  const float predX = predTrajectory[predIdx];
+  const float predY = predTrajectory[predIdx + 1];
+
+  const int targetIdx = b * AgentDim;
+  const float targetX = targetState[targetIdx];
+  const float targetY = targetState[targetIdx + 1];
+  const float targetYaw = targetState[targetIdx + 6];
+  const float targetCos = cosf(targetYaw);
+  const float targetSin = sinf(targetYaw);
+
+  predTrajectory[predIdx] = targetCos * predX - targetSin * predY + targetX;
+  predTrajectory[predIdx + 1] = targetSin * predX + targetCos * predY + targetY;
+}
+
+cudaError_t postprocessLauncher(
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory, cudaStream_t stream)
+{
+  // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
+  constexpr int threadsPerBlock = 256;
+  dim3 blocks(B, M, T);
+
+  transformTrajectoryKernel<<<blocks, threadsPerBlock, 0, stream>>>(
+    B, M, T, AgentDim, targetState, PredDim, predTrajectory);
+
+  return cudaGetLastError();
+}
diff --git a/perception/autoware_mtr/lib/src/preprocess/agent_preprocess_kernel.cu b/perception/autoware_mtr/lib/src/preprocess/agent_preprocess_kernel.cu
new file mode 100644
index 0000000000000..608dd812bd247
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/preprocess/agent_preprocess_kernel.cu
@@ -0,0 +1,135 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "preprocess/agent_preprocess_kernel.cuh"
+
+#include <iostream>
+
+__global__ void agentPreprocessKernel(
+  const int B, const int N, const int T, const int D, const int C, const int sdc_index,
+  const int * target_index, const int * object_type_index, const float * timestamps,
+  const float * in_trajectory, float * out_data, bool * out_mask, float * out_last_pos)
+{
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int n = blockIdx.y * blockDim.y + threadIdx.y;
+  int t = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (b >= B || n >= N || t >= T) {
+    return;
+  }
+
+  const int out_idx = (b * N * T + n * T + t) * (D - 2 + C + 2 + T + 1 + 2);
+
+  // === out data ===
+  // --- transform trajectory to target centric coords ---
+  const int src_trajectory_idx = (n * T + t) * D;
+  const float x = in_trajectory[src_trajectory_idx];
+  const float y = in_trajectory[src_trajectory_idx + 1];
+  const float z = in_trajectory[src_trajectory_idx + 2];
+  const float length = in_trajectory[src_trajectory_idx + 3];
+  const float width = in_trajectory[src_trajectory_idx + 4];
+  const float height = in_trajectory[src_trajectory_idx + 5];
+  const float yaw = in_trajectory[src_trajectory_idx + 6];
+  const float vx = in_trajectory[src_trajectory_idx + 7];
+  const float vy = in_trajectory[src_trajectory_idx + 8];
+  const float ax = in_trajectory[src_trajectory_idx + 9];
+  const float ay = in_trajectory[src_trajectory_idx + 10];
+  const float is_valid = in_trajectory[src_trajectory_idx + 11];
+
+  // extract targets trajectories
+  const int center_idx = (target_index[b] * T + T - 1) * D;
+  const float center_x = in_trajectory[center_idx];
+  const float center_y = in_trajectory[center_idx + 1];
+  const float center_z = in_trajectory[center_idx + 2];
+  const float center_yaw = in_trajectory[center_idx + 6];
+  const float center_cos = cos(center_yaw);
+  const float center_sin = sin(center_yaw);
+
+  // do transform
+  const float trans_x = center_cos * (x - center_x) - center_sin * (y - center_y);
+  const float trans_y = center_sin * (x - center_x) + center_cos * (y - center_y);
+  const float trans_z = z - center_z;
+  const float trans_yaw = yaw - center_yaw;
+  const float trans_vx = center_cos * vx - center_sin * vy;
+  const float trans_vy = center_sin * vx + center_cos * vy;
+  const float trans_ax = center_cos * ax - center_sin * ay;
+  const float trans_ay = center_sin * ax + center_cos * ay;
+
+  out_data[out_idx] = trans_x;
+  out_data[out_idx + 1] = trans_y;
+  out_data[out_idx + 2] = trans_z;
+  out_data[out_idx + 3] = length;
+  out_data[out_idx + 4] = width;
+  out_data[out_idx + 5] = height;
+
+  // --- onehot ---
+  const int onehot_idx = out_idx + 6;
+  out_data[onehot_idx + object_type_index[n]] = 1.0f;
+
+  if (target_index[b] == n) {
+    out_data[onehot_idx + C] = 1.0f;
+  }
+
+  if (sdc_index == n) {
+    out_data[onehot_idx + C + 1] = 1.0f;
+  }
+
+  // --- embedding ---
+  const int embed_idx = onehot_idx + C + 2;
+  // time embedding
+  out_data[embed_idx + t] = 1.0f;
+  out_data[embed_idx + T] = timestamps[t];
+  // heading embedding
+  out_data[embed_idx + T + 1] = sin(trans_yaw);
+  out_data[embed_idx + T + 2] = cos(trans_yaw);
+
+  const int other_idx = embed_idx + T + 3;
+  out_data[other_idx] = trans_vx;
+  out_data[other_idx + 1] = trans_vy;
+  out_data[other_idx + 2] = trans_ax;
+  out_data[other_idx + 3] = trans_ay;
+
+  // === mask ===
+  const int mask_idx = b * N * T + n * T + t;
+  out_mask[mask_idx] = is_valid == 1.0f;
+
+  // === last pos ===
+  if (t == T - 1) {
+    const int pos_idx = (b * N + n) * 3;
+    out_last_pos[pos_idx] = trans_x;
+    out_last_pos[pos_idx + 1] = trans_y;
+    out_last_pos[pos_idx + 2] = trans_z;
+  }
+}
+
+cudaError_t agentPreprocessLauncher(
+  const int B, const int N, const int T, const int D, const int C, const int sdc_index,
+  const int * target_index, const int * object_type_index, const float * timestamps,
+  const float * in_trajectory, float * out_data, bool * out_mask, float * out_last_pos,
+  cudaStream_t stream)
+{
+  if (D != 12) {
+    std::cerr << "D must be 12, but got " << D << std::endl;
+    return cudaError::cudaErrorInvalidValue;
+  }
+
+  // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
+  constexpr int threadsPerBlock = 256;
+  dim3 blocks(B, N, T);
+  agentPreprocessKernel<<<blocks, threadsPerBlock, 0, stream>>>(
+    B, N, T, D, C, sdc_index, target_index, object_type_index, timestamps, in_trajectory, out_data,
+    out_mask, out_last_pos);
+
+  return cudaGetLastError();
+}
diff --git a/perception/autoware_mtr/lib/src/preprocess/polyline_preprocess_kernel.cu b/perception/autoware_mtr/lib/src/preprocess/polyline_preprocess_kernel.cu
new file mode 100644
index 0000000000000..0ffc9d3ae6f96
--- /dev/null
+++ b/perception/autoware_mtr/lib/src/preprocess/polyline_preprocess_kernel.cu
@@ -0,0 +1,284 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "preprocess/polyline_preprocess_kernel.cuh"
+
+#include <cub/cub.cuh>
+
+#include <float.h>
+
+#include <iostream>
+
+__global__ void transformPolylineKernel(
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask)
+{
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int k = blockIdx.y * blockDim.y + threadIdx.y;
+  int p = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (b >= B || k >= K || p >= P) {
+    return;
+  }
+
+  const int inIdx = (k * P + p) * PointDim;
+  const int outIdx = b * K * P + k * P + p;
+  bool isValid = false;
+  for (int d = 0; d < PointDim - 1; ++d) {
+    if (inPolyline[inIdx + d] != 0.0f) {
+      isValid = true;
+    }
+  }
+  outPolylineMask[outIdx] = isValid;
+
+  // initialize output polyline with 0.0
+  for (int d = 0; d < PointDim + 2; ++d) {
+    outPolyline[outIdx * (PointDim + 2) + d] = 0.0f;
+  }
+
+  // set transformed values if valid, otherwise all 0.0.
+  if (isValid) {
+    const float x = inPolyline[inIdx];
+    const float y = inPolyline[inIdx + 1];
+    const float z = inPolyline[inIdx + 2];
+    const float dx = inPolyline[inIdx + 3];
+    const float dy = inPolyline[inIdx + 4];
+    const float dz = inPolyline[inIdx + 5];
+    const float typeID = inPolyline[inIdx + 6];
+
+    const int centerIdx = b * AgentDim;
+    const float centerX = targetState[centerIdx];
+    const float centerY = targetState[centerIdx + 1];
+    const float centerZ = targetState[centerIdx + 2];
+    const float centerYaw = targetState[centerIdx + 6];
+    const float centerCos = cosf(centerYaw);
+    const float centerSin = sinf(centerYaw);
+
+    // do transform
+    const float transX = centerCos * (x - centerX) - centerSin * (y - centerY);
+    const float transY = centerSin * (x - centerX) + centerCos * (y - centerY);
+    const float transZ = z - centerZ;
+    const float transDx = centerCos * dx - centerSin * dy;
+    const float transDy = centerSin * dx + centerCos * dy;
+    const float transDz = dz;
+
+    outPolyline[outIdx * (PointDim + 2)] = transX;
+    outPolyline[outIdx * (PointDim + 2) + 1] = transY;
+    outPolyline[outIdx * (PointDim + 2) + 2] = transZ;
+    outPolyline[outIdx * (PointDim + 2) + 3] = transDx;
+    outPolyline[outIdx * (PointDim + 2) + 4] = transDy;
+    outPolyline[outIdx * (PointDim + 2) + 5] = transDz;
+    outPolyline[outIdx * (PointDim + 2) + 6] = typeID;
+  }
+}
+
+__global__ void setPreviousPositionKernel(
+  const int B, const int K, const int P, const int D, float * polyline)
+{
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int k = blockIdx.y * blockDim.y + threadIdx.y;
+  int p = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (b >= B || k >= K || p >= P) {
+    return;
+  }
+
+  const int curIdx = (b * K * P + k * P + p) * D;
+  const int preIdx = p == 0 ? curIdx : (b * K * P + k * P + p - 1) * D;
+
+  polyline[curIdx + D - 2] = polyline[preIdx];      // x
+  polyline[curIdx + D - 1] = polyline[preIdx + 1];  // y
+}
+
+__global__ void calculateCenterDistanceKernel(
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * distance)
+{
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int k = blockIdx.y * blockDim.y + threadIdx.y;
+  if (b >= B || k >= K) {
+    return;
+  }
+
+  // calculate polyline center
+  double sumX = 0.0, sumY = 0.0;
+  double numValid = 0.0;
+  for (int p = 0; p < P; ++p) {
+    int idx = b * K * P + k * P + p;
+    if (polylineMask[idx]) {
+      sumX += polyline[idx * D];
+      sumY += polyline[idx * D + 1];
+      ++numValid;
+    }
+  }
+  float centerX = static_cast<float>(sumX / max(1.0, numValid));
+  float centerY = static_cast<float>(sumY / max(1.0, numValid));
+
+  distance[b * K + k] = hypot(centerX, centerY);
+}
+
+template <unsigned int BLOCK_THREADS, unsigned int ITEMS_PER_THREAD>
+__global__ void extractTopKPolylineKernel(
+  const int K, const int B, const int L, const int P, const int D, const float * inPolyline,
+  const bool * inPolylineMask, const float * inDistance, float * outPolyline,
+  bool * outPolylineMask)
+{
+  int b = blockIdx.x;                             // Batch index
+  int tid = threadIdx.x;                          // Thread local index in this CUDA block
+  int p = blockIdx.y * blockDim.y + threadIdx.y;  // Point index
+  int d = blockIdx.z * blockDim.z + threadIdx.z;  // Dim index
+  // Since all threads in a block expected to work wiht CUB, `return` shold not be called here
+  // if (b >= B || tid >= L || p >= P || d >= D) {
+  //   return;
+  // }
+
+  // Specialize BlockRadixSort type
+  using BlockRadixSortT = cub::BlockRadixSort<float, BLOCK_THREADS, ITEMS_PER_THREAD, unsigned int>;
+  using TempStorageT = typename BlockRadixSortT::TempStorage;
+
+  __shared__ TempStorageT temp_storage;
+
+  float distances[ITEMS_PER_THREAD] = {0};
+  unsigned int distance_indices[ITEMS_PER_THREAD] = {0};
+  for (unsigned int i = 0; i < ITEMS_PER_THREAD; i++) {
+    int polyline_idx = BLOCK_THREADS * i + tid;  // index order don't need to care.
+    int distance_idx = b * L + polyline_idx;
+    distance_indices[i] = polyline_idx;
+    distances[i] = (polyline_idx < L && distance_idx < B * L) ? inDistance[distance_idx] : FLT_MAX;
+  }
+
+  BlockRadixSortT(temp_storage).Sort(distances, distance_indices);
+  // Block-wide sync barrier necessary to refer the sort result
+  __syncthreads();
+
+  if (b >= B || tid >= L || p >= P || d >= D) {
+    return;
+  }
+
+  for (unsigned int i = 0; i < ITEMS_PER_THREAD; i++) {
+    int consective_polyline_idx =
+      tid * ITEMS_PER_THREAD + i;  // To keep sorted order, theads have to write consective region
+    int inIdx = b * L * P + distance_indices[i] * P + p;
+    int outIdx = b * K * P + consective_polyline_idx * P + p;
+    if (consective_polyline_idx >= K || fabsf(FLT_MAX - distances[i]) < FLT_EPSILON) {
+      continue;
+    }
+    outPolyline[outIdx * D + d] = inPolyline[inIdx * D + d];
+    outPolylineMask[outIdx] = inPolylineMask[inIdx];
+  }
+}
+
+__global__ void calculatePolylineCenterKernel(
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * center)
+{
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int k = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (b >= B || k >= K) {
+    return;
+  }
+
+  // initialize with 0.0
+  int centerIdx = (b * K + k) * 3;
+  for (int d = 0; d < 3; ++d) {
+    center[centerIdx + d] = 0.0f;
+  }
+
+  // calculate polyline center
+  double sumX = 0.0, sumY = 0.0, sumZ = 0.0;
+  double numValid = 0.0;
+  for (int p = 0; p < P; ++p) {
+    int idx = b * K * P + k * P + p;
+    if (polylineMask[idx]) {
+      sumX += polyline[idx * D];
+      sumY += polyline[idx * D + 1];
+      sumZ += polyline[idx * D + 2];
+      ++numValid;
+    }
+  }
+
+  center[centerIdx] = static_cast<float>(sumX / max(1.0, numValid));
+  center[centerIdx + 1] = static_cast<float>(sumY / max(1.0, numValid));
+  center[centerIdx + 2] = static_cast<float>(sumZ / max(1.0, numValid));
+}
+
+cudaError_t polylinePreprocessWithTopkLauncher(
+  const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * tmpPolyline, bool * tmpPolylineMask,
+  float * tmpDistance, float * outPolyline, bool * outPolylineMask, float * outPolylineCenter,
+  cudaStream_t stream)
+{
+  if (L < K) {
+    std::cerr << "L must be greater than K, but got L: " << L << ", K: " << K << std::endl;
+    return cudaError_t::cudaErrorInvalidValue;
+  }
+
+  const int outPointDim = PointDim + 2;
+
+  // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
+  constexpr int threadsPerBlock = 256;
+  const dim3 blocks1(B, L, P);
+  transformPolylineKernel<<<blocks1, threadsPerBlock, 0, stream>>>(
+    L, P, PointDim, inPolyline, B, AgentDim, targetState, tmpPolyline, tmpPolylineMask);
+
+  const dim3 blocks2(B, L);
+  calculateCenterDistanceKernel<<<blocks2, threadsPerBlock, 0, stream>>>(
+    B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance);
+
+  const dim3 blocks3(B, P, outPointDim);
+  constexpr unsigned int itemsPerThread = 24;
+  if (threadsPerBlock * itemsPerThread < L) {
+    std::cerr << "Larger L (" << L << ") than acceptable range (< "
+              << threadsPerBlock * itemsPerThread << ") detected." << std::endl;
+    return cudaError_t::cudaErrorInvalidValue;
+  }
+  extractTopKPolylineKernel<threadsPerBlock, itemsPerThread>
+    <<<blocks3, threadsPerBlock, 0, stream>>>(
+      K, B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance, outPolyline,
+      outPolylineMask);
+
+  const dim3 blocks4(B, K, P);
+  setPreviousPositionKernel<<<blocks4, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline);
+
+  const dim3 blocks5(B, K);
+  calculatePolylineCenterKernel<<<blocks5, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter);
+
+  return cudaGetLastError();
+}
+
+cudaError_t polylinePreprocessLauncher(
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream)
+{
+  const int outPointDim = PointDim + 2;
+
+  // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
+  constexpr int threadsPerBlock = 256;
+  const dim3 block3d(B, K, P);
+  transformPolylineKernel<<<block3d, threadsPerBlock, 0, stream>>>(
+    K, P, PointDim, inPolyline, B, AgentDim, targetState, outPolyline, outPolylineMask);
+
+  setPreviousPositionKernel<<<block3d, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline);
+
+  const dim3 block2d(B, K);
+  calculatePolylineCenterKernel<<<block2d, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter);
+
+  return cudaGetLastError();
+}
diff --git a/perception/autoware_mtr/package.xml b/perception/autoware_mtr/package.xml
new file mode 100644
index 0000000000000..bcdb22fab2713
--- /dev/null
+++ b/perception/autoware_mtr/package.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>autoware_mtr</name>
+  <version>0.1.0</version>
+  <description>ROS 2 Node of Motion Transfromer(a.k.a MTR).</description>
+  <maintainer email="kotaro.uetake@tier4.jp">kotarouetake</maintainer>
+  <license>Apache-2.0</license>
+
+  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+  <buildtool_depend>autoware_cmake</buildtool_depend>
+
+  <depend>autoware_map_msgs</depend>
+  <depend>autoware_object_recognition_utils</depend>
+  <depend>autoware_perception_msgs</depend>
+  <depend>autoware_tensorrt_common</depend>
+  <depend>autoware_universe_utils</depend>
+  <depend>autoware_vehicle_info_utils</depend>
+  <depend>lanelet2_core</depend>
+  <depend>lanelet2_extension</depend>
+  <depend>lanelet2_routing</depend>
+  <depend>lanelet2_traffic_rules</depend>
+  <depend>nav_msgs</depend>
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+
+  <test_depend>ament_lint_auto</test_depend>
+  <test_depend>autoware_lint_common</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/perception/autoware_mtr/schema/tensorrt_mtr.schema.json b/perception/autoware_mtr/schema/tensorrt_mtr.schema.json
new file mode 100644
index 0000000000000..93dbf94ff72ce
--- /dev/null
+++ b/perception/autoware_mtr/schema/tensorrt_mtr.schema.json
@@ -0,0 +1,18 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for Motion Transformer Node",
+  "type": "object",
+  "definitions": {},
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/"
+        }
+      },
+      "required": ["ros__parameters"]
+    }
+  },
+  "required": ["/**"]
+}
diff --git a/perception/autoware_mtr/src/builder.cpp b/perception/autoware_mtr/src/builder.cpp
new file mode 100644
index 0000000000000..d5aedd4a6134d
--- /dev/null
+++ b/perception/autoware_mtr/src/builder.cpp
@@ -0,0 +1,392 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/mtr/builder.hpp"
+
+#include <fstream>
+#include <sstream>
+
+namespace autoware::mtr
+{
+namespace
+{
+/**
+ * @brief Get the name of precision in string.
+ *
+ * @param type
+ * @return std::string
+ */
+std::string getPrecisionName(const PrecisionType & type)
+{
+  switch (type) {
+    case PrecisionType::FP32:
+      return "FP32";
+    case PrecisionType::FP16:
+      return "FP16";
+    case PrecisionType::INT8:
+      return "INT8";
+    default:
+      throw std::runtime_error("Unsupported precision type.");
+  }
+}
+
+/**
+ * @brief Get the name of calibration in string.
+ *
+ * @param type
+ * @return std::string
+ */
+std::string getCalibrationName(const CalibrationType & type)
+{
+  switch (type) {
+    case CalibrationType::ENTROPY:
+      return "ENTROPY";
+    case CalibrationType::LEGACY:
+      return "LEGACY";
+    case CalibrationType::PERCENTILE:
+      return "PERCENTILE";
+    case CalibrationType::MINMAX:
+      return "MINMAX";
+    default:
+      throw std::runtime_error("Unsupported calibration type.");
+  }
+}
+}  // namespace
+
+MTRBuilder::MTRBuilder(
+  const std::string & model_filepath, const BuildConfig & build_config,
+  const size_t max_workspace_size)
+: model_filepath_(model_filepath), max_workspace_size_(max_workspace_size)
+{
+  build_config_ = std::make_unique<const BuildConfig>(build_config);
+  runtime_ = TrtUniquePtr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(logger_));
+}
+
+void MTRBuilder::setup()
+{
+  if (!fs::exists(model_filepath_)) {
+    is_initialized_ = false;
+    return;
+  }
+  std::string engine_path = model_filepath_;
+  if (model_filepath_.extension() == ".engine") {
+    std::cout << "Loading... " << model_filepath_ << std::endl;
+    loadEngine(model_filepath_);
+  } else if (model_filepath_.extension() == ".onnx") {
+    const auto engine_cache_path = createEngineCachePath();
+    if (fs::exists(engine_cache_path)) {
+      std::cout << "Loading cached engine... " << engine_cache_path << std::endl;
+      if (!loadEngine(engine_cache_path)) {
+        std::cerr << "Fail to load engine" << std::endl;
+        is_initialized_ = false;
+        return;
+      }
+    } else {
+      std::cout << "Building... " << engine_cache_path << std::endl;
+      logger_.log(nvinfer1::ILogger::Severity::kINFO, "start build engine");
+      if (!buildEngineFromOnnx(model_filepath_, engine_cache_path)) {
+        std::cerr << "Fail to build engine from onnx" << std::endl;
+        is_initialized_ = false;
+        return;
+      }
+      logger_.log(nvinfer1::ILogger::Severity::kINFO, "End build engine");
+    }
+    engine_path = engine_cache_path;
+  } else {
+    is_initialized_ = false;
+    return;
+  }
+
+  context_ = TrtUniquePtr<nvinfer1::IExecutionContext>(engine_->createExecutionContext());
+  if (!context_) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context");
+    is_initialized_ = false;
+    return;
+  }
+
+  is_initialized_ = true;
+}
+
+bool MTRBuilder::loadEngine(const std::string & filepath)
+{
+  try {
+    std::ifstream engine_file(filepath);
+    std::stringstream buffer;
+    buffer << engine_file.rdbuf();
+    std::string engine_str = buffer.str();
+    engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(
+      reinterpret_cast<const void *>(engine_str.data()), engine_str.size()));
+    return true;
+  } catch (std::exception & e) {
+    std::cerr << e.what() << std::endl;
+    return false;
+  }
+}
+
+fs::path MTRBuilder::createEngineCachePath() const
+{
+  fs::path cache_engine_path{model_filepath_};
+  auto precision_name = getPrecisionName(build_config_->precision);
+  auto calibration_name = build_config_->precision == PrecisionType::INT8
+                            ? getCalibrationName(build_config_->calibration)
+                            : "";
+  cache_engine_path.replace_extension(calibration_name + precision_name + ".engine");
+  return cache_engine_path;
+}
+
+bool MTRBuilder::buildEngineFromOnnx(
+  const std::string & filepath, const std::string & output_engine_filepath)
+{
+  auto builder = TrtUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger_));
+  if (!builder) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder");
+    return false;
+  }
+
+  const auto explicit_batch =
+    1U << static_cast<int32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+
+  auto network =
+    TrtUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicit_batch));
+  if (!network) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network");
+    return false;
+  }
+
+  auto config = TrtUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
+  if (!config) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config");
+    return false;
+  }
+
+  if (build_config_->precision == PrecisionType::FP16) {
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+  } else if (build_config_->precision == PrecisionType::INT8) {
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+  }
+
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400
+  config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+#else
+  config->setMaxWorkspaceSize(max_workspace_size_);
+#endif
+
+  auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger_));
+  if (!parser->parseFromFile(
+        filepath.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kERROR))) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to parse onnx file");
+    return false;
+  }
+
+  if (isDynamic()) {
+    auto profile = builder->createOptimizationProfile();
+    const auto input0 = network->getInput(0);
+    const auto input0_dims = input0->getDimensions();
+    const auto num_past_frames = input0_dims.d[2];
+    const auto num_agent_dims = input0_dims.d[3];
+
+    const auto input2 = network->getInput(2);
+    const auto input2_dims = input2->getDimensions();
+    const auto num_polylines = input2_dims.d[1];
+    const auto num_points = input2_dims.d[2];
+    const auto num_polyline_dims = input2_dims.d[3];
+
+    const auto & batch_target = build_config_->batch_target;
+    const auto & batch_agent = build_config_->batch_agent;
+    {  // trajectory
+      auto name = network->getInput(0)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN,
+        nvinfer1::Dims4{batch_target.k_min, batch_agent.k_min, num_past_frames, num_agent_dims});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT,
+        nvinfer1::Dims4{batch_target.k_opt, batch_agent.k_opt, num_past_frames, num_agent_dims});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX,
+        nvinfer1::Dims4{batch_target.k_max, batch_agent.k_max, num_past_frames, num_agent_dims});
+    }
+    {  // trajectory mask
+      auto name = network->getInput(1)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN,
+        nvinfer1::Dims3{batch_target.k_min, batch_agent.k_min, num_past_frames});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT,
+        nvinfer1::Dims3{batch_target.k_opt, batch_agent.k_opt, num_past_frames});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX,
+        nvinfer1::Dims3{batch_target.k_max, batch_agent.k_max, num_past_frames});
+    }
+    {  // polyline
+      auto name = network->getInput(2)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN,
+        nvinfer1::Dims4{batch_target.k_min, num_polylines, num_points, num_polyline_dims});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT,
+        nvinfer1::Dims4{batch_target.k_opt, num_polylines, num_points, num_polyline_dims});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX,
+        nvinfer1::Dims4{batch_target.k_max, num_polylines, num_points, num_polyline_dims});
+    }
+    {  // polyline mask
+      auto name = network->getInput(3)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN,
+        nvinfer1::Dims3{batch_target.k_min, num_polylines, num_points});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT,
+        nvinfer1::Dims3{batch_target.k_opt, num_polylines, num_points});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX,
+        nvinfer1::Dims3{batch_target.k_max, num_polylines, num_points});
+    }
+    {  // polyline center
+      auto name = network->getInput(4)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN,
+        nvinfer1::Dims3{batch_target.k_min, num_polylines, 3});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT,
+        nvinfer1::Dims3{batch_target.k_opt, num_polylines, 3});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX,
+        nvinfer1::Dims3{batch_target.k_max, num_polylines, 3});
+    }
+    {  // last pos
+      auto name = network->getInput(5)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN,
+        nvinfer1::Dims3{batch_target.k_min, batch_agent.k_min, 3});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT,
+        nvinfer1::Dims3{batch_target.k_opt, batch_agent.k_opt, 3});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX,
+        nvinfer1::Dims3{batch_target.k_max, batch_agent.k_max, 3});
+    }
+    {  // track index
+      auto name = network->getInput(6)->getName();
+      nvinfer1::Dims minDim, optDim, maxDim;
+      minDim.nbDims = 1;
+      minDim.d[0] = batch_target.k_min;
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, minDim);
+      optDim.nbDims = 1;
+      optDim.d[0] = batch_target.k_opt;
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, optDim);
+      maxDim.nbDims = 1;
+      maxDim.d[0] = batch_target.k_max;
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, maxDim);
+    }
+    {
+      // intention points
+      auto name = network->getInput(7)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims3{batch_target.k_min, 64, 2});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims3{batch_target.k_opt, 64, 2});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims3{batch_target.k_max, 64, 2});
+    }
+    {  // pred scores
+      auto name = network->getOutput(0)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims2{batch_target.k_min, 6});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims2{batch_target.k_opt, 6});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims2{batch_target.k_max, 6});
+    }
+    {  // pred trajs
+      auto name = network->getOutput(1)->getName();
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4{batch_target.k_min, 6, 80, 7});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4{batch_target.k_opt, 6, 80, 7});
+      profile->setDimensions(
+        name, nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4{batch_target.k_max, 6, 80, 7});
+    }
+    config->addOptimizationProfile(profile);
+  }
+
+  if (isDynamic()) {
+#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8200
+    config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kDETAILED);
+#else
+    config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kVERBOSE);
+#endif
+  }
+
+#if NV_TENSORRT_MAJOR >= 8
+  auto plan =
+    TrtUniquePtr<nvinfer1::IHostMemory>(builder->buildSerializedNetwork(*network, *config));
+  if (!plan) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory");
+    return false;
+  }
+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(
+    runtime_->deserializeCudaEngine(plan->data(), plan->size()));
+#else
+  engine_ = TrtUniquePtr<nvinfer1::ICudaEngine>(builder->buildEngineWithConfig(*network, *config));
+#endif
+
+  if (!engine_) {
+    logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine");
+    return false;
+  }
+
+// save engine
+#if NV_TENSORRT_MAJOR < 8
+  auto data = TrtUniquePtr<nvinfer1::IHostMemory>(engine_->serialize());
+#endif
+  std::ofstream file;
+  file.open(output_engine_filepath, std::ios::binary | std::ios::out);
+  if (!file.is_open()) {
+    return false;
+  }
+#if NV_TENSORRT_MAJOR < 8
+  file.write(reinterpret_cast<const char *>(data->data()), data->size());
+#else
+  file.write(reinterpret_cast<const char *>(plan->data()), plan->size());
+#endif
+
+  file.close();
+
+  return true;
+}
+
+bool MTRBuilder::isInitialized() const
+{
+  return is_initialized_;
+}
+
+bool MTRBuilder::isDynamic() const
+{
+  return build_config_->is_dynamic();
+}
+
+bool MTRBuilder::setBindingDimensions(int index, nvinfer1::Dims dimensions)
+{
+  if (isDynamic()) {
+    return context_->setBindingDimensions(index, dimensions);
+  } else {
+    return true;
+  }
+}
+
+bool MTRBuilder::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * inputConsumed)
+{
+  return context_->enqueueV2(bindings, stream, inputConsumed);
+}
+}  // namespace autoware::mtr
diff --git a/perception/autoware_mtr/src/conversions/history.cpp b/perception/autoware_mtr/src/conversions/history.cpp
new file mode 100644
index 0000000000000..8175af8ba493e
--- /dev/null
+++ b/perception/autoware_mtr/src/conversions/history.cpp
@@ -0,0 +1,111 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/mtr/conversions/history.hpp"
+
+#include <utility>
+#include <vector>
+
+namespace autoware::mtr
+{
+using adl = autoware::mtr::AgentDimLabels;
+
+/**
+ * @brief Transform a 2D point with a reference point and rotation
+ *
+ * @param history history to modify.
+ * @param reference_state reference state used as reference frame.
+ * @return  vector of histories, each in the reference frame of the last state.
+ */
+[[nodiscard]] std::pair<float, float> transform2D(
+  const std::pair<float, float> input_xy, const std::pair<float, float> reference_xy,
+  const std::pair<float, float> rotation_cos_sin)
+{
+  auto [x, y] = input_xy;
+  auto [ref_x, ref_y] = reference_xy;
+  auto [cos, sin] = rotation_cos_sin;
+  x -= ref_x;
+  y -= ref_y;
+  return {x * cos - y * sin, x * sin + y * cos};
+}
+
+/**
+ * @brief Get histories with agent states in the reference frame of the last state
+ *
+ * @param history history to modify.
+ * @param reference_state reference state used as reference frame.
+ * @return  vector of histories, each in the reference frame of the last state.
+ */
+[[nodiscard]] AgentHistory getAgentCentricHistory(
+  const AgentHistory & history, const AgentState & reference_state)
+{
+  const auto latest_valid_state = history.get_latest_valid_state();
+  const auto & reference_state_dim = autoware::mtr::AgentState::dim();
+  const auto & history_state_dim = autoware::mtr::AgentHistory::state_dim();
+
+  if (!latest_valid_state.has_value() || reference_state_dim != history_state_dim) {
+    return history;
+  }
+  AgentHistory agent_centric_history = history;
+  const auto reference_array = reference_state.as_array();
+  auto agent_centric_states_array = agent_centric_history.as_array();
+  const auto ref_x = reference_array.at(adl::X);
+  const auto ref_y = reference_array.at(adl::Y);
+  const auto ref_z = reference_array.at(adl::Z);
+  const auto ref_yaw = reference_array.at(adl::YAW);
+  const auto cos = std::cos(-ref_yaw);
+  const auto sin = std::sin(-ref_yaw);
+
+  const auto & d = reference_state_dim;
+  // Use i as the index for each new state
+  for (size_t i = 0; i < agent_centric_states_array.size(); i += d) {
+    auto & x = agent_centric_states_array.at(i + adl::X);
+    auto & y = agent_centric_states_array.at(i + adl::Y);
+    auto & vx = agent_centric_states_array.at(i + adl::VX);
+    auto & vy = agent_centric_states_array.at(i + adl::VY);
+    auto & ax = agent_centric_states_array.at(i + adl::AX);
+    auto & ay = agent_centric_states_array.at(i + adl::AY);
+    auto & z = agent_centric_states_array.at(i + adl::Z);
+    auto & yaw = agent_centric_states_array.at(i + adl::YAW);
+
+    std::tie(x, y) = transform2D({x, y}, {ref_x, ref_y}, {cos, sin});
+    std::tie(vx, vy) = transform2D({vx, vy}, {0.0, 0.0}, {cos, sin});
+    std::tie(ax, ay) = transform2D({x, y}, {0.0, 0.0}, {cos, sin});
+
+    z -= ref_z;
+    yaw -= ref_yaw;
+  }
+  return agent_centric_history;
+};
+
+/**
+ * @brief Get histories with agent states in the reference frame of the last state
+ *
+ * @param histories vector of histories to modify.
+ * @return  vector of histories, each in the reference frame of the last state.
+ */
+[[nodiscard]] std::vector<AgentHistory> getAgentCentricHistories(
+  const std::vector<AgentHistory> & histories)
+{
+  std::vector<AgentHistory> agent_centric_histories;
+  agent_centric_histories.reserve(histories.size());
+  for (const auto & history : histories) {
+    const auto & reference_state =
+      history.get_latest_state();  // Todo(Daniel): should it be the latest VALID state?
+    agent_centric_histories.push_back(getAgentCentricHistory(history, reference_state));
+  }
+  return agent_centric_histories;
+};
+
+}  // namespace autoware::mtr
diff --git a/perception/autoware_mtr/src/conversions/lanelet.cpp b/perception/autoware_mtr/src/conversions/lanelet.cpp
new file mode 100644
index 0000000000000..3512c4283f83b
--- /dev/null
+++ b/perception/autoware_mtr/src/conversions/lanelet.cpp
@@ -0,0 +1,142 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/mtr/conversions/lanelet.hpp"
+
+#include "autoware/mtr/polyline.hpp"
+
+#include <geometry_msgs/msg/detail/point__struct.hpp>
+
+#include <cmath>
+#include <optional>
+#include <vector>
+
+namespace autoware::mtr
+{
+std::optional<PolylineData> LaneletConverter::convert(
+  const geometry_msgs::msg::Point & position, double distance_threshold) const
+{
+  std::vector<LanePoint> container;
+  // parse lanelet layers
+  for (const auto & lanelet : lanelet_map_ptr_->laneletLayer) {
+    const auto lanelet_subtype = toSubtypeName(lanelet);
+    if (isLaneLike(lanelet_subtype)) {
+      // convert centerlines
+      if (isRoadwayLike(lanelet_subtype)) {
+        auto points = fromLinestring(lanelet.centerline3d(), position, distance_threshold);
+        insertLanePoints(points, container);
+      }
+      // convert boundaries except of virtual lines
+      if (!isTurnableIntersection(lanelet)) {
+        const auto left_bound = lanelet.leftBound3d();
+        if (isBoundaryLike(left_bound)) {
+          auto points = fromLinestring(left_bound, position, distance_threshold);
+          insertLanePoints(points, container);
+        }
+        const auto right_bound = lanelet.rightBound3d();
+        if (isBoundaryLike(right_bound)) {
+          auto points = fromLinestring(right_bound, position, distance_threshold);
+          insertLanePoints(points, container);
+        }
+      }
+    } else if (isCrosswalkLike(lanelet_subtype)) {
+      auto points = fromPolygon(lanelet.polygon3d(), position, distance_threshold);
+      insertLanePoints(points, container);
+    }
+  }
+
+  // parse linestring layers
+  for (const auto & linestring : lanelet_map_ptr_->lineStringLayer) {
+    if (isBoundaryLike(linestring)) {
+      auto points = fromLinestring(linestring, position, distance_threshold);
+      insertLanePoints(points, container);
+    }
+  }
+
+  return container.size() == 0
+           ? std::nullopt
+           : std::make_optional<PolylineData>(
+               container, max_num_polyline_, max_num_point_, point_break_distance_);
+}
+
+std::vector<LanePoint> LaneletConverter::fromLinestring(
+  const lanelet::ConstLineString3d & linestring, const geometry_msgs::msg::Point & position,
+  double distance_threshold) const noexcept
+{
+  if (linestring.size() == 0) {
+    return {};
+  }
+
+  std::vector<LanePoint> output;
+  for (auto itr = linestring.begin(); itr != linestring.end(); ++itr) {
+    if (auto distance =
+          std::hypot(itr->x() - position.x, itr->y() - position.y, itr->z() - position.z);
+        distance > distance_threshold) {
+      continue;
+    }
+    double dx, dy, dz;
+    constexpr double epsilon = 1e-6;
+    if (itr == linestring.begin()) {
+      dx = 0.0;
+      dy = 0.0;
+      dz = 0.0;
+    } else {
+      dx = itr->x() - (itr - 1)->x();
+      dy = itr->y() - (itr - 1)->y();
+      dz = itr->z() - (itr - 1)->z();
+      const auto norm = std::hypot(dx, dy, dz);
+      dx /= (norm + epsilon);
+      dy /= (norm + epsilon);
+      dz /= (norm + epsilon);
+    }
+    output.emplace_back(itr->x(), itr->y(), itr->z(), dx, dy, dz, 0.0);  // TODO(ktro2828): Label ID
+  }
+  return output;
+}
+
+std::vector<LanePoint> LaneletConverter::fromPolygon(
+  const lanelet::CompoundPolygon3d & polygon, const geometry_msgs::msg::Point & position,
+  double distance_threshold) const noexcept
+{
+  if (polygon.size() == 0) {
+    return {};
+  }
+
+  std::vector<LanePoint> output;
+  for (auto itr = polygon.begin(); itr != polygon.end(); ++itr) {
+    if (auto distance =
+          std::hypot(itr->x() - position.x, itr->y() - position.y, itr->z() - position.z);
+        distance > distance_threshold) {
+      continue;
+    }
+    double dx, dy, dz;
+    constexpr double epsilon = 1e-6;
+    if (itr == polygon.begin()) {
+      dx = 0.0;
+      dy = 0.0;
+      dz = 0.0;
+    } else {
+      dx = itr->x() - (itr - 1)->x();
+      dy = itr->y() - (itr - 1)->y();
+      dz = itr->z() - (itr - 1)->z();
+      const auto norm = std::hypot(dx, dy, dz);
+      dx /= (norm + epsilon);
+      dy /= (norm + epsilon);
+      dz /= (norm + epsilon);
+    }
+    output.emplace_back(itr->x(), itr->y(), itr->z(), dx, dy, dz, 0.0);  // TODO(ktro2828): Label ID
+  }
+  return output;
+}
+}  // namespace autoware::mtr
diff --git a/perception/autoware_mtr/src/intention_point.cpp b/perception/autoware_mtr/src/intention_point.cpp
new file mode 100644
index 0000000000000..9b4036c99e143
--- /dev/null
+++ b/perception/autoware_mtr/src/intention_point.cpp
@@ -0,0 +1,41 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/mtr/intention_point.hpp"
+
+#include <cstddef>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace autoware::mtr
+{
+std::vector<float> IntentionPoint::as_array(const std::vector<std::string> & label_names) const
+{
+  std::vector<float> points;
+  points.reserve(label_names.size() * num_cluster * state_dim());
+  for (const auto & name : label_names) {
+    const auto & label_points = data_map_.at(name);
+    for (const auto & p : label_points) {
+      points.emplace_back(p);
+    }
+  }
+  return points;
+}
+
+size_t IntentionPoint::size() const noexcept
+{
+  return num_cluster * state_dim();
+}
+}  // namespace autoware::mtr
diff --git a/perception/autoware_mtr/src/node.cpp b/perception/autoware_mtr/src/node.cpp
new file mode 100644
index 0000000000000..57ea7042e2d2c
--- /dev/null
+++ b/perception/autoware_mtr/src/node.cpp
@@ -0,0 +1,450 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/mtr/node.hpp"
+
+#include "autoware/mtr/agent.hpp"
+#include "autoware/mtr/conversions/history.hpp"
+#include "autoware/mtr/conversions/lanelet.hpp"
+#include "autoware/mtr/fixed_queue.hpp"
+
+#include <lanelet2_extension/utility/message_conversion.hpp>
+#include <rclcpp/logging.hpp>
+
+#include <autoware_perception_msgs/msg/detail/object_classification__builder.hpp>
+#include <autoware_perception_msgs/msg/detail/tracked_object__struct.hpp>
+#include <geometry_msgs/msg/pose.hpp>
+#include <geometry_msgs/msg/pose_stamped.hpp>
+#include <geometry_msgs/msg/vector3.hpp>
+
+#include <tf2/utils.h>
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace autoware::mtr
+{
+// TODO(ktro2828): use a parameter
+constexpr double TIME_THRESHOLD = 1.0;
+constexpr size_t MAX_NUM_TARGET = 1;
+constexpr double POLYLINE_DISTANCE_THRESHOLD = 100.0;
+namespace
+{
+// Convert `TrackedObject` to `AgentState`.
+AgentState createAgentState(const TrackedObject & object, const bool is_valid)
+{
+  const auto & pose = object.kinematics.pose_with_covariance.pose;
+  const auto & twist = object.kinematics.twist_with_covariance.twist;
+  const auto & accel = object.kinematics.acceleration_with_covariance.accel;
+  const auto & dimension = object.shape.dimensions;
+
+  const float yaw = tf2::getYaw(pose.orientation);
+  const float valid = is_valid ? 1.0f : 0.0f;
+
+  return {pose.position, dimension, yaw, twist.linear, accel.linear, valid};
+}
+
+// Get the label index corresponding to AgentLabel. If the label of tracked object is not * defined
+// in AgentLabel returns `-1`.
+int getLabelIndex(const TrackedObject & object)
+{
+  const auto classification =
+    autoware::object_recognition_utils::getHighestProbLabel(object.classification);
+  if (autoware::object_recognition_utils::isCarLikeVehicle(classification)) {
+    return AgentLabel::VEHICLE;
+  }
+  if (classification == ObjectClassification::PEDESTRIAN) {
+    return AgentLabel::PEDESTRIAN;
+  }
+  if (
+    classification == ObjectClassification::MOTORCYCLE ||
+    classification == ObjectClassification::BICYCLE) {
+    return AgentLabel::CYCLIST;
+  }
+  return -1;  // other labels
+}
+}  // namespace
+
+MTRNode::MTRNode(const rclcpp::NodeOptions & node_options)
+: rclcpp::Node("mtr", node_options), transform_listener_(this)
+{
+  {
+    // Build MTR
+    // Model config
+    const auto model_path = declare_parameter<std::string>("model_params.model_path");
+    const auto target_labels =
+      declare_parameter<std::vector<std::string>>("model_params.target_labels");
+    const auto num_past = static_cast<size_t>(declare_parameter<int>("model_params.num_past"));
+    const auto num_mode = static_cast<size_t>(declare_parameter<int>("model_params.num_mode"));
+    const auto num_future = static_cast<size_t>(declare_parameter<int>("model_params.num_future"));
+    const auto max_num_polyline =
+      static_cast<size_t>(declare_parameter<int>("model_params.max_num_polyline"));
+    const auto max_num_point =
+      static_cast<size_t>(declare_parameter<int>("model_params.max_num_point"));
+    const auto point_break_distance =
+      static_cast<float>(declare_parameter<double>("model_params.point_break_distance"));
+    const auto intention_point_filepath =
+      declare_parameter<std::string>("model_params.intention_point_filepath");
+    const auto num_intention_point_cluster =
+      static_cast<size_t>(declare_parameter<int>("model_params.num_intention_point_cluster"));
+    config_ptr_ = std::make_unique<MTRConfig>(
+      target_labels, num_past, num_mode, num_future, max_num_polyline, max_num_point,
+      point_break_distance, intention_point_filepath, num_intention_point_cluster);
+    // Build config
+    const auto is_dynamic = declare_parameter<bool>("build_params.is_dynamic");
+    const auto precision = declare_parameter<std::string>("build_params.precision");
+    const auto calibration = declare_parameter<std::string>("build_params.calibration");
+    build_config_ptr_ = std::make_unique<BuildConfig>(is_dynamic, precision, calibration);
+    model_ptr_ = std::make_unique<TrtMTR>(model_path, *config_ptr_, *build_config_ptr_);
+  }
+
+  {
+    // Ego states and timestamps buffer
+    int ego_buffer_size = declare_parameter<int>("ego_buffer_size", 100);
+    ego_states_ = std::make_unique<FixedQueue<std::pair<float, AgentState>>>(ego_buffer_size);
+    timestamps_ = std::make_unique<FixedQueue<double>>(config_ptr_->num_past);
+  }
+
+  sub_objects_ = create_subscription<TrackedObjects>(
+    "~/input/objects", rclcpp::QoS{1}, std::bind(&MTRNode::callback, this, std::placeholders::_1));
+  sub_map_ = create_subscription<HADMapBin>(
+    "~/input/vector_map", rclcpp::QoS{1}.transient_local(),
+    std::bind(&MTRNode::onMap, this, std::placeholders::_1));
+
+  pub_objects_ = create_publisher<PredictedObjects>("~/output/objects", rclcpp::QoS{1});
+
+  if (declare_parameter<bool>("build_only")) {
+    RCLCPP_INFO(get_logger(), "TensorRT engine file is built and exit.");
+    rclcpp::shutdown();
+  }
+}
+
+void MTRNode::callback(const TrackedObjects::ConstSharedPtr object_msg)
+{
+  const auto ego_msg = getLatestEgo();
+
+  if (!ego_msg) {
+    RCLCPP_WARN(get_logger(), "No ego data");
+    return;
+  }
+
+  const auto polyline_data = lanelet_converter_ptr_->convert(
+    ego_msg->kinematics.pose_with_covariance.pose.position, POLYLINE_DISTANCE_THRESHOLD);
+
+  if (!polyline_data) {
+    RCLCPP_WARN(get_logger(), "No polyline");
+    return;
+  }
+
+  const auto current_time = rclcpp::Time(object_msg->header.stamp).seconds();
+  timestamps_->push_back(current_time);
+
+  removeAncientAgentHistory(current_time, object_msg);
+  updateAgentHistory(current_time, object_msg, ego_msg.value());
+
+  std::vector<std::string> object_ids;
+  std::vector<AgentHistory> histories;
+  std::vector<size_t> label_ids;
+  histories.reserve(agent_history_map_.size());
+  object_ids.reserve(agent_history_map_.size());
+  label_ids.reserve(agent_history_map_.size());
+  int tmp_ego_index = -1;
+  for (const auto & [object_id, history] : agent_history_map_) {
+    object_ids.emplace_back(object_id);
+    histories.emplace_back(history);
+    label_ids.emplace_back(history.label_id());
+    if (object_id == EGO_ID) {
+      tmp_ego_index = histories.size() - 1;
+    }
+  }
+
+  if (tmp_ego_index == -1) {
+    RCLCPP_WARN(get_logger(), "No EGO");
+    return;
+  }
+
+  const auto target_indices = extractTargetAgent(histories);
+  if (target_indices.empty()) {
+    RCLCPP_WARN(get_logger(), "No target agents");
+    return;
+  }
+
+  const auto ego_index = static_cast<size_t>(tmp_ego_index);
+  const auto relative_timestamps = getRelativeTimestamps();
+  // For testing purposes, normally pre-processing is done in the cuda side.
+  // const auto agent_centric_histories = getAgentCentricHistories(histories);
+  AgentData agent_data(histories, ego_index, target_indices, label_ids, relative_timestamps);
+
+  std::vector<PredictedTrajectory> trajectories;
+  if (!model_ptr_->doInference(agent_data, *polyline_data, trajectories)) {
+    RCLCPP_WARN(get_logger(), "Inference failed");
+    return;
+  }
+
+  PredictedObjects output;
+  output.header = object_msg->header;
+  output.objects.reserve(target_indices.size());
+  for (size_t i = 0; i < target_indices.size(); ++i) {
+    const auto & target_idx = target_indices.at(i);
+    const auto & object_id = object_ids.at(target_idx);
+    const auto & object = object_msg_map_.at(object_id);
+    const auto & trajectory = trajectories.at(i);
+
+    auto predicted_object = createPredictedObject(object, trajectory);
+    output.objects.emplace_back(predicted_object);
+  }
+
+  // Publish results
+  pub_objects_->publish(output);
+}
+
+void MTRNode::onMap(const HADMapBin::ConstSharedPtr map_msg)
+{
+  lanelet_map_ptr_ = std::make_shared<lanelet::LaneletMap>();
+  lanelet::utils::conversion::fromBinMsg(
+    *map_msg, lanelet_map_ptr_, &traffic_rules_ptr_, &routing_graph_ptr_);
+
+  lanelet_converter_ptr_ = std::make_unique<LaneletConverter>(
+    lanelet_map_ptr_, config_ptr_->max_num_polyline, config_ptr_->max_num_point,
+    config_ptr_->point_break_distance);
+}
+
+std::optional<TrackedObject> MTRNode::getLatestEgo()
+{
+  const Odometry::ConstSharedPtr odometry_msg = sub_ego_.takeData();
+  if (!odometry_msg) {
+    return std::nullopt;
+  }
+
+  auto createPoint32 =
+    [](const double x, const double y, const double z) -> geometry_msgs::msg::Point32 {
+    geometry_msgs::msg::Point32 p;
+    p.x = x;
+    p.y = y;
+    p.z = z;
+    return p;
+  };
+
+  TrackedObject output;
+  // Classification and probability
+  {
+    output.existence_probability = 1.0;
+    ObjectClassification classification;
+    classification.label = ObjectClassification::CAR;
+    output.classification = {classification};
+  }
+
+  // Kinematics
+  {
+    output.kinematics.pose_with_covariance = odometry_msg->pose;
+    output.kinematics.twist_with_covariance = odometry_msg->twist;
+  }
+
+  // Shape
+  {
+    const auto & ego_max_long_offset = vehicle_info_.max_longitudinal_offset_m;
+    const auto & ego_rear_overhang = vehicle_info_.vehicle_height_m;
+    const auto & ego_length = vehicle_info_.vehicle_length_m;
+    const auto & ego_width = vehicle_info_.vehicle_width_m;
+    const auto & ego_height = vehicle_info_.vehicle_height_m;
+
+    autoware_perception_msgs::msg::Shape shape;
+    shape.type = autoware_perception_msgs::msg::Shape::BOUNDING_BOX;
+    shape.dimensions.x = ego_length;
+    shape.dimensions.y = ego_width;
+    shape.dimensions.z = ego_height;
+
+    // TODO(Daniel): Should use overhang and ego info utils
+    shape.footprint.points.emplace_back(
+      createPoint32(-ego_rear_overhang, -0.5 * ego_width, ego_height));
+    shape.footprint.points.emplace_back(
+      createPoint32(-ego_rear_overhang, 0.5 * ego_width, ego_height));
+    shape.footprint.points.emplace_back(
+      createPoint32(ego_max_long_offset, 0.5 * ego_width, ego_height));
+    shape.footprint.points.emplace_back(
+      createPoint32(ego_max_long_offset, -0.5 * ego_width, ego_height));
+    output.shape = shape;
+  }
+  return output;
+}
+
+void MTRNode::removeAncientAgentHistory(
+  const double current_time, const TrackedObjects::ConstSharedPtr objects_msg)
+{
+  for (const auto & object : objects_msg->objects) {
+    const auto & object_id = autoware::universe_utils::toHexString(object.object_id);
+    if (agent_history_map_.count(object_id) == 0) {
+      continue;
+    }
+
+    const auto & history = agent_history_map_.at(object_id);
+    if (history.is_ancient(current_time, TIME_THRESHOLD)) {
+      agent_history_map_.erase(object_id);
+    }
+  }
+
+  if (
+    agent_history_map_.count(EGO_ID) != 0 &&
+    agent_history_map_.at(EGO_ID).is_ancient(current_time, TIME_THRESHOLD)) {
+    agent_history_map_.erase(EGO_ID);
+  }
+}
+
+void MTRNode::updateAgentHistory(
+  const double current_time, const TrackedObjects::ConstSharedPtr objects_msg,
+  const TrackedObject & ego_msg)
+{
+  std::vector<std::string> observed_ids;
+  // other agents
+  for (const auto & object : objects_msg->objects) {
+    auto label_index = getLabelIndex(object);
+    if (label_index == -1) {
+      continue;
+    }
+
+    const auto object_id = autoware::universe_utils::toHexString(object.object_id);
+    observed_ids.emplace_back(object_id);
+    object_msg_map_.insert_or_assign(object_id, object);
+
+    const auto state = createAgentState(object, true);
+    if (agent_history_map_.count(object_id) == 0) {
+      AgentHistory history(state, object_id, label_index, current_time, config_ptr_->num_past);
+      agent_history_map_.emplace(object_id, history);
+    } else {
+      agent_history_map_.at(object_id).update(current_time, state);
+    }
+  }
+
+  // ego vehicle
+  observed_ids.emplace_back(EGO_ID);
+  object_msg_map_.insert_or_assign(EGO_ID, ego_msg);
+
+  const auto ego = createAgentState(ego_msg, true);
+  if (agent_history_map_.count(EGO_ID) == 0) {
+    AgentHistory history(ego, EGO_ID, AgentLabel::VEHICLE, current_time, config_ptr_->num_past);
+    agent_history_map_.emplace(EGO_ID, history);
+  } else {
+    agent_history_map_.at(EGO_ID).update(current_time, ego);
+  }
+
+  // update unobserved histories with empty
+  for (auto & [object_id, history] : agent_history_map_) {
+    if (std::find(observed_ids.cbegin(), observed_ids.cend(), object_id) != observed_ids.cend()) {
+      continue;
+    }
+    history.update_empty();
+  }
+}
+
+std::vector<size_t> MTRNode::extractTargetAgent(const std::vector<AgentHistory> & histories)
+{
+  auto map2ego = transform_listener_.getTransform(
+    "base_link", "map", rclcpp::Time(), rclcpp::Duration::from_seconds(0.1));
+
+  if (!map2ego) {
+    RCLCPP_WARN(get_logger(), "Failed to get transform from map to base_link.");
+    return {};
+  }
+
+  std::vector<std::pair<size_t, double>> index_distances;
+  for (size_t idx = 0; idx < histories.size(); ++idx) {
+    const auto & history = histories.at(idx);
+    if (history.is_valid_latest() && history.object_id() != EGO_ID) {
+      const auto & state = history.get_latest_state();
+      geometry_msgs::msg::PoseStamped pose_in_map;
+      pose_in_map.pose.position.x = state.x();
+      pose_in_map.pose.position.y = state.y();
+      pose_in_map.pose.position.z = state.z();
+      pose_in_map.pose.orientation = autoware::universe_utils::createQuaternionFromYaw(state.yaw());
+
+      geometry_msgs::msg::PoseStamped pose_in_ego;
+      tf2::doTransform(pose_in_map, pose_in_ego, *map2ego);
+
+      const auto distance = std::hypot(pose_in_ego.pose.position.x, pose_in_ego.pose.position.y);
+      index_distances.emplace_back(idx, distance);
+    }
+  }
+
+  // sort by distance
+  std::sort(
+    index_distances.begin(), index_distances.end(),
+    [](const auto & item1, const auto & item2) { return item1.second < item2.second; });
+
+  std::vector<size_t> target_indices;
+  for (const auto & [idx, _] : index_distances) {
+    target_indices.emplace_back(idx);
+    if (MAX_NUM_TARGET <= target_indices.size()) {
+      break;
+    }
+  }
+
+  return target_indices;
+}
+
+std::vector<float> MTRNode::getRelativeTimestamps() const
+{
+  std::vector<float> output;
+  output.reserve(timestamps_->size());
+  for (const auto & t : *timestamps_) {
+    output.push_back(t - timestamps_->front());
+  }
+  return output;
+}
+
+PredictedObject MTRNode::createPredictedObject(
+  const TrackedObject & object, const PredictedTrajectory & trajectory)
+{
+  const auto & init_pose_with_cov = object.kinematics.pose_with_covariance;
+  const auto & init_twist_with_cov = object.kinematics.twist_with_covariance;
+  const auto & init_accel_with_cov = object.kinematics.acceleration_with_covariance;
+
+  PredictedObject predicted_object;
+  predicted_object.kinematics.initial_pose_with_covariance = init_pose_with_cov;
+  predicted_object.kinematics.initial_twist_with_covariance = init_twist_with_cov;
+  predicted_object.kinematics.initial_acceleration_with_covariance = init_accel_with_cov;
+  predicted_object.classification = object.classification;
+  predicted_object.shape = object.shape;
+  predicted_object.object_id = object.object_id;
+  predicted_object.existence_probability = object.existence_probability;
+
+  for (const auto & mode : trajectory.get_modes()) {
+    PredictedPath waypoints;
+    waypoints.confidence = mode.score();
+    waypoints.time_step = rclcpp::Duration::from_seconds(0.1);  // TODO(ktro282): use a parameter
+    waypoints.path.reserve(mode.num_future());
+
+    for (const auto & state : mode.get_waypoints()) {
+      geometry_msgs::msg::Pose predicted_pose;
+      predicted_pose.position.x = state.x();
+      predicted_pose.position.y = state.y();
+      predicted_pose.position.z = init_pose_with_cov.pose.position.z;
+      predicted_pose.orientation = init_pose_with_cov.pose.orientation;
+      waypoints.path.emplace_back(predicted_pose);
+      if (waypoints.path.size() >= waypoints.path.max_size()) {
+        break;
+      }
+    }
+    predicted_object.kinematics.predicted_paths.emplace_back(waypoints);
+  }
+
+  return predicted_object;
+}
+}  // namespace autoware::mtr
+
+#include <rclcpp_components/register_node_macro.hpp>
+RCLCPP_COMPONENTS_REGISTER_NODE(autoware::mtr::MTRNode);
diff --git a/perception/autoware_mtr/src/trt_mtr.cpp b/perception/autoware_mtr/src/trt_mtr.cpp
new file mode 100644
index 0000000000000..1fd47e017ed8d
--- /dev/null
+++ b/perception/autoware_mtr/src/trt_mtr.cpp
@@ -0,0 +1,233 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/mtr/trt_mtr.hpp"
+
+#include "autoware/mtr/cuda_helper.hpp"
+#include "autoware/mtr/intention_point.hpp"
+#include "autoware/mtr/trajectory.hpp"
+#include "postprocess/postprocess_kernel.cuh"
+#include "preprocess/agent_preprocess_kernel.cuh"
+#include "preprocess/polyline_preprocess_kernel.cuh"
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace autoware::mtr
+{
+TrtMTR::TrtMTR(
+  const std::string & model_path, const MTRConfig & config, const BuildConfig & build_config,
+  const size_t max_workspace_size)
+: config_(config),
+  intention_point_(IntentionPoint::from_file(
+    config_.num_intention_point_cluster, config_.intention_point_filepath))
+{
+  max_num_polyline_ = config_.max_num_polyline;
+  num_mode_ = config_.num_mode;
+  num_future_ = config_.num_future;
+  num_intention_point_ = config_.num_intention_point_cluster;
+
+  // build engine
+  builder_ = std::make_unique<MTRBuilder>(model_path, build_config, max_workspace_size);
+  builder_->setup();
+
+  if (!builder_->isInitialized()) {
+    return;
+  }
+
+  CHECK_CUDA_ERROR(cudaStreamCreate(&stream_));
+}
+
+bool TrtMTR::doInference(
+  const AgentData & agent_data, const PolylineData & polyline_data,
+  std::vector<PredictedTrajectory> & trajectories)
+{
+  initCudaPtr(agent_data, polyline_data);
+
+  if (!preProcess(agent_data, polyline_data)) {
+    std::cerr << "Fail to preprocess" << std::endl;
+    return false;
+  }
+
+  std::vector<void *> buffer = {d_in_trajectory_.get(),      d_in_trajectory_mask_.get(),
+                                d_in_polyline_.get(),        d_in_polyline_mask_.get(),
+                                d_in_polyline_center_.get(), d_in_last_pos_.get(),
+                                d_target_index_.get(),       d_intention_point_.get(),
+                                d_out_trajectory_.get(),     d_out_score_.get()};
+
+  if (!builder_->enqueueV2(buffer.data(), stream_, nullptr)) {
+    std::cerr << "Fail to do inference" << std::endl;
+    return false;
+  }
+
+  if (!postProcess(agent_data, trajectories)) {
+    std::cerr << "Fail to postprocess" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+void TrtMTR::initCudaPtr(const AgentData & agent_data, const PolylineData & polyline_data)
+{
+  num_target_ = agent_data.num_target();
+  num_agent_ = agent_data.num_agent();
+  num_timestamp_ = agent_data.time_length();
+  num_agent_attr_ = agent_data.input_dim();
+  num_polyline_ = polyline_data.num_polyline();
+  num_point_ = polyline_data.num_point();
+  num_point_attr_ = polyline_data.input_dim();
+
+  // source data
+  d_target_index_ = cuda::make_unique<int[]>(num_target_);
+  d_label_index_ = cuda::make_unique<int[]>(num_agent_);
+  d_timestamp_ = cuda::make_unique<float[]>(num_timestamp_);
+  d_trajectory_ = cuda::make_unique<float[]>(agent_data.size());
+  d_target_state_ = cuda::make_unique<float[]>(num_target_ * agent_data.state_dim());
+  d_intention_point_ = cuda::make_unique<float[]>(num_target_ * intention_point_.size());
+  d_polyline_ = cuda::make_unique<float[]>(polyline_data.size());
+
+  // preprocessed input
+  d_in_trajectory_ =
+    cuda::make_unique<float[]>(num_target_ * num_agent_ * num_timestamp_ * num_agent_attr_);
+  d_in_trajectory_mask_ = cuda::make_unique<bool[]>(num_target_ * num_agent_ * num_timestamp_);
+  d_in_last_pos_ = cuda::make_unique<float[]>(num_target_ * num_agent_ * 3);
+  d_in_polyline_ =
+    cuda::make_unique<float[]>(num_target_ * max_num_polyline_ * num_point_ * num_point_attr_);
+  d_in_polyline_mask_ = cuda::make_unique<bool[]>(num_target_ * max_num_polyline_ * num_point_);
+  d_in_polyline_center_ = cuda::make_unique<float[]>(num_target_ * max_num_polyline_ * 3);
+
+  if (max_num_polyline_ < num_polyline_) {
+    d_tmp_polyline_ =
+      cuda::make_unique<float[]>(num_target_ * num_polyline_ * num_point_ * num_point_attr_);
+    d_tmp_polyline_mask_ = cuda::make_unique<bool[]>(num_target_ * num_polyline_ * num_point_);
+    d_tmp_distance_ = cuda::make_unique<float[]>(num_target_ * num_polyline_);
+  }
+
+  // outputs
+  d_out_score_ = cuda::make_unique<float[]>(num_target_ * num_mode_);
+  d_out_trajectory_ =
+    cuda::make_unique<float[]>(num_target_ * num_mode_ * num_future_ * PredictedStateDim);
+
+  if (builder_->isDynamic()) {
+    // trajectory: (B, N, T, Da)
+    builder_->setBindingDimensions(
+      0, nvinfer1::Dims4{num_target_, num_agent_, num_timestamp_, num_agent_attr_});
+    // trajectory mask: (B, N, T)
+    builder_->setBindingDimensions(1, nvinfer1::Dims3{num_target_, num_agent_, num_timestamp_});
+    // polyline: (B, K, P, Dp)
+    builder_->setBindingDimensions(
+      2, nvinfer1::Dims4{num_target_, max_num_polyline_, num_point_, num_point_attr_});
+    // polyline mask: (B, K, P)
+    builder_->setBindingDimensions(3, nvinfer1::Dims3{num_target_, max_num_polyline_, num_point_});
+    // polyline center: (B, K, 3)
+    builder_->setBindingDimensions(4, nvinfer1::Dims3{num_target_, max_num_polyline_, 3});
+    // agent last position: (B, N, 3)
+    builder_->setBindingDimensions(5, nvinfer1::Dims3{num_target_, num_agent_, 3});
+    // target indices: (B,)
+    nvinfer1::Dims targetIdxDim;
+    targetIdxDim.nbDims = 1;
+    targetIdxDim.d[0] = num_target_;
+    builder_->setBindingDimensions(6, targetIdxDim);
+    // intention points: (B, I, 2)
+    builder_->setBindingDimensions(7, nvinfer1::Dims3{num_target_, num_intention_point_, 2});
+  }
+}
+
+bool TrtMTR::preProcess(const AgentData & agent_data, const PolylineData & polyline_data)
+{
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    d_target_index_.get(), agent_data.target_indices().data(), sizeof(int) * num_target_,
+    cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    d_label_index_.get(), agent_data.label_ids().data(), sizeof(int) * num_agent_,
+    cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    d_timestamp_.get(), agent_data.timestamps().data(), sizeof(float) * num_timestamp_,
+    cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    d_trajectory_.get(), agent_data.data_ptr(), sizeof(float) * agent_data.size(),
+    cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    d_target_state_.get(), agent_data.current_target_data_ptr(),
+    sizeof(float) * num_target_ * agent_data.state_dim(), cudaMemcpyHostToDevice, stream_));
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    d_polyline_.get(), polyline_data.data_ptr(), sizeof(float) * polyline_data.size(),
+    cudaMemcpyHostToDevice, stream_));
+
+  const auto target_label_names = getLabelNames(agent_data.target_label_ids());
+  const auto intention_point = intention_point_.as_array(target_label_names);
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    d_intention_point_.get(), intention_point.data(),
+    sizeof(float) * num_target_ * intention_point_.size(), cudaMemcpyHostToDevice, stream_));
+
+  CHECK_CUDA_ERROR(agentPreprocessLauncher(
+    num_target_, num_agent_, num_timestamp_, agent_data.state_dim(), agent_data.num_class(),
+    agent_data.ego_index(), d_target_index_.get(), d_label_index_.get(), d_timestamp_.get(),
+    d_trajectory_.get(), d_in_trajectory_.get(), d_in_trajectory_mask_.get(), d_in_last_pos_.get(),
+    stream_));
+
+  if (max_num_polyline_ < num_polyline_) {
+    CHECK_CUDA_ERROR(polylinePreprocessWithTopkLauncher(
+      max_num_polyline_, num_polyline_, num_point_, polyline_data.state_dim(), d_polyline_.get(),
+      num_target_, agent_data.state_dim(), d_target_state_.get(), d_tmp_polyline_.get(),
+      d_tmp_polyline_mask_.get(), d_tmp_distance_.get(), d_in_polyline_.get(),
+      d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));
+  } else {
+    CHECK_CUDA_ERROR(polylinePreprocessLauncher(
+      num_polyline_, num_point_, polyline_data.state_dim(), d_polyline_.get(), num_target_,
+      agent_data.state_dim(), d_target_state_.get(), d_in_polyline_.get(),
+      d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));
+  }
+  return true;
+}
+
+bool TrtMTR::postProcess(
+  const AgentData & agent_data, std::vector<PredictedTrajectory> & trajectories)
+{
+  CHECK_CUDA_ERROR(postprocessLauncher(
+    num_target_, num_mode_, num_future_, agent_data.state_dim(), d_target_state_.get(),
+    PredictedStateDim, d_out_trajectory_.get(), stream_));
+
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  h_out_score_.clear();
+  h_out_trajectory_.clear();
+  h_out_score_.resize(num_target_ * num_mode_);
+  h_out_trajectory_.resize(num_target_ * num_mode_ * num_future_ * PredictedStateDim);
+
+  CHECK_CUDA_ERROR(cudaMemcpy(
+    h_out_score_.data(), d_out_score_.get(), sizeof(float) * num_target_ * num_mode_,
+    cudaMemcpyDeviceToHost));
+  CHECK_CUDA_ERROR(cudaMemcpy(
+    h_out_trajectory_.data(), d_out_trajectory_.get(),
+    sizeof(float) * num_target_ * num_mode_ * num_future_ * PredictedStateDim,
+    cudaMemcpyDeviceToHost));
+
+  trajectories.clear();
+  trajectories.reserve(num_target_);
+  for (auto b = 0; b < num_target_; ++b) {
+    const auto score_itr = h_out_score_.cbegin() + b * num_mode_;
+    const std::vector<double> scores(score_itr, score_itr + num_mode_);
+    const auto mode_itr =
+      h_out_trajectory_.cbegin() + b * num_mode_ * num_future_ * PredictedStateDim;
+    std::vector<double> modes(mode_itr, mode_itr + num_mode_ * num_future_ * PredictedStateDim);
+    trajectories.emplace_back(scores, modes, num_mode_, num_future_);
+  }
+  return true;
+}
+}  // namespace autoware::mtr
diff --git a/perception/autoware_mtr/test/test_fixed_queue.cpp b/perception/autoware_mtr/test/test_fixed_queue.cpp
new file mode 100644
index 0000000000000..5ed4ebf5b2a28
--- /dev/null
+++ b/perception/autoware_mtr/test/test_fixed_queue.cpp
@@ -0,0 +1,42 @@
+// Copyright 2024 TIER IV, inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/mtr/fixed_queue.hpp"
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+
+TEST(testFixedQueue, test_name)
+{
+  constexpr size_t n_size = 5;
+  autoware::mtr::FixedQueue<double> queue(n_size);
+  EXPECT_DOUBLE_EQ(queue.front(), 0.0);
+  EXPECT_DOUBLE_EQ(queue.back(), 0.0);
+  EXPECT_EQ(queue.size(), n_size);
+
+  for (size_t n = 0; n < n_size; ++n) {
+    queue.push_back(n);
+  }
+
+  EXPECT_DOUBLE_EQ(queue.front(), 0.0);
+  EXPECT_DOUBLE_EQ(queue.back(), 4.0);
+  EXPECT_EQ(queue.size(), n_size);
+}
+
+int main(int argc, char ** argv)
+{
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}