From 860141272e3689b99fdbf38bccdb6b528f7faf7a Mon Sep 17 00:00:00 2001
From: Matthew Kotila <matthew.r.kotila@gmail.com>
Date: Mon, 26 Feb 2024 18:13:00 -0800
Subject: [PATCH] Add response outputs to profile export

---
 src/c++/library/common.h                      |  6 ++-
 src/c++/library/grpc_client.cc                | 22 +++++++++-
 src/c++/library/http_client.cc                | 22 +++++++++-
 .../client_backend/client_backend.h           |  9 +++-
 .../triton/triton_client_backend.cc           |  9 +++-
 .../triton/triton_client_backend.h            |  4 +-
 src/c++/perf_analyzer/infer_context.cc        | 19 +++++---
 src/c++/perf_analyzer/inference_profiler.cc   | 13 +++---
 .../perf_analyzer/profile_data_exporter.cc    | 40 ++++++++++++-----
 src/c++/perf_analyzer/profile_data_exporter.h | 10 +++--
 src/c++/perf_analyzer/request_record.h        | 25 ++++++-----
 .../perf_analyzer/test_inference_profiler.cc  | 38 ++++++++--------
 src/c++/perf_analyzer/test_load_manager.cc    | 36 ++++++++-------
 .../test_profile_data_collector.cc            | 44 ++++++++++++-------
 .../test_profile_data_exporter.cc             | 14 +++++-
 15 files changed, 212 insertions(+), 99 deletions(-)
diff --git a/src/c++/library/common.h b/src/c++/library/common.h
index 9cf99c478..9d89f7df7 100644
--- a/src/c++/library/common.h
+++ b/src/c++/library/common.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -530,6 +530,10 @@ class InferResult {
   /// \return Error object indicating the success or failure.
   virtual Error IsNullResponse(bool* is_null_response) const = 0;
 
+  /// Get output for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error Output(std::string& output) const = 0;
+
   /// Get the result data as a vector of strings. The vector will
   /// receive a copy of result data. An error will be generated if
   /// the datatype of output is not 'BYTES'.
diff --git a/src/c++/library/grpc_client.cc b/src/c++/library/grpc_client.cc
index fe91f5c17..a19e268ae 100644
--- a/src/c++/library/grpc_client.cc
+++ b/src/c++/library/grpc_client.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -206,6 +206,7 @@ class InferResultGrpc : public InferResult {
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
   Error IsNullResponse(bool* is_null_response) const override;
+  Error Output(std::string& output) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -349,6 +350,25 @@ InferResultGrpc::IsNullResponse(bool* is_null_response) const
   return Error::Success;
 }
 
+Error
+InferResultGrpc::Output(std::string& output) const
+{
+  // Only supports LLM outputs with name 'text_output' currently
+  if (output_name_to_buffer_map_.find("text_output") ==
+      output_name_to_buffer_map_.end()) {
+    return Error::Success;
+  }
+
+  const uint8_t* buf{nullptr};
+  size_t byte_size{0};
+  Error e{RawData("text_output", &buf, &byte_size)};
+  if (e.IsOk() == false) {
+    return e;
+  }
+  output.assign(reinterpret_cast<const char*>(buf), byte_size);
+  return Error::Success;
+}
+
 Error
 InferResultGrpc::StringData(
     const std::string& output_name,
diff --git a/src/c++/library/http_client.cc b/src/c++/library/http_client.cc
index 9f2f5ab5e..677fdc659 100644
--- a/src/c++/library/http_client.cc
+++ b/src/c++/library/http_client.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -758,6 +758,7 @@ class InferResultHttp : public InferResult {
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
   Error IsNullResponse(bool* is_null_response) const override;
+  Error Output(std::string& output) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -980,6 +981,25 @@ InferResultHttp::IsNullResponse(bool* is_null_response) const
   return Error::Success;
 }
 
+Error
+InferResultHttp::Output(std::string& output) const
+{
+  // Only supports LLM outputs with name 'text_output' currently
+  if (output_name_to_buffer_map_.find("text_output") ==
+      output_name_to_buffer_map_.end()) {
+    return Error::Success;
+  }
+
+  const uint8_t* buf{nullptr};
+  size_t byte_size{0};
+  Error e{RawData("text_output", &buf, &byte_size)};
+  if (e.IsOk() == false) {
+    return e;
+  }
+  output.assign(reinterpret_cast<const char*>(buf), byte_size);
+  return Error::Success;
+}
+
 Error
 InferResultHttp::StringData(
     const std::string& output_name,
diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
index 870ea3dd5..99e0d0e97 100644
--- a/src/c++/perf_analyzer/client_backend/client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -642,6 +642,13 @@ class InferResult {
   {
     return Error("InferResult::IsNullResponse() not implemented");
   };
+
+  /// Get output for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error Output(std::string& output) const
+  {
+    return Error("InferResult::Output() not implemented");
+  }
 };
 
 }}}  // namespace triton::perfanalyzer::clientbackend
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
index 70de5f52b..12e74f7b1 100644
--- a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -842,6 +842,13 @@ TritonInferResult::IsNullResponse(bool* is_null_response) const
   return Error::Success;
 }
 
+Error
+TritonInferResult::Output(std::string& output) const
+{
+  RETURN_IF_TRITON_ERROR(result_->Output(output));
+  return Error::Success;
+}
+
 //==============================================================================
 
 }}}}  // namespace triton::perfanalyzer::clientbackend::tritonremote
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
index aab3c8028..9d13a9cb2 100644
--- a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
+++ b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -333,6 +333,8 @@ class TritonInferResult : public InferResult {
   Error IsFinalResponse(bool* is_final_response) const override;
   /// See InferResult::IsNullResponse()
   Error IsNullResponse(bool* is_null_response) const override;
+  /// See InferResult::Output()
+  Error Output(std::string& output) const override;
 
  private:
   std::unique_ptr<tc::InferResult> result_;
diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
index 588d4057a..d2177182f 100644
--- a/src/c++/perf_analyzer/infer_context.cc
+++ b/src/c++/perf_analyzer/infer_context.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -149,8 +149,10 @@ InferContext::SendRequest(
         &results, *(infer_data_.options_), infer_data_.valid_inputs_,
         infer_data_.outputs_);
     thread_stat_->idle_timer.Stop();
+    std::string output{""};
     if (results != nullptr) {
       if (thread_stat_->status_.IsOk()) {
+        results->Output(output);
         thread_stat_->status_ = ValidateOutputs(results);
       }
       delete results;
@@ -167,7 +169,7 @@ InferContext::SendRequest(
       std::lock_guard<std::mutex> lock(thread_stat_->mu_);
       auto total = end_time_sync - start_time_sync;
       thread_stat_->request_records_.emplace_back(RequestRecord(
-          start_time_sync, std::move(end_time_syncs),
+          start_time_sync, std::move(end_time_syncs), {output},
           infer_data_.options_->sequence_end_, delayed, sequence_id, false));
       thread_stat_->status_ =
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
@@ -258,7 +260,11 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
         if (thread_stat_->cb_status_.IsOk() == false) {
           return;
         }
-        it->second.response_times_.push_back(std::chrono::system_clock::now());
+        it->second.response_timestamps_.push_back(
+            std::chrono::system_clock::now());
+        std::string response_output{""};
+        result->Output(response_output);
+        it->second.response_outputs_.push_back(response_output);
         num_responses_++;
         if (is_null_response == true) {
           it->second.has_null_last_response_ = true;
@@ -271,9 +277,10 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
         if (is_final_response) {
           has_received_final_response_ = is_final_response;
           thread_stat_->request_records_.emplace_back(
-              it->second.start_time_, it->second.response_times_,
-              it->second.sequence_end_, it->second.delayed_,
-              it->second.sequence_id_, it->second.has_null_last_response_);
+              it->second.start_time_, it->second.response_timestamps_,
+              it->second.response_outputs_, it->second.sequence_end_,
+              it->second.delayed_, it->second.sequence_id_,
+              it->second.has_null_last_response_);
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
           thread_stat_->cb_status_ = ValidateOutputs(result);
           async_req_map_.erase(request_id);
diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
index da5acf255..46e2bcb52 100644
--- a/src/c++/perf_analyzer/inference_profiler.cc
+++ b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1282,11 +1282,12 @@ InferenceProfiler::ValidLatencyMeasurement(
     uint64_t request_end_ns;
 
     if (request_record.has_null_last_response_ == false) {
-      request_end_ns = CHRONO_TO_NANOS(request_record.response_times_.back());
-    } else if (request_record.response_times_.size() > 1) {
-      size_t last_response_idx{request_record.response_times_.size() - 2};
       request_end_ns =
-          CHRONO_TO_NANOS(request_record.response_times_[last_response_idx]);
+          CHRONO_TO_NANOS(request_record.response_timestamps_.back());
+    } else if (request_record.response_timestamps_.size() > 1) {
+      size_t last_response_idx{request_record.response_timestamps_.size() - 2};
+      request_end_ns = CHRONO_TO_NANOS(
+          request_record.response_timestamps_[last_response_idx]);
     } else {
       erase_indices.push_back(i);
       continue;
@@ -1297,7 +1298,7 @@ InferenceProfiler::ValidLatencyMeasurement(
       if ((request_end_ns >= valid_range.first) &&
           (request_end_ns <= valid_range.second)) {
         valid_latencies->push_back(request_end_ns - request_start_ns);
-        response_count += request_record.response_times_.size();
+        response_count += request_record.response_timestamps_.size();
         if (request_record.has_null_last_response_) {
           response_count--;
         }
diff --git a/src/c++/perf_analyzer/profile_data_exporter.cc b/src/c++/perf_analyzer/profile_data_exporter.cc
index 2ca002ea1..fd63f5d36 100644
--- a/src/c++/perf_analyzer/profile_data_exporter.cc
+++ b/src/c++/perf_analyzer/profile_data_exporter.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -122,25 +122,43 @@ ProfileDataExporter::AddRequests(
       request.AddMember("sequence_id", sequence_id, document_.GetAllocator());
     }
 
-    rapidjson::Value responses(rapidjson::kArrayType);
-    AddResponses(responses, raw_request.response_times_);
+    rapidjson::Value response_timestamps(rapidjson::kArrayType);
+    AddResponseTimestamps(
+        response_timestamps, raw_request.response_timestamps_);
     request.AddMember(
-        "response_timestamps", responses, document_.GetAllocator());
+        "response_timestamps", response_timestamps, document_.GetAllocator());
+
+    rapidjson::Value response_outputs(rapidjson::kArrayType);
+    AddResponseOutputs(response_outputs, raw_request.response_outputs_);
+    request.AddMember(
+        "response_outputs", response_outputs, document_.GetAllocator());
+
     requests.PushBack(request, document_.GetAllocator());
   }
   entry.AddMember("requests", requests, document_.GetAllocator());
 }
 
 void
-ProfileDataExporter::AddResponses(
-    rapidjson::Value& responses,
+ProfileDataExporter::AddResponseTimestamps(
+    rapidjson::Value& timestamps_json,
     const std::vector<std::chrono::time_point<std::chrono::system_clock>>&
-        response_times)
+        timestamps)
+{
+  for (auto& timestamp : timestamps) {
+    rapidjson::Value timestamp_json;
+    timestamp_json.SetUint64(timestamp.time_since_epoch().count());
+    timestamps_json.PushBack(timestamp_json, document_.GetAllocator());
+  }
+}
+
+void
+ProfileDataExporter::AddResponseOutputs(
+    rapidjson::Value& outputs_json, const std::vector<std::string>& outputs)
 {
-  for (auto& response : response_times) {
-    rapidjson::Value time;
-    time.SetUint64(response.time_since_epoch().count());
-    responses.PushBack(time, document_.GetAllocator());
+  for (auto& output : outputs) {
+    rapidjson::Value output_json;
+    output_json.SetString(output.c_str(), output.size());
+    outputs_json.PushBack(output_json, document_.GetAllocator());
   }
 }
 
diff --git a/src/c++/perf_analyzer/profile_data_exporter.h b/src/c++/perf_analyzer/profile_data_exporter.h
index db6e63366..421c7c1ee 100644
--- a/src/c++/perf_analyzer/profile_data_exporter.h
+++ b/src/c++/perf_analyzer/profile_data_exporter.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -69,10 +69,12 @@ class ProfileDataExporter {
   void AddRequests(
       rapidjson::Value& entry, rapidjson::Value& requests,
       const Experiment& raw_experiment);
-  void AddResponses(
-      rapidjson::Value& responses,
+  void AddResponseTimestamps(
+      rapidjson::Value& timestamps_json,
       const std::vector<std::chrono::time_point<std::chrono::system_clock>>&
-          response_times);
+          timestamps);
+  void AddResponseOutputs(
+      rapidjson::Value& outputs_json, const std::vector<std::string>& outputs);
   void AddWindowBoundaries(
       rapidjson::Value& entry, rapidjson::Value& window_boundaries,
       const Experiment& raw_experiment);
diff --git a/src/c++/perf_analyzer/request_record.h b/src/c++/perf_analyzer/request_record.h
index 0c9ca9263..1f119ad60 100644
--- a/src/c++/perf_analyzer/request_record.h
+++ b/src/c++/perf_analyzer/request_record.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -34,24 +34,27 @@ namespace triton { namespace perfanalyzer {
 
 /// A record of an individual request
 struct RequestRecord {
-  RequestRecord() = default;
   RequestRecord(
-      std::chrono::time_point<std::chrono::system_clock> start_time,
+      std::chrono::time_point<std::chrono::system_clock> start_time =
+          std::chrono::time_point<std::chrono::system_clock>(),
       std::vector<std::chrono::time_point<std::chrono::system_clock>>
-          response_times,
-      bool sequence_end, bool delayed, uint64_t sequence_id,
-      bool has_null_last_response)
-      : start_time_(start_time), response_times_(response_times),
-        sequence_end_(sequence_end), delayed_(delayed),
-        sequence_id_(sequence_id),
+          response_timestamps = {},
+      std::vector<std::string> response_outputs = {}, bool sequence_end = true,
+      bool delayed = false, uint64_t sequence_id = 0,
+      bool has_null_last_response = false)
+      : start_time_(start_time), response_timestamps_(response_timestamps),
+        response_outputs_(response_outputs), sequence_end_(sequence_end),
+        delayed_(delayed), sequence_id_(sequence_id),
         has_null_last_response_(has_null_last_response)
   {
   }
   // The timestamp of when the request was started.
   std::chrono::time_point<std::chrono::system_clock> start_time_;
-  // Collection of response times
+  // Collection of response timestamps
   std::vector<std::chrono::time_point<std::chrono::system_clock>>
-      response_times_;
+      response_timestamps_;
+  // Collection of response outputs
+  std::vector<std::string> response_outputs_;
   // Whether or not the request is at the end of a sequence.
   bool sequence_end_;
   // Whether or not the request is delayed as per schedule.
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
index 5a41319bd..1c07c1906 100644
--- a/src/c++/perf_analyzer/test_inference_profiler.cc
+++ b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -178,40 +178,40 @@ TEST_CASE("testing the ValidLatencyMeasurement function")
       // in the vector of requests, but if it is, we exclude it: not included in
       // current window
       RequestRecord(
-          time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0,
+          time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, {}, 0,
           false, 0, false),
 
       // request starts before window starts and ends inside window: included in
       // current window
       RequestRecord(
-          time_point(ns(3)), std::vector<time_point>{time_point(ns(5))}, 0,
+          time_point(ns(3)), std::vector<time_point>{time_point(ns(5))}, {}, 0,
           false, 0, false),
 
       // requests start and end inside window: included in current window
       RequestRecord(
-          time_point(ns(6)), std::vector<time_point>{time_point(ns(9))}, 0,
+          time_point(ns(6)), std::vector<time_point>{time_point(ns(9))}, {}, 0,
           false, 0, false),
       RequestRecord(
-          time_point(ns(10)), std::vector<time_point>{time_point(ns(14))}, 0,
-          false, 0, false),
+          time_point(ns(10)), std::vector<time_point>{time_point(ns(14))}, {},
+          0, false, 0, false),
 
       // request starts before window ends and ends after window ends: not
       // included in current window
       RequestRecord(
-          time_point(ns(15)), std::vector<time_point>{time_point(ns(20))}, 0,
-          false, 0, false),
+          time_point(ns(15)), std::vector<time_point>{time_point(ns(20))}, {},
+          0, false, 0, false),
 
       // request starts after window ends: not included in current window
       RequestRecord(
-          time_point(ns(21)), std::vector<time_point>{time_point(ns(27))}, 0,
-          false, 0, false)};
+          time_point(ns(21)), std::vector<time_point>{time_point(ns(27))}, {},
+          0, false, 0, false)};
 
   TestInferenceProfiler::ValidLatencyMeasurement(
       window, valid_sequence_count, delayed_request_count, &latencies,
       response_count, valid_requests, all_request_records);
 
   const auto& convert_request_record_to_latency{[](RequestRecord t) {
-    return CHRONO_TO_NANOS(t.response_times_.back()) -
+    return CHRONO_TO_NANOS(t.response_timestamps_.back()) -
            CHRONO_TO_NANOS(t.start_time_);
   }};
 
@@ -882,7 +882,7 @@ TEST_CASE(
         request1_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response1_timestamp, response2_timestamp},
-        0, false, 0, false)};
+        {}, 0, false, 0, false)};
 
     auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
     RequestRecord request_record2{};
@@ -897,7 +897,7 @@ TEST_CASE(
           request2_timestamp,
           std::vector<std::chrono::time_point<std::chrono::system_clock>>{
               response3_timestamp, response4_timestamp, response5_timestamp},
-          0, false, 0, false);
+          {}, 0, false, 0, false);
       expected_response_count = 5;
     }
     SUBCASE("second request has two data responses and one null response")
@@ -909,15 +909,15 @@ TEST_CASE(
           request2_timestamp,
           std::vector<std::chrono::time_point<std::chrono::system_clock>>{
               response3_timestamp, response4_timestamp, response5_timestamp},
-          0, false, 0, true);
+          {}, 0, false, 0, true);
       expected_response_count = 4;
     }
     SUBCASE("second request has one null response")
     {
       request_record2 = RequestRecord(
           request2_timestamp,
-          std::vector<std::chrono::time_point<std::chrono::system_clock>>{}, 0,
-          false, 0, true);
+          std::vector<std::chrono::time_point<std::chrono::system_clock>>{}, {},
+          0, false, 0, true);
       expected_response_count = 2;
     }
 
@@ -948,7 +948,7 @@ TEST_CASE(
         request1_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response1_timestamp},
-        0, false, 0, false)};
+        {}, 0, false, 0, false)};
 
     auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(3)};
     auto response2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
@@ -956,7 +956,7 @@ TEST_CASE(
         request2_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response2_timestamp},
-        0, false, 0, false)};
+        {}, 0, false, 0, false)};
 
     auto request3_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
     auto response3_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
@@ -964,7 +964,7 @@ TEST_CASE(
         request3_timestamp,
         std::vector<std::chrono::time_point<std::chrono::system_clock>>{
             response3_timestamp},
-        0, false, 0, false)};
+        {}, 0, false, 0, false)};
 
     mock_inference_profiler.all_request_records_ = {
         request_record1, request_record2, request_record3};
diff --git a/src/c++/perf_analyzer/test_load_manager.cc b/src/c++/perf_analyzer/test_load_manager.cc
index c057516f0..acd8f9592 100644
--- a/src/c++/perf_analyzer/test_load_manager.cc
+++ b/src/c++/perf_analyzer/test_load_manager.cc
@@ -1,4 +1,4 @@
-// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -39,11 +39,13 @@ bool
 operator==(const RequestRecord& lhs, const RequestRecord& rhs)
 {
   return std::tie(
-             lhs.start_time_, lhs.response_times_, lhs.sequence_end_,
-             lhs.delayed_, lhs.sequence_id_, lhs.has_null_last_response_) ==
+             lhs.start_time_, lhs.response_timestamps_, lhs.response_outputs_,
+             lhs.sequence_end_, lhs.delayed_, lhs.sequence_id_,
+             lhs.has_null_last_response_) ==
          std::tie(
-             rhs.start_time_, rhs.response_times_, rhs.sequence_end_,
-             rhs.delayed_, rhs.sequence_id_, rhs.has_null_last_response_);
+             rhs.start_time_, rhs.response_timestamps_, rhs.response_outputs_,
+             rhs.sequence_end_, rhs.delayed_, rhs.sequence_id_,
+             rhs.has_null_last_response_);
 }
 
 }  // namespace
@@ -134,14 +136,14 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
     using time_point = std::chrono::time_point<std::chrono::system_clock>;
     using ns = std::chrono::nanoseconds;
     auto request_record1 = RequestRecord(
-        time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0, false,
-        0, false);
+        time_point(ns(1)), std::vector<time_point>{time_point(ns(2))},
+        {"my_output"}, 0, false, 0, false);
     auto request_record2 = RequestRecord(
-        time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0, false,
-        0, false);
+        time_point(ns(3)), std::vector<time_point>{time_point(ns(4))},
+        {"my_output"}, 0, false, 0, false);
     auto request_record3 = RequestRecord(
-        time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0, false,
-        0, false);
+        time_point(ns(5)), std::vector<time_point>{time_point(ns(6))},
+        {"my_output"}, 0, false, 0, false);
 
     std::vector<RequestRecord> source_request_records;
 
@@ -295,14 +297,14 @@ class TestLoadManager : public TestLoadManagerBase, public LoadManager {
     using time_point = std::chrono::time_point<std::chrono::system_clock>;
     using ns = std::chrono::nanoseconds;
     auto request_record1 = RequestRecord(
-        time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, 0, false,
-        0, false);
+        time_point(ns(1)), std::vector<time_point>{time_point(ns(2))}, {}, 0,
+        false, 0, false);
     auto request_record2 = RequestRecord(
-        time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, 0, false,
-        0, false);
+        time_point(ns(3)), std::vector<time_point>{time_point(ns(4))}, {}, 0,
+        false, 0, false);
     auto request_record3 = RequestRecord(
-        time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, 0, false,
-        0, false);
+        time_point(ns(5)), std::vector<time_point>{time_point(ns(6))}, {}, 0,
+        false, 0, false);
 
     SUBCASE("No threads")
     {
diff --git a/src/c++/perf_analyzer/test_profile_data_collector.cc b/src/c++/perf_analyzer/test_profile_data_collector.cc
index 1428b6309..775a11b2f 100644
--- a/src/c++/perf_analyzer/test_profile_data_collector.cc
+++ b/src/c++/perf_analyzer/test_profile_data_collector.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -60,28 +60,34 @@ TEST_CASE("profile_data_collector: AddData")
   auto clock_epoch{std::chrono::time_point<std::chrono::system_clock>()};
 
   uint64_t sequence_id1{123};
-  auto request_timestamp1{clock_epoch + std::chrono::nanoseconds(1)};
-  auto response_timestamp1{clock_epoch + std::chrono::nanoseconds(2)};
-  auto response_timestamp2{clock_epoch + std::chrono::nanoseconds(3)};
+  auto request1_timestamp{clock_epoch + std::chrono::nanoseconds(1)};
+  auto request1_response1_timestamp{clock_epoch + std::chrono::nanoseconds(2)};
+  auto request1_response2_timestamp{clock_epoch + std::chrono::nanoseconds(3)};
+  auto request1_response1_output{"request1_response1_output"};
+  auto request1_response2_output{"request1_response2_output"};
 
   RequestRecord request_record1{
-      request_timestamp1,
+      request1_timestamp,
       std::vector<std::chrono::time_point<std::chrono::system_clock>>{
-          response_timestamp1, response_timestamp2},
+          request1_response1_timestamp, request1_response2_timestamp},
+      {request1_response1_output, request1_response2_output},
       0,
       false,
       sequence_id1,
       false};
 
   uint64_t sequence_id2{456};
-  auto request_timestamp2{clock_epoch + std::chrono::nanoseconds(4)};
-  auto response_timestamp3{clock_epoch + std::chrono::nanoseconds(5)};
-  auto response_timestamp4{clock_epoch + std::chrono::nanoseconds(6)};
+  auto request2_timestamp{clock_epoch + std::chrono::nanoseconds(4)};
+  auto request2_response1_timestamp{clock_epoch + std::chrono::nanoseconds(5)};
+  auto request2_response2_timestamp{clock_epoch + std::chrono::nanoseconds(6)};
+  auto request2_response1_output{"request2_response1_output"};
+  auto request2_response2_output{"request2_response2_output"};
 
   RequestRecord request_record2{
-      request_timestamp2,
+      request2_timestamp,
       std::vector<std::chrono::time_point<std::chrono::system_clock>>{
-          response_timestamp3, response_timestamp4},
+          request2_response1_timestamp, request2_response2_timestamp},
+      {request2_response1_output, request2_response2_output},
       0,
       false,
       sequence_id2,
@@ -94,13 +100,17 @@ TEST_CASE("profile_data_collector: AddData")
 
   std::vector<RequestRecord> rr{collector.experiments_[0].requests};
   CHECK(rr[0].sequence_id_ == sequence_id1);
-  CHECK(rr[0].start_time_ == request_timestamp1);
-  CHECK(rr[0].response_times_[0] == response_timestamp1);
-  CHECK(rr[0].response_times_[1] == response_timestamp2);
+  CHECK(rr[0].start_time_ == request1_timestamp);
+  CHECK(rr[0].response_timestamps_[0] == request1_response1_timestamp);
+  CHECK(rr[0].response_timestamps_[1] == request1_response2_timestamp);
+  CHECK(rr[0].response_outputs_[0] == request1_response1_output);
+  CHECK(rr[0].response_outputs_[1] == request1_response2_output);
   CHECK(rr[1].sequence_id_ == sequence_id2);
-  CHECK(rr[1].start_time_ == request_timestamp2);
-  CHECK(rr[1].response_times_[0] == response_timestamp3);
-  CHECK(rr[1].response_times_[1] == response_timestamp4);
+  CHECK(rr[1].start_time_ == request2_timestamp);
+  CHECK(rr[1].response_timestamps_[0] == request2_response1_timestamp);
+  CHECK(rr[1].response_timestamps_[1] == request2_response2_timestamp);
+  CHECK(rr[1].response_outputs_[0] == request2_response1_output);
+  CHECK(rr[1].response_outputs_[1] == request2_response2_output);
 }
 
 TEST_CASE("profile_data_collector: AddWindow")
diff --git a/src/c++/perf_analyzer/test_profile_data_exporter.cc b/src/c++/perf_analyzer/test_profile_data_exporter.cc
index 669d8ee3f..08a89b5a2 100644
--- a/src/c++/perf_analyzer/test_profile_data_exporter.cc
+++ b/src/c++/perf_analyzer/test_profile_data_exporter.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -41,11 +41,14 @@ TEST_CASE("profile_data_exporter: ConvertToJson")
   auto request_timestamp{clock_epoch + std::chrono::nanoseconds(1)};
   auto response_timestamp1{clock_epoch + std::chrono::nanoseconds(2)};
   auto response_timestamp2{clock_epoch + std::chrono::nanoseconds(3)};
+  auto response_output1{"response_output1"};
+  auto response_output2{"response_output2"};
 
   RequestRecord request_record{
       request_timestamp,
       std::vector<std::chrono::time_point<std::chrono::system_clock>>{
           response_timestamp1, response_timestamp2},
+      {response_output1, response_output2},
       0,
       false,
       sequence_id,
@@ -75,7 +78,8 @@ TEST_CASE("profile_data_exporter: ConvertToJson")
               {
                 "timestamp" : 1,
                 "sequence_id" : 1,
-                "response_timestamps" : [ 2, 3 ]
+                "response_timestamps" : [ 2, 3 ],
+                "response_outputs" : [ "response_output1", "response_output2" ]
               }
             ],
             "window_boundaries" : [ 1, 5, 6 ]
@@ -116,6 +120,12 @@ TEST_CASE("profile_data_exporter: ConvertToJson")
   CHECK(
       actual_request["response_timestamps"][1] ==
       expected_request["response_timestamps"][1]);
+  CHECK(
+      actual_request["response_outputs"][0] ==
+      expected_request["response_outputs"][0]);
+  CHECK(
+      actual_request["response_outputs"][1] ==
+      expected_request["response_outputs"][1]);
 
   CHECK(actual_windows[0] == expected_windows[0]);
   CHECK(actual_windows[1] == expected_windows[1]);