Add response outputs to profile export

triton-inference-server · Feb 28, 2024 · 8601412 · 8601412
1 parent 820539b
commit 8601412
Show file tree

Hide file tree

Showing 15 changed files with 212 additions and 99 deletions.
diff --git a/src/c++/library/common.h b/src/c++/library/common.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -530,6 +530,10 @@ class InferResult {
   /// \return Error object indicating the success or failure.
   virtual Error IsNullResponse(bool* is_null_response) const = 0;
 
+  /// Get output for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error Output(std::string& output) const = 0;
+
   /// Get the result data as a vector of strings. The vector will
   /// receive a copy of result data. An error will be generated if
   /// the datatype of output is not 'BYTES'.

diff --git a/src/c++/library/grpc_client.cc b/src/c++/library/grpc_client.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -206,6 +206,7 @@ class InferResultGrpc : public InferResult {
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
   Error IsNullResponse(bool* is_null_response) const override;
+  Error Output(std::string& output) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -349,6 +350,25 @@ InferResultGrpc::IsNullResponse(bool* is_null_response) const
   return Error::Success;
 }
 
+Error
+InferResultGrpc::Output(std::string& output) const
+{
+  // Only supports LLM outputs with name 'text_output' currently
+  if (output_name_to_buffer_map_.find("text_output") ==
+      output_name_to_buffer_map_.end()) {
+    return Error::Success;
+  }
+
+  const uint8_t* buf{nullptr};
+  size_t byte_size{0};
+  Error e{RawData("text_output", &buf, &byte_size)};
+  if (e.IsOk() == false) {
+    return e;
+  }
+  output.assign(reinterpret_cast<const char*>(buf), byte_size);
+  return Error::Success;
+}
+
 Error
 InferResultGrpc::StringData(
     const std::string& output_name,

diff --git a/src/c++/library/http_client.cc b/src/c++/library/http_client.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -758,6 +758,7 @@ class InferResultHttp : public InferResult {
       size_t* byte_size) const override;
   Error IsFinalResponse(bool* is_final_response) const override;
   Error IsNullResponse(bool* is_null_response) const override;
+  Error Output(std::string& output) const override;
   Error StringData(
       const std::string& output_name,
       std::vector<std::string>* string_result) const override;
@@ -980,6 +981,25 @@ InferResultHttp::IsNullResponse(bool* is_null_response) const
   return Error::Success;
 }
 
+Error
+InferResultHttp::Output(std::string& output) const
+{
+  // Only supports LLM outputs with name 'text_output' currently
+  if (output_name_to_buffer_map_.find("text_output") ==
+      output_name_to_buffer_map_.end()) {
+    return Error::Success;
+  }
+
+  const uint8_t* buf{nullptr};
+  size_t byte_size{0};
+  Error e{RawData("text_output", &buf, &byte_size)};
+  if (e.IsOk() == false) {
+    return e;
+  }
+  output.assign(reinterpret_cast<const char*>(buf), byte_size);
+  return Error::Success;
+}
+
 Error
 InferResultHttp::StringData(
     const std::string& output_name,

diff --git a/src/c++/perf_analyzer/client_backend/client_backend.h b/src/c++/perf_analyzer/client_backend/client_backend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -642,6 +642,13 @@ class InferResult {
   {
     return Error("InferResult::IsNullResponse() not implemented");
   };
+
+  /// Get output for this response.
+  /// \return Error object indicating the success or failure.
+  virtual Error Output(std::string& output) const
+  {
+    return Error("InferResult::Output() not implemented");
+  }
 };
 
 }}}  // namespace triton::perfanalyzer::clientbackend

diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -842,6 +842,13 @@ TritonInferResult::IsNullResponse(bool* is_null_response) const
   return Error::Success;
 }
 
+Error
+TritonInferResult::Output(std::string& output) const
+{
+  RETURN_IF_TRITON_ERROR(result_->Output(output));
+  return Error::Success;
+}
+
 //==============================================================================
 
 }}}}  // namespace triton::perfanalyzer::clientbackend::tritonremote
diff --git a/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h b/src/c++/perf_analyzer/client_backend/triton/triton_client_backend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -333,6 +333,8 @@ class TritonInferResult : public InferResult {
   Error IsFinalResponse(bool* is_final_response) const override;
   /// See InferResult::IsNullResponse()
   Error IsNullResponse(bool* is_null_response) const override;
+  /// See InferResult::Output()
+  Error Output(std::string& output) const override;
 
  private:
   std::unique_ptr<tc::InferResult> result_;

diff --git a/src/c++/perf_analyzer/infer_context.cc b/src/c++/perf_analyzer/infer_context.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -149,8 +149,10 @@ InferContext::SendRequest(
         &results, *(infer_data_.options_), infer_data_.valid_inputs_,
         infer_data_.outputs_);
     thread_stat_->idle_timer.Stop();
+    std::string output{""};
     if (results != nullptr) {
       if (thread_stat_->status_.IsOk()) {
+        results->Output(output);
         thread_stat_->status_ = ValidateOutputs(results);
       }
       delete results;
@@ -167,7 +169,7 @@ InferContext::SendRequest(
       std::lock_guard<std::mutex> lock(thread_stat_->mu_);
       auto total = end_time_sync - start_time_sync;
       thread_stat_->request_records_.emplace_back(RequestRecord(
-          start_time_sync, std::move(end_time_syncs),
+          start_time_sync, std::move(end_time_syncs), {output},
           infer_data_.options_->sequence_end_, delayed, sequence_id, false));
       thread_stat_->status_ =
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
@@ -258,7 +260,11 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
         if (thread_stat_->cb_status_.IsOk() == false) {
           return;
         }
-        it->second.response_times_.push_back(std::chrono::system_clock::now());
+        it->second.response_timestamps_.push_back(
+            std::chrono::system_clock::now());
+        std::string response_output{""};
+        result->Output(response_output);
+        it->second.response_outputs_.push_back(response_output);
         num_responses_++;
         if (is_null_response == true) {
           it->second.has_null_last_response_ = true;
@@ -271,9 +277,10 @@ InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
         if (is_final_response) {
           has_received_final_response_ = is_final_response;
           thread_stat_->request_records_.emplace_back(
-              it->second.start_time_, it->second.response_times_,
-              it->second.sequence_end_, it->second.delayed_,
-              it->second.sequence_id_, it->second.has_null_last_response_);
+              it->second.start_time_, it->second.response_timestamps_,
+              it->second.response_outputs_, it->second.sequence_end_,
+              it->second.delayed_, it->second.sequence_id_,
+              it->second.has_null_last_response_);
           infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
           thread_stat_->cb_status_ = ValidateOutputs(result);
           async_req_map_.erase(request_id);

diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1282,11 +1282,12 @@ InferenceProfiler::ValidLatencyMeasurement(
     uint64_t request_end_ns;
 
     if (request_record.has_null_last_response_ == false) {
-      request_end_ns = CHRONO_TO_NANOS(request_record.response_times_.back());
-    } else if (request_record.response_times_.size() > 1) {
-      size_t last_response_idx{request_record.response_times_.size() - 2};
       request_end_ns =
-          CHRONO_TO_NANOS(request_record.response_times_[last_response_idx]);
+          CHRONO_TO_NANOS(request_record.response_timestamps_.back());
+    } else if (request_record.response_timestamps_.size() > 1) {
+      size_t last_response_idx{request_record.response_timestamps_.size() - 2};
+      request_end_ns = CHRONO_TO_NANOS(
+          request_record.response_timestamps_[last_response_idx]);
     } else {
       erase_indices.push_back(i);
       continue;
@@ -1297,7 +1298,7 @@ InferenceProfiler::ValidLatencyMeasurement(
       if ((request_end_ns >= valid_range.first) &&
           (request_end_ns <= valid_range.second)) {
         valid_latencies->push_back(request_end_ns - request_start_ns);
-        response_count += request_record.response_times_.size();
+        response_count += request_record.response_timestamps_.size();
         if (request_record.has_null_last_response_) {
           response_count--;
         }

diff --git a/src/c++/perf_analyzer/profile_data_exporter.cc b/src/c++/perf_analyzer/profile_data_exporter.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -122,25 +122,43 @@ ProfileDataExporter::AddRequests(
       request.AddMember("sequence_id", sequence_id, document_.GetAllocator());
     }
 
-    rapidjson::Value responses(rapidjson::kArrayType);
-    AddResponses(responses, raw_request.response_times_);
+    rapidjson::Value response_timestamps(rapidjson::kArrayType);
+    AddResponseTimestamps(
+        response_timestamps, raw_request.response_timestamps_);
     request.AddMember(
-        "response_timestamps", responses, document_.GetAllocator());
+        "response_timestamps", response_timestamps, document_.GetAllocator());
+
+    rapidjson::Value response_outputs(rapidjson::kArrayType);
+    AddResponseOutputs(response_outputs, raw_request.response_outputs_);
+    request.AddMember(
+        "response_outputs", response_outputs, document_.GetAllocator());
+
     requests.PushBack(request, document_.GetAllocator());
   }
   entry.AddMember("requests", requests, document_.GetAllocator());
 }
 
 void
-ProfileDataExporter::AddResponses(
-    rapidjson::Value& responses,
+ProfileDataExporter::AddResponseTimestamps(
+    rapidjson::Value& timestamps_json,
     const std::vector<std::chrono::time_point<std::chrono::system_clock>>&
-        response_times)
+        timestamps)
+{
+  for (auto& timestamp : timestamps) {
+    rapidjson::Value timestamp_json;
+    timestamp_json.SetUint64(timestamp.time_since_epoch().count());
+    timestamps_json.PushBack(timestamp_json, document_.GetAllocator());
+  }
+}
+
+void
+ProfileDataExporter::AddResponseOutputs(
+    rapidjson::Value& outputs_json, const std::vector<std::string>& outputs)
 {
-  for (auto& response : response_times) {
-    rapidjson::Value time;
-    time.SetUint64(response.time_since_epoch().count());
-    responses.PushBack(time, document_.GetAllocator());
+  for (auto& output : outputs) {
+    rapidjson::Value output_json;
+    output_json.SetString(output.c_str(), output.size());
+    outputs_json.PushBack(output_json, document_.GetAllocator());
   }
 }
 

diff --git a/src/c++/perf_analyzer/profile_data_exporter.h b/src/c++/perf_analyzer/profile_data_exporter.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -69,10 +69,12 @@ class ProfileDataExporter {
   void AddRequests(
       rapidjson::Value& entry, rapidjson::Value& requests,
       const Experiment& raw_experiment);
-  void AddResponses(
-      rapidjson::Value& responses,
+  void AddResponseTimestamps(
+      rapidjson::Value& timestamps_json,
       const std::vector<std::chrono::time_point<std::chrono::system_clock>>&
-          response_times);
+          timestamps);
+  void AddResponseOutputs(
+      rapidjson::Value& outputs_json, const std::vector<std::string>& outputs);
   void AddWindowBoundaries(
       rapidjson::Value& entry, rapidjson::Value& window_boundaries,
       const Experiment& raw_experiment);

diff --git a/src/c++/perf_analyzer/request_record.h b/src/c++/perf_analyzer/request_record.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -34,24 +34,27 @@ namespace triton { namespace perfanalyzer {
 
 /// A record of an individual request
 struct RequestRecord {
-  RequestRecord() = default;
   RequestRecord(
-      std::chrono::time_point<std::chrono::system_clock> start_time,
+      std::chrono::time_point<std::chrono::system_clock> start_time =
+          std::chrono::time_point<std::chrono::system_clock>(),
       std::vector<std::chrono::time_point<std::chrono::system_clock>>
-          response_times,
-      bool sequence_end, bool delayed, uint64_t sequence_id,
-      bool has_null_last_response)
-      : start_time_(start_time), response_times_(response_times),
-        sequence_end_(sequence_end), delayed_(delayed),
-        sequence_id_(sequence_id),
+          response_timestamps = {},
+      std::vector<std::string> response_outputs = {}, bool sequence_end = true,
+      bool delayed = false, uint64_t sequence_id = 0,
+      bool has_null_last_response = false)
+      : start_time_(start_time), response_timestamps_(response_timestamps),
+        response_outputs_(response_outputs), sequence_end_(sequence_end),
+        delayed_(delayed), sequence_id_(sequence_id),
         has_null_last_response_(has_null_last_response)
   {
   }
   // The timestamp of when the request was started.
   std::chrono::time_point<std::chrono::system_clock> start_time_;
-  // Collection of response times
+  // Collection of response timestamps
   std::vector<std::chrono::time_point<std::chrono::system_clock>>
-      response_times_;
+      response_timestamps_;
+  // Collection of response outputs
+  std::vector<std::string> response_outputs_;
   // Whether or not the request is at the end of a sequence.
   bool sequence_end_;
   // Whether or not the request is delayed as per schedule.