[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579)

markmc · web-flow · commit 233df6f5c452 · 2025-02-04T19:46:54.000-05:00
Signed-off-by: Mark McLoughlin &lt;markmc@redhat.com&gt;
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
@@ -205,6 +205,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:gpu_cache_usage_perc",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
+    "vllm:request_success_total",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
     "vllm:request_prompt_tokens_count",
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
@@ -15,6 +15,23 @@
     from vllm.sampling_params import SamplingParams
 
 
+class RequestFinishedReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, or abort.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted for another reason
+
+    """
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+
+    def __str__(self):
+        return self.name.lower()
+
+
 @dataclass
 class EngineCoreRequest:
 
@@ -45,7 +62,7 @@ class EngineCoreOutput(
     request_id: str
     new_token_ids: List[int]
     finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -56,7 +73,7 @@ class EngineCoreOutputs(
         gc=False):  # type: ignore[call-arg]
 
     #NOTE(Nick): We could consider ways to make this more compact,
-    # e.g. columnwise layout and using an int enum for finish/stop reason
+    # e.g. columnwise layout
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,8 @@
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
+                            RequestFinishedReason)
 
 logger = init_logger(__name__)
 
@@ -18,7 +19,7 @@ class DetokenizerOutput:
     output_text: str
     token_ids: List[int]
     finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -147,13 +148,13 @@ def update_from_output(
                 stop_str, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
-                finish_reason = "stop"  # TODO: use constant
+                finish_reason = RequestFinishedReason.STOP
                 stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
 
         # 3) Update the RequestOutput object with the new text.
-        finished = bool(finish_reason)
+        finished = finish_reason is not None
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
             and not finished:
             return None
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -161,8 +161,10 @@ def process_outputs(
                 engine_core_output)
 
             # 3) Create and handle RequestOutput objects.
-            if request_output := self._make_request_output(
-                    req_state, detokenizer_output):
+            if detokenizer_output is not None:
+                request_output = self._make_request_output(
+                    req_state, detokenizer_output)
+
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put_nowait(request_output)
@@ -172,6 +174,8 @@ def process_outputs(
 
                 # Free completed requests.
                 if request_output.finished:
+                    assert detokenizer_output.finish_reason is not None
+
                     self.request_states.pop(req_id)
                     if not engine_core_output.finished:
                         # If req not finished in EngineCore, but Detokenizer
@@ -180,7 +184,8 @@ def process_outputs(
 
                     # Track per-request stats
                     iteration_stats.update_from_finished_request(
-                        request_output, req_state.stats)
+                        detokenizer_output.finish_reason, request_output,
+                        req_state.stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
@@ -191,12 +196,8 @@ def process_outputs(
     @staticmethod
     def _make_request_output(
         request_state: RequestState,
-        detokenizer_output: Optional[DetokenizerOutput],
-    ) -> Optional[RequestOutput]:
-
-        if detokenizer_output is None:
-            return None
-
+        detokenizer_output: DetokenizerOutput,
+    ) -> RequestOutput:
         request_output = RequestOutput.new(
             request_state.request_id,
             request_state.prompt,
@@ -207,7 +208,8 @@ def _make_request_output(
         )
         if detokenizer_output.finished:
             completion_output = request_output.outputs[0]
-            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.finish_reason = str(
+                detokenizer_output.finish_reason)
             completion_output.stop_reason = detokenizer_output.stop_reason
 
         return request_output
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -2,13 +2,14 @@
 
 import time
 from abc import ABC, abstractmethod
-from typing import List
+from typing import Dict, List
 
 import numpy as np
 import prometheus_client
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
+from vllm.v1.engine import RequestFinishedReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -116,6 +117,17 @@ def __init__(self, model_config: ModelConfig):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.counter_request_success: Dict[RequestFinishedReason,
+                                           prometheus_client.Counter] = {}
+        counter_request_success_base = prometheus_client.Counter(
+            name="vllm:request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + ["finished_reason"])
+        for reason in RequestFinishedReason:
+            self.counter_request_success[
+                reason] = counter_request_success_base.labels(*(labelvalues +
+                                                                [str(reason)]))
+
         self.histogram_num_prompt_tokens_request = \
             prometheus_client.Histogram(
                 name="vllm:request_prompt_tokens",
@@ -163,6 +175,7 @@ def log(self, scheduler_stats: SchedulerStats,
             iteration_stats.num_generation_tokens)
 
         for finished_request in iteration_stats.finished_requests:
+            self.counter_request_success[finished_request.finish_reason].inc()
             self.histogram_num_prompt_tokens_request.observe(
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request.observe(
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput
+    from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
 
 
 @dataclass
@@ -32,6 +32,7 @@ class RequestStateStats:
 class FinishedRequestStats:
     """Stats associated with a finished request."""
 
+    finish_reason: "RequestFinishedReason"
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
 
@@ -73,8 +74,11 @@ def update_from_output(self, output: "EngineCoreOutput",
         request_state_stats.num_generation_tokens += num_new_generation_tokens
         request_state_stats.last_token_time = now
 
-    def update_from_finished_request(self, request_output: "RequestOutput",
+    def update_from_finished_request(self,
+                                     finish_reason: "RequestFinishedReason",
+                                     request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):
         self.finished_requests.append(
-            FinishedRequestStats(len(request_output.prompt_token_ids),
+            FinishedRequestStats(finish_reason,
+                                 len(request_output.prompt_token_ids),
                                  request_state_stats.num_generation_tokens))
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
@@ -6,7 +6,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -109,7 +109,7 @@ def num_output_tokens(self) -> int:
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
-    def get_finished_reason(self) -> Union[str, None]:
+    def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
@@ -149,7 +149,8 @@ def is_finished(status: "RequestStatus") -> bool:
         return status > RequestStatus.PREEMPTED
 
     @staticmethod
-    def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
+    def get_finished_reason(
+            status: "RequestStatus") -> Union[RequestFinishedReason, None]:
         return _FINISHED_REASON_MAP.get(status)
 
 
@@ -158,8 +159,8 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
-    RequestStatus.FINISHED_STOPPED: "stop",
-    RequestStatus.FINISHED_LENGTH_CAPPED: "length",
-    RequestStatus.FINISHED_ABORTED: "abort",
-    RequestStatus.FINISHED_IGNORED: "length",
+    RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
+    RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
+    RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
+    RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
 }