Skip to content

Commit 233df6f

Browse files
authored
[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent 18016a5 commit 233df6f

File tree

7 files changed

+66
-27
lines changed

7 files changed

+66
-27
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
205205
"vllm:gpu_cache_usage_perc",
206206
"vllm:prompt_tokens_total",
207207
"vllm:generation_tokens_total",
208+
"vllm:request_success_total",
208209
"vllm:request_prompt_tokens_sum",
209210
"vllm:request_prompt_tokens_bucket",
210211
"vllm:request_prompt_tokens_count",

vllm/v1/engine/__init__.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,23 @@
1515
from vllm.sampling_params import SamplingParams
1616

1717

18+
class RequestFinishedReason(enum.IntEnum):
19+
"""
20+
Reason a request finished - stop, length, or abort.
21+
22+
stop - a stop string was emitted
23+
length - max_tokens was consumed, or max_model_len was reached
24+
abort - aborted for another reason
25+
26+
"""
27+
STOP = 0
28+
LENGTH = 1
29+
ABORT = 2
30+
31+
def __str__(self):
32+
return self.name.lower()
33+
34+
1835
@dataclass
1936
class EngineCoreRequest:
2037

@@ -45,7 +62,7 @@ class EngineCoreOutput(
4562
request_id: str
4663
new_token_ids: List[int]
4764
finished: bool
48-
finish_reason: Optional[str] = None
65+
finish_reason: Optional[RequestFinishedReason] = None
4966
stop_reason: Union[int, str, None] = None
5067

5168

@@ -56,7 +73,7 @@ class EngineCoreOutputs(
5673
gc=False): # type: ignore[call-arg]
5774

5875
#NOTE(Nick): We could consider ways to make this more compact,
59-
# e.g. columnwise layout and using an int enum for finish/stop reason
76+
# e.g. columnwise layout
6077

6178
# [num_reqs]
6279
outputs: List[EngineCoreOutput]

vllm/v1/engine/detokenizer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
from vllm.sampling_params import RequestOutputKind
99
from vllm.transformers_utils.detokenizer_utils import (
1010
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
11-
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
11+
from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
12+
RequestFinishedReason)
1213

1314
logger = init_logger(__name__)
1415

@@ -18,7 +19,7 @@ class DetokenizerOutput:
1819
output_text: str
1920
token_ids: List[int]
2021
finished: bool
21-
finish_reason: Optional[str] = None
22+
finish_reason: Optional[RequestFinishedReason] = None
2223
stop_reason: Union[int, str, None] = None
2324

2425

@@ -147,13 +148,13 @@ def update_from_output(
147148
stop_str, truncate_to = stop
148149
if truncate_to != -1:
149150
self.output_text = self.output_text[:truncate_to]
150-
finish_reason = "stop" # TODO: use constant
151+
finish_reason = RequestFinishedReason.STOP
151152
stop_reason = stop_str
152153

153154
# TODO: handle stop_token_ids here too?
154155

155156
# 3) Update the RequestOutput object with the new text.
156-
finished = bool(finish_reason)
157+
finished = finish_reason is not None
157158
if self.output_kind == RequestOutputKind.FINAL_ONLY \
158159
and not finished:
159160
return None

vllm/v1/engine/output_processor.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,10 @@ def process_outputs(
161161
engine_core_output)
162162

163163
# 3) Create and handle RequestOutput objects.
164-
if request_output := self._make_request_output(
165-
req_state, detokenizer_output):
164+
if detokenizer_output is not None:
165+
request_output = self._make_request_output(
166+
req_state, detokenizer_output)
167+
166168
if req_state.queue is not None:
167169
# AsyncLLM: put into queue for handling by generate().
168170
req_state.queue.put_nowait(request_output)
@@ -172,6 +174,8 @@ def process_outputs(
172174

173175
# Free completed requests.
174176
if request_output.finished:
177+
assert detokenizer_output.finish_reason is not None
178+
175179
self.request_states.pop(req_id)
176180
if not engine_core_output.finished:
177181
# If req not finished in EngineCore, but Detokenizer
@@ -180,7 +184,8 @@ def process_outputs(
180184

181185
# Track per-request stats
182186
iteration_stats.update_from_finished_request(
183-
request_output, req_state.stats)
187+
detokenizer_output.finish_reason, request_output,
188+
req_state.stats)
184189

185190
return OutputProcessorOutput(
186191
request_outputs=request_outputs,
@@ -191,12 +196,8 @@ def process_outputs(
191196
@staticmethod
192197
def _make_request_output(
193198
request_state: RequestState,
194-
detokenizer_output: Optional[DetokenizerOutput],
195-
) -> Optional[RequestOutput]:
196-
197-
if detokenizer_output is None:
198-
return None
199-
199+
detokenizer_output: DetokenizerOutput,
200+
) -> RequestOutput:
200201
request_output = RequestOutput.new(
201202
request_state.request_id,
202203
request_state.prompt,
@@ -207,7 +208,8 @@ def _make_request_output(
207208
)
208209
if detokenizer_output.finished:
209210
completion_output = request_output.outputs[0]
210-
completion_output.finish_reason = detokenizer_output.finish_reason
211+
completion_output.finish_reason = str(
212+
detokenizer_output.finish_reason)
211213
completion_output.stop_reason = detokenizer_output.stop_reason
212214

213215
return request_output

vllm/v1/metrics/loggers.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22

33
import time
44
from abc import ABC, abstractmethod
5-
from typing import List
5+
from typing import Dict, List
66

77
import numpy as np
88
import prometheus_client
99

1010
from vllm.config import ModelConfig
1111
from vllm.logger import init_logger
12+
from vllm.v1.engine import RequestFinishedReason
1213
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
1314

1415
logger = init_logger(__name__)
@@ -116,6 +117,17 @@ def __init__(self, model_config: ModelConfig):
116117
documentation="Number of generation tokens processed.",
117118
labelnames=labelnames).labels(*labelvalues)
118119

120+
self.counter_request_success: Dict[RequestFinishedReason,
121+
prometheus_client.Counter] = {}
122+
counter_request_success_base = prometheus_client.Counter(
123+
name="vllm:request_success_total",
124+
documentation="Count of successfully processed requests.",
125+
labelnames=labelnames + ["finished_reason"])
126+
for reason in RequestFinishedReason:
127+
self.counter_request_success[
128+
reason] = counter_request_success_base.labels(*(labelvalues +
129+
[str(reason)]))
130+
119131
self.histogram_num_prompt_tokens_request = \
120132
prometheus_client.Histogram(
121133
name="vllm:request_prompt_tokens",
@@ -163,6 +175,7 @@ def log(self, scheduler_stats: SchedulerStats,
163175
iteration_stats.num_generation_tokens)
164176

165177
for finished_request in iteration_stats.finished_requests:
178+
self.counter_request_success[finished_request.finish_reason].inc()
166179
self.histogram_num_prompt_tokens_request.observe(
167180
finished_request.num_prompt_tokens)
168181
self.histogram_num_generation_tokens_request.observe(

vllm/v1/metrics/stats.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
if TYPE_CHECKING:
88
from vllm.outputs import RequestOutput
9-
from vllm.v1.engine import EngineCoreOutput
9+
from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
1010

1111

1212
@dataclass
@@ -32,6 +32,7 @@ class RequestStateStats:
3232
class FinishedRequestStats:
3333
"""Stats associated with a finished request."""
3434

35+
finish_reason: "RequestFinishedReason"
3536
num_prompt_tokens: int = 0
3637
num_generation_tokens: int = 0
3738

@@ -73,8 +74,11 @@ def update_from_output(self, output: "EngineCoreOutput",
7374
request_state_stats.num_generation_tokens += num_new_generation_tokens
7475
request_state_stats.last_token_time = now
7576

76-
def update_from_finished_request(self, request_output: "RequestOutput",
77+
def update_from_finished_request(self,
78+
finish_reason: "RequestFinishedReason",
79+
request_output: "RequestOutput",
7780
request_state_stats: RequestStateStats):
7881
self.finished_requests.append(
79-
FinishedRequestStats(len(request_output.prompt_token_ids),
82+
FinishedRequestStats(finish_reason,
83+
len(request_output.prompt_token_ids),
8084
request_state_stats.num_generation_tokens))

vllm/v1/request.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from vllm.lora.request import LoRARequest
77
from vllm.sampling_params import SamplingParams
88
from vllm.sequence import RequestMetrics
9-
from vllm.v1.engine import EngineCoreRequest
9+
from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
1010
from vllm.v1.utils import ConstantList
1111

1212
if TYPE_CHECKING:
@@ -109,7 +109,7 @@ def num_output_tokens(self) -> int:
109109
def is_finished(self) -> bool:
110110
return RequestStatus.is_finished(self.status)
111111

112-
def get_finished_reason(self) -> Union[str, None]:
112+
def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
113113
return RequestStatus.get_finished_reason(self.status)
114114

115115
def has_encoder_inputs(self) -> bool:
@@ -149,7 +149,8 @@ def is_finished(status: "RequestStatus") -> bool:
149149
return status > RequestStatus.PREEMPTED
150150

151151
@staticmethod
152-
def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
152+
def get_finished_reason(
153+
status: "RequestStatus") -> Union[RequestFinishedReason, None]:
153154
return _FINISHED_REASON_MAP.get(status)
154155

155156

@@ -158,8 +159,8 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
158159
# are longer than the model's length cap. Therefore, the stop
159160
# reason should also be "length" as in OpenAI API.
160161
_FINISHED_REASON_MAP = {
161-
RequestStatus.FINISHED_STOPPED: "stop",
162-
RequestStatus.FINISHED_LENGTH_CAPPED: "length",
163-
RequestStatus.FINISHED_ABORTED: "abort",
164-
RequestStatus.FINISHED_IGNORED: "length",
162+
RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
163+
RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
164+
RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
165+
RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
165166
}

0 commit comments

Comments
 (0)