Skip to content

Commit cd0952f

Browse files
comaniacSzymonOzog
authored andcommitted
[V1][Metrics] Add GPU prefix cache hit rate % gauge (vllm-project#12592)
Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
1 parent fa6790f commit cd0952f

File tree

7 files changed

+174
-5
lines changed

7 files changed

+174
-5
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
203203
"vllm:num_requests_running",
204204
"vllm:num_requests_waiting",
205205
"vllm:gpu_cache_usage_perc",
206+
"vllm:gpu_prefix_cache_queries",
207+
"vllm:gpu_prefix_cache_hits",
206208
"vllm:prompt_tokens_total",
207209
"vllm:generation_tokens_total",
208210
"vllm:request_success_total",

tests/v1/core/test_kv_cache_utils.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
from vllm.multimodal.inputs import MultiModalKwargs
66
from vllm.sampling_params import SamplingParams
77
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
8-
KVCacheBlock,
8+
KVCacheBlock, PrefixCachingMetrics,
99
generate_block_hash_extra_keys,
1010
hash_block_tokens,
1111
hash_request_tokens)
12+
from vllm.v1.metrics.stats import PrefixCacheStats
1213
from vllm.v1.request import Request
1314

1415

@@ -277,3 +278,39 @@ def test_hash_request_tokens_no_mm_inputs():
277278
assert block_hashes[0].extra_keys is None
278279
assert block_hashes[1].token_ids == (3, 4, 5)
279280
assert block_hashes[1].extra_keys is None
281+
282+
283+
def test_metrics():
284+
"""
285+
Test the prefix caching metrics.
286+
"""
287+
288+
def stats(requests, queries, hits):
289+
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
290+
291+
metrics = PrefixCachingMetrics(interval=5)
292+
assert metrics.hit_rate == 0.0
293+
294+
metrics.observe(stats(1, 20, 9))
295+
# 9 / 20 = 0.45
296+
assert metrics.hit_rate == 0.45
297+
298+
metrics.observe(stats(4, 80, 16))
299+
300+
# 25 / 100 = 0.25
301+
assert metrics.hit_rate == 0.25
302+
303+
metrics.observe(stats(1, 10, 2))
304+
305+
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
306+
assert metrics.aggregated_requests == 5
307+
assert metrics.aggregated_query_total == 90
308+
assert metrics.aggregated_query_hit == 18
309+
assert metrics.hit_rate == 0.2
310+
311+
metrics.reset()
312+
assert metrics.hit_rate == 0.0
313+
assert metrics.aggregated_requests == 0
314+
assert metrics.aggregated_query_total == 0
315+
assert metrics.aggregated_query_hit == 0
316+
assert not metrics.query_queue

vllm/v1/core/kv_cache_manager.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
generate_block_hash_extra_keys,
1111
hash_block_tokens,
1212
hash_request_tokens)
13+
from vllm.v1.metrics.stats import PrefixCacheStats
1314
from vllm.v1.request import Request, RequestStatus
1415

1516
logger = init_logger(__name__)
@@ -78,11 +79,28 @@ def __init__(
7879
self.req_to_block_hashes: DefaultDict[
7980
str, List[BlockHashType]] = defaultdict(list)
8081

82+
self.prefix_cache_stats = PrefixCacheStats()
83+
8184
@property
8285
def usage(self) -> float:
86+
"""Get the KV cache usage.
87+
88+
Returns:
89+
The KV cache usage (between 0.0 and 1.0).
90+
"""
8391
return 1.0 - (self.free_block_queue.num_free_blocks /
8492
self.num_gpu_blocks)
8593

94+
def make_prefix_cache_stats(self) -> PrefixCacheStats:
95+
"""Get (and reset) the prefix cache stats.
96+
97+
Returns:
98+
The current prefix caching stats.
99+
"""
100+
stats = self.prefix_cache_stats
101+
self.prefix_cache_stats = PrefixCacheStats()
102+
return stats
103+
86104
def get_computed_blocks(
87105
self, request: Request) -> Tuple[List[KVCacheBlock], int]:
88106
"""Get the computed (cached) blocks for the request.
@@ -118,6 +136,10 @@ def get_computed_blocks(
118136
else:
119137
break
120138

139+
self.prefix_cache_stats.requests += 1
140+
self.prefix_cache_stats.queries += len(block_hashes)
141+
self.prefix_cache_stats.hits += len(computed_blocks)
142+
121143
# NOTE(woosuk): Since incomplete blocks are not eligible for
122144
# sharing, `num_computed_tokens` is always a multiple of
123145
# `block_size`.
@@ -280,6 +302,8 @@ def reset_prefix_cache(self) -> bool:
280302
for block in self.block_pool:
281303
block.reset_hash()
282304

305+
self.prefix_cache_stats.reset = True
306+
283307
logger.info("Successfully reset prefix cache")
284308
return True
285309

vllm/v1/core/kv_cache_utils.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""KV-Cache Utilities."""
3+
from collections import deque
34
from collections.abc import Sequence
45
from dataclasses import dataclass
56
from typing import Any, List, NamedTuple, Optional, Tuple
@@ -8,6 +9,7 @@
89
from vllm.logger import init_logger
910
from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
1011
KVCacheTensor)
12+
from vllm.v1.metrics.stats import PrefixCacheStats
1113
from vllm.v1.request import Request
1214

1315
logger = init_logger(__name__)
@@ -28,6 +30,68 @@ class BlockHashType(NamedTuple):
2830
extra_keys: Optional[Any] = None
2931

3032

33+
class PrefixCachingMetrics:
34+
"""Metrics for prefix caching with a hit rate of the most recent N requests.
35+
36+
Args:
37+
interval: The number of the most recent requests to aggregate.
38+
Defaults to 1000.
39+
"""
40+
41+
def __init__(self, interval: int = 1000):
42+
self.interval = interval
43+
# The current aggregated values.
44+
self.aggregated_requests = 0
45+
self.aggregated_query_total = 0
46+
self.aggregated_query_hit = 0
47+
# A deque of (requests, queries, hits) for the most recent requests.
48+
self.query_queue: deque[Tuple[int, int, int]] = deque()
49+
50+
def observe(self, stats: PrefixCacheStats):
51+
"""Observe the prefix caching for a set of requests.
52+
53+
This function is called with information gathered when new requests
54+
are being scheduled and are looking for computed blocks.
55+
56+
When there are more than `interval` requests, the oldest set of
57+
requestsare removed from the metrics.
58+
59+
Args:
60+
stats: The prefix cache stats.
61+
"""
62+
# reset_prefix_cache was invoked before the current update.
63+
# Reset the metrics before aggregating the current stats.
64+
if stats.reset:
65+
self.reset()
66+
67+
# Update the metrics.
68+
self.query_queue.append((stats.requests, stats.queries, stats.hits))
69+
self.aggregated_requests += stats.requests
70+
self.aggregated_query_total += stats.queries
71+
self.aggregated_query_hit += stats.hits
72+
73+
# Remove the oldest stats if the number of requests exceeds.
74+
if self.aggregated_requests > self.interval:
75+
old_requests, old_queries, old_hits = self.query_queue.popleft()
76+
self.aggregated_requests -= old_requests
77+
self.aggregated_query_total -= old_queries
78+
self.aggregated_query_hit -= old_hits
79+
80+
def reset(self):
81+
"""Reset the metrics."""
82+
self.aggregated_requests = 0
83+
self.aggregated_query_total = 0
84+
self.aggregated_query_hit = 0
85+
self.query_queue.clear()
86+
87+
@property
88+
def hit_rate(self) -> float:
89+
"""Calculate the hit rate for the past N requests."""
90+
if self.aggregated_query_total == 0:
91+
return 0.0
92+
return self.aggregated_query_hit / self.aggregated_query_total
93+
94+
3195
@dataclass
3296
class KVCacheBlock:
3397
"""KV-cache block metadata."""

vllm/v1/core/scheduler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,4 +593,5 @@ def make_stats(self) -> SchedulerStats:
593593
num_running_reqs=len(self.running),
594594
num_waiting_reqs=len(self.waiting),
595595
gpu_cache_usage=self.kv_cache_manager.usage,
596+
prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
596597
)

vllm/v1/metrics/loggers.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from vllm.config import ModelConfig
1111
from vllm.logger import init_logger
12+
from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
1213
from vllm.v1.engine import FinishReason
1314
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
1415

@@ -37,6 +38,9 @@ def _reset(self, now):
3738
self.num_prompt_tokens: List[int] = []
3839
self.num_generation_tokens: List[int] = []
3940

41+
# Prefix cache metrics. TODO: Make the interval configurable.
42+
self.prefix_caching_metrics = PrefixCachingMetrics()
43+
4044
def _local_interval_elapsed(self, now: float) -> bool:
4145
# Log every _LOCAL_LOGGING_INTERVAL_SEC.
4246
elapsed_time = now - self.last_log_time
@@ -58,6 +62,8 @@ def log(self, scheduler_stats: SchedulerStats,
5862

5963
self._track_iteration_stats(iteration_stats)
6064

65+
self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
66+
6167
now = time.monotonic()
6268
if not self._local_interval_elapsed(now):
6369
return
@@ -72,13 +78,15 @@ def log(self, scheduler_stats: SchedulerStats,
7278
logger.info(
7379
"Avg prompt throughput: %.1f tokens/s, "
7480
"Avg generation throughput: %.1f tokens/s, "
75-
"Running: %d reqs, Waiting: %d reqs "
76-
"GPU KV cache usage: %.1f%%.",
81+
"Running: %d reqs, Waiting: %d reqs, "
82+
"GPU KV cache usage: %.1f%%, "
83+
"Prefix cache hit rate: %.1f%%",
7784
prompt_throughput,
7885
generation_throughput,
7986
scheduler_stats.num_running_reqs,
8087
scheduler_stats.num_waiting_reqs,
8188
scheduler_stats.gpu_cache_usage * 100,
89+
self.prefix_caching_metrics.hit_rate * 100,
8290
)
8391

8492

@@ -107,6 +115,18 @@ def __init__(self, model_config: ModelConfig):
107115
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
108116
labelnames=labelnames).labels(*labelvalues)
109117

118+
self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
119+
name="vllm:gpu_prefix_cache_queries",
120+
documentation=
121+
"GPU prefix cache queries, in terms of number of queried blocks.",
122+
labelnames=labelnames).labels(*labelvalues)
123+
124+
self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
125+
name="vllm:gpu_prefix_cache_hits",
126+
documentation=
127+
"GPU prefix cache hits, in terms of number of cached blocks.",
128+
labelnames=labelnames).labels(*labelvalues)
129+
110130
self.counter_prompt_tokens = prometheus_client.Counter(
111131
name="vllm:prompt_tokens_total",
112132
documentation="Number of prefill tokens processed.",
@@ -170,6 +190,11 @@ def log(self, scheduler_stats: SchedulerStats,
170190

171191
self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
172192

193+
self.counter_gpu_prefix_cache_queries.inc(
194+
scheduler_stats.prefix_cache_stats.queries)
195+
self.counter_gpu_prefix_cache_hits.inc(
196+
scheduler_stats.prefix_cache_stats.hits)
197+
173198
self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
174199
self.counter_generation_tokens.inc(
175200
iteration_stats.num_generation_tokens)

vllm/v1/metrics/stats.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,28 @@
11
# SPDX-License-Identifier: Apache-2.0
22

33
import time
4-
from dataclasses import dataclass
4+
from dataclasses import dataclass, field
55
from typing import TYPE_CHECKING, List
66

77
if TYPE_CHECKING:
88
from vllm.outputs import RequestOutput
99
from vllm.v1.engine import EngineCoreOutput, FinishReason
1010

1111

12+
@dataclass
13+
class PrefixCacheStats:
14+
"""Stores prefix cache hit statistics."""
15+
# Whether reset_prefix_cache was invoked.
16+
reset: bool = False
17+
# The number of requests in this update.
18+
requests: int = 0
19+
# The number of queries in these requests. Note that "queries" here
20+
# means the number of blocks that were queried from the cache.
21+
queries: int = 0
22+
# The number of hits in these requests.
23+
hits: int = 0
24+
25+
1226
@dataclass
1327
class SchedulerStats:
1428
"""Stats associated with the scheduler."""
@@ -17,7 +31,9 @@ class SchedulerStats:
1731
num_waiting_reqs: int = 0
1832

1933
gpu_cache_usage: float = 0.0
20-
# gpu_prefix_cache_hit_rate: float = 0.0
34+
35+
prefix_cache_stats: PrefixCacheStats = field(
36+
default_factory=PrefixCacheStats)
2137

2238

2339
@dataclass

0 commit comments

Comments
 (0)