Skip to content

Commit 60e1637

Browse files
markmccomaniac
authored andcommitted
[V1][Metrics] Replace prefix cache hit rate with queries/hits counters
With a hit rate metric, we need to avoid the situation where if the hit rate drops due to a sudden increase in misses, the change gets diluted by the large accumulated totals. With Prometheus, it is better to log monotonic counters and use Prometheus queries to calculate rates according to whatever time interval makes sense to the operator, also allowing them to easily adjust the time interval in their PromQL queries. With the logging logger, we can continue to aggregate across an interval of (roughly) 1000 requests. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent fd10665 commit 60e1637

File tree

7 files changed

+90
-47
lines changed

7 files changed

+90
-47
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
203203
"vllm:num_requests_running",
204204
"vllm:num_requests_waiting",
205205
"vllm:gpu_cache_usage_perc",
206-
"vllm:gpu_prefix_cache_hit_rate_perc",
206+
"vllm:gpu_prefix_cache_queries",
207+
"vllm:gpu_prefix_cache_hits",
207208
"vllm:prompt_tokens_total",
208209
"vllm:generation_tokens_total",
209210
"vllm:request_success_total",

tests/v1/core/test_kv_cache_utils.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
generate_block_hash_extra_keys,
1010
hash_block_tokens,
1111
hash_request_tokens)
12+
from vllm.v1.metrics.stats import PrefixCacheStats
1213
from vllm.v1.request import Request
1314

1415

@@ -283,28 +284,33 @@ def test_metrics():
283284
"""
284285
Test the prefix caching metrics.
285286
"""
287+
288+
def stats(requests, queries, hits):
289+
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
290+
286291
metrics = PrefixCachingMetrics(interval=5)
287292
assert metrics.hit_rate == 0.0
288293

289-
metrics.add_request_query(20, 9)
294+
metrics.observe(stats(1, 20, 9))
290295
# 9 / 20 = 0.45
291296
assert metrics.hit_rate == 0.45
292297

293-
for _ in range(4):
294-
metrics.add_request_query(20, 4)
298+
metrics.observe(stats(4, 80, 16))
295299

296300
# 25 / 100 = 0.25
297301
assert metrics.hit_rate == 0.25
298302

299-
metrics.add_request_query(10, 2)
303+
metrics.observe(stats(1, 10, 2))
300304

301305
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
306+
assert metrics.aggregated_requests == 5
302307
assert metrics.aggregated_query_total == 90
303308
assert metrics.aggregated_query_hit == 18
304309
assert metrics.hit_rate == 0.2
305310

306311
metrics.reset()
307312
assert metrics.hit_rate == 0.0
313+
assert metrics.aggregated_requests == 0
308314
assert metrics.aggregated_query_total == 0
309315
assert metrics.aggregated_query_hit == 0
310316
assert not metrics.query_queue

vllm/v1/core/kv_cache_manager.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
from vllm.logger import init_logger
77
from vllm.utils import cdiv
88
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
9-
KVCacheBlock, PrefixCachingMetrics,
9+
KVCacheBlock,
1010
generate_block_hash_extra_keys,
1111
hash_block_tokens,
1212
hash_request_tokens)
13+
from vllm.v1.metrics.stats import PrefixCacheStats
1314
from vllm.v1.request import Request, RequestStatus
1415

1516
logger = init_logger(__name__)
@@ -78,8 +79,7 @@ def __init__(
7879
self.req_to_block_hashes: DefaultDict[
7980
str, List[BlockHashType]] = defaultdict(list)
8081

81-
# Prefix cache metrics. TODO: Make the interval configurable.
82-
self.prefix_caching_metrics = PrefixCachingMetrics(interval=1000)
82+
self.prefix_cache_stats = PrefixCacheStats()
8383

8484
@property
8585
def usage(self) -> float:
@@ -91,14 +91,15 @@ def usage(self) -> float:
9191
return 1.0 - (self.free_block_queue.num_free_blocks /
9292
self.num_gpu_blocks)
9393

94-
@property
95-
def prefix_cache_hit_rate(self) -> float:
96-
"""Get the prefix caching hit rate.
94+
def make_prefix_cache_stats(self) -> PrefixCacheStats:
95+
"""Get (and reset) the prefix cache query and hit counts.
9796
9897
Returns:
99-
The prefix caching hit rate.
98+
The prefix caching stats - query count, and hit count.
10099
"""
101-
return self.prefix_caching_metrics.hit_rate
100+
stats = self.prefix_cache_stats
101+
self.prefix_cache_stats = PrefixCacheStats()
102+
return stats
102103

103104
def get_computed_blocks(
104105
self, request: Request) -> Tuple[List[KVCacheBlock], int]:
@@ -135,10 +136,9 @@ def get_computed_blocks(
135136
else:
136137
break
137138

138-
self.prefix_caching_metrics.add_request_query(
139-
num_queries=len(block_hashes),
140-
num_hits=len(computed_blocks),
141-
)
139+
self.prefix_cache_stats.requests += 1
140+
self.prefix_cache_stats.queries += len(block_hashes)
141+
self.prefix_cache_stats.hits += len(computed_blocks)
142142

143143
# NOTE(woosuk): Since incomplete blocks are not eligible for
144144
# sharing, `num_computed_tokens` is always a multiple of
@@ -302,8 +302,7 @@ def reset_prefix_cache(self) -> bool:
302302
for block in self.block_pool:
303303
block.reset_hash()
304304

305-
# Reset the prefix caching metrics.
306-
self.prefix_caching_metrics.reset()
305+
self.prefix_cache_stats.reset = True
307306

308307
logger.info("Successfully reset prefix cache")
309308
return True

vllm/v1/core/kv_cache_utils.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from vllm.logger import init_logger
1010
from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
1111
KVCacheTensor)
12+
from vllm.v1.metrics.stats import PrefixCacheStats
1213
from vllm.v1.request import Request
1314

1415
logger = init_logger(__name__)
@@ -40,33 +41,44 @@ class PrefixCachingMetrics:
4041
def __init__(self, interval: int = 1000):
4142
self.interval = interval
4243
# The current aggregated query total and hit.
44+
self.aggregated_requests = 0
4345
self.aggregated_query_total = 0
4446
self.aggregated_query_hit = 0
45-
# A deque of (num_queries, num_hits) for the most recent requests.
46-
self.query_queue: deque[Tuple[int, int]] = deque()
47+
# A deque of (requests, queries, hits) for the most recent requests.
48+
self.query_queue: deque[Tuple[int, int, int]] = deque()
4749

48-
def add_request_query(self, num_queries: int, num_hits: int):
49-
"""Add a request to the metrics. This function is called when
50-
a new request is being scheduled and is looking for computed blocks.
51-
When there are more than `interval` requests, the oldest request
52-
is removed from the metrics.
50+
def observe(self, stats: PrefixCacheStats):
51+
"""Observe the prefix caching for a set of requests.
5352
54-
Args:
55-
num_queries: The number of queries in the request.
56-
num_hits: The number of hits in the request.
53+
This function is called with information gathered when new requests
54+
are being scheduled and are looking for computed blocks.
55+
56+
When there are more than `interval` requests, the oldest set of
57+
requestsare removed from the metrics.
58+
59+
Stats:
60+
reset: Whether reset_prefix_cache was invoked.
61+
requests: The number of requests in this update.
62+
queries: The number of queries in these requests.
63+
hits: The number of hits in these requests.
5764
"""
65+
if stats.reset:
66+
self.reset()
5867

59-
self.query_queue.append((num_queries, num_hits))
60-
if len(self.query_queue) > self.interval:
61-
old_num_queries, old_num_hits = self.query_queue.popleft()
62-
self.aggregated_query_total -= old_num_queries
63-
self.aggregated_query_hit -= old_num_hits
68+
self.query_queue.append((stats.requests, stats.queries, stats.hits))
69+
self.aggregated_requests += stats.requests
70+
self.aggregated_query_total += stats.queries
71+
self.aggregated_query_hit += stats.hits
6472

65-
self.aggregated_query_total += num_queries
66-
self.aggregated_query_hit += num_hits
73+
if self.aggregated_requests > self.interval:
74+
old_requests, old_queries, old_hits = self.query_queue.popleft()
75+
self.aggregated_requests -= old_requests
76+
self.aggregated_query_total -= old_queries
77+
self.aggregated_query_hit -= old_hits
6778

6879
def reset(self):
6980
"""Reset the metrics."""
81+
self.aggregated_requests = 0
7082
self.aggregated_query_total = 0
7183
self.aggregated_query_hit = 0
7284
self.query_queue.clear()

vllm/v1/core/scheduler.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -599,8 +599,7 @@ def make_stats(self) -> SchedulerStats:
599599
num_running_reqs=len(self.running),
600600
num_waiting_reqs=len(self.waiting),
601601
gpu_cache_usage=self.kv_cache_manager.usage,
602-
gpu_prefix_cache_hit_rate=self.kv_cache_manager.
603-
prefix_cache_hit_rate,
602+
prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
604603
)
605604

606605

vllm/v1/metrics/loggers.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from vllm.config import ModelConfig
1111
from vllm.logger import init_logger
12+
from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
1213
from vllm.v1.engine import FinishReason
1314
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
1415

@@ -37,6 +38,9 @@ def _reset(self, now):
3738
self.num_prompt_tokens: List[int] = []
3839
self.num_generation_tokens: List[int] = []
3940

41+
# Prefix cache metrics. TODO: Make the interval configurable.
42+
self.prefix_caching_metrics = PrefixCachingMetrics()
43+
4044
def _local_interval_elapsed(self, now: float) -> bool:
4145
# Log every _LOCAL_LOGGING_INTERVAL_SEC.
4246
elapsed_time = now - self.last_log_time
@@ -58,6 +62,8 @@ def log(self, scheduler_stats: SchedulerStats,
5862

5963
self._track_iteration_stats(iteration_stats)
6064

65+
self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
66+
6167
now = time.monotonic()
6268
if not self._local_interval_elapsed(now):
6369
return
@@ -74,13 +80,13 @@ def log(self, scheduler_stats: SchedulerStats,
7480
"Avg generation throughput: %.1f tokens/s, "
7581
"Running: %d reqs, Waiting: %d reqs, "
7682
"GPU KV cache usage: %.1f%%, "
77-
"GPU prefix cache hit rate: %.1f%%",
83+
"Prefix cache hit rate: %.1f%%",
7884
prompt_throughput,
7985
generation_throughput,
8086
scheduler_stats.num_running_reqs,
8187
scheduler_stats.num_waiting_reqs,
8288
scheduler_stats.gpu_cache_usage * 100,
83-
scheduler_stats.gpu_prefix_cache_hit_rate * 100,
89+
self.prefix_caching_metrics.hit_rate * 100,
8490
)
8591

8692

@@ -109,10 +115,16 @@ def __init__(self, model_config: ModelConfig):
109115
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
110116
labelnames=labelnames).labels(*labelvalues)
111117

112-
self.gauge_gpu_prefix_cache_hit_rate = prometheus_client.Gauge(
113-
name="vllm:gpu_prefix_cache_hit_rate_perc",
118+
self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
119+
name="vllm:gpu_prefix_cache_queries",
120+
documentation=
121+
"GPU prefix cache queries, in terms of number of queried blocks.",
122+
labelnames=labelnames).labels(*labelvalues)
123+
124+
self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
125+
name="vllm:gpu_prefix_cache_hits",
114126
documentation=
115-
"GPU prefix cache hit rate. 1 means 100 percent hit rate.",
127+
"GPU prefix cache hits, in terms of number of cached blocks.",
116128
labelnames=labelnames).labels(*labelvalues)
117129

118130
self.counter_prompt_tokens = prometheus_client.Counter(
@@ -177,8 +189,11 @@ def log(self, scheduler_stats: SchedulerStats,
177189
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
178190

179191
self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
180-
self.gauge_gpu_prefix_cache_hit_rate.set(
181-
scheduler_stats.gpu_prefix_cache_hit_rate)
192+
193+
self.counter_gpu_prefix_cache_queries.inc(
194+
scheduler_stats.prefix_cache_stats.queries)
195+
self.counter_gpu_prefix_cache_hits.inc(
196+
scheduler_stats.prefix_cache_stats.hits)
182197

183198
self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
184199
self.counter_generation_tokens.inc(

vllm/v1/metrics/stats.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,23 @@
11
# SPDX-License-Identifier: Apache-2.0
22

33
import time
4-
from dataclasses import dataclass
4+
from dataclasses import dataclass, field
55
from typing import TYPE_CHECKING, List
66

77
if TYPE_CHECKING:
88
from vllm.outputs import RequestOutput
99
from vllm.v1.engine import EngineCoreOutput, FinishReason
1010

1111

12+
@dataclass
13+
class PrefixCacheStats:
14+
"""Stores prefix cache hit statistics."""
15+
reset: bool = False
16+
requests: int = 0
17+
hits: int = 0
18+
queries: int = 0
19+
20+
1221
@dataclass
1322
class SchedulerStats:
1423
"""Stats associated with the scheduler."""
@@ -17,7 +26,9 @@ class SchedulerStats:
1726
num_waiting_reqs: int = 0
1827

1928
gpu_cache_usage: float = 0.0
20-
gpu_prefix_cache_hit_rate: float = 0.0
29+
30+
prefix_cache_stats: PrefixCacheStats = field(
31+
default_factory=PrefixCacheStats)
2132

2233

2334
@dataclass

0 commit comments

Comments
 (0)