Skip to content

Commit afd51dc

Browse files
committed
[WIP][Metrics] Re-work approach to LoRA metrics
The current `vllm:lora_requests_info` Gauge is somewhat similar to an Info metric (like cache_config_info) except the value is the current wall-clock time, and is updated every iteration. The label names used are: - running_lora_adapters: a list of adapters with running requests, formatted as a comma-separated string. - waiting_lora_adapters: similar, except listing adapters with requests waiting to be scheduled. - max_lora - the static "max number of LoRAs in a single batch." configuration. It looks like this: ``` vllm:lora_requests_info{max_lora="1",running_lora_adapters="",waiting_lora_adapters=""} 1.7395575657589855e+09 vllm:lora_requests_info{max_lora="1",running_lora_adapters="test-lora",waiting_lora_adapters=""} 1.7395575723949368e+09 vllm:lora_requests_info{max_lora="1",running_lora_adapters="test-lora",waiting_lora_adapters="test-lora"} 1.7395575717647147e+09 ``` I can't really make much sense of this. Encoding a running/waiting status for multiple adapters in a comma-separated string seems quite misguided - we should use labels to distinguish between per-adapter counts instead: ``` vllm:num_lora_requests_running{lora_name="test-lora",model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0 vllm:num_lora_requests_waiting{lora_name="test-lora",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0 ``` This was added in #9477 and there is at least one known user. If we revisit this design and deprecate the old metric, we should reduce the need for a significant deprecation period by making the change in v0 also and asking this project to move to the new metric. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent c9e2d64 commit afd51dc

File tree

3 files changed

+29
-6
lines changed

3 files changed

+29
-6
lines changed

vllm/engine/llm_engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,8 +1812,8 @@ def _get_stats(self,
18121812
max_tokens_requests=max_tokens_requests,
18131813
finished_reason_requests=finished_reason_requests,
18141814
max_lora=str(max_lora_stat),
1815-
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
1816-
running_lora_adapters=list(running_lora_adapters.keys()))
1815+
waiting_lora_adapters=waiting_lora_adapters,
1816+
running_lora_adapters=running_lora_adapters)
18171817

18181818
def add_lora(self, lora_request: LoRARequest) -> bool:
18191819
return self.model_executor.add_lora(lora_request)

vllm/engine/metrics.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
5353

5454
max_model_len = vllm_config.model_config.max_model_len
5555

56+
lora_labelnames = labelnames + ["lora_name"]
57+
5658
# System stats
5759
# Scheduler State
5860
self.gauge_scheduler_running = self._gauge_cls(
@@ -65,6 +67,17 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
6567
documentation="Number of requests waiting to be processed.",
6668
labelnames=labelnames,
6769
multiprocess_mode="sum")
70+
self.gauge_lora_requests_running = self._gauge_cls(
71+
name="vllm:num_lora_requests_running",
72+
documentation="Number of requests currently, per LoRA.",
73+
labelnames=lora_labelnames,
74+
multiprocess_mode="sum")
75+
self.gauge_lora_requests_waiting = self._gauge_cls(
76+
name="vllm:num_lora_requests_waiting",
77+
documentation="Number of requests waiting, per LoRA.",
78+
labelnames=lora_labelnames,
79+
multiprocess_mode="sum")
80+
# Deprecated
6881
self.gauge_lora_info = self._gauge_cls(
6982
name="vllm:lora_requests_info",
7083
documentation="Running stats on lora requests.",
@@ -517,9 +530,11 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
517530
self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
518531
vllm_config=vllm_config)
519532

520-
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
533+
def _log_gauge(self, gauge, data: Union[int, float],
534+
**extra_labels) -> None:
521535
# Convenience function for logging to gauge.
522-
gauge.labels(**self.labels).set(data)
536+
combined_labels = {**self.labels, **extra_labels}
537+
gauge.labels(**combined_labels).set(data)
523538

524539
def _log_counter(self, counter, data: Union[int, float]) -> None:
525540
# Convenience function for logging to counter.
@@ -561,6 +576,14 @@ def _log_prometheus(self, stats: Stats) -> None:
561576
stats.cpu_prefix_cache_hit_rate)
562577
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
563578
stats.gpu_prefix_cache_hit_rate)
579+
for lora_name, lora_running in stats.running_lora_adapters.items():
580+
self._log_gauge(self.metrics.gauge_lora_requests_running,
581+
lora_running,
582+
lora_name=lora_name)
583+
for lora_name, lora_waiting in stats.waiting_lora_adapters.items():
584+
self._log_gauge(self.metrics.gauge_lora_requests_waiting,
585+
lora_waiting,
586+
lora_name=lora_name)
564587
# Including max-lora in metric, in future this property of lora
565588
# config maybe extended to be dynamic.
566589
lora_info = {

vllm/engine/metrics_types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ class Stats:
6363
max_num_generation_tokens_requests: List[int]
6464
max_tokens_requests: List[int]
6565
finished_reason_requests: List[str]
66-
waiting_lora_adapters: List[str]
67-
running_lora_adapters: List[str]
66+
waiting_lora_adapters: Dict[str, int]
67+
running_lora_adapters: Dict[str, int]
6868
max_lora: str
6969

7070
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None

0 commit comments

Comments
 (0)