Skip to content

Commit 79e8f3a

Browse files
committed
[V1][Metrics] Make --disable-log-stats more effective
Avoid constructing: - The logging and prometheus loggers - IterationStats and RequestStateStats in the output processor - SchedulerStats and EngineCoreEvents in the scheduler Also, add TODO for vllm-project#12592 to disable prefix cache stats. Signed-off-by: Mark McLoughlin <markmc@redhat.com>
1 parent 8b31770 commit 79e8f3a

File tree

13 files changed

+112
-53
lines changed

13 files changed

+112
-53
lines changed

tests/v1/core/test_scheduler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ def create_scheduler(
3838
return Scheduler(scheduler_config,
3939
model_config,
4040
cache_config,
41-
lora_config=None)
41+
lora_config=None,
42+
log_stats=True)
4243

4344

4445
def create_requests(

tests/v1/engine/test_engine_core.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ def test_engine_core(monkeypatch):
5050
executor_class = Executor.get_class(vllm_config)
5151

5252
engine_core = EngineCore(vllm_config=vllm_config,
53-
executor_class=executor_class)
53+
executor_class=executor_class,
54+
log_stats=True)
5455
"""Test basic request lifecycle."""
5556

5657
# First request.
@@ -157,7 +158,8 @@ def test_engine_core_advanced_sampling(monkeypatch):
157158
executor_class = Executor.get_class(vllm_config)
158159

159160
engine_core = EngineCore(vllm_config=vllm_config,
160-
executor_class=executor_class)
161+
executor_class=executor_class,
162+
log_stats=True)
161163
"""Test basic request lifecycle."""
162164
# First request.
163165
request: EngineCoreRequest = make_request()

tests/v1/engine/test_engine_core_client.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
9494
asyncio_mode=False,
9595
vllm_config=vllm_config,
9696
executor_class=executor_class,
97+
log_stats=False,
9798
)
9899

99100
MAX_TOKENS = 20
@@ -163,6 +164,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
163164
asyncio_mode=True,
164165
vllm_config=vllm_config,
165166
executor_class=executor_class,
167+
log_stats=True,
166168
)
167169

168170
MAX_TOKENS = 20

tests/v1/engine/test_output_processor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,7 @@ def test_iteration_stats(dummy_test_vectors):
633633

634634
# First iteration has 2 prefills.
635635
outputs = engine_core.get_outputs()[:num_active]
636-
iteration_stats = IterationStats(output_processor.log_stats)
636+
iteration_stats = IterationStats()
637637
output_processor.process_outputs(outputs, engine_core_timestamp,
638638
iteration_stats)
639639
total_prompt_tokens = sum([
@@ -646,7 +646,7 @@ def test_iteration_stats(dummy_test_vectors):
646646

647647
# Just decodes in this step.
648648
outputs = engine_core.get_outputs()[:num_active]
649-
iteration_stats = IterationStats(output_processor.log_stats)
649+
iteration_stats = IterationStats()
650650
output_processor.process_outputs(outputs, engine_core_timestamp,
651651
iteration_stats)
652652

@@ -657,7 +657,7 @@ def test_iteration_stats(dummy_test_vectors):
657657
output_processor.add_request(inactive_request)
658658
num_active += 1
659659
outputs = engine_core.get_outputs()[:num_active]
660-
iteration_stats = IterationStats(output_processor.log_stats)
660+
iteration_stats = IterationStats()
661661
output_processor.process_outputs(outputs, engine_core_timestamp,
662662
iteration_stats)
663663
total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
@@ -667,7 +667,7 @@ def test_iteration_stats(dummy_test_vectors):
667667

668668
# Just decodes in this step.
669669
outputs = engine_core.get_outputs()[:num_active]
670-
iteration_stats = IterationStats(output_processor.log_stats)
670+
iteration_stats = IterationStats()
671671
output_processor.process_outputs(outputs, engine_core_timestamp,
672672
iteration_stats)
673673

vllm/v1/core/kv_cache_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,16 @@ def __init__(
2525
sliding_window: Optional[int] = None,
2626
enable_caching: bool = True,
2727
num_preallocate_tokens: int = 64,
28+
log_stats: bool = False,
2829
) -> None:
2930
self.block_size = block_size
3031
self.num_gpu_blocks = num_gpu_blocks
3132
self.max_model_len = max_model_len
3233
self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
3334
self.sliding_window = sliding_window
3435
self.enable_caching = enable_caching
36+
# FIXME: make prefix cache stats conditional on log_stats
37+
self.log_stats = log_stats
3538
# NOTE(woosuk): To avoid frequent block allocation, we preallocate some
3639
# blocks for each request. For example, when a request reaches the end
3740
# of its block table, we preallocate N blocks in advance. This way, we

vllm/v1/core/scheduler.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
from vllm.v1.core.kv_cache_manager import KVCacheManager
1212
from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
1313
SchedulerOutput)
14-
from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
14+
from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
15+
EngineCoreOutput, EngineCoreOutputs)
1516
from vllm.v1.metrics.stats import SchedulerStats
1617
from vllm.v1.outputs import ModelRunnerOutput
1718
from vllm.v1.request import Request, RequestStatus
@@ -27,10 +28,12 @@ def __init__(
2728
model_config: ModelConfig,
2829
cache_config: CacheConfig,
2930
lora_config: Optional[LoRAConfig],
31+
log_stats: bool,
3032
) -> None:
3133
self.scheduler_config = scheduler_config
3234
self.cache_config = cache_config
3335
self.lora_config = lora_config
36+
self.log_stats = log_stats
3437

3538
# Scheduling constraints.
3639
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -46,7 +49,8 @@ def __init__(
4649
num_gpu_blocks=num_gpu_blocks,
4750
max_model_len=self.max_model_len,
4851
sliding_window=self.cache_config.sliding_window,
49-
enable_caching=self.cache_config.enable_prefix_caching)
52+
enable_caching=self.cache_config.enable_prefix_caching,
53+
log_stats=self.log_stats)
5054
self.block_size = self.cache_config.block_size
5155

5256
# req_id -> Request
@@ -249,7 +253,7 @@ def schedule(self) -> "SchedulerOutput":
249253
self.running.append(request)
250254
if request.status == RequestStatus.WAITING:
251255
scheduled_new_reqs.append(request)
252-
request.scheduled(scheduled_timestamp)
256+
self.request_scheduled(request, scheduled_timestamp)
253257
elif request.status == RequestStatus.PREEMPTED:
254258
scheduled_resumed_reqs.append(request)
255259
else:
@@ -546,7 +550,7 @@ def _check_stop(self, request: Request) -> bool:
546550
def add_request(self, request: Request) -> None:
547551
self.waiting.append(request)
548552
self.requests[request.request_id] = request
549-
request.queued()
553+
self.request_queued(request)
550554

551555
def finish_requests(
552556
self,
@@ -594,7 +598,22 @@ def has_unfinished_requests(self) -> bool:
594598
def reset_prefix_cache(self) -> bool:
595599
return self.kv_cache_manager.reset_prefix_cache()
596600

597-
def make_stats(self) -> SchedulerStats:
601+
def request_queued(self, request: Request):
602+
if not self.log_stats:
603+
return
604+
request.events.append(
605+
EngineCoreEvent.new_event(EngineCoreEventType.QUEUED))
606+
607+
def request_scheduled(self, request: Request, timestamp: float):
608+
if not self.log_stats:
609+
return
610+
request.events.append(
611+
EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
612+
timestamp))
613+
614+
def make_stats(self) -> Optional[SchedulerStats]:
615+
if not self.log_stats:
616+
return None
598617
return SchedulerStats(
599618
num_running_reqs=len(self.running),
600619
num_waiting_reqs=len(self.waiting),

vllm/v1/engine/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ class EngineCoreOutputs(
117117

118118
# [num_reqs]
119119
outputs: List[EngineCoreOutput]
120-
scheduler_stats: SchedulerStats
120+
scheduler_stats: Optional[SchedulerStats]
121121
timestamp: float = 0.0
122122

123123
def __post_init__(self):

vllm/v1/engine/async_llm.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ def __init__(
5353

5454
self.log_requests = log_requests
5555
self.log_stats = log_stats
56-
self.stat_loggers: List[StatLoggerBase] = [
57-
LoggingStatLogger(),
58-
PrometheusStatLogger(vllm_config.model_config),
59-
]
56+
self.stat_loggers: List[StatLoggerBase] = []
57+
if self.log_stats:
58+
self.stat_loggers.extend([
59+
LoggingStatLogger(),
60+
PrometheusStatLogger(vllm_config.model_config),
61+
])
6062

6163
# Tokenizer (+ ensure liveness if running in another process).
6264
self.tokenizer = init_tokenizer_from_configs(
@@ -85,6 +87,7 @@ def __init__(
8587
asyncio_mode=True,
8688
vllm_config=vllm_config,
8789
executor_class=executor_class,
90+
log_stats=self.log_stats,
8891
)
8992

9093
self.output_handler: Optional[asyncio.Task] = None
@@ -246,7 +249,7 @@ async def _run_output_handler(self):
246249
# 1) Pull EngineCoreOutputs from the EngineCore.
247250
outputs = await self.engine_core.get_output_async()
248251

249-
iteration_stats = IterationStats(self.log_stats)
252+
iteration_stats = IterationStats() if self.log_stats else None
250253

251254
# Split outputs into chunks of at most
252255
# VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
@@ -277,7 +280,6 @@ async def _run_output_handler(self):
277280
# 4) Logging.
278281
# TODO(rob): make into a coroutine and launch it in
279282
# background thread once Prometheus overhead is non-trivial.
280-
assert iteration_stats is not None
281283
self._log_stats(
282284
scheduler_stats=outputs.scheduler_stats,
283285
iteration_stats=iteration_stats,
@@ -299,12 +301,14 @@ async def abort(self, request_id: str) -> None:
299301

300302
def _log_stats(
301303
self,
302-
scheduler_stats: SchedulerStats,
303-
iteration_stats: IterationStats,
304+
scheduler_stats: Optional[SchedulerStats],
305+
iteration_stats: Optional[IterationStats],
304306
):
305307
if not self.log_stats:
306308
return
307309

310+
assert scheduler_stats is not None
311+
assert iteration_stats is not None
308312
for logger in self.stat_loggers:
309313
logger.log(scheduler_stats=scheduler_stats,
310314
iteration_stats=iteration_stats)

vllm/v1/engine/core.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,15 @@ def __init__(
3838
self,
3939
vllm_config: VllmConfig,
4040
executor_class: Type[Executor],
41+
log_stats: bool,
4142
):
4243
assert vllm_config.model_config.runner_type != "pooling"
4344

4445
logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
4546
VLLM_VERSION, vllm_config)
4647

48+
self.log_stats = log_stats
49+
4750
# Setup Model.
4851
self.model_executor = executor_class(vllm_config)
4952

@@ -59,6 +62,7 @@ def __init__(
5962
model_config=vllm_config.model_config,
6063
cache_config=vllm_config.cache_config,
6164
lora_config=vllm_config.lora_config,
65+
log_stats=self.log_stats,
6266
)
6367

6468
self.mm_input_mapper_server = MMInputMapperServer(
@@ -148,11 +152,9 @@ def __init__(
148152
ready_pipe: Connection,
149153
vllm_config: VllmConfig,
150154
executor_class: Type[Executor],
151-
log_stats: bool = False,
155+
log_stats: bool,
152156
):
153-
super().__init__(vllm_config, executor_class)
154-
155-
self.log_stats = log_stats
157+
super().__init__(vllm_config, executor_class, log_stats)
156158

157159
# Background Threads and Queues for IO. These enable us to
158160
# overlap ZMQ socket IO with GPU since they release the GIL,

vllm/v1/engine/core_client.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def make_client(
4141
asyncio_mode: bool,
4242
vllm_config: VllmConfig,
4343
executor_class: Type[Executor],
44+
log_stats: bool,
4445
) -> "EngineCoreClient":
4546

4647
# TODO: support this for debugging purposes.
@@ -50,12 +51,12 @@ def make_client(
5051
"is not currently supported.")
5152

5253
if multiprocess_mode and asyncio_mode:
53-
return AsyncMPClient(vllm_config, executor_class)
54+
return AsyncMPClient(vllm_config, executor_class, log_stats)
5455

5556
if multiprocess_mode and not asyncio_mode:
56-
return SyncMPClient(vllm_config, executor_class)
57+
return SyncMPClient(vllm_config, executor_class, log_stats)
5758

58-
return InprocClient(vllm_config, executor_class)
59+
return InprocClient(vllm_config, executor_class, log_stats)
5960

6061
@abstractmethod
6162
def shutdown(self):
@@ -204,13 +205,13 @@ def shutdown(self):
204205
class SyncMPClient(MPClient):
205206
"""Synchronous client for multi-proc EngineCore."""
206207

207-
def __init__(self, vllm_config: VllmConfig,
208-
executor_class: Type[Executor]):
208+
def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
209+
log_stats: bool):
209210
super().__init__(
210211
asyncio_mode=False,
211212
vllm_config=vllm_config,
212213
executor_class=executor_class,
213-
log_stats=False,
214+
log_stats=log_stats,
214215
)
215216

216217
def get_output(self) -> EngineCoreOutputs:
@@ -245,13 +246,13 @@ def reset_prefix_cache(self) -> None:
245246
class AsyncMPClient(MPClient):
246247
"""Asyncio-compatible client for multi-proc EngineCore."""
247248

248-
def __init__(self, vllm_config: VllmConfig,
249-
executor_class: Type[Executor]):
249+
def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
250+
log_stats: bool):
250251
super().__init__(
251252
asyncio_mode=True,
252253
vllm_config=vllm_config,
253254
executor_class=executor_class,
254-
log_stats=True,
255+
log_stats=log_stats,
255256
)
256257

257258
self.outputs_queue: Optional[asyncio.Queue[bytes]] = None

vllm/v1/engine/llm_engine.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def __init__(
7373
asyncio_mode=False,
7474
vllm_config=vllm_config,
7575
executor_class=executor_class,
76+
log_stats=False, # FIXME: implement
7677
)
7778

7879
@classmethod

0 commit comments

Comments
 (0)