From fb2e6ef76a5811629b9abcde63a4286b8159c0b7 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Thu, 8 May 2025 10:39:14 +0000 Subject: [PATCH 1/4] update Signed-off-by: inkcherry --- vllm/config.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 40beace3040..64b0d20f4b0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2050,6 +2050,10 @@ def __post_init__(self) -> None: _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) + self.max_num_batched_tokens = min( + self.max_num_seqs * self.max_model_len, + self.max_num_batched_tokens) + self.max_num_encoder_input_tokens = self.max_num_batched_tokens self.encoder_cache_size = self.max_num_batched_tokens @@ -2090,6 +2094,12 @@ def _verify_args(self) -> None: "be greater than or equal to max_num_seqs " f"({self.max_num_seqs}).") + if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " + "be less than or equal to max_num_seqs * max_model_len " + f"({self.max_num_seqs * self.max_model_len}).") + if self.num_lookahead_slots < 0: raise ValueError( "num_lookahead_slots " From 6c3561c01101b774adf4a0885ec9208bae662d90 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Thu, 8 May 2025 10:46:20 +0000 Subject: [PATCH 2/4] update comments Signed-off-by: inkcherry --- vllm/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 64b0d20f4b0..b9d01bb99d5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2050,6 +2050,8 @@ def __post_init__(self) -> None: _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) + # Ensure max_num_batched_tokens does not exceed model limit. + # Some models (e.g., Whisper) have embeddings tied to max length. self.max_num_batched_tokens = min( self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens) From b7b790c0ca4fc3e10343bc6a852afbe386742ca5 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Fri, 9 May 2025 01:49:52 +0000 Subject: [PATCH 3/4] update Signed-off-by: inkcherry --- vllm/config.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b9d01bb99d5..a947e91b58f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2050,8 +2050,9 @@ def __post_init__(self) -> None: _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) - # Ensure max_num_batched_tokens does not exceed model limit. - # Some models (e.g., Whisper) have embeddings tied to max length. + # When using default settings, + # ensure max_num_batched_tokens does not exceed the model limit. + # Some models (e.g., Whisper) have embeddings tied to the maximum sequence length.. self.max_num_batched_tokens = min( self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens) @@ -2097,10 +2098,10 @@ def _verify_args(self) -> None: f"({self.max_num_seqs}).") if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: - raise ValueError( - f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " - "be less than or equal to max_num_seqs * max_model_len " - f"({self.max_num_seqs * self.max_model_len}).") + logger.warning( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) exceeds " + f"max_num_seqs * max_model_len ({self.max_num_seqs * self.max_model_len}). " + "This may lead to unexpected behavior.") if self.num_lookahead_slots < 0: raise ValueError( From b384745fa0dc7c2bd15e2e3c0ca823ad0e8f1f0e Mon Sep 17 00:00:00 2001 From: inkcherry Date: Fri, 9 May 2025 02:45:55 +0000 Subject: [PATCH 4/4] format Signed-off-by: inkcherry --- vllm/config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index a947e91b58f..4020f8b3ebd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2051,8 +2051,8 @@ def __post_init__(self) -> None: ) # When using default settings, - # ensure max_num_batched_tokens does not exceed the model limit. - # Some models (e.g., Whisper) have embeddings tied to the maximum sequence length.. + # Ensure max_num_batched_tokens does not exceed model limit. + # Some models (e.g., Whisper) have embeddings tied to max length. self.max_num_batched_tokens = min( self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens) @@ -2099,9 +2099,10 @@ def _verify_args(self) -> None: if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: logger.warning( - f"max_num_batched_tokens ({self.max_num_batched_tokens}) exceeds " - f"max_num_seqs * max_model_len ({self.max_num_seqs * self.max_model_len}). " - "This may lead to unexpected behavior.") + "max_num_batched_tokens (%d) exceeds max_num_seqs" + "* max_model_len (%d). This may lead to unexpected behavior.", + self.max_num_batched_tokens, + self.max_num_seqs * self.max_model_len) if self.num_lookahead_slots < 0: raise ValueError(