Skip to content

Commit 0a347b2

Browse files
ywang96LeiWang1999
authored andcommitted
[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (vllm-project#8425)
Signed-off-by: LeiWang1999 <leiwang1999@outlook.com>
1 parent a1a4c2c commit 0a347b2

File tree

2 files changed

+13
-3
lines changed

2 files changed

+13
-3
lines changed

vllm/engine/arg_utils.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,13 @@ def create_engine_config(self) -> EngineConfig:
843843
device_config = DeviceConfig(device=self.device)
844844
model_config = self.create_model_config()
845845

846+
if model_config.is_multimodal_model:
847+
if self.enable_prefix_caching:
848+
logger.warning(
849+
"--enable-prefix-caching is currently not "
850+
"supported for multimodal models and has been disabled.")
851+
self.enable_prefix_caching = False
852+
846853
cache_config = CacheConfig(
847854
block_size=self.block_size if self.device != "neuron" else
848855
self.max_model_len, # neuron needs block_size = max_model_len
@@ -874,7 +881,10 @@ def create_engine_config(self) -> EngineConfig:
874881
# If not explicitly set, enable chunked prefill by default for
875882
# long context (> 32K) models. This is to avoid OOM errors in the
876883
# initial memory profiling phase.
877-
if use_long_context:
884+
885+
# Chunked prefill is currently disabled for multimodal models by
886+
# default.
887+
if use_long_context and not model_config.is_multimodal_model:
878888
is_gpu = device_config.device_type == "cuda"
879889
use_sliding_window = (model_config.get_sliding_window()
880890
is not None)

vllm/model_executor/models/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,12 @@
9090
"PaliGemmaForConditionalGeneration": ("paligemma",
9191
"PaliGemmaForConditionalGeneration"),
9292
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
93-
"UltravoxModel": ("ultravox", "UltravoxModel"),
94-
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
9593
"PixtralForConditionalGeneration": ("pixtral",
9694
"PixtralForConditionalGeneration"),
95+
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
9796
"Qwen2VLForConditionalGeneration": ("qwen2_vl",
9897
"Qwen2VLForConditionalGeneration"),
98+
"UltravoxModel": ("ultravox", "UltravoxModel"),
9999
}
100100
_CONDITIONAL_GENERATION_MODELS = {
101101
"BartModel": ("bart", "BartForConditionalGeneration"),

0 commit comments

Comments
 (0)