Skip to content

Commit 99d1b38

Browse files
DarkLight1337Isotr0py
authored andcommitted
[VLM] Merged multi-modal processor and V1 support for Qwen-VL (vllm-project#12504)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com>
1 parent e78b3d2 commit 99d1b38

File tree

4 files changed

+387
-477
lines changed

4 files changed

+387
-477
lines changed

docs/source/models/supported_models.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,7 @@ See [this page](#generative-models) for more information on how to use generativ
745745
- `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
746746
- ✅︎
747747
- ✅︎
748-
-
748+
- ✅︎
749749
* - `Qwen2AudioForConditionalGeneration`
750750
- Qwen2-Audio
751751
- T + A<sup>+</sup>

tests/models/multimodal/processing/test_common.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
def _test_processing_correctness(
1818
model_id: str,
19-
modalities: dict[str, bool],
2019
hit_rate: float,
2120
num_batches: int,
2221
simplify_rate: float,
@@ -25,11 +24,6 @@ def _test_processing_correctness(
2524
model_info.check_available_online(on_fail="skip")
2625
model_info.check_transformers_version(on_fail="skip")
2726

28-
limit_mm_per_prompt = {
29-
modality: 3 if supports_multi else 1
30-
for modality, supports_multi in modalities.items()
31-
}
32-
3327
model_config = ModelConfig(
3428
model_id,
3529
task="auto",
@@ -40,18 +34,29 @@ def _test_processing_correctness(
4034
dtype="float16",
4135
revision=None,
4236
hf_overrides=model_info.hf_overrides,
43-
limit_mm_per_prompt=limit_mm_per_prompt,
4437
)
4538

4639
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
4740
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
4841
ctx = InputProcessingContext(
4942
model_config,
50-
tokenizer=cached_get_tokenizer(model_config.tokenizer),
43+
tokenizer=cached_get_tokenizer(
44+
model_config.tokenizer,
45+
trust_remote_code=model_info.trust_remote_code,
46+
),
5147
)
5248
# Ensure that it can fit all of the data
5349
cache = ProcessingCache(capacity=1 << 30)
5450

51+
processing_info = factories.info(ctx)
52+
supported_mm_limits = processing_info.get_supported_mm_limits()
53+
limit_mm_per_prompt = {
54+
modality: 3 if limit is None else limit
55+
for modality, limit in supported_mm_limits.items()
56+
}
57+
58+
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
59+
5560
baseline_processor = factories.build_processor(ctx, cache=None)
5661
cached_processor = factories.build_processor(ctx, cache=cache)
5762
dummy_inputs = baseline_processor.dummy_inputs
@@ -82,8 +87,8 @@ def _test_processing_correctness(
8287
mm_data = {
8388
k:
8489
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
85-
for _ in range(rng.randint(limit_mm_per_prompt[k]))]
86-
for k in modalities
90+
for _ in range(rng.randint(limit))]
91+
for k, limit in limit_mm_per_prompt.items()
8792
}
8893

8994
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
@@ -135,53 +140,49 @@ def _test_processing_correctness(
135140

136141
# yapf: disable
137142
# True if the model supports multiple data items of the modality per request
138-
@pytest.mark.parametrize(("model_id", "modalities"), [
139-
("rhymes-ai/Aria", {"image": True}),
140-
("Salesforce/blip2-opt-2.7b", {"image": False}),
141-
("facebook/chameleon-7b", {"image": False}),
142-
("deepseek-ai/deepseek-vl2-tiny", {"image": True}),
143-
("adept/fuyu-8b", {"image": False}),
144-
("llava-hf/llava-1.5-7b-hf", {"image": True}),
145-
("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
146-
("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
147-
("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501
148-
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
149-
("mistral-community/pixtral-12b", {"image": True}),
150-
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
151-
("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
152-
("fixie-ai/ultravox-v0_3", {"audio": True}),
143+
@pytest.mark.parametrize("model_id", [
144+
"rhymes-ai/Aria",
145+
"Salesforce/blip2-opt-2.7b",
146+
"facebook/chameleon-7b",
147+
"deepseek-ai/deepseek-vl2-tiny",
148+
"adept/fuyu-8b",
149+
"llava-hf/llava-1.5-7b-hf",
150+
"llava-hf/llava-v1.6-mistral-7b-hf",
151+
"llava-hf/LLaVA-NeXT-Video-7B-hf",
152+
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
153+
"TIGER-Lab/Mantis-8B-siglip-llama3",
154+
"mistral-community/pixtral-12b",
155+
"Qwen/Qwen-VL-Chat",
156+
"Qwen/Qwen2-VL-2B-Instruct",
157+
"Qwen/Qwen2-Audio-7B-Instruct",
158+
"fixie-ai/ultravox-v0_3",
153159
])
154160
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
155161
@pytest.mark.parametrize("num_batches", [32])
156162
@pytest.mark.parametrize("simplify_rate", [1.0])
157163
# yapf: enable
158164
def test_processing_correctness(
159165
model_id: str,
160-
modalities: dict[str, bool],
161166
hit_rate: float,
162167
num_batches: int,
163168
simplify_rate: float,
164169
):
165170
_test_processing_correctness(
166171
model_id,
167-
modalities,
168172
hit_rate=hit_rate,
169173
num_batches=num_batches,
170174
simplify_rate=simplify_rate,
171175
)
172176

173177

174178
# yapf: disable
175-
@pytest.mark.parametrize(("model_id", "modalities"), [
176-
("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
177-
])
179+
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
178180
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
179181
@pytest.mark.parametrize("num_batches", [32])
180182
@pytest.mark.parametrize("simplify_rate", [1.0])
181183
# yapf: enable
182184
def test_processing_correctness_phi3v(
183185
model_id: str,
184-
modalities: dict[str, bool],
185186
hit_rate: float,
186187
num_batches: int,
187188
simplify_rate: float,
@@ -195,7 +196,6 @@ def test_processing_correctness_phi3v(
195196

196197
_test_processing_correctness(
197198
model_id,
198-
modalities,
199199
hit_rate=hit_rate,
200200
num_batches=num_batches,
201201
simplify_rate=simplify_rate,

tests/models/multimodal/processing/test_qwen.py

Lines changed: 0 additions & 144 deletions
This file was deleted.

0 commit comments

Comments
 (0)