Skip to content

Commit d04a106

Browse files
committed
[Bugfix] Fix crashing for multimodal when image passed with height == 1
1 parent c0d9a98 commit d04a106

File tree

2 files changed

+55
-1
lines changed

2 files changed

+55
-1
lines changed

tests/multimodal/test_mapper.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import numpy as np
44
import pytest
5+
from PIL import Image
56
from transformers import CLIPImageProcessor, LlavaNextImageProcessor
67

78
from vllm.config import ModelConfig
@@ -155,3 +156,31 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
155156

156157
mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
157158
assert len(mapped_inputs["pixel_values"]) == num_images
159+
160+
161+
@pytest.mark.parametrize("image_size", [(1, 1), (2, 1), (1, 2), (2, 2)])
162+
def test_llama3p2_image_processor_when_small_width_or_height(
163+
mm_registry, image_size):
164+
MODEL_NAME = "meta-llama/Llama-3.2-11B-Vision-Instruct"
165+
166+
model_config = ModelConfig(
167+
model=MODEL_NAME,
168+
tokenizer=MODEL_NAME,
169+
tokenizer_mode="auto",
170+
trust_remote_code=False,
171+
dtype="bfloat16",
172+
seed=0,
173+
revision=None,
174+
limit_mm_per_prompt={"image": 1},
175+
)
176+
177+
mm_registry.init_mm_limits_per_prompt(model_config)
178+
179+
try:
180+
image = Image.new("RGB", image_size)
181+
mm_registry.map_input(
182+
model_config,
183+
{"image": image},
184+
)
185+
except Exception as e:
186+
raise AssertionError(f"Expected no exceptions: {e} ") from e

vllm/multimodal/image.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,12 @@ def _default_input_mapper(
4545
return MultiModalInputs(data.data)
4646

4747
# PIL image
48-
if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
48+
if (isinstance(data, Image.Image)
49+
or is_list_of(data, Image.Image, check="all")):
4950
image_processor = self._get_hf_image_processor(model_config)
5051

52+
data = self._handle_single_row_images(data)
53+
5154
if image_processor is None:
5255
raise RuntimeError("No HuggingFace processor is available "
5356
"to process the image object")
@@ -72,5 +75,27 @@ def _default_input_mapper(
7275

7376
raise TypeError(f"Invalid image type: {type(data)}")
7477

78+
def _handle_single_row_images(self, data):
79+
# transformers library has error when image height is 1
80+
# https://github.com/huggingface/transformers/issues/21638
81+
if isinstance(data, Image.Image):
82+
if data.height == 1:
83+
# Pad the image to a height of 2
84+
padded_image = Image.new("RGB", (data.width, 2))
85+
padded_image.paste(data, (0, 0))
86+
return padded_image
87+
else:
88+
# Pad the images in the list to a height of 2
89+
padded_images = []
90+
for image in data:
91+
if image.height == 1:
92+
padded_image = Image.new("RGB", (image.width, 2))
93+
padded_image.paste(image, (0, 0))
94+
padded_images.append(padded_image)
95+
else:
96+
padded_images.append(image)
97+
return padded_images
98+
return data
99+
75100
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
76101
return 3000

0 commit comments

Comments
 (0)