[Bugfix] Fix crashing for multimodal when image passed with height == 1

Pernekhan · Pernekhan · commit d04a10691f01 · 2024-10-07T17:09:08.000-07:00
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytest
+from PIL import Image
 from transformers import CLIPImageProcessor, LlavaNextImageProcessor
 
 from vllm.config import ModelConfig
@@ -155,3 +156,31 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
 
     mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
     assert len(mapped_inputs["pixel_values"]) == num_images
+
+
+@pytest.mark.parametrize("image_size", [(1, 1), (2, 1), (1, 2), (2, 2)])
+def test_llama3p2_image_processor_when_small_width_or_height(
+        mm_registry, image_size):
+    MODEL_NAME = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="bfloat16",
+        seed=0,
+        revision=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    mm_registry.init_mm_limits_per_prompt(model_config)
+
+    try:
+        image = Image.new("RGB", image_size)
+        mm_registry.map_input(
+            model_config,
+            {"image": image},
+        )
+    except Exception as e:
+        raise AssertionError(f"Expected no exceptions: {e} ") from e
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
@@ -45,9 +45,12 @@ def _default_input_mapper(
             return MultiModalInputs(data.data)
 
         # PIL image
-        if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
+        if (isinstance(data, Image.Image)
+                or is_list_of(data, Image.Image, check="all")):
             image_processor = self._get_hf_image_processor(model_config)
 
+            data = self._handle_single_row_images(data)
+
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available "
                                    "to process the image object")
@@ -72,5 +75,27 @@ def _default_input_mapper(
 
         raise TypeError(f"Invalid image type: {type(data)}")
 
+    def _handle_single_row_images(self, data):
+        # transformers library has error when image height is 1
+        # https://github.com/huggingface/transformers/issues/21638
+        if isinstance(data, Image.Image):
+            if data.height == 1:
+                # Pad the image to a height of 2
+                padded_image = Image.new("RGB", (data.width, 2))
+                padded_image.paste(data, (0, 0))
+                return padded_image
+        else:
+            # Pad the images in the list to a height of 2
+            padded_images = []
+            for image in data:
+                if image.height == 1:
+                    padded_image = Image.new("RGB", (image.width, 2))
+                    padded_image.paste(image, (0, 0))
+                    padded_images.append(padded_image)
+                else:
+                    padded_images.append(image)
+            return padded_images
+        return data
+
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 3000