@@ -45,9 +45,12 @@ def _default_input_mapper(
45
45
return MultiModalInputs (data .data )
46
46
47
47
# PIL image
48
- if isinstance (data , Image .Image ) or is_list_of (data , Image .Image ):
48
+ if (isinstance (data , Image .Image )
49
+ or is_list_of (data , Image .Image , check = "all" )):
49
50
image_processor = self ._get_hf_image_processor (model_config )
50
51
52
+ data = self ._handle_single_row_images (data )
53
+
51
54
if image_processor is None :
52
55
raise RuntimeError ("No HuggingFace processor is available "
53
56
"to process the image object" )
@@ -72,5 +75,27 @@ def _default_input_mapper(
72
75
73
76
raise TypeError (f"Invalid image type: { type (data )} " )
74
77
78
+ def _handle_single_row_images (self , data ):
79
+ # transformers library has error when image height is 1
80
+ # https://github.com/huggingface/transformers/issues/21638
81
+ if isinstance (data , Image .Image ):
82
+ if data .height == 1 :
83
+ # Pad the image to a height of 2
84
+ padded_image = Image .new ("RGB" , (data .width , 2 ))
85
+ padded_image .paste (data , (0 , 0 ))
86
+ return padded_image
87
+ else :
88
+ # Pad the images in the list to a height of 2
89
+ padded_images = []
90
+ for image in data :
91
+ if image .height == 1 :
92
+ padded_image = Image .new ("RGB" , (image .width , 2 ))
93
+ padded_image .paste (image , (0 , 0 ))
94
+ padded_images .append (padded_image )
95
+ else :
96
+ padded_images .append (image )
97
+ return padded_images
98
+ return data
99
+
75
100
def _default_max_multimodal_tokens (self , ctx : InputContext ) -> int :
76
101
return 3000
0 commit comments