Skip to content

Commit 48aea73

Browse files
hhzhang16sumitd2
authored andcommitted
[Bugfix] Fixes Phi3v & Ultravox Multimodal EmbeddingInputs (vllm-project#8979)
Signed-off-by: Sumit Dubey <sumit.dubey2@ibm.com>
1 parent 0e48174 commit 48aea73

File tree

2 files changed

+43
-25
lines changed

2 files changed

+43
-25
lines changed

vllm/model_executor/models/phi3v.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -467,9 +467,10 @@ def input_processor_for_phi3v(ctx: InputContext,
467467
input_height=h,
468468
num_crops=num_crops))
469469
elif isinstance(image_data, torch.Tensor):
470-
num_images, image_feature_size, hidden_size = image_data.shape
470+
image_feature_size = [image_data.shape[0]]
471+
image_data = [image_data]
471472
elif is_list_of(image_data, torch.Tensor):
472-
image_feature_size = [item.shape[1] for item in image_data]
473+
image_feature_size = [item.shape[0] for item in image_data]
473474
else:
474475
raise TypeError(f"Invalid image type: {type(image_data)}")
475476

@@ -611,9 +612,6 @@ def _parse_and_validate_image_input(
611612
image_sizes = kwargs.pop("image_sizes", None)
612613
image_embeds = kwargs.pop("image_embeds", None)
613614

614-
if pixel_values is None:
615-
return None
616-
617615
if pixel_values is None and image_embeds is None:
618616
return None
619617

@@ -650,7 +648,17 @@ def _process_image_input(
650648
) -> torch.Tensor:
651649

652650
if image_input["type"] == "image_embeds":
653-
return image_input["data"]
651+
image_data = image_input["data"]
652+
if is_list_of(image_data, torch.Tensor):
653+
# it's already a list of tensors
654+
return image_data
655+
if len(image_data.shape) == 3:
656+
# 3D tensor
657+
return list(torch.unbind(image_data, dim=0))
658+
raise ValueError(
659+
"We expect batched 2D tensors;"
660+
"this can be either a list of 2D tensors or a single 3D tensor."
661+
)
654662

655663
assert self.vision_embed_tokens is not None
656664
image_embeds = self.vision_embed_tokens(image_input["data"],

vllm/model_executor/models/ultravox.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
3939
SequenceData)
4040
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
41+
from vllm.utils import is_list_of
4142

4243
from .interfaces import SupportsMultiModal, SupportsPP
4344

@@ -119,6 +120,10 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
119120
if not isinstance(data, list):
120121
data = [data]
121122

123+
# If the audio inputs are embeddings, no need for preprocessing
124+
if is_list_of(data, torch.Tensor, check="all"):
125+
return MultiModalInputs({"audio_embeds": data})
126+
122127
audio_features = []
123128
for audio_input in data:
124129
if not isinstance(audio_input, tuple):
@@ -165,25 +170,30 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs):
165170
audios = [audios]
166171

167172
audio_token_counts = []
168-
for audio_data, sample_rate in audios:
169-
audio_length = audio_data.shape[0]
170-
if sample_rate != feature_extractor.sampling_rate:
171-
# Account for resampling.
172-
adjustment = feature_extractor.sampling_rate / sample_rate
173-
audio_length = math.ceil(adjustment * audio_length)
174-
175-
feature_extractor_output_length = math.ceil(
176-
(audio_length - (feature_extractor.hop_length - 1)) /
177-
feature_extractor.hop_length)
178-
179-
uv_config = ctx.get_hf_config(UltravoxConfig)
180-
audio_num_tokens = min(
181-
max(
182-
1,
183-
math.ceil(feature_extractor_output_length /
184-
(uv_config.stack_factor * 2))),
185-
get_ultravox_max_audio_tokens(ctx))
186-
audio_token_counts.append(audio_num_tokens)
173+
for audio in audios:
174+
if isinstance(audio, torch.Tensor):
175+
audio_num_tokens = audio.shape[1]
176+
audio_token_counts.append(audio_num_tokens)
177+
else:
178+
audio_data, sample_rate = audio
179+
audio_length = audio_data.shape[0]
180+
if sample_rate != feature_extractor.sampling_rate:
181+
# Account for resampling.
182+
adjustment = feature_extractor.sampling_rate / sample_rate
183+
audio_length = math.ceil(adjustment * audio_length)
184+
185+
feature_extractor_output_length = math.ceil(
186+
(audio_length - (feature_extractor.hop_length - 1)) /
187+
feature_extractor.hop_length)
188+
189+
uv_config = ctx.get_hf_config(UltravoxConfig)
190+
audio_num_tokens = min(
191+
max(
192+
1,
193+
math.ceil(feature_extractor_output_length /
194+
(uv_config.stack_factor * 2))),
195+
get_ultravox_max_audio_tokens(ctx))
196+
audio_token_counts.append(audio_num_tokens)
187197

188198
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
189199

0 commit comments

Comments
 (0)