vllm-project · DarkLight1337 · May 16, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
@@ -8,14 +8,14 @@
 from pathlib import PosixPath
 
 import pytest
-from transformers import (AutoModelForImageTextToText,
+from transformers import (AutoModel, AutoModelForImageTextToText,
                           AutoModelForTextToWaveform, AutoModelForVision2Seq)
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
 
-from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
-                          VideoTestAssets, VllmRunner)
+from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
+                          ImageTestAssets, VideoTestAssets, VllmRunner)
 from ....utils import (create_new_process_for_each_test, large_gpu_mark,
                        multi_gpu_marks)
 from ...utils import check_outputs_equal
@@ -158,6 +158,17 @@
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
+    "ultravox": VLMTestInfo(
+        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
+        test_type=VLMTestType.AUDIO,
+        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
+        audio_idx_to_prompt=lambda idx: "<|audio|>",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModel,
+        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
@@ -393,7 +404,6 @@
                 formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
             ),
             limit_mm_per_prompt={"video": 4},
-            runner_mm_key="videos",
         )],
     ),
     "llava_next_video": VLMTestInfo(
@@ -706,6 +716,7 @@ def _mark_splits(
 # - multi-image
 # - image embeddings
 # - video
+# - audio
 # - custom inputs
 @pytest.mark.parametrize(
     "model_type,test_case",
@@ -803,6 +814,28 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.AUDIO,
+        create_new_process_for_each_test=False,
+    ))
+def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
+                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
+                      audio_assets: AudioTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_audio_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        audio_assets=audio_assets,
+    )
+
+
 @pytest.mark.parametrize(
     "model_type,test_case",
     get_parametrized_options(
@@ -930,6 +963,29 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
     )
 
 
+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.AUDIO,
+        create_new_process_for_each_test=True,
+    ))
+def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
+                            hf_runner: type[HfRunner],
+                            vllm_runner: type[VllmRunner],
+                            audio_assets: AudioTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_audio_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        audio_assets=audio_assets,
+    )
+
+
 @pytest.mark.parametrize(
     "model_type,test_case",
     get_parametrized_options(

@@ -1,20 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 import pytest
 import pytest_asyncio
-from transformers import AutoModel, AutoTokenizer
+from transformers import AutoTokenizer
 
-from vllm.multimodal.audio import resample_audio_librosa
-from vllm.sequence import SampleLogprobs
-
-from ....conftest import AUDIO_ASSETS, AudioTestAssets, HfRunner, VllmRunner
+from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
 from ....utils import RemoteOpenAIServer
 from ...registry import HF_EXAMPLE_MODELS
-from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
@@ -88,79 +84,6 @@ def _get_prompt(audio_count, question, placeholder):
                                          add_generation_prompt=True)
 
 
-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    eos_token_id = tokenizer.eos_token_id
-
-    hf_output_ids = output_ids[:]
-    hf_output_str = output_str
-    if hf_output_ids[-1] == eos_token_id:
-        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
-
-    return hf_output_ids, hf_output_str, out_logprobs
-
-
-def run_test(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
-    prompts_and_audios: list[tuple[str, str, AudioTuple]],
-    model: str,
-    *,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    **kwargs,
-):
-    """Inference result should be the same between hf and vllm."""
-    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
-    model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip")
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    with vllm_runner(model, dtype=dtype, enforce_eager=True,
-                     **kwargs) as vllm_model:
-        vllm_outputs_per_audio = [
-            vllm_model.generate_greedy_logprobs([vllm_prompt],
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                audios=[audio])
-            for vllm_prompt, _, audio in prompts_and_audios
-        ]
-
-    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
-        hf_outputs_per_audio = [
-            hf_model.generate_greedy_logprobs_limit(
-                [hf_prompt],
-                max_tokens,
-                num_logprobs=num_logprobs,
-                audios=[(resample_audio_librosa(audio[0],
-                                                orig_sr=audio[1],
-                                                target_sr=16000), 16000)])
-            for _, hf_prompt, audio in prompts_and_audios
-        ]
-
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
-                                        vllm_outputs_per_audio):
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
 def run_multi_audio_test(
     vllm_runner: type[VllmRunner],
     prompts_and_audios: list[tuple[str, list[AudioTuple]]],
@@ -194,35 +117,6 @@ def run_multi_audio_test(
     assert all(tokens for tokens, *_ in vllm_outputs)
 
 
-@pytest.mark.core_model
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
-def test_models(hf_runner, vllm_runner, audio_assets: AudioTestAssets,
-                dtype: str, max_tokens: int, num_logprobs: int,
-                vllm_kwargs: dict) -> None:
-    audio_inputs = [(
-        _get_prompt(1, audio, VLLM_PLACEHOLDER),
-        _get_prompt(1, audio, HF_PLACEHOLDER),
-        audio.audio_and_sample_rate,
-    ) for audio in audio_assets]
-
-    run_test(
-        hf_runner,
-        vllm_runner,
-        audio_inputs,
-        MODEL_NAME,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        **vllm_kwargs,
-    )
-
-
 @pytest.mark.core_model
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])