|
1 | 1 | # SPDX-License-Identifier: Apache-2.0
|
2 | 2 |
|
3 | 3 | import json
|
4 |
| -from typing import Any, Optional |
| 4 | +from typing import Any |
5 | 5 |
|
6 | 6 | import numpy as np
|
7 | 7 | import pytest
|
8 | 8 | import pytest_asyncio
|
9 |
| -from transformers import AutoModel, AutoTokenizer |
| 9 | +from transformers import AutoTokenizer |
10 | 10 |
|
11 |
| -from vllm.multimodal.audio import resample_audio_librosa |
12 |
| -from vllm.sequence import SampleLogprobs |
13 |
| - |
14 |
| -from ....conftest import AUDIO_ASSETS, AudioTestAssets, HfRunner, VllmRunner |
| 11 | +from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner |
15 | 12 | from ....utils import RemoteOpenAIServer
|
16 | 13 | from ...registry import HF_EXAMPLE_MODELS
|
17 |
| -from ...utils import check_logprobs_close |
18 | 14 |
|
19 | 15 | MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
20 | 16 |
|
@@ -88,79 +84,6 @@ def _get_prompt(audio_count, question, placeholder):
|
88 | 84 | add_generation_prompt=True)
|
89 | 85 |
|
90 | 86 |
|
91 |
| -def vllm_to_hf_output(vllm_output: tuple[list[int], str, |
92 |
| - Optional[SampleLogprobs]], |
93 |
| - model: str): |
94 |
| - """Sanitize vllm output to be comparable with hf output.""" |
95 |
| - output_ids, output_str, out_logprobs = vllm_output |
96 |
| - |
97 |
| - tokenizer = AutoTokenizer.from_pretrained(model) |
98 |
| - eos_token_id = tokenizer.eos_token_id |
99 |
| - |
100 |
| - hf_output_ids = output_ids[:] |
101 |
| - hf_output_str = output_str |
102 |
| - if hf_output_ids[-1] == eos_token_id: |
103 |
| - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) |
104 |
| - |
105 |
| - return hf_output_ids, hf_output_str, out_logprobs |
106 |
| - |
107 |
| - |
108 |
| -def run_test( |
109 |
| - hf_runner: type[HfRunner], |
110 |
| - vllm_runner: type[VllmRunner], |
111 |
| - prompts_and_audios: list[tuple[str, str, AudioTuple]], |
112 |
| - model: str, |
113 |
| - *, |
114 |
| - dtype: str, |
115 |
| - max_tokens: int, |
116 |
| - num_logprobs: int, |
117 |
| - **kwargs, |
118 |
| -): |
119 |
| - """Inference result should be the same between hf and vllm.""" |
120 |
| - model_info = HF_EXAMPLE_MODELS.find_hf_info(model) |
121 |
| - model_info.check_available_online(on_fail="skip") |
122 |
| - model_info.check_transformers_version(on_fail="skip") |
123 |
| - |
124 |
| - # NOTE: take care of the order. run vLLM first, and then run HF. |
125 |
| - # vLLM needs a fresh new process without cuda initialization. |
126 |
| - # if we run HF first, the cuda initialization will be done and it |
127 |
| - # will hurt multiprocessing backend with fork method (the default method). |
128 |
| - |
129 |
| - with vllm_runner(model, dtype=dtype, enforce_eager=True, |
130 |
| - **kwargs) as vllm_model: |
131 |
| - vllm_outputs_per_audio = [ |
132 |
| - vllm_model.generate_greedy_logprobs([vllm_prompt], |
133 |
| - max_tokens, |
134 |
| - num_logprobs=num_logprobs, |
135 |
| - audios=[audio]) |
136 |
| - for vllm_prompt, _, audio in prompts_and_audios |
137 |
| - ] |
138 |
| - |
139 |
| - with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: |
140 |
| - hf_outputs_per_audio = [ |
141 |
| - hf_model.generate_greedy_logprobs_limit( |
142 |
| - [hf_prompt], |
143 |
| - max_tokens, |
144 |
| - num_logprobs=num_logprobs, |
145 |
| - audios=[(resample_audio_librosa(audio[0], |
146 |
| - orig_sr=audio[1], |
147 |
| - target_sr=16000), 16000)]) |
148 |
| - for _, hf_prompt, audio in prompts_and_audios |
149 |
| - ] |
150 |
| - |
151 |
| - for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio, |
152 |
| - vllm_outputs_per_audio): |
153 |
| - check_logprobs_close( |
154 |
| - outputs_0_lst=hf_outputs, |
155 |
| - outputs_1_lst=[ |
156 |
| - vllm_to_hf_output(vllm_output, model) |
157 |
| - for vllm_output in vllm_outputs |
158 |
| - ], |
159 |
| - name_0="hf", |
160 |
| - name_1="vllm", |
161 |
| - ) |
162 |
| - |
163 |
| - |
164 | 87 | def run_multi_audio_test(
|
165 | 88 | vllm_runner: type[VllmRunner],
|
166 | 89 | prompts_and_audios: list[tuple[str, list[AudioTuple]]],
|
@@ -194,35 +117,6 @@ def run_multi_audio_test(
|
194 | 117 | assert all(tokens for tokens, *_ in vllm_outputs)
|
195 | 118 |
|
196 | 119 |
|
197 |
| -@pytest.mark.core_model |
198 |
| -@pytest.mark.parametrize("dtype", ["bfloat16"]) |
199 |
| -@pytest.mark.parametrize("max_tokens", [128]) |
200 |
| -@pytest.mark.parametrize("num_logprobs", [5]) |
201 |
| -@pytest.mark.parametrize("vllm_kwargs", [ |
202 |
| - pytest.param({}, marks=pytest.mark.cpu_model), |
203 |
| - pytest.param(CHUNKED_PREFILL_KWARGS), |
204 |
| -]) |
205 |
| -def test_models(hf_runner, vllm_runner, audio_assets: AudioTestAssets, |
206 |
| - dtype: str, max_tokens: int, num_logprobs: int, |
207 |
| - vllm_kwargs: dict) -> None: |
208 |
| - audio_inputs = [( |
209 |
| - _get_prompt(1, audio, VLLM_PLACEHOLDER), |
210 |
| - _get_prompt(1, audio, HF_PLACEHOLDER), |
211 |
| - audio.audio_and_sample_rate, |
212 |
| - ) for audio in audio_assets] |
213 |
| - |
214 |
| - run_test( |
215 |
| - hf_runner, |
216 |
| - vllm_runner, |
217 |
| - audio_inputs, |
218 |
| - MODEL_NAME, |
219 |
| - dtype=dtype, |
220 |
| - max_tokens=max_tokens, |
221 |
| - num_logprobs=num_logprobs, |
222 |
| - **vllm_kwargs, |
223 |
| - ) |
224 |
| - |
225 |
| - |
226 | 120 | @pytest.mark.core_model
|
227 | 121 | @pytest.mark.parametrize("dtype", ["half"])
|
228 | 122 | @pytest.mark.parametrize("max_tokens", [128])
|
|
0 commit comments