Skip to content

Commit 4843252

Browse files
committed
Merge branch 'main' into min-gpu-memory
2 parents 137307a + e585b58 commit 4843252

File tree

21 files changed

+366
-186
lines changed

21 files changed

+366
-186
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# label(str): the name of the test. emoji allowed.
1010
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
1111
# fast_check_only(bool): run this test on fastcheck pipeline only
12+
# optional(bool): never run this test by default (i.e. need to unblock manually)
1213
# command(str): the single command to run for tests. incompatible with commands.
1314
# commands(list): the list of commands to run for test. incompatbile with command.
1415
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -39,7 +40,7 @@ steps:
3940
# Check API reference (if it fails, you may have missing mock imports)
4041
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
4142

42-
- label: Async Engine, Inputs, Utils, Worker Test # 15min
43+
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4344
fast_check: true
4445
source_file_dependencies:
4546
- vllm/
@@ -81,7 +82,7 @@ steps:
8182
commands:
8283
- pytest -v -s core
8384

84-
- label: Entrypoints Test # 20min
85+
- label: Entrypoints Test # 40min
8586
working_dir: "/vllm-workspace/tests"
8687
fast_check: true
8788
mirror_hardwares: [amd]
@@ -151,7 +152,7 @@ steps:
151152
# OOM in the CI unless we run this separately
152153
- pytest -v -s tokenization
153154

154-
- label: Examples Test # 12min
155+
- label: Examples Test # 15min
155156
working_dir: "/vllm-workspace/examples"
156157
#mirror_hardwares: [amd]
157158
source_file_dependencies:
@@ -169,15 +170,15 @@ steps:
169170
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
170171
- python3 offline_inference_encoder_decoder.py
171172

172-
- label: Prefix Caching Test # 7min
173+
- label: Prefix Caching Test # 9min
173174
#mirror_hardwares: [amd]
174175
source_file_dependencies:
175176
- vllm/
176177
- tests/prefix_caching
177178
commands:
178179
- pytest -v -s prefix_caching
179180

180-
- label: Samplers Test # 18min
181+
- label: Samplers Test # 36min
181182
source_file_dependencies:
182183
- vllm/model_executor/layers
183184
- vllm/sampling_metadata.py
@@ -193,7 +194,7 @@ steps:
193194
- tests/test_logits_processor
194195
command: pytest -v -s test_logits_processor.py
195196

196-
- label: Speculative decoding tests # 22min
197+
- label: Speculative decoding tests # 30min
197198
source_file_dependencies:
198199
- vllm/spec_decode
199200
- tests/spec_decode
@@ -203,30 +204,30 @@ steps:
203204
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
204205
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
205206

206-
- label: LoRA Test %N # 30min each
207+
- label: LoRA Test %N # 15min each
207208
mirror_hardwares: [amd]
208209
source_file_dependencies:
209210
- vllm/lora
210211
- tests/lora
211212
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
212213
parallelism: 4
213214

214-
- label: "PyTorch Fullgraph Smoke Test"
215+
- label: "PyTorch Fullgraph Smoke Test" # 9min
215216
fast_check: true
216217
source_file_dependencies:
217218
- vllm/
218219
- tests/compile
219220
commands:
220221
- pytest -v -s compile/test_full_graph_smoke.py
221222

222-
- label: "PyTorch Fullgraph Test"
223+
- label: "PyTorch Fullgraph Test" # 18min
223224
source_file_dependencies:
224225
- vllm/
225226
- tests/compile
226227
commands:
227228
- pytest -v -s compile/test_full_graph.py
228229

229-
- label: Kernels Test %N # 30min each
230+
- label: Kernels Test %N # 1h each
230231
mirror_hardwares: [amd]
231232
source_file_dependencies:
232233
- csrc/
@@ -256,7 +257,7 @@ steps:
256257
- pip install aiohttp
257258
- bash run-benchmarks.sh
258259

259-
- label: Quantization Test # 15min
260+
- label: Quantization Test # 33min
260261
source_file_dependencies:
261262
- csrc/
262263
- vllm/model_executor/layers/quantization
@@ -300,15 +301,15 @@ steps:
300301
- pytest -v -s models/test_oot_registration.py # it needs a clean process
301302
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
302303

303-
- label: Decoder-only Language Models Test # 1h3min
304+
- label: Decoder-only Language Models Test # 1h36min
304305
#mirror_hardwares: [amd]
305306
source_file_dependencies:
306307
- vllm/
307308
- tests/models/decoder_only/language
308309
commands:
309310
- pytest -v -s models/decoder_only/language
310311

311-
- label: Decoder-only Multi-Modal Models Test # 56min
312+
- label: Decoder-only Multi-Modal Models Test # 1h31min
312313
#mirror_hardwares: [amd]
313314
source_file_dependencies:
314315
- vllm/
@@ -318,15 +319,25 @@ steps:
318319
- pytest -v -s models/decoder_only/audio_language
319320
- pytest -v -s models/decoder_only/vision_language
320321

321-
- label: Other Models Test # 5min
322+
- label: Other Models Test # 6min
322323
#mirror_hardwares: [amd]
323324
source_file_dependencies:
324325
- vllm/
325326
- tests/models/embedding/language
326327
- tests/models/encoder_decoder/language
328+
- tests/models/encoder_decoder/vision_language
327329
commands:
328330
- pytest -v -s models/embedding/language
329331
- pytest -v -s models/encoder_decoder/language
332+
- pytest -v -s models/encoder_decoder/vision_language
333+
334+
- label: Custom Models Test
335+
#mirror_hardwares: [amd]
336+
optional: true
337+
commands:
338+
# PR authors can temporarily add commands below to test individual models
339+
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
340+
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
330341

331342
##### 1 GPU test #####
332343
##### multi gpus test #####
@@ -359,7 +370,7 @@ steps:
359370
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
360371
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
361372

362-
- label: Distributed Tests (2 GPUs) # 28min
373+
- label: Distributed Tests (2 GPUs) # 40min
363374
#mirror_hardwares: [amd]
364375
working_dir: "/vllm-workspace/tests"
365376
num_gpus: 2
@@ -376,14 +387,16 @@ steps:
376387
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
377388
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
378389
# Avoid importing model tests that cause CUDA reinitialization error
379-
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
390+
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
391+
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
392+
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
380393
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
381394
- pip install -e ./plugins/vllm_add_dummy_model
382395
- pytest -v -s distributed/test_distributed_oot.py
383396
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
384397
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
385398

386-
- label: Multi-step Tests (4 GPUs) # 21min
399+
- label: Multi-step Tests (4 GPUs) # 36min
387400
working_dir: "/vllm-workspace/tests"
388401
num_gpus: 4
389402
source_file_dependencies:
@@ -401,7 +414,7 @@ steps:
401414
- pytest -v -s multi_step/test_correctness_async_llm.py
402415
- pytest -v -s multi_step/test_correctness_llm.py
403416

404-
- label: Pipeline Parallelism Test # 23min
417+
- label: Pipeline Parallelism Test # 45min
405418
working_dir: "/vllm-workspace/tests"
406419
num_gpus: 4
407420
source_file_dependencies:
@@ -427,7 +440,7 @@ steps:
427440
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
428441
- pytest -v -s -x lora/test_long_context.py
429442

430-
- label: Weight Loading Multiple GPU Test
443+
- label: Weight Loading Multiple GPU Test # 33min
431444
working_dir: "/vllm-workspace/tests"
432445
num_gpus: 2
433446
source_file_dependencies:

benchmarks/benchmark_serving.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,6 @@ def sample_sharegpt_requests(
8989
tokenizer: PreTrainedTokenizerBase,
9090
fixed_output_len: Optional[int] = None,
9191
) -> List[Tuple[str, int, int, None]]:
92-
if fixed_output_len is not None and fixed_output_len < 4:
93-
raise ValueError("output_len too small")
9492
# Load the dataset.
9593
with open(dataset_path) as f:
9694
dataset = json.load(f)
@@ -117,7 +115,7 @@ def sample_sharegpt_requests(
117115
prompt_len = len(prompt_token_ids)
118116
output_len = len(completion_token_ids
119117
) if fixed_output_len is None else fixed_output_len
120-
if prompt_len < 4 or output_len < 4:
118+
if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
121119
# Prune too short sequences.
122120
continue
123121
if prompt_len > 1024 or prompt_len + output_len > 2048:
@@ -228,10 +226,11 @@ def sample_hf_requests(
228226
prompt_len = len(prompt_token_ids)
229227
output_len = len(completion_token_ids
230228
) if fixed_output_len is None else fixed_output_len
231-
if prompt_len < 4 or output_len < 4:
229+
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
232230
# Prune too short sequences.
233231
continue
234-
if prompt_len > 1024 or prompt_len + output_len > 2048:
232+
if fixed_output_len is None and \
233+
(prompt_len > 1024 or prompt_len + output_len > 2048):
235234
# Prune too long sequences.
236235
continue
237236

examples/offline_inference_vision_language.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,18 @@
1212
from vllm.assets.video import VideoAsset
1313
from vllm.utils import FlexibleArgumentParser
1414

15+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
16+
# lower-end GPUs.
17+
# Unless specified, these settings have been tested to work on a single L4.
18+
1519

1620
# LLaVA-1.5
1721
def run_llava(question, modality):
1822
assert modality == "image"
1923

2024
prompt = f"USER: <image>\n{question}\nASSISTANT:"
2125

22-
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
26+
llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
2327
stop_token_ids = None
2428
return llm, prompt, stop_token_ids
2529

@@ -57,7 +61,7 @@ def run_llava_onevision(question, modality):
5761
<|im_start|>assistant\n"
5862

5963
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
60-
max_model_len=32768)
64+
max_model_len=16384)
6165
stop_token_ids = None
6266
return llm, prompt, stop_token_ids
6367

@@ -67,7 +71,7 @@ def run_fuyu(question, modality):
6771
assert modality == "image"
6872

6973
prompt = f"{question}\n"
70-
llm = LLM(model="adept/fuyu-8b")
74+
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
7175
stop_token_ids = None
7276
return llm, prompt, stop_token_ids
7377

@@ -99,7 +103,8 @@ def run_phi3v(question, modality):
99103
llm = LLM(
100104
model="microsoft/Phi-3-vision-128k-instruct",
101105
trust_remote_code=True,
102-
max_num_seqs=5,
106+
max_model_len=4096,
107+
max_num_seqs=2,
103108
mm_processor_kwargs={"num_crops": 16},
104109
)
105110
stop_token_ids = None
@@ -122,7 +127,7 @@ def run_chameleon(question, modality):
122127
assert modality == "image"
123128

124129
prompt = f"{question}<image>"
125-
llm = LLM(model="facebook/chameleon-7b")
130+
llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
126131
stop_token_ids = None
127132
return llm, prompt, stop_token_ids
128133

@@ -145,6 +150,8 @@ def run_minicpmv(question, modality):
145150
trust_remote_code=True)
146151
llm = LLM(
147152
model=model_name,
153+
max_model_len=4096,
154+
max_num_seqs=2,
148155
trust_remote_code=True,
149156
)
150157
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
@@ -177,7 +184,7 @@ def run_internvl(question, modality):
177184
llm = LLM(
178185
model=model_name,
179186
trust_remote_code=True,
180-
max_num_seqs=5,
187+
max_model_len=4096,
181188
)
182189

183190
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -215,7 +222,8 @@ def run_qwen_vl(question, modality):
215222
llm = LLM(
216223
model="Qwen/Qwen-VL",
217224
trust_remote_code=True,
218-
max_num_seqs=5,
225+
max_model_len=1024,
226+
max_num_seqs=2,
219227
)
220228

221229
prompt = f"{question}Picture 1: <img></img>\n"
@@ -229,8 +237,10 @@ def run_qwen2_vl(question, modality):
229237

230238
model_name = "Qwen/Qwen2-VL-7B-Instruct"
231239

240+
# Tested on L40
232241
llm = LLM(
233242
model=model_name,
243+
max_model_len=8192,
234244
max_num_seqs=5,
235245
)
236246

@@ -252,10 +262,10 @@ def run_mllama(question, modality):
252262
# max_model_len (131072) for this model may cause OOM.
253263
# You may lower either to run this example on lower-end GPUs.
254264

255-
# The configuration below has been confirmed to launch on a
256-
# single H100 GPU.
265+
# The configuration below has been confirmed to launch on a single L40 GPU.
257266
llm = LLM(
258267
model=model_name,
268+
max_model_len=4096,
259269
max_num_seqs=16,
260270
enforce_eager=True,
261271
)

examples/offline_inference_vision_language_multi_image.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,18 @@ class ModelRequestData(NamedTuple):
2828
chat_template: Optional[str]
2929

3030

31+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
32+
# lower-end GPUs.
33+
# Unless specified, these settings have been tested to work on a single L4.
34+
35+
3136
def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
3237
model_name = "Qwen/Qwen-VL-Chat"
3338
llm = LLM(
3439
model=model_name,
3540
trust_remote_code=True,
36-
max_num_seqs=5,
41+
max_model_len=1024,
42+
max_num_seqs=2,
3743
limit_mm_per_prompt={"image": len(image_urls)},
3844
)
3945
placeholders = "".join(f"Picture {i}: <img></img>\n"
@@ -83,6 +89,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
8389
model="microsoft/Phi-3.5-vision-instruct",
8490
trust_remote_code=True,
8591
max_model_len=4096,
92+
max_num_seqs=2,
8693
limit_mm_per_prompt={"image": len(image_urls)},
8794
mm_processor_kwargs={"num_crops": 4},
8895
)
@@ -106,7 +113,6 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
106113
llm = LLM(
107114
model=model_name,
108115
trust_remote_code=True,
109-
max_num_seqs=5,
110116
max_model_len=4096,
111117
limit_mm_per_prompt={"image": len(image_urls)},
112118
)
@@ -148,10 +154,11 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
148154

149155
model_name = "Qwen/Qwen2-VL-7B-Instruct"
150156

157+
# Tested on L40
151158
llm = LLM(
152159
model=model_name,
153-
max_num_seqs=5,
154160
max_model_len=32768 if process_vision_info is None else 4096,
161+
max_num_seqs=5,
155162
limit_mm_per_prompt={"image": len(image_urls)},
156163
)
157164

0 commit comments

Comments
 (0)