vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 32 additions & 19 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 32 additions & 19 deletions
diff --git a/‎benchmarks/benchmark_serving.py
Lines changed: 4 additions & 5 deletions b/‎benchmarks/benchmark_serving.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎examples/offline_inference_vision_language.py
Lines changed: 19 additions & 9 deletions b/‎examples/offline_inference_vision_language.py
Lines changed: 19 additions & 9 deletions
diff --git a/‎examples/offline_inference_vision_language_multi_image.py
Lines changed: 10 additions & 3 deletions b/‎examples/offline_inference_vision_language_multi_image.py
Lines changed: 10 additions & 3 deletions
@@ -9,6 +9,7 @@
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually)
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -39,7 +40,7 @@ steps:
   # Check API reference (if it fails, you may have missing mock imports)
   - grep \"sig sig-object py\" build/html/dev/sampling_params.html
 
-- label: Async Engine, Inputs, Utils, Worker Test # 15min
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
   fast_check: true
   source_file_dependencies:
   - vllm/
@@ -81,7 +82,7 @@ steps:
   commands:
   - pytest -v -s core
 
-- label: Entrypoints Test # 20min
+- label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
   mirror_hardwares: [amd]
@@ -151,7 +152,7 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
-- label: Examples Test # 12min
+- label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
   source_file_dependencies:
@@ -169,15 +170,15 @@ steps:
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
 
-- label: Prefix Caching Test # 7min
+- label: Prefix Caching Test # 9min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/prefix_caching
   commands:
     - pytest -v -s prefix_caching
 
-- label: Samplers Test # 18min
+- label: Samplers Test # 36min
   source_file_dependencies:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
@@ -193,7 +194,7 @@ steps:
   - tests/test_logits_processor
   command: pytest -v -s test_logits_processor.py
 
-- label: Speculative decoding tests # 22min
+- label: Speculative decoding tests # 30min
   source_file_dependencies:
   - vllm/spec_decode
   - tests/spec_decode
@@ -203,30 +204,30 @@ steps:
     - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
     - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
 
-- label: LoRA Test %N # 30min each
+- label: LoRA Test %N # 15min each
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
   parallelism: 4
 
-- label: "PyTorch Fullgraph Smoke Test"
+- label: "PyTorch Fullgraph Smoke Test" # 9min
   fast_check: true
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph_smoke.py
 
-- label: "PyTorch Fullgraph Test"
+- label: "PyTorch Fullgraph Test" # 18min
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 30min each
+- label: Kernels Test %N # 1h each
   mirror_hardwares: [amd]
   source_file_dependencies:
   - csrc/
@@ -256,7 +257,7 @@ steps:
   - pip install aiohttp
   - bash run-benchmarks.sh
 
-- label: Quantization Test # 15min
+- label: Quantization Test # 33min
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
@@ -300,15 +301,15 @@ steps:
     - pytest -v -s models/test_oot_registration.py # it needs a clean process
     - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
 
-- label: Decoder-only Language Models Test # 1h3min
+- label: Decoder-only Language Models Test # 1h36min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
   commands:
     - pytest -v -s models/decoder_only/language
 
-- label: Decoder-only Multi-Modal Models Test # 56min
+- label: Decoder-only Multi-Modal Models Test # 1h31min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -318,15 +319,25 @@ steps:
     - pytest -v -s models/decoder_only/audio_language
     - pytest -v -s models/decoder_only/vision_language
 
-- label: Other Models Test # 5min
+- label: Other Models Test # 6min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/embedding/language
   - tests/models/encoder_decoder/language
+  - tests/models/encoder_decoder/vision_language
   commands:
     - pytest -v -s models/embedding/language
     - pytest -v -s models/encoder_decoder/language
+    - pytest -v -s models/encoder_decoder/vision_language
+
+- label: Custom Models Test
+  #mirror_hardwares: [amd]
+  optional: true
+  commands:
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -359,7 +370,7 @@ steps:
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
 
-- label: Distributed Tests (2 GPUs) # 28min
+- label: Distributed Tests (2 GPUs) # 40min
   #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -376,14 +387,16 @@ steps:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
   # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
-- label: Multi-step Tests (4 GPUs) # 21min
+- label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -401,7 +414,7 @@ steps:
   - pytest -v -s multi_step/test_correctness_async_llm.py
   - pytest -v -s multi_step/test_correctness_llm.py
 
-- label: Pipeline Parallelism Test # 23min
+- label: Pipeline Parallelism Test # 45min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -427,7 +440,7 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_long_context.py
 
-- label: Weight Loading Multiple GPU Test
+- label: Weight Loading Multiple GPU Test  # 33min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
 
@@ -89,8 +89,6 @@ def sample_sharegpt_requests(
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int, None]]:
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -117,7 +115,7 @@ def sample_sharegpt_requests(
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
                          ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
+        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
             # Prune too short sequences.
             continue
         if prompt_len > 1024 or prompt_len + output_len > 2048:
@@ -228,10 +226,11 @@ def sample_hf_requests(
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
                          ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
+        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
             # Prune too short sequences.
             continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
+        if fixed_output_len is None and \
+            (prompt_len > 1024 or prompt_len + output_len > 2048):
             # Prune too long sequences.
             continue
 
 
@@ -12,14 +12,18 @@
 from vllm.assets.video import VideoAsset
 from vllm.utils import FlexibleArgumentParser
 
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
 
 # LLaVA-1.5
 def run_llava(question, modality):
     assert modality == "image"
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -57,7 +61,7 @@ def run_llava_onevision(question, modality):
         <|im_start|>assistant\n"
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=32768)
+              max_model_len=16384)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -67,7 +71,7 @@ def run_fuyu(question, modality):
     assert modality == "image"
 
     prompt = f"{question}\n"
-    llm = LLM(model="adept/fuyu-8b")
+    llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -99,7 +103,8 @@ def run_phi3v(question, modality):
     llm = LLM(
         model="microsoft/Phi-3-vision-128k-instruct",
         trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=4096,
+        max_num_seqs=2,
         mm_processor_kwargs={"num_crops": 16},
     )
     stop_token_ids = None
@@ -122,7 +127,7 @@ def run_chameleon(question, modality):
     assert modality == "image"
 
     prompt = f"{question}<image>"
-    llm = LLM(model="facebook/chameleon-7b")
+    llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -145,6 +150,8 @@ def run_minicpmv(question, modality):
                                               trust_remote_code=True)
     llm = LLM(
         model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
         trust_remote_code=True,
     )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
@@ -177,7 +184,7 @@ def run_internvl(question, modality):
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=4096,
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -215,7 +222,8 @@ def run_qwen_vl(question, modality):
     llm = LLM(
         model="Qwen/Qwen-VL",
         trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=1024,
+        max_num_seqs=2,
     )
 
     prompt = f"{question}Picture 1: <img></img>\n"
@@ -229,8 +237,10 @@ def run_qwen2_vl(question, modality):
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
+    # Tested on L40
     llm = LLM(
         model=model_name,
+        max_model_len=8192,
         max_num_seqs=5,
     )
 
@@ -252,10 +262,10 @@ def run_mllama(question, modality):
     # max_model_len (131072) for this model may cause OOM.
     # You may lower either to run this example on lower-end GPUs.
 
-    # The configuration below has been confirmed to launch on a
-    # single H100 GPU.
+    # The configuration below has been confirmed to launch on a single L40 GPU.
     llm = LLM(
         model=model_name,
+        max_model_len=4096,
         max_num_seqs=16,
         enforce_eager=True,
     )
 
@@ -28,12 +28,18 @@ class ModelRequestData(NamedTuple):
     chat_template: Optional[str]
 
 
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+
 def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
-        max_num_seqs=5,
+        max_model_len=1024,
+        max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
     placeholders = "".join(f"Picture {i}: <img></img>\n"
@@ -83,6 +89,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         max_model_len=4096,
+        max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         mm_processor_kwargs={"num_crops": 4},
     )
@@ -106,7 +113,6 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
-        max_num_seqs=5,
         max_model_len=4096,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
@@ -148,10 +154,11 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
+    # Tested on L40
     llm = LLM(
         model=model_name,
-        max_num_seqs=5,
         max_model_len=32768 if process_vision_info is None else 4096,
+        max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )