vllm-project
diff --git a/‎.buildkite/run-cpu-test.sh
Lines changed: 1 addition & 0 deletions b/‎.buildkite/run-cpu-test.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/backend_request_func.py
Lines changed: 0 additions & 6 deletions b/‎benchmarks/backend_request_func.py
Lines changed: 0 additions & 6 deletions
diff --git a/‎benchmarks/benchmark_latency.py
Lines changed: 1 addition & 2 deletions b/‎benchmarks/benchmark_latency.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/benchmark_prioritization.py
Lines changed: 11 additions & 13 deletions b/‎benchmarks/benchmark_prioritization.py
Lines changed: 11 additions & 13 deletions
diff --git a/‎benchmarks/benchmark_serving.py
Lines changed: 0 additions & 7 deletions b/‎benchmarks/benchmark_serving.py
Lines changed: 0 additions & 7 deletions
diff --git a/‎benchmarks/benchmark_throughput.py
Lines changed: 17 additions & 24 deletions b/‎benchmarks/benchmark_throughput.py
Lines changed: 17 additions & 24 deletions
@@ -23,6 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  pytest -v -s tests/models/encoder_decoder/language
   pytest -v -s tests/models/decoder_only/language \
     --ignore=tests/models/test_fp8.py \
     --ignore=tests/models/decoder_only/language/test_jamba.py \
 
@@ -23,7 +23,6 @@ class RequestFuncInput:
     output_len: int
     model: str
     best_of: int = 1
-    use_beam_search: bool = False
     logprobs: Optional[int] = None
     multi_modal_content: Optional[dict] = None
     ignore_eos: bool = False
@@ -49,7 +48,6 @@ async def async_request_tgi(
     assert api_url.endswith("generate_stream")
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         params = {
             "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
@@ -121,7 +119,6 @@ async def async_request_trt_llm(
     assert api_url.endswith("generate_stream")
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
@@ -187,7 +184,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         assert request_func_input.best_of == 1
-        assert not request_func_input.use_beam_search
 
         payload = {
             "prompt": request_func_input.prompt,
@@ -235,7 +231,6 @@ async def async_request_openai_completions(
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         payload = {
             "model": request_func_input.model,
             "prompt": request_func_input.prompt,
@@ -317,7 +312,6 @@ async def async_request_openai_chat_completions(
     ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
     async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
         content = [{"type": "text", "text": request_func_input.prompt}]
         if request_func_input.multi_modal_content:
             content.append(request_func_input.multi_modal_content)
 
@@ -51,9 +51,8 @@ def main(args: argparse.Namespace):
 
     sampling_params = SamplingParams(
         n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
+        temperature=1.0,
         top_p=1.0,
-        use_beam_search=args.use_beam_search,
         ignore_eos=True,
         max_tokens=args.output_len,
     )
 
@@ -68,7 +68,6 @@ def run_vllm(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -114,9 +113,8 @@ def run_vllm(
         sampling_params.append(
             SamplingParams(
                 n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                 top_p=1.0,
-                use_beam_search=use_beam_search,
                 ignore_eos=True,
                 max_tokens=output_len,
             ))
@@ -144,15 +142,16 @@ def main(args: argparse.Namespace):
                                    args.output_len)
 
     if args.backend == "vllm":
-        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
-            args.enforce_eager, args.kv_cache_dtype,
-            args.quantization_param_path, args.device,
-            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.download_dir)
+        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+                                args.quantization, args.tensor_parallel_size,
+                                args.seed, args.n, args.trust_remote_code,
+                                args.dtype, args.max_model_len,
+                                args.enforce_eager, args.kv_cache_dtype,
+                                args.quantization_param_path, args.device,
+                                args.enable_prefix_caching,
+                                args.enable_chunked_prefill,
+                                args.max_num_batched_tokens,
+                                args.gpu_memory_utilization, args.download_dir)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
     total_num_tokens = sum(prompt_len + output_len
@@ -203,7 +202,6 @@ def main(args: argparse.Namespace):
                         type=int,
                         default=1,
                         help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument("--num-prompts",
                         type=int,
                         default=200,
 
@@ -391,7 +391,6 @@ async def benchmark(
     input_requests: List[Tuple[str, int, int]],
     logprobs: Optional[int],
     best_of: int,
-    use_beam_search: bool,
     request_rate: float,
     disable_tqdm: bool,
     profile: bool,
@@ -419,7 +418,6 @@ async def benchmark(
         output_len=test_output_len,
         logprobs=logprobs,
         best_of=best_of,
-        use_beam_search=use_beam_search,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
     )
@@ -441,7 +439,6 @@ async def benchmark(
             output_len=test_output_len,
             logprobs=logprobs,
             best_of=best_of,
-            use_beam_search=use_beam_search,
             multi_modal_content=test_mm_content,
         )
         profile_output = await request_func(request_func_input=profile_input)
@@ -464,7 +461,6 @@ async def benchmark(
             output_len=output_len,
             logprobs=logprobs,
             best_of=best_of,
-            use_beam_search=use_beam_search,
             multi_modal_content=mm_content,
         )
         tasks.append(
@@ -483,7 +479,6 @@ async def benchmark(
             output_len=test_output_len,
             logprobs=logprobs,
             best_of=best_of,
-            use_beam_search=use_beam_search,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -679,7 +674,6 @@ def main(args: argparse.Namespace):
             input_requests=input_requests,
             logprobs=args.logprobs,
             best_of=args.best_of,
-            use_beam_search=args.use_beam_search,
             request_rate=args.request_rate,
             disable_tqdm=args.disable_tqdm,
             profile=args.profile,
@@ -701,7 +695,6 @@ def main(args: argparse.Namespace):
         result_json["model_id"] = model_id
         result_json["tokenizer_id"] = tokenizer_id
         result_json["best_of"] = args.best_of
-        result_json["use_beam_search"] = args.use_beam_search
         result_json["num_prompts"] = args.num_prompts
 
         # Metadata
 
@@ -15,6 +15,7 @@
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
@@ -72,7 +73,6 @@ def run_vllm(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -90,7 +90,6 @@ def run_vllm(
     download_dir: Optional[str] = None,
     load_format: str = EngineArgs.load_format,
     disable_async_output_proc: bool = False,
-    use_new_beam_search_impl: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -126,29 +125,32 @@ def run_vllm(
         sampling_params.append(
             SamplingParams(
                 n=n,
-                temperature=0.0 if use_beam_search else 1.0,
+                temperature=1.0,
                 top_p=1.0,
-                use_beam_search=use_beam_search,
                 ignore_eos=True,
                 max_tokens=output_len,
             ))
 
-    if not use_new_beam_search_impl:
+    use_beam_search = False
+
+    if not use_beam_search:
         start = time.perf_counter()
         llm.generate(prompts, sampling_params, use_tqdm=True)
         end = time.perf_counter()
     else:
-        assert use_beam_search
         prompts = [prompt for prompt, _, _ in requests]
         # output_len should be the same for all requests.
         output_len = requests[0][2]
         for prompt, input_len, _output_len in requests:
             assert _output_len == output_len
         start = time.perf_counter()
-        llm.beam_search(prompts,
-                        beam_width=n,
-                        max_tokens=output_len,
-                        ignore_eos=True)
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ))
         end = time.perf_counter()
     return end - start
 
@@ -161,7 +163,6 @@ async def run_vllm_async(
     tensor_parallel_size: int,
     seed: int,
     n: int,
-    use_beam_search: bool,
     trust_remote_code: bool,
     dtype: str,
     max_model_len: Optional[int],
@@ -220,9 +221,8 @@ async def run_vllm_async(
             sampling_params.append(
                 SamplingParams(
                     n=n,
-                    temperature=0.0 if use_beam_search else 1.0,
+                    temperature=1.0,
                     top_p=1.0,
-                    use_beam_search=use_beam_search,
                     ignore_eos=True,
                     max_tokens=output_len,
                 ))
@@ -244,11 +244,9 @@ def run_hf(
     model: str,
     tokenizer: PreTrainedTokenizerBase,
     n: int,
-    use_beam_search: bool,
     max_batch_size: int,
     trust_remote_code: bool,
 ) -> float:
-    assert not use_beam_search
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
@@ -280,7 +278,7 @@ def run_hf(
                               padding=True).input_ids
         llm_outputs = llm.generate(
             input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
+            do_sample=True,
             num_return_sequences=n,
             temperature=1.0,
             top_p=1.0,
@@ -336,7 +334,7 @@ def main(args: argparse.Namespace):
     if args.backend == "vllm":
         run_args = [
             requests, args.model, args.tokenizer, args.quantization,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.tensor_parallel_size, args.seed, args.n,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
@@ -351,12 +349,11 @@ def main(args: argparse.Namespace):
             run_args.append(args.disable_frontend_multiprocessing)
             elapsed_time = uvloop.run(run_vllm_async(*run_args))
         else:
-            elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
+            elapsed_time = run_vllm(*run_args)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)
@@ -410,8 +407,6 @@ def main(args: argparse.Namespace):
                         type=int,
                         default=1,
                         help="Number of generated sequences per prompt.")
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--use-new-beam-search-impl", action="store_true")
     parser.add_argument("--num-prompts",
                         type=int,
                         default=1000,
@@ -566,8 +561,6 @@ def main(args: argparse.Namespace):
             raise ValueError("dtype must be auto for MII backend.")
         if args.n != 1:
             raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
         if args.quantization is not None:
             raise ValueError("Quantization is only for vLLM backend.")
         if args.hf_max_batch_size is not None: