Skip to content

Commit 47ed7e1

Browse files
committed
Merge branch 'main' into nvlm_d
2 parents 6d54d59 + 4f95ffe commit 47ed7e1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+2371
-1575
lines changed

.buildkite/run-cpu-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
2323
# Run basic model test
2424
docker exec cpu-test bash -c "
2525
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
26+
pytest -v -s tests/models/encoder_decoder/language
2627
pytest -v -s tests/models/decoder_only/language \
2728
--ignore=tests/models/test_fp8.py \
2829
--ignore=tests/models/decoder_only/language/test_jamba.py \

benchmarks/backend_request_func.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ class RequestFuncInput:
2323
output_len: int
2424
model: str
2525
best_of: int = 1
26-
use_beam_search: bool = False
2726
logprobs: Optional[int] = None
2827
multi_modal_content: Optional[dict] = None
2928
ignore_eos: bool = False
@@ -49,7 +48,6 @@ async def async_request_tgi(
4948
assert api_url.endswith("generate_stream")
5049

5150
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
52-
assert not request_func_input.use_beam_search
5351
params = {
5452
"best_of": request_func_input.best_of,
5553
"max_new_tokens": request_func_input.output_len,
@@ -121,7 +119,6 @@ async def async_request_trt_llm(
121119
assert api_url.endswith("generate_stream")
122120

123121
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
124-
assert not request_func_input.use_beam_search
125122
assert request_func_input.best_of == 1
126123
payload = {
127124
"accumulate_tokens": True,
@@ -187,7 +184,6 @@ async def async_request_deepspeed_mii(
187184
) -> RequestFuncOutput:
188185
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
189186
assert request_func_input.best_of == 1
190-
assert not request_func_input.use_beam_search
191187

192188
payload = {
193189
"prompt": request_func_input.prompt,
@@ -235,7 +231,6 @@ async def async_request_openai_completions(
235231
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
236232

237233
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
238-
assert not request_func_input.use_beam_search
239234
payload = {
240235
"model": request_func_input.model,
241236
"prompt": request_func_input.prompt,
@@ -317,7 +312,6 @@ async def async_request_openai_chat_completions(
317312
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
318313

319314
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
320-
assert not request_func_input.use_beam_search
321315
content = [{"type": "text", "text": request_func_input.prompt}]
322316
if request_func_input.multi_modal_content:
323317
content.append(request_func_input.multi_modal_content)

benchmarks/benchmark_latency.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,8 @@ def main(args: argparse.Namespace):
5151

5252
sampling_params = SamplingParams(
5353
n=args.n,
54-
temperature=0.0 if args.use_beam_search else 1.0,
54+
temperature=1.0,
5555
top_p=1.0,
56-
use_beam_search=args.use_beam_search,
5756
ignore_eos=True,
5857
max_tokens=args.output_len,
5958
)

benchmarks/benchmark_prioritization.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ def run_vllm(
6868
tensor_parallel_size: int,
6969
seed: int,
7070
n: int,
71-
use_beam_search: bool,
7271
trust_remote_code: bool,
7372
dtype: str,
7473
max_model_len: Optional[int],
@@ -114,9 +113,8 @@ def run_vllm(
114113
sampling_params.append(
115114
SamplingParams(
116115
n=n,
117-
temperature=0.0 if use_beam_search else 1.0,
116+
temperature=1.0,
118117
top_p=1.0,
119-
use_beam_search=use_beam_search,
120118
ignore_eos=True,
121119
max_tokens=output_len,
122120
))
@@ -144,15 +142,16 @@ def main(args: argparse.Namespace):
144142
args.output_len)
145143

146144
if args.backend == "vllm":
147-
elapsed_time = run_vllm(
148-
requests, args.model, args.tokenizer, args.quantization,
149-
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
150-
args.trust_remote_code, args.dtype, args.max_model_len,
151-
args.enforce_eager, args.kv_cache_dtype,
152-
args.quantization_param_path, args.device,
153-
args.enable_prefix_caching, args.enable_chunked_prefill,
154-
args.max_num_batched_tokens, args.gpu_memory_utilization,
155-
args.download_dir)
145+
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
146+
args.quantization, args.tensor_parallel_size,
147+
args.seed, args.n, args.trust_remote_code,
148+
args.dtype, args.max_model_len,
149+
args.enforce_eager, args.kv_cache_dtype,
150+
args.quantization_param_path, args.device,
151+
args.enable_prefix_caching,
152+
args.enable_chunked_prefill,
153+
args.max_num_batched_tokens,
154+
args.gpu_memory_utilization, args.download_dir)
156155
else:
157156
raise ValueError(f"Unknown backend: {args.backend}")
158157
total_num_tokens = sum(prompt_len + output_len
@@ -203,7 +202,6 @@ def main(args: argparse.Namespace):
203202
type=int,
204203
default=1,
205204
help="Number of generated sequences per prompt.")
206-
parser.add_argument("--use-beam-search", action="store_true")
207205
parser.add_argument("--num-prompts",
208206
type=int,
209207
default=200,

benchmarks/benchmark_serving.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,6 @@ async def benchmark(
391391
input_requests: List[Tuple[str, int, int]],
392392
logprobs: Optional[int],
393393
best_of: int,
394-
use_beam_search: bool,
395394
request_rate: float,
396395
disable_tqdm: bool,
397396
profile: bool,
@@ -419,7 +418,6 @@ async def benchmark(
419418
output_len=test_output_len,
420419
logprobs=logprobs,
421420
best_of=best_of,
422-
use_beam_search=use_beam_search,
423421
multi_modal_content=test_mm_content,
424422
ignore_eos=ignore_eos,
425423
)
@@ -441,7 +439,6 @@ async def benchmark(
441439
output_len=test_output_len,
442440
logprobs=logprobs,
443441
best_of=best_of,
444-
use_beam_search=use_beam_search,
445442
multi_modal_content=test_mm_content,
446443
)
447444
profile_output = await request_func(request_func_input=profile_input)
@@ -464,7 +461,6 @@ async def benchmark(
464461
output_len=output_len,
465462
logprobs=logprobs,
466463
best_of=best_of,
467-
use_beam_search=use_beam_search,
468464
multi_modal_content=mm_content,
469465
)
470466
tasks.append(
@@ -483,7 +479,6 @@ async def benchmark(
483479
output_len=test_output_len,
484480
logprobs=logprobs,
485481
best_of=best_of,
486-
use_beam_search=use_beam_search,
487482
)
488483
profile_output = await request_func(request_func_input=profile_input)
489484
if profile_output.success:
@@ -679,7 +674,6 @@ def main(args: argparse.Namespace):
679674
input_requests=input_requests,
680675
logprobs=args.logprobs,
681676
best_of=args.best_of,
682-
use_beam_search=args.use_beam_search,
683677
request_rate=args.request_rate,
684678
disable_tqdm=args.disable_tqdm,
685679
profile=args.profile,
@@ -701,7 +695,6 @@ def main(args: argparse.Namespace):
701695
result_json["model_id"] = model_id
702696
result_json["tokenizer_id"] = tokenizer_id
703697
result_json["best_of"] = args.best_of
704-
result_json["use_beam_search"] = args.use_beam_search
705698
result_json["num_prompts"] = args.num_prompts
706699

707700
# Metadata

benchmarks/benchmark_throughput.py

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from vllm.entrypoints.openai.api_server import (
1616
build_async_engine_client_from_engine_args)
1717
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
18+
from vllm.sampling_params import BeamSearchParams
1819
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
1920

2021

@@ -72,7 +73,6 @@ def run_vllm(
7273
tensor_parallel_size: int,
7374
seed: int,
7475
n: int,
75-
use_beam_search: bool,
7676
trust_remote_code: bool,
7777
dtype: str,
7878
max_model_len: Optional[int],
@@ -90,7 +90,6 @@ def run_vllm(
9090
download_dir: Optional[str] = None,
9191
load_format: str = EngineArgs.load_format,
9292
disable_async_output_proc: bool = False,
93-
use_new_beam_search_impl: bool = False,
9493
) -> float:
9594
from vllm import LLM, SamplingParams
9695
llm = LLM(
@@ -126,29 +125,32 @@ def run_vllm(
126125
sampling_params.append(
127126
SamplingParams(
128127
n=n,
129-
temperature=0.0 if use_beam_search else 1.0,
128+
temperature=1.0,
130129
top_p=1.0,
131-
use_beam_search=use_beam_search,
132130
ignore_eos=True,
133131
max_tokens=output_len,
134132
))
135133

136-
if not use_new_beam_search_impl:
134+
use_beam_search = False
135+
136+
if not use_beam_search:
137137
start = time.perf_counter()
138138
llm.generate(prompts, sampling_params, use_tqdm=True)
139139
end = time.perf_counter()
140140
else:
141-
assert use_beam_search
142141
prompts = [prompt for prompt, _, _ in requests]
143142
# output_len should be the same for all requests.
144143
output_len = requests[0][2]
145144
for prompt, input_len, _output_len in requests:
146145
assert _output_len == output_len
147146
start = time.perf_counter()
148-
llm.beam_search(prompts,
149-
beam_width=n,
150-
max_tokens=output_len,
151-
ignore_eos=True)
147+
llm.beam_search(
148+
prompts,
149+
BeamSearchParams(
150+
beam_width=n,
151+
max_tokens=output_len,
152+
ignore_eos=True,
153+
))
152154
end = time.perf_counter()
153155
return end - start
154156

@@ -161,7 +163,6 @@ async def run_vllm_async(
161163
tensor_parallel_size: int,
162164
seed: int,
163165
n: int,
164-
use_beam_search: bool,
165166
trust_remote_code: bool,
166167
dtype: str,
167168
max_model_len: Optional[int],
@@ -220,9 +221,8 @@ async def run_vllm_async(
220221
sampling_params.append(
221222
SamplingParams(
222223
n=n,
223-
temperature=0.0 if use_beam_search else 1.0,
224+
temperature=1.0,
224225
top_p=1.0,
225-
use_beam_search=use_beam_search,
226226
ignore_eos=True,
227227
max_tokens=output_len,
228228
))
@@ -244,11 +244,9 @@ def run_hf(
244244
model: str,
245245
tokenizer: PreTrainedTokenizerBase,
246246
n: int,
247-
use_beam_search: bool,
248247
max_batch_size: int,
249248
trust_remote_code: bool,
250249
) -> float:
251-
assert not use_beam_search
252250
llm = AutoModelForCausalLM.from_pretrained(
253251
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
254252
if llm.config.model_type == "llama":
@@ -280,7 +278,7 @@ def run_hf(
280278
padding=True).input_ids
281279
llm_outputs = llm.generate(
282280
input_ids=input_ids.cuda(),
283-
do_sample=not use_beam_search,
281+
do_sample=True,
284282
num_return_sequences=n,
285283
temperature=1.0,
286284
top_p=1.0,
@@ -336,7 +334,7 @@ def main(args: argparse.Namespace):
336334
if args.backend == "vllm":
337335
run_args = [
338336
requests, args.model, args.tokenizer, args.quantization,
339-
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
337+
args.tensor_parallel_size, args.seed, args.n,
340338
args.trust_remote_code, args.dtype, args.max_model_len,
341339
args.enforce_eager, args.kv_cache_dtype,
342340
args.quantization_param_path, args.device,
@@ -351,12 +349,11 @@ def main(args: argparse.Namespace):
351349
run_args.append(args.disable_frontend_multiprocessing)
352350
elapsed_time = uvloop.run(run_vllm_async(*run_args))
353351
else:
354-
elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
352+
elapsed_time = run_vllm(*run_args)
355353
elif args.backend == "hf":
356354
assert args.tensor_parallel_size == 1
357355
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
358-
args.use_beam_search, args.hf_max_batch_size,
359-
args.trust_remote_code)
356+
args.hf_max_batch_size, args.trust_remote_code)
360357
elif args.backend == "mii":
361358
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
362359
args.output_len)
@@ -410,8 +407,6 @@ def main(args: argparse.Namespace):
410407
type=int,
411408
default=1,
412409
help="Number of generated sequences per prompt.")
413-
parser.add_argument("--use-beam-search", action="store_true")
414-
parser.add_argument("--use-new-beam-search-impl", action="store_true")
415410
parser.add_argument("--num-prompts",
416411
type=int,
417412
default=1000,
@@ -566,8 +561,6 @@ def main(args: argparse.Namespace):
566561
raise ValueError("dtype must be auto for MII backend.")
567562
if args.n != 1:
568563
raise ValueError("n must be 1 for MII backend.")
569-
if args.use_beam_search:
570-
raise ValueError("Beam search is not supported for MII backend.")
571564
if args.quantization is not None:
572565
raise ValueError("Quantization is only for vLLM backend.")
573566
if args.hf_max_batch_size is not None:

0 commit comments

Comments
 (0)