@@ -83,6 +83,7 @@ def run_vllm(
83
83
enable_prefix_caching : bool ,
84
84
enable_chunked_prefill : bool ,
85
85
max_num_batched_tokens : int ,
86
+ max_num_seqs : int ,
86
87
distributed_executor_backend : Optional [str ],
87
88
gpu_memory_utilization : float = 0.9 ,
88
89
num_scheduler_steps : int = 1 ,
@@ -111,6 +112,7 @@ def run_vllm(
111
112
download_dir = download_dir ,
112
113
enable_chunked_prefill = enable_chunked_prefill ,
113
114
max_num_batched_tokens = max_num_batched_tokens ,
115
+ max_num_seqs = max_num_seqs ,
114
116
distributed_executor_backend = distributed_executor_backend ,
115
117
load_format = load_format ,
116
118
num_scheduler_steps = num_scheduler_steps ,
@@ -172,6 +174,7 @@ async def run_vllm_async(
172
174
enable_prefix_caching : bool ,
173
175
enable_chunked_prefill : bool ,
174
176
max_num_batched_tokens : int ,
177
+ max_num_seqs : int ,
175
178
distributed_executor_backend : Optional [str ],
176
179
gpu_memory_utilization : float = 0.9 ,
177
180
num_scheduler_steps : int = 1 ,
@@ -200,6 +203,7 @@ async def run_vllm_async(
200
203
download_dir = download_dir ,
201
204
enable_chunked_prefill = enable_chunked_prefill ,
202
205
max_num_batched_tokens = max_num_batched_tokens ,
206
+ max_num_seqs = max_num_seqs ,
203
207
distributed_executor_backend = distributed_executor_backend ,
204
208
load_format = load_format ,
205
209
num_scheduler_steps = num_scheduler_steps ,
@@ -341,7 +345,8 @@ def main(args: argparse.Namespace):
341
345
args .enforce_eager , args .kv_cache_dtype ,
342
346
args .quantization_param_path , args .device ,
343
347
args .enable_prefix_caching , args .enable_chunked_prefill ,
344
- args .max_num_batched_tokens , args .distributed_executor_backend ,
348
+ args .max_num_batched_tokens , args .max_num_seqs ,
349
+ args .distributed_executor_backend ,
345
350
args .gpu_memory_utilization , args .num_scheduler_steps ,
346
351
args .use_v2_block_manager , args .download_dir , args .load_format ,
347
352
args .disable_async_output_proc
@@ -494,6 +499,11 @@ def main(args: argparse.Namespace):
494
499
default = None ,
495
500
help = 'maximum number of batched tokens per '
496
501
'iteration' )
502
+ parser .add_argument ('--max-num-seqs' ,
503
+ type = int ,
504
+ default = None ,
505
+ help = 'maximum number of sequences per '
506
+ 'iteration' )
497
507
parser .add_argument ('--download-dir' ,
498
508
type = str ,
499
509
default = None ,
0 commit comments