@@ -345,11 +345,10 @@ def main(args: argparse.Namespace):
345
345
args .enforce_eager , args .kv_cache_dtype ,
346
346
args .quantization_param_path , args .device ,
347
347
args .enable_prefix_caching , args .enable_chunked_prefill ,
348
- args .max_num_batched_tokens , args .max_num_seqs ,
349
- args .distributed_executor_backend ,
350
- args .gpu_memory_utilization , args .num_scheduler_steps ,
351
- args .use_v2_block_manager , args .download_dir , args .load_format ,
352
- args .disable_async_output_proc
348
+ args .max_num_batched_tokens , args .max_num_seqs ,
349
+ args .distributed_executor_backend , args .gpu_memory_utilization ,
350
+ args .num_scheduler_steps , args .use_v2_block_manager ,
351
+ args .download_dir , args .load_format , args .disable_async_output_proc
353
352
]
354
353
355
354
if args .async_engine :
@@ -369,8 +368,7 @@ def main(args: argparse.Namespace):
369
368
raise ValueError (f"Unknown backend: { args .backend } " )
370
369
total_num_tokens = sum (prompt_len + output_len
371
370
for _ , prompt_len , output_len in requests )
372
- total_output_tokens = sum (output_len
373
- for _ , _ , output_len in requests )
371
+ total_output_tokens = sum (output_len for _ , _ , output_len in requests )
374
372
print (f"Throughput: { len (requests ) / elapsed_time :.2f} requests/s, "
375
373
f"{ total_num_tokens / elapsed_time :.2f} total tokens/s, "
376
374
f"{ total_output_tokens / elapsed_time :.2f} output tokens/s" )
0 commit comments