|
18 | 18 | CompletionResponseChoice,
|
19 | 19 | CompletionResponseStreamChoice,
|
20 | 20 | CompletionStreamResponse,
|
21 |
| - ErrorResponse, UsageInfo, |
22 |
| - RequestResponseMetadata) |
| 21 | + ErrorResponse, |
| 22 | + RequestResponseMetadata, |
| 23 | + UsageInfo) |
23 | 24 | # yapf: enable
|
24 | 25 | from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
|
25 | 26 | LoRAModulePath,
|
@@ -170,14 +171,15 @@ async def create_completion(
|
170 | 171 |
|
171 | 172 | # Streaming response
|
172 | 173 | if stream:
|
173 |
| - return self.completion_stream_generator(request, |
174 |
| - result_generator, |
175 |
| - request_id, |
176 |
| - created_time, |
177 |
| - model_name, |
178 |
| - num_prompts=len(prompts), |
179 |
| - tokenizer=tokenizer, |
180 |
| - request_metadata=request_metadata) |
| 174 | + return self.completion_stream_generator( |
| 175 | + request, |
| 176 | + result_generator, |
| 177 | + request_id, |
| 178 | + created_time, |
| 179 | + model_name, |
| 180 | + num_prompts=len(prompts), |
| 181 | + tokenizer=tokenizer, |
| 182 | + request_metadata=request_metadata) |
181 | 183 |
|
182 | 184 | # Non-streaming response
|
183 | 185 | final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts)
|
@@ -354,12 +356,13 @@ async def completion_stream_generator(
|
354 | 356 | exclude_unset=False, exclude_none=True))
|
355 | 357 | yield f"data: {final_usage_data}\n\n"
|
356 | 358 |
|
357 |
| - # report to FastAPI middleware aggregate tokens (all prompts, all completions) |
| 359 | + # report to FastAPI middleware aggregate usage across all choices |
358 | 360 | total_prompt_tokens = sum(num_prompt_tokens)
|
359 | 361 | total_completion_tokens = sum(previous_num_tokens)
|
360 |
| - request_metadata.final_usage_info = UsageInfo(prompt_tokens=total_prompt_tokens, |
361 |
| - completion_tokens=total_completion_tokens, |
362 |
| - total_tokens=total_prompt_tokens + total_completion_tokens) |
| 362 | + request_metadata.final_usage_info = UsageInfo( |
| 363 | + prompt_tokens=total_prompt_tokens, |
| 364 | + completion_tokens=total_completion_tokens, |
| 365 | + total_tokens=total_prompt_tokens + total_completion_tokens) |
363 | 366 |
|
364 | 367 | except ValueError as e:
|
365 | 368 | # TODO: Use a vllm-specific Validation Error
|
|
0 commit comments