22
22
ChatCompletionRequest , ChatCompletionResponse ,
23
23
ChatCompletionResponseChoice , ChatCompletionResponseStreamChoice ,
24
24
ChatCompletionStreamResponse , ChatMessage , DeltaFunctionCall , DeltaMessage ,
25
- DeltaToolCall , ErrorResponse , FunctionCall , ToolCall , UsageInfo )
25
+ DeltaToolCall , ErrorResponse , FunctionCall , RequestResponseMetadata ,
26
+ ToolCall , UsageInfo )
26
27
from vllm .entrypoints .openai .serving_engine import (BaseModelPath ,
27
28
LoRAModulePath ,
28
29
OpenAIServing ,
@@ -175,6 +176,11 @@ async def create_chat_completion(
175
176
"--enable-auto-tool-choice and --tool-call-parser to be set" )
176
177
177
178
request_id = f"chat-{ random_uuid ()} "
179
+
180
+ request_metadata = RequestResponseMetadata (request_id = request_id )
181
+ if raw_request :
182
+ raw_request .state .request_metadata = request_metadata
183
+
178
184
try :
179
185
guided_decode_logits_processor = (
180
186
await self ._guided_decode_logits_processor (request , tokenizer ))
@@ -241,11 +247,13 @@ async def create_chat_completion(
241
247
# Streaming response
242
248
if request .stream :
243
249
return self .chat_completion_stream_generator (
244
- request , result_generator , request_id , conversation , tokenizer )
250
+ request , result_generator , request_id , conversation , tokenizer ,
251
+ request_metadata )
245
252
246
253
try :
247
254
return await self .chat_completion_full_generator (
248
- request , result_generator , request_id , conversation , tokenizer )
255
+ request , result_generator , request_id , conversation , tokenizer ,
256
+ request_metadata )
249
257
except ValueError as e :
250
258
# TODO: Use a vllm-specific Validation Error
251
259
return self .create_error_response (str (e ))
@@ -262,6 +270,7 @@ async def chat_completion_stream_generator(
262
270
request_id : str ,
263
271
conversation : List [ConversationMessage ],
264
272
tokenizer : AnyTokenizer ,
273
+ request_metadata : RequestResponseMetadata ,
265
274
) -> AsyncGenerator [str , None ]:
266
275
model_name = self .base_model_paths [0 ].name
267
276
created_time = int (time .time ())
@@ -580,6 +589,13 @@ async def chat_completion_stream_generator(
580
589
exclude_unset = True , exclude_none = True ))
581
590
yield f"data: { final_usage_data } \n \n "
582
591
592
+ # report to FastAPI middleware aggregate usage across all choices
593
+ num_completion_tokens = sum (previous_num_tokens )
594
+ request_metadata .final_usage_info = UsageInfo (
595
+ prompt_tokens = num_prompt_tokens ,
596
+ completion_tokens = num_completion_tokens ,
597
+ total_tokens = num_prompt_tokens + num_completion_tokens )
598
+
583
599
except ValueError as e :
584
600
# TODO: Use a vllm-specific Validation Error
585
601
logger .error ("error in chat completion stream generator: %s" , e )
@@ -595,6 +611,7 @@ async def chat_completion_full_generator(
595
611
request_id : str ,
596
612
conversation : List [ConversationMessage ],
597
613
tokenizer : AnyTokenizer ,
614
+ request_metadata : RequestResponseMetadata ,
598
615
) -> Union [ErrorResponse , ChatCompletionResponse ]:
599
616
600
617
model_name = self .base_model_paths [0 ].name
@@ -714,6 +731,9 @@ async def chat_completion_full_generator(
714
731
completion_tokens = num_generated_tokens ,
715
732
total_tokens = num_prompt_tokens + num_generated_tokens ,
716
733
)
734
+
735
+ request_metadata .final_usage_info = usage
736
+
717
737
response = ChatCompletionResponse (
718
738
id = request_id ,
719
739
created = created_time ,
0 commit comments