Skip to content

Commit f1e15da

Browse files
authored
[Frontend] Continuous usage stats in OpenAI completion API (#5742)
1 parent 0097bb1 commit f1e15da

File tree

3 files changed

+110
-31
lines changed

3 files changed

+110
-31
lines changed

tests/entrypoints/openai/test_completion.py

Lines changed: 94 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -295,25 +295,49 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
295295
model_name: str):
296296
prompt = "What is the capital of France?"
297297

298-
# Test stream=True, stream_options={"include_usage": False}
299-
stream = await client.completions.create(
300-
model=model_name,
301-
prompt=prompt,
302-
max_tokens=5,
303-
temperature=0.0,
304-
stream=True,
305-
stream_options={"include_usage": False})
298+
# Test stream=True, stream_options=
299+
# {"include_usage": False, "continuous_usage_stats": False}
300+
stream = await client.completions.create(model=model_name,
301+
prompt=prompt,
302+
max_tokens=5,
303+
temperature=0.0,
304+
stream=True,
305+
stream_options={
306+
"include_usage": False,
307+
"continuous_usage_stats":
308+
False,
309+
})
310+
306311
async for chunk in stream:
307312
assert chunk.usage is None
308313

309-
# Test stream=True, stream_options={"include_usage": True}
310-
stream = await client.completions.create(
311-
model=model_name,
312-
prompt=prompt,
313-
max_tokens=5,
314-
temperature=0.0,
315-
stream=True,
316-
stream_options={"include_usage": True})
314+
# Test stream=True, stream_options=
315+
# {"include_usage": False, "continuous_usage_stats": True}
316+
stream = await client.completions.create(model=model_name,
317+
prompt=prompt,
318+
max_tokens=5,
319+
temperature=0.0,
320+
stream=True,
321+
stream_options={
322+
"include_usage": False,
323+
"continuous_usage_stats":
324+
True,
325+
})
326+
async for chunk in stream:
327+
assert chunk.usage is None
328+
329+
# Test stream=True, stream_options=
330+
# {"include_usage": True, "continuous_usage_stats": False}
331+
stream = await client.completions.create(model=model_name,
332+
prompt=prompt,
333+
max_tokens=5,
334+
temperature=0.0,
335+
stream=True,
336+
stream_options={
337+
"include_usage": True,
338+
"continuous_usage_stats":
339+
False,
340+
})
317341
async for chunk in stream:
318342
if chunk.choices[0].finish_reason is None:
319343
assert chunk.usage is None
@@ -328,7 +352,36 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
328352
final_chunk.usage.completion_tokens)
329353
assert final_chunk.choices == []
330354

331-
# Test stream=False, stream_options={"include_usage": None}
355+
# Test stream=True, stream_options=
356+
# {"include_usage": True, "continuous_usage_stats": True}
357+
stream = await client.completions.create(model=model_name,
358+
prompt=prompt,
359+
max_tokens=5,
360+
temperature=0.0,
361+
stream=True,
362+
stream_options={
363+
"include_usage": True,
364+
"continuous_usage_stats":
365+
True,
366+
})
367+
async for chunk in stream:
368+
assert chunk.usage is not None
369+
assert chunk.usage.prompt_tokens > 0
370+
assert chunk.usage.completion_tokens > 0
371+
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
372+
chunk.usage.completion_tokens)
373+
if chunk.choices[0].finish_reason is not None:
374+
final_chunk = await stream.__anext__()
375+
assert final_chunk.usage is not None
376+
assert final_chunk.usage.prompt_tokens > 0
377+
assert final_chunk.usage.completion_tokens > 0
378+
assert final_chunk.usage.total_tokens == (
379+
final_chunk.usage.prompt_tokens +
380+
final_chunk.usage.completion_tokens)
381+
assert final_chunk.choices == []
382+
383+
# Test stream=False, stream_options=
384+
# {"include_usage": None}
332385
with pytest.raises(BadRequestError):
333386
await client.completions.create(model=model_name,
334387
prompt=prompt,
@@ -337,7 +390,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
337390
stream=False,
338391
stream_options={"include_usage": None})
339392

340-
# Test stream=False, stream_options={"include_usage": True}
393+
# Test stream=False, stream_options=
394+
# {"include_usage": True}
341395
with pytest.raises(BadRequestError):
342396
await client.completions.create(model=model_name,
343397
prompt=prompt,
@@ -346,6 +400,28 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
346400
stream=False,
347401
stream_options={"include_usage": True})
348402

403+
# Test stream=False, stream_options=
404+
# {"continuous_usage_stats": None}
405+
with pytest.raises(BadRequestError):
406+
await client.completions.create(
407+
model=model_name,
408+
prompt=prompt,
409+
max_tokens=5,
410+
temperature=0.0,
411+
stream=False,
412+
stream_options={"continuous_usage_stats": None})
413+
414+
# Test stream=False, stream_options=
415+
# {"continuous_usage_stats": True}
416+
with pytest.raises(BadRequestError):
417+
await client.completions.create(
418+
model=model_name,
419+
prompt=prompt,
420+
max_tokens=5,
421+
temperature=0.0,
422+
stream=False,
423+
stream_options={"continuous_usage_stats": True})
424+
349425

350426
@pytest.mark.asyncio
351427
@pytest.mark.parametrize(

vllm/entrypoints/openai/protocol.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ class ResponseFormat(OpenAIBaseModel):
103103

104104

105105
class StreamOptions(OpenAIBaseModel):
106-
include_usage: Optional[bool]
106+
include_usage: Optional[bool] = True
107+
continuous_usage_stats: Optional[bool] = True
107108

108109

109110
class FunctionDefinition(OpenAIBaseModel):

vllm/entrypoints/openai/serving_completion.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -271,16 +271,6 @@ async def completion_stream_generator(
271271
previous_num_tokens[i] = len(output.token_ids)
272272
finish_reason = output.finish_reason
273273
stop_reason = output.stop_reason
274-
if output.finish_reason is not None: # return final usage
275-
prompt_tokens = len(res.prompt_token_ids)
276-
completion_tokens = len(output.token_ids)
277-
final_usage = UsageInfo(
278-
prompt_tokens=prompt_tokens,
279-
completion_tokens=completion_tokens,
280-
total_tokens=prompt_tokens + completion_tokens,
281-
)
282-
else:
283-
final_usage = None
284274

285275
chunk = CompletionStreamResponse(
286276
id=request_id,
@@ -297,7 +287,19 @@ async def completion_stream_generator(
297287
])
298288
if (request.stream_options
299289
and request.stream_options.include_usage):
300-
chunk.usage = None
290+
if (request.stream_options.continuous_usage_stats
291+
or output.finish_reason is not None):
292+
prompt_tokens = len(res.prompt_token_ids)
293+
completion_tokens = len(output.token_ids)
294+
usage = UsageInfo(
295+
prompt_tokens=prompt_tokens,
296+
completion_tokens=completion_tokens,
297+
total_tokens=prompt_tokens + completion_tokens,
298+
)
299+
if request.stream_options.continuous_usage_stats:
300+
chunk.usage = usage
301+
else:
302+
chunk.usage = None
301303

302304
response_json = chunk.model_dump_json(exclude_unset=True)
303305
yield f"data: {response_json}\n\n"
@@ -309,7 +311,7 @@ async def completion_stream_generator(
309311
created=created_time,
310312
model=model_name,
311313
choices=[],
312-
usage=final_usage,
314+
usage=usage,
313315
)
314316
final_usage_data = (final_usage_chunk.model_dump_json(
315317
exclude_unset=True, exclude_none=True))

0 commit comments

Comments
 (0)