1
+ # ruff: noqa: E501
1
2
# SPDX-License-Identifier: Apache-2.0
2
3
3
4
from __future__ import annotations
4
5
5
6
import json
6
7
import re
7
8
from enum import Enum
8
- from typing import Any
9
+ from typing import TYPE_CHECKING , Any
9
10
10
11
import jsonschema
11
12
import pytest
12
13
from pydantic import BaseModel
13
14
15
+ from tests .reasoning .utils import run_reasoning_extraction
14
16
from vllm .entrypoints .llm import LLM
15
17
from vllm .outputs import RequestOutput
16
18
from vllm .platforms import current_platform
19
+ from vllm .reasoning .abs_reasoning_parsers import ReasoningParserManager
17
20
from vllm .sampling_params import GuidedDecodingParams , SamplingParams
18
21
22
+ if TYPE_CHECKING :
23
+ from vllm .config import TokenizerMode
24
+
19
25
NGRAM_SPEC_CONFIG = {
20
26
"model" : "[ngram]" ,
21
27
"num_speculative_tokens" : 5 ,
@@ -444,7 +450,7 @@ def test_structured_output(
444
450
445
451
prompt = """
446
452
You have access to the following function to retrieve the weather in a city:
447
-
453
+
448
454
{
449
455
"name": "get_weather",
450
456
"parameters": {
@@ -455,7 +461,7 @@ def test_structured_output(
455
461
}
456
462
}
457
463
}
458
-
464
+
459
465
If a you choose to call a function ONLY reply in the following format:
460
466
<{start_tag}={function_name}>{parameters}{end_tag}
461
467
where
@@ -476,7 +482,7 @@ def test_structured_output(
476
482
- Always add your sources when using search results to answer the user query
477
483
478
484
You are a helpful assistant.
479
-
485
+
480
486
Given the previous instructions, what is the weather in New York City? \
481
487
Make the response as short as possible.
482
488
"""
@@ -514,6 +520,88 @@ def test_structured_output(
514
520
f"{ generated_text !r} \n Error: { str (e )} " )
515
521
516
522
523
+ @pytest .mark .skip_global_cleanup
524
+ @pytest .mark .parametrize (
525
+ "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config" , # noqa: E501
526
+ [
527
+ ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" , "xgrammar" , "auto" ,
528
+ "deepseek_r1" , NGRAM_SPEC_CONFIG ),
529
+ ("Qwen/Qwen3-1.7B" , "xgrammar" , "auto" , "deepseek_r1" , None ),
530
+ ],
531
+ )
532
+ def test_structured_output_with_reasoning_matrices (
533
+ monkeypatch : pytest .MonkeyPatch ,
534
+ guided_decoding_backend : str ,
535
+ tokenizer_mode : TokenizerMode ,
536
+ reasoning_parser : str ,
537
+ model_name : str ,
538
+ speculative_config : dict [str , Any ] | None ,
539
+ ):
540
+ monkeypatch .setenv ("VLLM_USE_V1" , "1" )
541
+
542
+ if current_platform .is_tpu () and speculative_config :
543
+ pytest .skip ("TPU does not support speculative decoding" )
544
+
545
+ # Use a single LLM instance for several scenarios to
546
+ # speed up the test suite.
547
+ llm = LLM (
548
+ model = model_name ,
549
+ # Don't use eager execution on TPUs because we want to test for no
550
+ # recompilation at runtime
551
+ enforce_eager = bool (not current_platform .is_tpu ()),
552
+ max_model_len = 1024 ,
553
+ max_num_seqs = 16 ,
554
+ guided_decoding_backend = guided_decoding_backend ,
555
+ guided_decoding_disable_any_whitespace = True ,
556
+ tokenizer_mode = tokenizer_mode ,
557
+ reasoning_parser = reasoning_parser ,
558
+ speculative_config = speculative_config ,
559
+ )
560
+ tokenizer = llm .get_tokenizer (None )
561
+ reasoner = ReasoningParserManager .get_reasoning_parser (reasoning_parser )(
562
+ tokenizer = tokenizer )
563
+
564
+ reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\n Problem: What is 5 * 8 + 2?" # noqa: E501
565
+ reasoning_schema = {
566
+ "type" : "object" ,
567
+ "properties" : {
568
+ "result" : {
569
+ "type" : "integer"
570
+ }
571
+ },
572
+ "required" : ["result" ],
573
+ "additionalProperties" : False
574
+ }
575
+ if "Qwen3" in model_name :
576
+ reasoning_prompt += "<think>\n "
577
+
578
+ sampling_params = SamplingParams (
579
+ temperature = 0.1 ,
580
+ max_tokens = 8192 ,
581
+ guided_decoding = GuidedDecodingParams (json = reasoning_schema ),
582
+ )
583
+ outputs = llm .generate (
584
+ [reasoning_prompt ],
585
+ sampling_params = sampling_params ,
586
+ use_tqdm = True ,
587
+ )
588
+
589
+ assert outputs is not None
590
+ output = outputs [0 ]
591
+ assert output is not None and isinstance (output , RequestOutput )
592
+ prompt = output .prompt
593
+ generated_text = output .outputs [0 ].text
594
+ reasoning_content , content = run_reasoning_extraction (
595
+ reasoner , [generated_text ])
596
+ print (
597
+ f"Prompt: { prompt !r} \n Reasoning: { reasoning_content !r} \n Content: { content !r} "
598
+ )
599
+
600
+ assert content is not None and reasoning_content is not None
601
+ output_json = json .loads (content )
602
+ jsonschema .validate (instance = output_json , schema = reasoning_schema )
603
+
604
+
517
605
@pytest .mark .skip_global_cleanup
518
606
@pytest .mark .parametrize ("model_name, tokenizer_mode" ,
519
607
PARAMS_MODELS_TOKENIZER_MODE )
0 commit comments