feat: support parsing thinking tokens

aarnphm · aarnphm · commit f60e62dcecac · 2025-04-14T09:11:54.000Z
Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -15,14 +15,37 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
-     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
-     "auto"),
-    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
-     "mistral"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+PARAMS_MODELS_BACKENDS_TOKENIZER_MODE_REASONING_PARSER = [
+    (
+        "mistralai/Ministral-8B-Instruct-2410",
+        "xgrammar:disable-any-whitespace",
+        "auto",
+        None,
+    ),
+    (
+        "mistralai/Ministral-8B-Instruct-2410",
+        "guidance:disable-any-whitespace",
+        "auto",
+        None,
+    ),
+    (
+        "mistralai/Ministral-8B-Instruct-2410",
+        "xgrammar:disable-any-whitespace",
+        "mistral",
+        None,
+    ),
+    (
+        "Qwen/Qwen2.5-1.5B-Instruct",
+        "xgrammar:disable-any-whitespace",
+        "auto",
+        None,
+    ),
+    (
+        "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "xgrammar:disable-any-whitespace",
+        "auto",
+        "deepseek_r1",
+    ),
     #FIXME: This test is flaky on CI thus disabled
     #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]
@@ -47,8 +70,9 @@ class CarDescription(BaseModel):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode",
-                         PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
+@pytest.mark.parametrize(
+    "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser",
+    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE_REASONING_PARSER)
 def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
@@ -59,6 +83,7 @@ def test_structured_output(
     sample_guided_choice: str,
     guided_decoding_backend: str,
     tokenizer_mode: str,
+    reasoning_parser: str | None,
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
@@ -69,7 +94,9 @@ def test_structured_output(
               enforce_eager=True,
               max_model_len=1024,
               guided_decoding_backend=guided_decoding_backend,
-              tokenizer_mode=tokenizer_mode)
+              tokenizer_mode=tokenizer_mode,
+              enable_reasoning=reasoning_parser is not None,
+              reasoning_parser=reasoning_parser)
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -364,6 +391,40 @@ def test_structured_output(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
 
+    #
+    # Test 11: Generate structured output with reasoning step
+    #
+    if reasoning_parser is not None:
+        reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Problem: What is 5 * 8 + 2?"  # noqa: E501
+        reasoning_schema = {
+            "type": "object",
+            "properties": {
+                "result": {
+                    "type": "integer"
+                }
+            },
+            "required": ["result"]
+        }
+
+        sampling_params = SamplingParams(
+            temperature=0.1,  # Low temp for deterministic reasoning
+            max_tokens=200,
+            guided_decoding=GuidedDecodingParams(json=reasoning_schema))
+        outputs = llm.generate(prompts=[reasoning_prompt],
+                               sampling_params=sampling_params,
+                               use_tqdm=True)
+
+        assert outputs is not None
+        output = outputs[0]
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=reasoning_schema)
+
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode",
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
@@ -106,7 +106,7 @@ class ReasoningParserManager:
     reasoning_parsers: dict[str, type] = {}
 
     @classmethod
-    def get_reasoning_parser(cls, name: str) -> type[ReasoningParser]:
+    def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]:
         """
         Get reasoning parser by name which is registered by `register_module`.
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -653,11 +653,20 @@ def update_from_output(
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
             if new_token_ids and request.use_structured_output:
-                # NOTE: structured_output_request
-                # should not be None if use_structured_output, we have
-                # check above, so safe to ignore type warning
-                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    req_id, new_token_ids)
+                advance_fsm = False
+                reasoner = self.structured_output_manager.reasoner
+                if reasoner is None or request.reasoning_ended:
+                    advance_fsm = True
+                elif reasoner.is_reasoning_end(request.all_token_ids):
+                    request.reasoning_ended = True
+                    advance_fsm = True
+
+                if advance_fsm:
+                    # NOTE: structured_output_request
+                    # should not be None if use_structured_output, we have
+                    # check above, so safe to ignore type warning
+                    request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
+                        req_id, new_token_ids)
 
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
@@ -37,6 +37,7 @@ def __init__(
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
         self.structured_output_request = structured_output_request
+        self.reasoning_ended: bool = False
 
         self.status = (RequestStatus.WAITING_FOR_FSM
                        if sampling_params.guided_decoding is not None else
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -119,15 +119,29 @@ def grammar_bitmask(
         # position in the batch. Resize the bitmask down to the size of
         # the batch.
         bitmask_tensor = self._grammar_bitmask
+        # Reset the relevant part of the bitmask before filling
+        if batch_len > 0:
+            bitmask_tensor[:batch_len].fill_(-1)
+
         for req_id, batch_index in structured_output_request_ids.items():
-            request = requests[req_id].structured_output_request
-            assert request is not None and request.grammar is not None
-            if not request.grammar.is_terminated():
-                request.grammar.fill_bitmask(bitmask_tensor, batch_index)
-        if batch_len < self._grammar_bitmask.shape[0]:
-            bitmask_tensor = self._grammar_bitmask[:batch_len]
+            full_request = requests[req_id]
+            so_request = full_request.structured_output_request
+            assert so_request is not None and so_request.grammar is not None
+
+            apply_bitmask = (self.reasoner is None
+                             or full_request.reasoning_ended
+                             or self.reasoner.is_reasoning_end(
+                                 full_request.all_token_ids))
+
+            if apply_bitmask and not so_request.grammar.is_terminated():
+                so_request.grammar.fill_bitmask(bitmask_tensor, batch_index)
+
+        if batch_len < bitmask_tensor.shape[0]:
+            final_bitmask_tensor = bitmask_tensor[:batch_len]
+        else:
+            final_bitmask_tensor = bitmask_tensor
 
         # After finishing with the xgrammar operations, we convert to
         # np.ndarray, because that is much more efficient for serialization
         # and deserialization when sending this to the GPU workers.
-        return bitmask_tensor.numpy()
+        return final_bitmask_tensor.numpy()