chore: move reasoning_ended to so_request

aarnphm · aarnphm · commit 0e699ec7d787 · 2025-04-17T15:02:53.000-04:00
Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import os
 from abc import abstractmethod
 from collections.abc import Sequence
@@ -33,7 +35,7 @@ def vocab(self) -> dict[str, int]:
         return self.model_tokenizer.get_vocab()
 
     @abstractmethod
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         """
         Check if the reasoning content ends in the input_ids.
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -655,21 +655,19 @@ def update_from_output(
             if new_token_ids and request.use_structured_output:
                 advance_fsm = False
                 reasoner = self.structured_output_manager.reasoner
-                is_reasoning_end_this_step = False  # Flag the transition
+                so_request = request.structured_output_request
+                is_reasoning_end_this_step = False
 
-                if reasoner is None or request.reasoning_ended:
-                    # Reasoning was already off or never active
+                if reasoner is None or so_request.reasoning_ended:  # type: ignore[union-attr]
                     advance_fsm = True
-                else:
-                    # Reasoning is active, check if it ends now
+                else:  # type: ignore[union-attr]
                     if reasoner.is_reasoning_end(request.all_token_ids):
-                        request.reasoning_ended = True
+                        so_request.reasoning_ended = True  # type: ignore[union-attr]
                         is_reasoning_end_this_step = True
                         # Don't advance FSM in the step the transition occurs,
                         # as new_token_ids might contain the end marker.
                         advance_fsm = False
                     else:
-                        # Reasoning continues, don't advance FSM
                         advance_fsm = False
 
                 # Only advance FSM if reasoning was already off OR
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
@@ -37,7 +37,6 @@ def __init__(
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
         self.structured_output_request = structured_output_request
-        self.reasoning_ended: bool = False
 
         self.status = (RequestStatus.WAITING_FOR_FSM
                        if sampling_params.guided_decoding is not None else
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -129,7 +129,7 @@ def grammar_bitmask(
             assert so_request is not None and so_request.grammar is not None
 
             apply_bitmask = (self.reasoner is None
-                             or full_request.reasoning_ended
+                             or so_request.reasoning_ended
                              or self.reasoner.is_reasoning_end(
                                  full_request.all_token_ids))
 
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
@@ -20,6 +20,7 @@ class StructuredOutputRequest:
     sampling_params: SamplingParams
     _grammar: Optional[Union[Future[StructuredOutputGrammar],
                              StructuredOutputGrammar]] = None
+    reasoning_ended: bool = False
 
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports