feat:add engine v1 tracing

Mu Huai · Mu Huai · commit cc97560c9a7b · 2025-05-12T19:29:33.000+08:00
Signed-off-by: Mu Huai &lt;tianbowen.tbw@antgroup.com&gt;
diff --git a/vllm/tracing.py b/vllm/tracing.py
@@ -118,6 +118,9 @@ class SpanAttributes:
     # forward, block/sync across workers, cpu-gpu sync time and sampling time.
     GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
         "gen_ai.latency.time_in_model_execute")
+    GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = "gen_ai.latency.time_in_model_prefill"
+    GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
+    GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = "gen_ai.latency.time_in_model_inference"
 
 
 def contains_trace_headers(headers: Mapping[str, str]) -> bool:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -749,7 +749,9 @@ def update_from_output(
                         new_logprobs=new_logprobs,
                         new_prompt_logprobs_tensors=prompt_logprobs_tensors,
                         stop_reason=request.stop_reason,
-                        events=request.take_events()))
+                        events=request.take_events()),
+                        trace_headers=request.trace_headers
+                )
             else:
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
@@ -3,7 +3,7 @@
 import enum
 import time
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, Mapping
 
 import msgspec
 
@@ -39,10 +39,10 @@ def __str__(self):
 
 
 class EngineCoreRequest(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False):  # type: ignore[call-arg]
 
     # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
     # but this object is currently not playing well with msgspec
@@ -64,6 +64,8 @@ class EngineCoreRequest(
     # a wave finished notification is received.
     current_wave: int = 0
 
+    trace_headers: Optional[Mapping[str, str]] = None
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
@@ -91,10 +93,10 @@ def new_event(cls,
 
 
 class EngineCoreOutput(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False):  # type: ignore[call-arg]
 
     request_id: str
     new_token_ids: list[int]
@@ -106,15 +108,17 @@ class EngineCoreOutput(
     stop_reason: Union[int, str, None] = None
     events: Optional[list[EngineCoreEvent]] = None
 
+    trace_headers: Optional[Mapping[str, str]] = None
+
     @property
     def finished(self) -> bool:
         return self.finish_reason is not None
 
 
 class UtilityOutput(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    gc=False):  # type: ignore[call-arg]
 
     call_id: int
 
@@ -124,12 +128,12 @@ class UtilityOutput(
 
 
 class EngineCoreOutputs(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True,  # type: ignore[call-arg]
-        gc=False):  # type: ignore[call-arg]
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False):  # type: ignore[call-arg]
 
-    #NOTE(Nick): We could consider ways to make this more compact,
+    # NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout
 
     engine_index: int = 0
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -15,6 +15,9 @@
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
                                    RequestStateStats)
+from vllm.config import ObservabilityConfig
+from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
+                          init_tracer)
 
 
 class RequestOutputCollector:
@@ -64,28 +67,27 @@ def get_nowait(self) -> Optional[RequestOutput]:
 
 @dataclass
 class OutputProcessorOutput:
-
     request_outputs: list[RequestOutput]
     reqs_to_abort: list[str]
 
 
 class RequestState:
 
     def __init__(
-        self,
-        request_id: str,
-        parent_req: Optional[ParentRequest],
-        request_index: int,
-        lora_name: Optional[str],
-        output_kind: RequestOutputKind,
-        prompt: Optional[str],
-        prompt_token_ids: list[int],
-        logprobs_processor: LogprobsProcessor,
-        detokenizer: IncrementalDetokenizer,
-        max_tokens_param: Optional[int],
-        arrival_time: float,
-        queue: Optional[RequestOutputCollector],
-        log_stats: bool,
+            self,
+            request_id: str,
+            parent_req: Optional[ParentRequest],
+            request_index: int,
+            lora_name: Optional[str],
+            output_kind: RequestOutputKind,
+            prompt: Optional[str],
+            prompt_token_ids: list[int],
+            logprobs_processor: LogprobsProcessor,
+            detokenizer: IncrementalDetokenizer,
+            max_tokens_param: Optional[int],
+            arrival_time: float,
+            queue: Optional[RequestOutputCollector],
+            log_stats: bool,
     ):
         self.request_id = request_id
         self.parent_req = parent_req
@@ -106,14 +108,14 @@ def __init__(
 
     @classmethod
     def from_new_request(
-        cls,
-        tokenizer: AnyTokenizer,
-        request: EngineCoreRequest,
-        prompt: Optional[str],
-        parent_req: Optional[ParentRequest],
-        request_index: int,
-        queue: Optional[RequestOutputCollector],
-        log_stats: bool,
+            cls,
+            tokenizer: AnyTokenizer,
+            request: EngineCoreRequest,
+            prompt: Optional[str],
+            parent_req: Optional[ParentRequest],
+            request_index: int,
+            queue: Optional[RequestOutputCollector],
+            log_stats: bool,
     ) -> "RequestState":
         if not request.sampling_params.detokenize:
             tokenizer = None
@@ -142,10 +144,10 @@ def from_new_request(
         )
 
     def make_request_output(
-        self,
-        new_token_ids: list[int],
-        finish_reason: Optional[FinishReason],
-        stop_reason: Union[int, str, None],
+            self,
+            new_token_ids: list[int],
+            finish_reason: Optional[FinishReason],
+            stop_reason: Union[int, str, None],
     ) -> Optional[RequestOutput]:
 
         finished = finish_reason is not None
@@ -170,10 +172,10 @@ def make_request_output(
         return self._new_request_output(request_id, outputs, finished)
 
     def _new_request_output(
-        self,
-        request_id: str,
-        outputs: list[CompletionOutput],
-        finished: bool,
+            self,
+            request_id: str,
+            outputs: list[CompletionOutput],
+            finished: bool,
     ) -> RequestOutput:
 
         if self.output_kind == RequestOutputKind.DELTA:
@@ -192,10 +194,10 @@ def _new_request_output(
         )
 
     def _new_completion_output(
-        self,
-        token_ids: list[int],
-        finish_reason: Optional[FinishReason],
-        stop_reason: Union[int, str, None],
+            self,
+            token_ids: list[int],
+            finish_reason: Optional[FinishReason],
+            stop_reason: Union[int, str, None],
     ) -> CompletionOutput:
 
         finished = finish_reason is not None
@@ -225,15 +227,26 @@ class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
     def __init__(
-        self,
-        tokenizer: TokenizerGroup,
-        log_stats: bool,
+            self,
+            tokenizer: TokenizerGroup,
+            log_stats: bool,
+            observability_config: Optional[ObservabilityConfig] = None
     ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
+        self.observability_config = observability_config
+
+        self.tracer = None
+        if self.observability_config is not None and self.observability_config.otlp_traces_endpoint:
+            self.tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+
+    def is_tracing_enabled(self) -> bool:
+        return self.tracer is not None
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -249,8 +262,8 @@ def propagate_error(self, e: Exception):
             state.queue.put(e)
 
     def abort_requests(
-        self,
-        request_ids: Iterable[str],
+            self,
+            request_ids: Iterable[str],
     ) -> list[str]:
         request_ids_to_abort = []
         for request_id in request_ids:
@@ -266,12 +279,12 @@ def abort_requests(
         return request_ids_to_abort
 
     def add_request(
-        self,
-        request: EngineCoreRequest,
-        prompt: Optional[str],
-        parent_req: Optional[ParentRequest] = None,
-        request_index: int = 0,
-        queue: Optional[RequestOutputCollector] = None,
+            self,
+            request: EngineCoreRequest,
+            prompt: Optional[str],
+            parent_req: Optional[ParentRequest] = None,
+            request_index: int = 0,
+            queue: Optional[RequestOutputCollector] = None,
     ) -> None:
         request_id = request.request_id
         if request_id in self.request_states:
@@ -291,10 +304,10 @@ def add_request(
             self.parent_requests[parent_req.request_id] = parent_req
 
     def process_outputs(
-        self,
-        engine_core_outputs: list[EngineCoreOutput],
-        engine_core_timestamp: Optional[float] = None,
-        iteration_stats: Optional[IterationStats] = None,
+            self,
+            engine_core_outputs: list[EngineCoreOutput],
+            engine_core_timestamp: Optional[float] = None,
+            iteration_stats: Optional[IterationStats] = None,
     ) -> OutputProcessorOutput:
         """
         Process the EngineCoreOutputs:
@@ -373,14 +386,68 @@ def process_outputs(
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
                                                  iteration_stats)
-
+                self.do_tracing(engine_core_output, req_state, iteration_stats)
         self.lora_states.update_iteration_stats(iteration_stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
         )
 
+    def do_tracing(self, engine_core_output: EngineCoreOutput,
+                   req_state: RequestState,
+                   iteration_stats: Optional[IterationStats]):
+        if engine_core_output.finish_reason is None or iteration_stats is None:
+            return
+        arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
+
+        trace_context = extract_trace_context(engine_core_output.trace_headers)
+        with tracer.start_as_current_span("llm_request",
+                                          kind=SpanKind.SERVER,
+                                          context=trace_context,
+                                          start_time=arrival_time_nano_seconds) as span:
+            metrics = req_state.stats
+            ttft = metrics.first_token_ts - metrics.arrival_time
+            e2e_time = time.time() - metrics.arrival_time
+            # Queued interval is from first QUEUED event to first SCHEDULED
+            queued_time = metrics.scheduled_ts - metrics.queued_ts
+
+            # Prefill interval is from first SCHEDULED to first NEW_TOKEN
+            # Any preemptions during prefill is included in the interval
+            prefill_time = metrics.first_token_ts - metrics.scheduled_ts
+
+            # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
+            # Any preemptions during decode are included
+            decode_time = metrics.last_token_ts - metrics.first_token_ts
+
+            # Inference interval is from first SCHEDULED to last NEW_TOKEN
+            # Any preemptions during prefill or decode are included
+            inference_time = metrics.last_token_ts - metrics.scheduled_ts
+            span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
+                               self.tokenizer.tokenizer_id)
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
+                               req_state.request_id)
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
+                               req_state.max_tokens_param)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
+                               len(req_state.prompt_token_ids))
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
+                               metrics.num_generation_tokens)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                               metrics.queued_ts - metrics.arrival_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
+                               ttft)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E,
+                               e2e_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                               queued_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL,
+                               prefill_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE,
+                               decode_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE,
+                               inference_time)
+
     def _update_stats_from_output(self, req_state: RequestState,
                                   engine_core_output: EngineCoreOutput,
                                   engine_core_timestamp: Optional[float],
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -215,8 +215,6 @@ def process_inputs(
         self._validate_params(params, lora_request)
         if priority != 0:
             raise ValueError("V1 does not support priority yet.")
-        if trace_headers is not None:
-            raise ValueError("V1 does not support tracing yet.")
         if prompt_adapter_request is not None:
             raise ValueError("V1 does not support prompt_adapter_request.")
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py