feat(trace-explorer): Add trace breakdowns to trace response (#69042)

Zylphrex · web-flow · commit ba83ef139f30 · 2024-04-16T22:10:56.000Z
To give an overview where a trace is spending it's time, this adds a
breakdown of the trace by project.
diff --git a/src/sentry/api/endpoints/organization_traces.py b/src/sentry/api/endpoints/organization_traces.py
@@ -1,11 +1,13 @@
 from collections import defaultdict
 from collections.abc import Mapping
 from datetime import datetime, timedelta
-from typing import Any, TypedDict, cast
+from typing import Any, Literal, TypedDict, cast
 
+import sentry_sdk
 from rest_framework import serializers
 from rest_framework.request import Request
 from rest_framework.response import Response
+from snuba_sdk import Column, Condition, Op
 
 from sentry import options
 from sentry.api.api_owners import ApiOwner
@@ -19,13 +21,24 @@
 from sentry.search.events.types import ParamsType, QueryBuilderConfig
 from sentry.snuba.dataset import Dataset
 from sentry.snuba.referrer import Referrer
+from sentry.utils.snuba import bulk_snql_query
+
+
+class TraceInterval(TypedDict):
+    project: str | None
+    start: int
+    end: int
+    kind: Literal["project", "missing", "unknown"]
 
 
 class TraceResult(TypedDict):
     trace: str
     numSpans: int
     name: str | None
     duration: int
+    start: int
+    end: int
+    breakdowns: list[TraceInterval]
     spans: list[Mapping[str, Any]]
 
 
@@ -118,34 +131,90 @@ def data_fn(offset: int, limit: int):
             snuba_params.start = min_timestamp - buffer
             snuba_params.end = max_timestamp + buffer
 
+            trace_condition = f"trace:[{', '.join(spans_by_trace.keys())}]"
+
             with handle_query_errors():
-                builder = SpansIndexedQueryBuilder(
+                breakdowns_query = SpansIndexedQueryBuilder(
+                    Dataset.SpansIndexed,
+                    cast(ParamsType, params),
+                    snuba_params=snuba_params,
+                    query=f"{trace_condition}",
+                    selected_columns=[
+                        "trace",
+                        "project",
+                        "transaction",
+                        "first_seen()",
+                        "last_seen()",
+                    ],
+                    orderby=["first_seen()", "last_seen()"],
+                    # limit the number of segments we fetch per trace so a single
+                    # large trace does not result in the rest being blank
+                    limitby=("trace", int(10_000 / len(spans_by_trace))),
+                    config=QueryBuilderConfig(
+                        functions_acl=["trace_name", "first_seen", "last_seen"],
+                        transform_alias_to_input_format=True,
+                    ),
+                )
+                # TODO: this should be `is_transaction:1` but there's some
+                # boolean mapping that's not working for this field
+                breakdowns_query.add_conditions(
+                    [
+                        Condition(Column("is_segment"), Op.EQ, 1),
+                    ]
+                )
+
+            with handle_query_errors():
+                traces_meta_query = SpansIndexedQueryBuilder(
                     Dataset.SpansIndexed,
                     cast(ParamsType, params),
                     snuba_params=snuba_params,
-                    query=f"trace:[{', '.join(spans_by_trace.keys())}]",
-                    selected_columns=["trace", "count()", "trace_name()", "elapsed()"],
+                    query=trace_condition,
+                    selected_columns=[
+                        "trace",
+                        "count()",
+                        "trace_name()",
+                        "first_seen()",
+                        "last_seen()",
+                    ],
                     limit=len(spans_by_trace),
                     config=QueryBuilderConfig(
-                        functions_acl=["trace_name", "elapsed"],
+                        functions_acl=["trace_name", "first_seen", "last_seen"],
                         transform_alias_to_input_format=True,
                     ),
                 )
-                trace_results = builder.run_query(Referrer.API_TRACE_EXPLORER_TRACES_META.value)
-                trace_results = builder.process_results(trace_results)
+
+            with handle_query_errors():
+                results = bulk_snql_query(
+                    [
+                        breakdowns_query.get_snql_query(),
+                        traces_meta_query.get_snql_query(),
+                    ],
+                    Referrer.API_TRACE_EXPLORER_TRACES_META.value,
+                )
+                breakdowns_results = breakdowns_query.process_results(results[0])
+                traces_meta_results = traces_meta_query.process_results(results[1])
+
+            try:
+                breakdowns = process_breakdowns(breakdowns_results["data"])
+            except Exception as e:
+                sentry_sdk.capture_exception(e)
+                breakdowns = defaultdict(list)
 
             traces: list[TraceResult] = [
                 {
                     "trace": row["trace"],
                     "numSpans": row["count()"],
                     "name": row["trace_name()"],
-                    "duration": row["elapsed()"],
+                    "duration": row["last_seen()"] - row["first_seen()"],
+                    "start": row["first_seen()"],
+                    "end": row["last_seen()"],
+                    "breakdowns": breakdowns[row["trace"]],
                     "spans": [
                         {field: span[field] for field in serialized["field"]}
                         for span in spans_by_trace[row["trace"]]
                     ],
                 }
-                for row in trace_results["data"]
+                for row in traces_meta_results["data"]
             ]
 
             return {"data": traces, "meta": meta}
@@ -162,3 +231,105 @@ def data_fn(offset: int, limit: int):
                 dataset=Dataset.SpansIndexed,
             ),
         )
+
+
+def process_breakdowns(data):
+    trace_stacks: Mapping[str, list[TraceInterval]] = defaultdict(list)
+    breakdowns_by_trace: Mapping[str, list[TraceInterval]] = defaultdict(list)
+
+    def breakdown_append(breakdown, interval):
+        breakdown.append(interval)
+
+    def stack_pop(stack):
+        interval = stack.pop()
+        if stack:
+            # when popping an interval off the stack, it means that we've
+            # consumed up to this point in time, so we update the transaction
+            # above it to the remaining interval
+            stack[-1]["start"] = interval["end"]
+        return interval
+
+    def stack_clear(breakdown, stack, until=None):
+        interval = None
+        while stack:
+            if until is not None and stack[-1]["end"] > until:
+                break
+
+            item = stack_pop(stack)
+
+            # While popping the stack, we adjust the start
+            # of the intervals in the stack, so it's possible
+            # we produce an invalid interval. Make sure to
+            # filter them out here
+            if item["start"] <= item["end"]:
+                interval = item
+                breakdown_append(breakdown, interval)
+        return interval
+
+    for row in data:
+        stack = trace_stacks[row["trace"]]
+        cur: TraceInterval = {
+            "project": row["project"],
+            "start": row["first_seen()"],
+            "end": row["last_seen()"],
+            "kind": "project",
+        }
+
+        # nothing on the stack yet, so directly push onto
+        # stack and wait for next item to come so an
+        # interval can be determined
+        if not stack:
+            stack.append(cur)
+            continue
+
+        breakdown = breakdowns_by_trace[row["trace"]]
+        prev = stack[-1]
+        if prev["end"] <= cur["start"]:
+            # This implies there's no overlap between this transaction
+            # and the previous transaction. So we're done with the whole stack
+
+            last = stack_clear(breakdown, stack, until=cur["start"])
+
+            # if there's a gap between prev end and cur start
+            if not stack or stack[-1]["end"] < cur["start"]:
+                if last is not None and last["end"] < cur["start"]:
+                    breakdown_append(
+                        breakdown,
+                        {
+                            "project": None,
+                            "start": prev["end"],
+                            "end": cur["start"],
+                            "kind": "missing",
+                        },
+                    )
+        elif prev["project"] == cur["project"]:
+            # If the intervals are in the same project,
+            # we need to merge them into the same interval.
+            new_end = max(prev["end"], cur["end"])
+            prev["end"] = new_end
+            continue
+        else:
+            # This imples that there is some overlap between this transaction
+            # and the previous transaction. So we need push the interval up to
+            # the current item to the breakdown.
+
+            breakdown_append(
+                breakdown,
+                {
+                    "project": prev["project"],
+                    "start": prev["start"],
+                    "end": cur["start"],
+                    "kind": "project",
+                },
+            )
+
+            if prev["end"] <= cur["end"]:
+                stack_pop(stack)
+
+        stack.append(cur)
+
+    for trace, stack in trace_stacks.items():
+        breakdown = breakdowns_by_trace[trace]
+        stack_clear(breakdown, stack)
+
+    return breakdowns_by_trace
diff --git a/src/sentry/search/events/datasets/spans_indexed.py b/src/sentry/search/events/datasets/spans_indexed.py
@@ -254,19 +254,20 @@ def function_converter(self) -> Mapping[str, SnQLFunction]:
                     private=True,
                 ),
                 SnQLFunction(
-                    "elapsed",
+                    "first_seen",
                     snql_aggregate=lambda args, alias: Function(
-                        "minus",
-                        [
-                            Function(
-                                "max",
-                                [self._resolve_timestamp_with_ms("end_timestamp", "end_ms")],
-                            ),
-                            Function(
-                                "min",
-                                [self._resolve_timestamp_with_ms("start_timestamp", "start_ms")],
-                            ),
-                        ],
+                        "min",
+                        [self._resolve_timestamp_with_ms("start_timestamp", "start_ms")],
+                        alias,
+                    ),
+                    default_result_type="duration",
+                    private=True,
+                ),
+                SnQLFunction(
+                    "last_seen",
+                    snql_aggregate=lambda args, alias: Function(
+                        "max",
+                        [self._resolve_timestamp_with_ms("end_timestamp", "end_ms")],
                         alias,
                     ),
                     default_result_type="duration",
diff --git a/tests/sentry/api/endpoints/test_organization_traces.py b/tests/sentry/api/endpoints/test_organization_traces.py