feat(EXP-253): Add attribute comparison endpoint (#92283)

shruthilayaj · web-flow · commit c5379b80cb02 · 2025-05-27T11:11:26.000-04:00
Adds an endpoint that returns a distribution of ranked "suspect"
attributes
when comparing two cohorts.
diff --git a/src/sentry/api/endpoints/organization_trace_item_attributes_ranked.py b/src/sentry/api/endpoints/organization_trace_item_attributes_ranked.py
@@ -0,0 +1,164 @@
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any
+
+from rest_framework.request import Request
+from rest_framework.response import Response
+from sentry_protos.snuba.v1.endpoint_trace_item_stats_pb2 import (
+    AttributeDistributionsRequest,
+    StatsType,
+    TraceItemStatsRequest,
+)
+
+from sentry import features
+from sentry.api.api_owners import ApiOwner
+from sentry.api.api_publish_status import ApiPublishStatus
+from sentry.api.base import region_silo_endpoint
+from sentry.api.bases import NoProjects, OrganizationEventsV2EndpointBase
+from sentry.models.organization import Organization
+from sentry.search.eap.resolver import SearchResolver
+from sentry.search.eap.spans.definitions import SPAN_DEFINITIONS
+from sentry.search.eap.types import SearchResolverConfig, SupportedTraceItemType
+from sentry.search.eap.utils import translate_internal_to_public_alias
+from sentry.seer.workflows.compare import keyed_rrf_score
+from sentry.snuba.referrer import Referrer
+from sentry.snuba.spans_rpc import run_table_query
+from sentry.utils.snuba_rpc import trace_item_stats_rpc
+
+_query_thread_pool = ThreadPoolExecutor(max_workers=4)
+
+
+@region_silo_endpoint
+class OrganizationTraceItemsAttributesRankedEndpoint(OrganizationEventsV2EndpointBase):
+    publish_status = {
+        "GET": ApiPublishStatus.PRIVATE,
+    }
+    owner = ApiOwner.PERFORMANCE
+
+    def get(self, request: Request, organization: Organization) -> Response:
+
+        if not features.has(
+            "organizations:performance-spans-suspect-attributes", organization, actor=request.user
+        ):
+            return Response(status=404)
+
+        try:
+            snuba_params = self.get_snuba_params(request, organization)
+        except NoProjects:
+            return Response({"rankedAttributes": []})
+
+        resolver = SearchResolver(
+            params=snuba_params, config=SearchResolverConfig(), definitions=SPAN_DEFINITIONS
+        )
+
+        meta = resolver.resolve_meta(referrer=Referrer.API_SPANS_FREQUENCY_STATS_RPC.value)
+        query_1 = request.GET.get("query_1", "")
+        query_2 = request.GET.get("query_2", "")
+
+        if query_1 == query_2:
+            return Response({"rankedAttributes": []})
+
+        cohort_1, _, _ = resolver.resolve_query(query_1)
+        cohort_1_request = TraceItemStatsRequest(
+            filter=cohort_1,
+            meta=meta,
+            stats_types=[
+                StatsType(
+                    attribute_distributions=AttributeDistributionsRequest(
+                        max_buckets=100,
+                    )
+                )
+            ],
+        )
+
+        cohort_2, _, _ = resolver.resolve_query(query_2)
+        cohort_2_request = TraceItemStatsRequest(
+            filter=cohort_2,
+            meta=meta,
+            stats_types=[
+                StatsType(
+                    attribute_distributions=AttributeDistributionsRequest(
+                        max_buckets=100,
+                    )
+                )
+            ],
+        )
+
+        cohort_1_future = _query_thread_pool.submit(
+            trace_item_stats_rpc,
+            cohort_1_request,
+        )
+        totals_1_future = _query_thread_pool.submit(
+            run_table_query,
+            snuba_params,
+            query_1,
+            ["count(span.duration)"],
+            None,
+            config=SearchResolverConfig(use_aggregate_conditions=False),
+            offset=0,
+            limit=1,
+            sampling_mode=snuba_params.sampling_mode,
+            referrer=Referrer.API_SPAN_SAMPLE_GET_SPAN_DATA.value,
+        )
+
+        cohort_2_future = _query_thread_pool.submit(
+            trace_item_stats_rpc,
+            cohort_2_request,
+        )
+
+        totals_2_future = _query_thread_pool.submit(
+            run_table_query,
+            snuba_params,
+            query_2,
+            ["count(span.duration)"],
+            None,
+            config=SearchResolverConfig(use_aggregate_conditions=False),
+            offset=0,
+            limit=1,
+            sampling_mode=snuba_params.sampling_mode,
+            referrer=Referrer.API_SPAN_SAMPLE_GET_SPAN_DATA.value,
+        )
+
+        cohort_1_data = cohort_1_future.result()
+        cohort_2_data = cohort_2_future.result()
+        totals_1_result = totals_1_future.result()
+        totals_2_result = totals_2_future.result()
+
+        cohort_1_distribution = []
+        cohort_1_distribution_map = defaultdict(list)
+        for attribute in cohort_1_data.results[0].attribute_distributions.attributes:
+            for bucket in attribute.buckets:
+                cohort_1_distribution.append((attribute.attribute_name, bucket.label, bucket.value))
+                cohort_1_distribution_map[attribute.attribute_name].append(
+                    {"label": bucket.label, "value": bucket.value}
+                )
+
+        cohort_2_distribution = []
+        cohort_2_distribution_map = defaultdict(list)
+        for attribute in cohort_2_data.results[0].attribute_distributions.attributes:
+            for bucket in attribute.buckets:
+                cohort_2_distribution.append((attribute.attribute_name, bucket.label, bucket.value))
+                cohort_2_distribution_map[attribute.attribute_name].append(
+                    {"label": bucket.label, "value": bucket.value}
+                )
+
+        scored_attrs = keyed_rrf_score(
+            cohort_1_distribution,
+            cohort_2_distribution,
+            totals_1_result["data"][0]["count(span.duration)"],
+            totals_2_result["data"][0]["count(span.duration)"],
+        )
+
+        ranked_distribution: dict[str, list[dict[str, Any]]] = {"rankedAttributes": []}
+        for attr, _ in scored_attrs:
+            distribution = {
+                "attributeName": translate_internal_to_public_alias(
+                    attr, "string", SupportedTraceItemType.SPANS
+                )
+                or attr,
+                "cohort1": cohort_1_distribution_map.get(attr),
+                "cohort2": cohort_2_distribution_map.get(attr),
+            }
+            ranked_distribution["rankedAttributes"].append(distribution)
+
+        return Response(ranked_distribution)
diff --git a/src/sentry/api/urls.py b/src/sentry/api/urls.py
@@ -38,6 +38,9 @@
     OrganizationTraceItemAttributesEndpoint,
     OrganizationTraceItemAttributeValuesEndpoint,
 )
+from sentry.api.endpoints.organization_trace_item_attributes_ranked import (
+    OrganizationTraceItemsAttributesRankedEndpoint,
+)
 from sentry.api.endpoints.organization_trace_summary import OrganizationTraceSummaryEndpoint
 from sentry.api.endpoints.organization_unsubscribe import (
     OrganizationUnsubscribeIssue,
@@ -1527,6 +1530,11 @@ def create_group_urls(name_prefix: str) -> list[URLPattern | URLResolver]:
         OrganizationTraceItemAttributeValuesEndpoint.as_view(),
         name="sentry-api-0-organization-trace-item-attribute-values",
     ),
+    re_path(
+        r"^(?P<organization_id_or_slug>[^\/]+)/trace-items/attributes/ranked/$",
+        OrganizationTraceItemsAttributesRankedEndpoint.as_view(),
+        name="sentry-api-0-organization-trace-item-attributes-ranked",
+    ),
     re_path(
         r"^(?P<organization_id_or_slug>[^\/]+)/spans/fields/$",
         OrganizationSpansFieldsEndpoint.as_view(),
diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py
@@ -266,6 +266,8 @@ def register_temporary_features(manager: FeatureManager):
     manager.add("organizations:performance-vitals-standalone-cls-lcp", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
     # Enable default explore queries in the new side nav
     manager.add("organizations:performance-default-explore-queries", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
+    # Enable suspect attributes feature
+    manager.add("organizations:performance-spans-suspect-attributes", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
     # Enables the playstation ingestion in relay
     manager.add("organizations:relay-playstation-ingestion", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
     # Enable profiling
diff --git a/tests/sentry/api/endpoints/test_organization_trace_item_attributes_ranked.py b/tests/sentry/api/endpoints/test_organization_trace_item_attributes_ranked.py
@@ -0,0 +1,88 @@
+from django.urls import reverse
+
+from sentry.testutils.cases import APITransactionTestCase, SnubaTestCase, SpanTestCase
+from sentry.testutils.helpers.datetime import before_now
+
+
+class OrganizationTraceItemsAttributesRankedEndpointTest(
+    APITransactionTestCase,
+    SnubaTestCase,
+    SpanTestCase,
+):
+    view = "sentry-api-0-organization-trace-item-attributes-ranked"
+
+    def setUp(self):
+        super().setUp()
+        self.login_as(user=self.user)
+        self.features = {
+            "organizations:performance-spans-suspect-attributes": True,
+        }
+        self.ten_mins_ago = before_now(minutes=10)
+        self.ten_mins_ago_iso = self.ten_mins_ago.replace(microsecond=0).isoformat()
+
+    def do_request(self, query=None, features=None, **kwargs):
+        if features is None:
+            features = ["organizations:performance-spans-suspect-attributes"]
+
+        if query and "type" not in query.keys():
+            query["type"] = "string"
+
+        with self.feature(features):
+            response = self.client.get(
+                reverse(self.view, kwargs={"organization_id_or_slug": self.organization.slug}),
+                query,
+                format="json",
+                **kwargs,
+            )
+
+            return response
+
+    def _store_span(self, description=None, tags=None, duration=None):
+        if tags is None:
+            tags = {"foo": "bar"}
+
+        self.store_span(
+            self.create_span(
+                {"description": description or "foo", "sentry_tags": tags},
+                start_ts=self.ten_mins_ago,
+                duration=duration or 1000,
+            ),
+            is_eap=True,
+        )
+
+    def test_no_project(self):
+        response = self.do_request()
+        assert response.status_code == 200, response.data
+        assert response.data == {"rankedAttributes": []}
+
+    def test_no_feature(self):
+        response = self.do_request(features=[])
+        assert response.status_code == 404, response.data
+
+    def test_distribution_values(self):
+        tags = [
+            ({"browser": "chrome", "device": "desktop"}, 500),
+            ({"browser": "chrome", "device": "mobile"}, 100),
+            ({"browser": "chrome", "device": "mobile"}, 100),
+            ({"browser": "chrome", "device": "desktop"}, 100),
+            ({"browser": "safari", "device": "mobile"}, 100),
+            ({"browser": "chrome", "device": "desktop"}, 500),
+            ({"browser": "edge", "device": "desktop"}, 500),
+        ]
+
+        for tag, duration in tags:
+            self._store_span(tags=tag, duration=duration)
+
+        response = self.do_request(
+            query={"query_1": "span.duration:<=100", "query_2": "span.duration:>100"}
+        )
+        assert response.status_code == 200, response.data
+        distributions = response.data["rankedAttributes"]
+        assert distributions[0]["attributeName"] == "sentry.device"
+        assert distributions[0]["cohort1"] == [
+            {"label": "mobile", "value": 3.0},
+            {"label": "desktop", "value": 1.0},
+        ]
+        assert distributions[0]["cohort2"] == [{"label": "desktop", "value": 3.0}]
+
+        assert distributions[1]["attributeName"] == "browser"