Skip to content

chore(perf-issues): Setup N+1 DB query experiment #90530

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/sentry/issues/grouptype.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,17 @@ class PerformanceNPlusOneGroupType(PerformanceGroupTypeDefaults, GroupType):
released = True


@dataclass(frozen=True)
class PerformanceNPlusOneExperimentalGroupType(PerformanceGroupTypeDefaults, GroupType):
type_id = 1906
slug = "performance_n_plus_one_db_queries_experimental"
description = "N+1 Query (Experimental)"
category = GroupCategory.PERFORMANCE.value
category_v2 = GroupCategory.PERFORMANCE_BEST_PRACTICE.value
default_priority = PriorityLevel.LOW
released = False


@dataclass(frozen=True)
class PerformanceConsecutiveDBQueriesGroupType(PerformanceGroupTypeDefaults, GroupType):
type_id = 1007
Expand Down
1 change: 1 addition & 0 deletions src/sentry/utils/performance_issues/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class DetectorType(Enum):
DB_MAIN_THREAD = "db_main_thread"
HTTP_OVERHEAD = "http_overhead"
EXPERIMENTAL_N_PLUS_ONE_API_CALLS = "experimental_n_plus_one_api_calls"
EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES = "experimental_n_plus_one_db_queries"


# Detector and the corresponding system option must be added to this list to have issues created.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
from __future__ import annotations

import hashlib
from typing import Any

from sentry.issues.grouptype import PerformanceNPlusOneExperimentalGroupType
from sentry.issues.issue_occurrence import IssueEvidence
from sentry.models.organization import Organization
from sentry.models.project import Project
from sentry.utils import metrics
from sentry.utils.performance_issues.base import (
DetectorType,
PerformanceDetector,
get_notification_attachment_body,
get_span_evidence_value,
total_span_time,
)
from sentry.utils.performance_issues.performance_problem import PerformanceProblem
from sentry.utils.performance_issues.types import Span
from sentry.utils.safe import get_path


class NPlusOneDBSpanExperimentalDetector(PerformanceDetector):
"""
Detector goals:
- identify a database N+1 query with high accuracy
- collect enough information to create a good fingerprint (see below)
- only return issues with good fingerprints

A good fingerprint is one that gives us confidence that, if two fingerprints
match, then they correspond to the same issue location in code (and
therefore, the same fix).

To do this we look for a specific structure:

[-------- transaction span -----------]
[-------- parent span -----------]
[source query]
[n0]
[n1]
[n2]
...

If we detect two different N+1 problems, and both have matching parents,
source queries, and repeated (n) queries, then we can be fairly confident
they are the same issue.
"""

__slots__ = (
"stored_problems",
"potential_parents",
"source_span",
"n_hash",
"n_spans",
"transaction",
)

type = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES
settings_key = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES

def __init__(self, settings: dict[DetectorType, Any], event: dict[str, Any]) -> None:
super().__init__(settings, event)

self.stored_problems = {}
self.potential_parents = {}
self.n_hash: str | None = None
self.n_spans: list[Span] = []
self.source_span: Span | None = None
root_span = get_path(self._event, "contexts", "trace")
if root_span:
self.potential_parents[root_span.get("span_id")] = root_span

def is_creation_allowed_for_system(self) -> bool:
# Defer to the issue platform for whether to create issues
# See https://develop.sentry.dev/backend/issue-platform/#releasing-your-issue-type
return True

def is_creation_allowed_for_organization(self, organization: Organization | None) -> bool:
return True # This detector is fully rolled out

def is_creation_allowed_for_project(self, project: Project | None) -> bool:
return self.settings["detection_enabled"]

def visit_span(self, span: Span) -> None:
span_id = span.get("span_id", None)
op = span.get("op", None)
if not span_id or not op:
return

if not self._is_db_op(op):
# This breaks up the N+1 we're currently tracking.
self._maybe_store_problem()
self._reset_detection()
# Treat it as a potential parent as long as it isn't the root span.
if span.get("parent_span_id", None):
self.potential_parents[span_id] = span
return

if not self.source_span:
# We aren't currently tracking an N+1. Maybe this span triggers one!
self._maybe_use_as_source(span)
return

# If we got this far, we know we're a DB span and we're looking for a
# sequence of N identical DB spans.
if self._continues_n_plus_1(span):
self.n_spans.append(span)
else:
previous_span = self.n_spans[-1] if self.n_spans else None
self._maybe_store_problem()
self._reset_detection()

# Maybe this DB span starts a whole new N+1!
if previous_span:
self._maybe_use_as_source(previous_span)
if self.source_span and self._continues_n_plus_1(span):
self.n_spans.append(span)
else:
self.source_span = None
self._maybe_use_as_source(span)

def on_complete(self) -> None:
self._maybe_store_problem()

def _is_db_op(self, op: str) -> bool:
return op.startswith("db") and not op.startswith("db.redis")

def _maybe_use_as_source(self, span: Span) -> None:
parent_span_id = span.get("parent_span_id", None)
if not parent_span_id or parent_span_id not in self.potential_parents:
return

self.source_span = span

def _continues_n_plus_1(self, span: Span) -> bool:
if self.source_span is None:
return False

expected_parent_id = self.source_span.get("parent_span_id", None)
parent_id = span.get("parent_span_id", None)
if not parent_id or parent_id != expected_parent_id:
return False

span_hash = span.get("hash", None)
if not span_hash:
return False

if span_hash == self.source_span.get("hash", None):
# The source span and n repeating spans must have different queries.
return False

if not self.n_hash:
self.n_hash = span_hash
return True

return span_hash == self.n_hash

def _maybe_store_problem(self) -> None:
if not self.source_span or not self.n_spans:
return

# Do we have enough spans?
count = self.settings.get("count")
if len(self.n_spans) < count:
return

# Do the spans take enough total time?
if not self._is_slower_than_threshold():
return

# We require a parent span in order to improve our fingerprint accuracy.
parent_span_id = self.source_span.get("parent_span_id", None)
if not parent_span_id:
return
parent_span = self.potential_parents[parent_span_id]
if not parent_span:
return

# Track how many N+1-looking problems we found but dropped because we
# couldn't be sure (maybe the truncated part of the query differs).
if not contains_complete_query(
self.source_span, is_source=True
) or not contains_complete_query(self.n_spans[0]):
metrics.incr("performance.performance_issue.truncated_np1_db")
return

if not self._contains_valid_repeating_query(self.n_spans[0]):
metrics.incr("performance.performance_issue.unparametrized_first_span")
return

fingerprint = self._fingerprint(
parent_span.get("op", None),
parent_span.get("hash", None),
self.source_span.get("hash", None),
self.n_spans[0].get("hash", None),
)
if fingerprint not in self.stored_problems:
self._metrics_for_extra_matching_spans()

offender_span_ids = [span.get("span_id", None) for span in self.n_spans]

self.stored_problems[fingerprint] = PerformanceProblem(
fingerprint=fingerprint,
op="db",
desc=self.n_spans[0].get("description", ""),
type=PerformanceNPlusOneExperimentalGroupType,
parent_span_ids=[parent_span_id],
cause_span_ids=[self.source_span.get("span_id", None)],
offender_span_ids=offender_span_ids,
evidence_display=[
IssueEvidence(
name="Offending Spans",
value=get_notification_attachment_body(
"db",
self.n_spans[0].get("description", ""),
),
# Has to be marked important to be displayed in the notifications
important=True,
)
],
evidence_data={
"transaction_name": self._event.get("transaction", ""),
"op": "db",
"parent_span_ids": [parent_span_id],
"parent_span": get_span_evidence_value(parent_span),
"cause_span_ids": [self.source_span.get("span_id", None)],
"offender_span_ids": offender_span_ids,
"repeating_spans": get_span_evidence_value(self.n_spans[0]),
"repeating_spans_compact": get_span_evidence_value(
self.n_spans[0], include_op=False
),
"num_repeating_spans": str(len(offender_span_ids)),
},
)

def _is_slower_than_threshold(self) -> bool:
duration_threshold = self.settings.get("duration_threshold")
return total_span_time(self.n_spans) >= duration_threshold

def _contains_valid_repeating_query(self, span: Span) -> bool:
# Make sure we at least have a space, to exclude e.g. MongoDB and
# Prisma's `rawQuery`.
query = span.get("description", None)
return bool(query) and " " in query

def _metrics_for_extra_matching_spans(self) -> None:
# Checks for any extra spans that match the detected problem but are not part of affected spans.
# Temporary check since we eventually want to capture extra perf problems on the initial pass while walking spans.
n_count = len(self.n_spans)
all_matching_spans = [
span
for span in self._event.get("spans", [])
if span.get("span_id", None) == self.n_hash
]
all_count = len(all_matching_spans)
if n_count > 0 and n_count != all_count:
metrics.incr("performance.performance_issue.np1_db.extra_spans")

def _reset_detection(self) -> None:
self.source_span = None
self.n_hash = None
self.n_spans = []

def _fingerprint(self, parent_op: str, parent_hash: str, source_hash: str, n_hash: str) -> str:
# XXX: this has to be a hardcoded string otherwise grouping will break
problem_class = "GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES"
full_fingerprint = hashlib.sha1(
(str(parent_op) + str(parent_hash) + str(source_hash) + str(n_hash)).encode("utf8"),
).hexdigest()
return f"1-{problem_class}-{full_fingerprint}"


def contains_complete_query(span: Span, is_source: bool | None = False) -> bool:
# Remove the truncation check from the n_plus_one db detector.
query = span.get("description", None)
if is_source and query:
return True
else:
return query and not query.endswith("...")
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@ def get_detection_settings(project_id: int | None = None) -> dict[DetectorType,
"duration_threshold": settings["n_plus_one_db_duration_threshold"], # ms
"detection_enabled": settings["n_plus_one_db_queries_detection_enabled"],
},
DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES: {
"count": settings["n_plus_one_db_count"],
"duration_threshold": settings["n_plus_one_db_duration_threshold"], # ms
"detection_enabled": settings["n_plus_one_db_queries_detection_enabled"],
},
DetectorType.CONSECUTIVE_DB_OP: {
# time saved by running all queries in parallel
"min_time_saved": settings["consecutive_db_min_time_saved_threshold"], # ms
Expand Down
Loading
Loading