Skip to content

Commit f4d78c9

Browse files
leeandherandrewshie-sentry
authored andcommitted
chore(perf-issues): Setup N+1 DB query experiment (#90530)
Similar to #90070 but for the N+1 DB query detector. This PR has no functional changes, and the new group type is not released, so will not produce any results. A follow up PR will have the detector changes, and roll them out internally to test.
1 parent 7f3534e commit f4d78c9

File tree

5 files changed

+605
-0
lines changed

5 files changed

+605
-0
lines changed

src/sentry/issues/grouptype.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,17 @@ class PerformanceNPlusOneGroupType(PerformanceGroupTypeDefaults, GroupType):
332332
released = True
333333

334334

335+
@dataclass(frozen=True)
336+
class PerformanceNPlusOneExperimentalGroupType(PerformanceGroupTypeDefaults, GroupType):
337+
type_id = 1906
338+
slug = "performance_n_plus_one_db_queries_experimental"
339+
description = "N+1 Query (Experimental)"
340+
category = GroupCategory.PERFORMANCE.value
341+
category_v2 = GroupCategory.PERFORMANCE_BEST_PRACTICE.value
342+
default_priority = PriorityLevel.LOW
343+
released = False
344+
345+
335346
@dataclass(frozen=True)
336347
class PerformanceConsecutiveDBQueriesGroupType(PerformanceGroupTypeDefaults, GroupType):
337348
type_id = 1007

src/sentry/utils/performance_issues/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class DetectorType(Enum):
3030
DB_MAIN_THREAD = "db_main_thread"
3131
HTTP_OVERHEAD = "http_overhead"
3232
EXPERIMENTAL_N_PLUS_ONE_API_CALLS = "experimental_n_plus_one_api_calls"
33+
EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES = "experimental_n_plus_one_db_queries"
3334

3435

3536
# Detector and the corresponding system option must be added to this list to have issues created.
Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
from __future__ import annotations
2+
3+
import hashlib
4+
from typing import Any
5+
6+
from sentry.issues.grouptype import PerformanceNPlusOneExperimentalGroupType
7+
from sentry.issues.issue_occurrence import IssueEvidence
8+
from sentry.models.organization import Organization
9+
from sentry.models.project import Project
10+
from sentry.utils import metrics
11+
from sentry.utils.performance_issues.base import (
12+
DetectorType,
13+
PerformanceDetector,
14+
get_notification_attachment_body,
15+
get_span_evidence_value,
16+
total_span_time,
17+
)
18+
from sentry.utils.performance_issues.performance_problem import PerformanceProblem
19+
from sentry.utils.performance_issues.types import Span
20+
from sentry.utils.safe import get_path
21+
22+
23+
class NPlusOneDBSpanExperimentalDetector(PerformanceDetector):
24+
"""
25+
Detector goals:
26+
- identify a database N+1 query with high accuracy
27+
- collect enough information to create a good fingerprint (see below)
28+
- only return issues with good fingerprints
29+
30+
A good fingerprint is one that gives us confidence that, if two fingerprints
31+
match, then they correspond to the same issue location in code (and
32+
therefore, the same fix).
33+
34+
To do this we look for a specific structure:
35+
36+
[-------- transaction span -----------]
37+
[-------- parent span -----------]
38+
[source query]
39+
[n0]
40+
[n1]
41+
[n2]
42+
...
43+
44+
If we detect two different N+1 problems, and both have matching parents,
45+
source queries, and repeated (n) queries, then we can be fairly confident
46+
they are the same issue.
47+
"""
48+
49+
__slots__ = (
50+
"stored_problems",
51+
"potential_parents",
52+
"source_span",
53+
"n_hash",
54+
"n_spans",
55+
"transaction",
56+
)
57+
58+
type = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES
59+
settings_key = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES
60+
61+
def __init__(self, settings: dict[DetectorType, Any], event: dict[str, Any]) -> None:
62+
super().__init__(settings, event)
63+
64+
self.stored_problems = {}
65+
self.potential_parents = {}
66+
self.n_hash: str | None = None
67+
self.n_spans: list[Span] = []
68+
self.source_span: Span | None = None
69+
root_span = get_path(self._event, "contexts", "trace")
70+
if root_span:
71+
self.potential_parents[root_span.get("span_id")] = root_span
72+
73+
def is_creation_allowed_for_system(self) -> bool:
74+
# Defer to the issue platform for whether to create issues
75+
# See https://develop.sentry.dev/backend/issue-platform/#releasing-your-issue-type
76+
return True
77+
78+
def is_creation_allowed_for_organization(self, organization: Organization | None) -> bool:
79+
return True # This detector is fully rolled out
80+
81+
def is_creation_allowed_for_project(self, project: Project | None) -> bool:
82+
return self.settings["detection_enabled"]
83+
84+
def visit_span(self, span: Span) -> None:
85+
span_id = span.get("span_id", None)
86+
op = span.get("op", None)
87+
if not span_id or not op:
88+
return
89+
90+
if not self._is_db_op(op):
91+
# This breaks up the N+1 we're currently tracking.
92+
self._maybe_store_problem()
93+
self._reset_detection()
94+
# Treat it as a potential parent as long as it isn't the root span.
95+
if span.get("parent_span_id", None):
96+
self.potential_parents[span_id] = span
97+
return
98+
99+
if not self.source_span:
100+
# We aren't currently tracking an N+1. Maybe this span triggers one!
101+
self._maybe_use_as_source(span)
102+
return
103+
104+
# If we got this far, we know we're a DB span and we're looking for a
105+
# sequence of N identical DB spans.
106+
if self._continues_n_plus_1(span):
107+
self.n_spans.append(span)
108+
else:
109+
previous_span = self.n_spans[-1] if self.n_spans else None
110+
self._maybe_store_problem()
111+
self._reset_detection()
112+
113+
# Maybe this DB span starts a whole new N+1!
114+
if previous_span:
115+
self._maybe_use_as_source(previous_span)
116+
if self.source_span and self._continues_n_plus_1(span):
117+
self.n_spans.append(span)
118+
else:
119+
self.source_span = None
120+
self._maybe_use_as_source(span)
121+
122+
def on_complete(self) -> None:
123+
self._maybe_store_problem()
124+
125+
def _is_db_op(self, op: str) -> bool:
126+
return op.startswith("db") and not op.startswith("db.redis")
127+
128+
def _maybe_use_as_source(self, span: Span) -> None:
129+
parent_span_id = span.get("parent_span_id", None)
130+
if not parent_span_id or parent_span_id not in self.potential_parents:
131+
return
132+
133+
self.source_span = span
134+
135+
def _continues_n_plus_1(self, span: Span) -> bool:
136+
if self.source_span is None:
137+
return False
138+
139+
expected_parent_id = self.source_span.get("parent_span_id", None)
140+
parent_id = span.get("parent_span_id", None)
141+
if not parent_id or parent_id != expected_parent_id:
142+
return False
143+
144+
span_hash = span.get("hash", None)
145+
if not span_hash:
146+
return False
147+
148+
if span_hash == self.source_span.get("hash", None):
149+
# The source span and n repeating spans must have different queries.
150+
return False
151+
152+
if not self.n_hash:
153+
self.n_hash = span_hash
154+
return True
155+
156+
return span_hash == self.n_hash
157+
158+
def _maybe_store_problem(self) -> None:
159+
if not self.source_span or not self.n_spans:
160+
return
161+
162+
# Do we have enough spans?
163+
count = self.settings.get("count")
164+
if len(self.n_spans) < count:
165+
return
166+
167+
# Do the spans take enough total time?
168+
if not self._is_slower_than_threshold():
169+
return
170+
171+
# We require a parent span in order to improve our fingerprint accuracy.
172+
parent_span_id = self.source_span.get("parent_span_id", None)
173+
if not parent_span_id:
174+
return
175+
parent_span = self.potential_parents[parent_span_id]
176+
if not parent_span:
177+
return
178+
179+
# Track how many N+1-looking problems we found but dropped because we
180+
# couldn't be sure (maybe the truncated part of the query differs).
181+
if not contains_complete_query(
182+
self.source_span, is_source=True
183+
) or not contains_complete_query(self.n_spans[0]):
184+
metrics.incr("performance.performance_issue.truncated_np1_db")
185+
return
186+
187+
if not self._contains_valid_repeating_query(self.n_spans[0]):
188+
metrics.incr("performance.performance_issue.unparametrized_first_span")
189+
return
190+
191+
fingerprint = self._fingerprint(
192+
parent_span.get("op", None),
193+
parent_span.get("hash", None),
194+
self.source_span.get("hash", None),
195+
self.n_spans[0].get("hash", None),
196+
)
197+
if fingerprint not in self.stored_problems:
198+
self._metrics_for_extra_matching_spans()
199+
200+
offender_span_ids = [span.get("span_id", None) for span in self.n_spans]
201+
202+
self.stored_problems[fingerprint] = PerformanceProblem(
203+
fingerprint=fingerprint,
204+
op="db",
205+
desc=self.n_spans[0].get("description", ""),
206+
type=PerformanceNPlusOneExperimentalGroupType,
207+
parent_span_ids=[parent_span_id],
208+
cause_span_ids=[self.source_span.get("span_id", None)],
209+
offender_span_ids=offender_span_ids,
210+
evidence_display=[
211+
IssueEvidence(
212+
name="Offending Spans",
213+
value=get_notification_attachment_body(
214+
"db",
215+
self.n_spans[0].get("description", ""),
216+
),
217+
# Has to be marked important to be displayed in the notifications
218+
important=True,
219+
)
220+
],
221+
evidence_data={
222+
"transaction_name": self._event.get("transaction", ""),
223+
"op": "db",
224+
"parent_span_ids": [parent_span_id],
225+
"parent_span": get_span_evidence_value(parent_span),
226+
"cause_span_ids": [self.source_span.get("span_id", None)],
227+
"offender_span_ids": offender_span_ids,
228+
"repeating_spans": get_span_evidence_value(self.n_spans[0]),
229+
"repeating_spans_compact": get_span_evidence_value(
230+
self.n_spans[0], include_op=False
231+
),
232+
"num_repeating_spans": str(len(offender_span_ids)),
233+
},
234+
)
235+
236+
def _is_slower_than_threshold(self) -> bool:
237+
duration_threshold = self.settings.get("duration_threshold")
238+
return total_span_time(self.n_spans) >= duration_threshold
239+
240+
def _contains_valid_repeating_query(self, span: Span) -> bool:
241+
# Make sure we at least have a space, to exclude e.g. MongoDB and
242+
# Prisma's `rawQuery`.
243+
query = span.get("description", None)
244+
return bool(query) and " " in query
245+
246+
def _metrics_for_extra_matching_spans(self) -> None:
247+
# Checks for any extra spans that match the detected problem but are not part of affected spans.
248+
# Temporary check since we eventually want to capture extra perf problems on the initial pass while walking spans.
249+
n_count = len(self.n_spans)
250+
all_matching_spans = [
251+
span
252+
for span in self._event.get("spans", [])
253+
if span.get("span_id", None) == self.n_hash
254+
]
255+
all_count = len(all_matching_spans)
256+
if n_count > 0 and n_count != all_count:
257+
metrics.incr("performance.performance_issue.np1_db.extra_spans")
258+
259+
def _reset_detection(self) -> None:
260+
self.source_span = None
261+
self.n_hash = None
262+
self.n_spans = []
263+
264+
def _fingerprint(self, parent_op: str, parent_hash: str, source_hash: str, n_hash: str) -> str:
265+
# XXX: this has to be a hardcoded string otherwise grouping will break
266+
problem_class = "GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES"
267+
full_fingerprint = hashlib.sha1(
268+
(str(parent_op) + str(parent_hash) + str(source_hash) + str(n_hash)).encode("utf8"),
269+
).hexdigest()
270+
return f"1-{problem_class}-{full_fingerprint}"
271+
272+
273+
def contains_complete_query(span: Span, is_source: bool | None = False) -> bool:
274+
# Remove the truncation check from the n_plus_one db detector.
275+
query = span.get("description", None)
276+
if is_source and query:
277+
return True
278+
else:
279+
return query and not query.endswith("...")

src/sentry/utils/performance_issues/performance_detection.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,11 @@ def get_detection_settings(project_id: int | None = None) -> dict[DetectorType,
243243
"duration_threshold": settings["n_plus_one_db_duration_threshold"], # ms
244244
"detection_enabled": settings["n_plus_one_db_queries_detection_enabled"],
245245
},
246+
DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES: {
247+
"count": settings["n_plus_one_db_count"],
248+
"duration_threshold": settings["n_plus_one_db_duration_threshold"], # ms
249+
"detection_enabled": settings["n_plus_one_db_queries_detection_enabled"],
250+
},
246251
DetectorType.CONSECUTIVE_DB_OP: {
247252
# time saved by running all queries in parallel
248253
"min_time_saved": settings["consecutive_db_min_time_saved_threshold"], # ms

0 commit comments

Comments
 (0)