|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import hashlib |
| 4 | +from typing import Any |
| 5 | + |
| 6 | +from sentry.issues.grouptype import PerformanceNPlusOneExperimentalGroupType |
| 7 | +from sentry.issues.issue_occurrence import IssueEvidence |
| 8 | +from sentry.models.organization import Organization |
| 9 | +from sentry.models.project import Project |
| 10 | +from sentry.utils import metrics |
| 11 | +from sentry.utils.performance_issues.base import ( |
| 12 | + DetectorType, |
| 13 | + PerformanceDetector, |
| 14 | + get_notification_attachment_body, |
| 15 | + get_span_evidence_value, |
| 16 | + total_span_time, |
| 17 | +) |
| 18 | +from sentry.utils.performance_issues.performance_problem import PerformanceProblem |
| 19 | +from sentry.utils.performance_issues.types import Span |
| 20 | +from sentry.utils.safe import get_path |
| 21 | + |
| 22 | + |
| 23 | +class NPlusOneDBSpanExperimentalDetector(PerformanceDetector): |
| 24 | + """ |
| 25 | + Detector goals: |
| 26 | + - identify a database N+1 query with high accuracy |
| 27 | + - collect enough information to create a good fingerprint (see below) |
| 28 | + - only return issues with good fingerprints |
| 29 | +
|
| 30 | + A good fingerprint is one that gives us confidence that, if two fingerprints |
| 31 | + match, then they correspond to the same issue location in code (and |
| 32 | + therefore, the same fix). |
| 33 | +
|
| 34 | + To do this we look for a specific structure: |
| 35 | +
|
| 36 | + [-------- transaction span -----------] |
| 37 | + [-------- parent span -----------] |
| 38 | + [source query] |
| 39 | + [n0] |
| 40 | + [n1] |
| 41 | + [n2] |
| 42 | + ... |
| 43 | +
|
| 44 | + If we detect two different N+1 problems, and both have matching parents, |
| 45 | + source queries, and repeated (n) queries, then we can be fairly confident |
| 46 | + they are the same issue. |
| 47 | + """ |
| 48 | + |
| 49 | + __slots__ = ( |
| 50 | + "stored_problems", |
| 51 | + "potential_parents", |
| 52 | + "source_span", |
| 53 | + "n_hash", |
| 54 | + "n_spans", |
| 55 | + "transaction", |
| 56 | + ) |
| 57 | + |
| 58 | + type = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES |
| 59 | + settings_key = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES |
| 60 | + |
| 61 | + def __init__(self, settings: dict[DetectorType, Any], event: dict[str, Any]) -> None: |
| 62 | + super().__init__(settings, event) |
| 63 | + |
| 64 | + self.stored_problems = {} |
| 65 | + self.potential_parents = {} |
| 66 | + self.n_hash: str | None = None |
| 67 | + self.n_spans: list[Span] = [] |
| 68 | + self.source_span: Span | None = None |
| 69 | + root_span = get_path(self._event, "contexts", "trace") |
| 70 | + if root_span: |
| 71 | + self.potential_parents[root_span.get("span_id")] = root_span |
| 72 | + |
| 73 | + def is_creation_allowed_for_system(self) -> bool: |
| 74 | + # Defer to the issue platform for whether to create issues |
| 75 | + # See https://develop.sentry.dev/backend/issue-platform/#releasing-your-issue-type |
| 76 | + return True |
| 77 | + |
| 78 | + def is_creation_allowed_for_organization(self, organization: Organization | None) -> bool: |
| 79 | + return True # This detector is fully rolled out |
| 80 | + |
| 81 | + def is_creation_allowed_for_project(self, project: Project | None) -> bool: |
| 82 | + return self.settings["detection_enabled"] |
| 83 | + |
| 84 | + def visit_span(self, span: Span) -> None: |
| 85 | + span_id = span.get("span_id", None) |
| 86 | + op = span.get("op", None) |
| 87 | + if not span_id or not op: |
| 88 | + return |
| 89 | + |
| 90 | + if not self._is_db_op(op): |
| 91 | + # This breaks up the N+1 we're currently tracking. |
| 92 | + self._maybe_store_problem() |
| 93 | + self._reset_detection() |
| 94 | + # Treat it as a potential parent as long as it isn't the root span. |
| 95 | + if span.get("parent_span_id", None): |
| 96 | + self.potential_parents[span_id] = span |
| 97 | + return |
| 98 | + |
| 99 | + if not self.source_span: |
| 100 | + # We aren't currently tracking an N+1. Maybe this span triggers one! |
| 101 | + self._maybe_use_as_source(span) |
| 102 | + return |
| 103 | + |
| 104 | + # If we got this far, we know we're a DB span and we're looking for a |
| 105 | + # sequence of N identical DB spans. |
| 106 | + if self._continues_n_plus_1(span): |
| 107 | + self.n_spans.append(span) |
| 108 | + else: |
| 109 | + previous_span = self.n_spans[-1] if self.n_spans else None |
| 110 | + self._maybe_store_problem() |
| 111 | + self._reset_detection() |
| 112 | + |
| 113 | + # Maybe this DB span starts a whole new N+1! |
| 114 | + if previous_span: |
| 115 | + self._maybe_use_as_source(previous_span) |
| 116 | + if self.source_span and self._continues_n_plus_1(span): |
| 117 | + self.n_spans.append(span) |
| 118 | + else: |
| 119 | + self.source_span = None |
| 120 | + self._maybe_use_as_source(span) |
| 121 | + |
| 122 | + def on_complete(self) -> None: |
| 123 | + self._maybe_store_problem() |
| 124 | + |
| 125 | + def _is_db_op(self, op: str) -> bool: |
| 126 | + return op.startswith("db") and not op.startswith("db.redis") |
| 127 | + |
| 128 | + def _maybe_use_as_source(self, span: Span) -> None: |
| 129 | + parent_span_id = span.get("parent_span_id", None) |
| 130 | + if not parent_span_id or parent_span_id not in self.potential_parents: |
| 131 | + return |
| 132 | + |
| 133 | + self.source_span = span |
| 134 | + |
| 135 | + def _continues_n_plus_1(self, span: Span) -> bool: |
| 136 | + if self.source_span is None: |
| 137 | + return False |
| 138 | + |
| 139 | + expected_parent_id = self.source_span.get("parent_span_id", None) |
| 140 | + parent_id = span.get("parent_span_id", None) |
| 141 | + if not parent_id or parent_id != expected_parent_id: |
| 142 | + return False |
| 143 | + |
| 144 | + span_hash = span.get("hash", None) |
| 145 | + if not span_hash: |
| 146 | + return False |
| 147 | + |
| 148 | + if span_hash == self.source_span.get("hash", None): |
| 149 | + # The source span and n repeating spans must have different queries. |
| 150 | + return False |
| 151 | + |
| 152 | + if not self.n_hash: |
| 153 | + self.n_hash = span_hash |
| 154 | + return True |
| 155 | + |
| 156 | + return span_hash == self.n_hash |
| 157 | + |
| 158 | + def _maybe_store_problem(self) -> None: |
| 159 | + if not self.source_span or not self.n_spans: |
| 160 | + return |
| 161 | + |
| 162 | + # Do we have enough spans? |
| 163 | + count = self.settings.get("count") |
| 164 | + if len(self.n_spans) < count: |
| 165 | + return |
| 166 | + |
| 167 | + # Do the spans take enough total time? |
| 168 | + if not self._is_slower_than_threshold(): |
| 169 | + return |
| 170 | + |
| 171 | + # We require a parent span in order to improve our fingerprint accuracy. |
| 172 | + parent_span_id = self.source_span.get("parent_span_id", None) |
| 173 | + if not parent_span_id: |
| 174 | + return |
| 175 | + parent_span = self.potential_parents[parent_span_id] |
| 176 | + if not parent_span: |
| 177 | + return |
| 178 | + |
| 179 | + # Track how many N+1-looking problems we found but dropped because we |
| 180 | + # couldn't be sure (maybe the truncated part of the query differs). |
| 181 | + if not contains_complete_query( |
| 182 | + self.source_span, is_source=True |
| 183 | + ) or not contains_complete_query(self.n_spans[0]): |
| 184 | + metrics.incr("performance.performance_issue.truncated_np1_db") |
| 185 | + return |
| 186 | + |
| 187 | + if not self._contains_valid_repeating_query(self.n_spans[0]): |
| 188 | + metrics.incr("performance.performance_issue.unparametrized_first_span") |
| 189 | + return |
| 190 | + |
| 191 | + fingerprint = self._fingerprint( |
| 192 | + parent_span.get("op", None), |
| 193 | + parent_span.get("hash", None), |
| 194 | + self.source_span.get("hash", None), |
| 195 | + self.n_spans[0].get("hash", None), |
| 196 | + ) |
| 197 | + if fingerprint not in self.stored_problems: |
| 198 | + self._metrics_for_extra_matching_spans() |
| 199 | + |
| 200 | + offender_span_ids = [span.get("span_id", None) for span in self.n_spans] |
| 201 | + |
| 202 | + self.stored_problems[fingerprint] = PerformanceProblem( |
| 203 | + fingerprint=fingerprint, |
| 204 | + op="db", |
| 205 | + desc=self.n_spans[0].get("description", ""), |
| 206 | + type=PerformanceNPlusOneExperimentalGroupType, |
| 207 | + parent_span_ids=[parent_span_id], |
| 208 | + cause_span_ids=[self.source_span.get("span_id", None)], |
| 209 | + offender_span_ids=offender_span_ids, |
| 210 | + evidence_display=[ |
| 211 | + IssueEvidence( |
| 212 | + name="Offending Spans", |
| 213 | + value=get_notification_attachment_body( |
| 214 | + "db", |
| 215 | + self.n_spans[0].get("description", ""), |
| 216 | + ), |
| 217 | + # Has to be marked important to be displayed in the notifications |
| 218 | + important=True, |
| 219 | + ) |
| 220 | + ], |
| 221 | + evidence_data={ |
| 222 | + "transaction_name": self._event.get("transaction", ""), |
| 223 | + "op": "db", |
| 224 | + "parent_span_ids": [parent_span_id], |
| 225 | + "parent_span": get_span_evidence_value(parent_span), |
| 226 | + "cause_span_ids": [self.source_span.get("span_id", None)], |
| 227 | + "offender_span_ids": offender_span_ids, |
| 228 | + "repeating_spans": get_span_evidence_value(self.n_spans[0]), |
| 229 | + "repeating_spans_compact": get_span_evidence_value( |
| 230 | + self.n_spans[0], include_op=False |
| 231 | + ), |
| 232 | + "num_repeating_spans": str(len(offender_span_ids)), |
| 233 | + }, |
| 234 | + ) |
| 235 | + |
| 236 | + def _is_slower_than_threshold(self) -> bool: |
| 237 | + duration_threshold = self.settings.get("duration_threshold") |
| 238 | + return total_span_time(self.n_spans) >= duration_threshold |
| 239 | + |
| 240 | + def _contains_valid_repeating_query(self, span: Span) -> bool: |
| 241 | + # Make sure we at least have a space, to exclude e.g. MongoDB and |
| 242 | + # Prisma's `rawQuery`. |
| 243 | + query = span.get("description", None) |
| 244 | + return bool(query) and " " in query |
| 245 | + |
| 246 | + def _metrics_for_extra_matching_spans(self) -> None: |
| 247 | + # Checks for any extra spans that match the detected problem but are not part of affected spans. |
| 248 | + # Temporary check since we eventually want to capture extra perf problems on the initial pass while walking spans. |
| 249 | + n_count = len(self.n_spans) |
| 250 | + all_matching_spans = [ |
| 251 | + span |
| 252 | + for span in self._event.get("spans", []) |
| 253 | + if span.get("span_id", None) == self.n_hash |
| 254 | + ] |
| 255 | + all_count = len(all_matching_spans) |
| 256 | + if n_count > 0 and n_count != all_count: |
| 257 | + metrics.incr("performance.performance_issue.np1_db.extra_spans") |
| 258 | + |
| 259 | + def _reset_detection(self) -> None: |
| 260 | + self.source_span = None |
| 261 | + self.n_hash = None |
| 262 | + self.n_spans = [] |
| 263 | + |
| 264 | + def _fingerprint(self, parent_op: str, parent_hash: str, source_hash: str, n_hash: str) -> str: |
| 265 | + # XXX: this has to be a hardcoded string otherwise grouping will break |
| 266 | + problem_class = "GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES" |
| 267 | + full_fingerprint = hashlib.sha1( |
| 268 | + (str(parent_op) + str(parent_hash) + str(source_hash) + str(n_hash)).encode("utf8"), |
| 269 | + ).hexdigest() |
| 270 | + return f"1-{problem_class}-{full_fingerprint}" |
| 271 | + |
| 272 | + |
| 273 | +def contains_complete_query(span: Span, is_source: bool | None = False) -> bool: |
| 274 | + # Remove the truncation check from the n_plus_one db detector. |
| 275 | + query = span.get("description", None) |
| 276 | + if is_source and query: |
| 277 | + return True |
| 278 | + else: |
| 279 | + return query and not query.endswith("...") |
0 commit comments