diff --git a/src/sentry/issues/grouptype.py b/src/sentry/issues/grouptype.py index 53b0e28f9f8b8e..01158376dd6313 100644 --- a/src/sentry/issues/grouptype.py +++ b/src/sentry/issues/grouptype.py @@ -332,6 +332,17 @@ class PerformanceNPlusOneGroupType(PerformanceGroupTypeDefaults, GroupType): released = True +@dataclass(frozen=True) +class PerformanceNPlusOneExperimentalGroupType(PerformanceGroupTypeDefaults, GroupType): + type_id = 1906 + slug = "performance_n_plus_one_db_queries_experimental" + description = "N+1 Query (Experimental)" + category = GroupCategory.PERFORMANCE.value + category_v2 = GroupCategory.PERFORMANCE_BEST_PRACTICE.value + default_priority = PriorityLevel.LOW + released = False + + @dataclass(frozen=True) class PerformanceConsecutiveDBQueriesGroupType(PerformanceGroupTypeDefaults, GroupType): type_id = 1007 diff --git a/src/sentry/utils/performance_issues/base.py b/src/sentry/utils/performance_issues/base.py index 589d55368300e8..65ac2354161ad9 100644 --- a/src/sentry/utils/performance_issues/base.py +++ b/src/sentry/utils/performance_issues/base.py @@ -30,6 +30,7 @@ class DetectorType(Enum): DB_MAIN_THREAD = "db_main_thread" HTTP_OVERHEAD = "http_overhead" EXPERIMENTAL_N_PLUS_ONE_API_CALLS = "experimental_n_plus_one_api_calls" + EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES = "experimental_n_plus_one_db_queries" # Detector and the corresponding system option must be added to this list to have issues created. diff --git a/src/sentry/utils/performance_issues/detectors/experiments/n_plus_one_db_span_detector.py b/src/sentry/utils/performance_issues/detectors/experiments/n_plus_one_db_span_detector.py new file mode 100644 index 00000000000000..33f0c49f32a945 --- /dev/null +++ b/src/sentry/utils/performance_issues/detectors/experiments/n_plus_one_db_span_detector.py @@ -0,0 +1,279 @@ +from __future__ import annotations + +import hashlib +from typing import Any + +from sentry.issues.grouptype import PerformanceNPlusOneExperimentalGroupType +from sentry.issues.issue_occurrence import IssueEvidence +from sentry.models.organization import Organization +from sentry.models.project import Project +from sentry.utils import metrics +from sentry.utils.performance_issues.base import ( + DetectorType, + PerformanceDetector, + get_notification_attachment_body, + get_span_evidence_value, + total_span_time, +) +from sentry.utils.performance_issues.performance_problem import PerformanceProblem +from sentry.utils.performance_issues.types import Span +from sentry.utils.safe import get_path + + +class NPlusOneDBSpanExperimentalDetector(PerformanceDetector): + """ + Detector goals: + - identify a database N+1 query with high accuracy + - collect enough information to create a good fingerprint (see below) + - only return issues with good fingerprints + + A good fingerprint is one that gives us confidence that, if two fingerprints + match, then they correspond to the same issue location in code (and + therefore, the same fix). + + To do this we look for a specific structure: + + [-------- transaction span -----------] + [-------- parent span -----------] + [source query] + [n0] + [n1] + [n2] + ... + + If we detect two different N+1 problems, and both have matching parents, + source queries, and repeated (n) queries, then we can be fairly confident + they are the same issue. + """ + + __slots__ = ( + "stored_problems", + "potential_parents", + "source_span", + "n_hash", + "n_spans", + "transaction", + ) + + type = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES + settings_key = DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES + + def __init__(self, settings: dict[DetectorType, Any], event: dict[str, Any]) -> None: + super().__init__(settings, event) + + self.stored_problems = {} + self.potential_parents = {} + self.n_hash: str | None = None + self.n_spans: list[Span] = [] + self.source_span: Span | None = None + root_span = get_path(self._event, "contexts", "trace") + if root_span: + self.potential_parents[root_span.get("span_id")] = root_span + + def is_creation_allowed_for_system(self) -> bool: + # Defer to the issue platform for whether to create issues + # See https://develop.sentry.dev/backend/issue-platform/#releasing-your-issue-type + return True + + def is_creation_allowed_for_organization(self, organization: Organization | None) -> bool: + return True # This detector is fully rolled out + + def is_creation_allowed_for_project(self, project: Project | None) -> bool: + return self.settings["detection_enabled"] + + def visit_span(self, span: Span) -> None: + span_id = span.get("span_id", None) + op = span.get("op", None) + if not span_id or not op: + return + + if not self._is_db_op(op): + # This breaks up the N+1 we're currently tracking. + self._maybe_store_problem() + self._reset_detection() + # Treat it as a potential parent as long as it isn't the root span. + if span.get("parent_span_id", None): + self.potential_parents[span_id] = span + return + + if not self.source_span: + # We aren't currently tracking an N+1. Maybe this span triggers one! + self._maybe_use_as_source(span) + return + + # If we got this far, we know we're a DB span and we're looking for a + # sequence of N identical DB spans. + if self._continues_n_plus_1(span): + self.n_spans.append(span) + else: + previous_span = self.n_spans[-1] if self.n_spans else None + self._maybe_store_problem() + self._reset_detection() + + # Maybe this DB span starts a whole new N+1! + if previous_span: + self._maybe_use_as_source(previous_span) + if self.source_span and self._continues_n_plus_1(span): + self.n_spans.append(span) + else: + self.source_span = None + self._maybe_use_as_source(span) + + def on_complete(self) -> None: + self._maybe_store_problem() + + def _is_db_op(self, op: str) -> bool: + return op.startswith("db") and not op.startswith("db.redis") + + def _maybe_use_as_source(self, span: Span) -> None: + parent_span_id = span.get("parent_span_id", None) + if not parent_span_id or parent_span_id not in self.potential_parents: + return + + self.source_span = span + + def _continues_n_plus_1(self, span: Span) -> bool: + if self.source_span is None: + return False + + expected_parent_id = self.source_span.get("parent_span_id", None) + parent_id = span.get("parent_span_id", None) + if not parent_id or parent_id != expected_parent_id: + return False + + span_hash = span.get("hash", None) + if not span_hash: + return False + + if span_hash == self.source_span.get("hash", None): + # The source span and n repeating spans must have different queries. + return False + + if not self.n_hash: + self.n_hash = span_hash + return True + + return span_hash == self.n_hash + + def _maybe_store_problem(self) -> None: + if not self.source_span or not self.n_spans: + return + + # Do we have enough spans? + count = self.settings.get("count") + if len(self.n_spans) < count: + return + + # Do the spans take enough total time? + if not self._is_slower_than_threshold(): + return + + # We require a parent span in order to improve our fingerprint accuracy. + parent_span_id = self.source_span.get("parent_span_id", None) + if not parent_span_id: + return + parent_span = self.potential_parents[parent_span_id] + if not parent_span: + return + + # Track how many N+1-looking problems we found but dropped because we + # couldn't be sure (maybe the truncated part of the query differs). + if not contains_complete_query( + self.source_span, is_source=True + ) or not contains_complete_query(self.n_spans[0]): + metrics.incr("performance.performance_issue.truncated_np1_db") + return + + if not self._contains_valid_repeating_query(self.n_spans[0]): + metrics.incr("performance.performance_issue.unparametrized_first_span") + return + + fingerprint = self._fingerprint( + parent_span.get("op", None), + parent_span.get("hash", None), + self.source_span.get("hash", None), + self.n_spans[0].get("hash", None), + ) + if fingerprint not in self.stored_problems: + self._metrics_for_extra_matching_spans() + + offender_span_ids = [span.get("span_id", None) for span in self.n_spans] + + self.stored_problems[fingerprint] = PerformanceProblem( + fingerprint=fingerprint, + op="db", + desc=self.n_spans[0].get("description", ""), + type=PerformanceNPlusOneExperimentalGroupType, + parent_span_ids=[parent_span_id], + cause_span_ids=[self.source_span.get("span_id", None)], + offender_span_ids=offender_span_ids, + evidence_display=[ + IssueEvidence( + name="Offending Spans", + value=get_notification_attachment_body( + "db", + self.n_spans[0].get("description", ""), + ), + # Has to be marked important to be displayed in the notifications + important=True, + ) + ], + evidence_data={ + "transaction_name": self._event.get("transaction", ""), + "op": "db", + "parent_span_ids": [parent_span_id], + "parent_span": get_span_evidence_value(parent_span), + "cause_span_ids": [self.source_span.get("span_id", None)], + "offender_span_ids": offender_span_ids, + "repeating_spans": get_span_evidence_value(self.n_spans[0]), + "repeating_spans_compact": get_span_evidence_value( + self.n_spans[0], include_op=False + ), + "num_repeating_spans": str(len(offender_span_ids)), + }, + ) + + def _is_slower_than_threshold(self) -> bool: + duration_threshold = self.settings.get("duration_threshold") + return total_span_time(self.n_spans) >= duration_threshold + + def _contains_valid_repeating_query(self, span: Span) -> bool: + # Make sure we at least have a space, to exclude e.g. MongoDB and + # Prisma's `rawQuery`. + query = span.get("description", None) + return bool(query) and " " in query + + def _metrics_for_extra_matching_spans(self) -> None: + # Checks for any extra spans that match the detected problem but are not part of affected spans. + # Temporary check since we eventually want to capture extra perf problems on the initial pass while walking spans. + n_count = len(self.n_spans) + all_matching_spans = [ + span + for span in self._event.get("spans", []) + if span.get("span_id", None) == self.n_hash + ] + all_count = len(all_matching_spans) + if n_count > 0 and n_count != all_count: + metrics.incr("performance.performance_issue.np1_db.extra_spans") + + def _reset_detection(self) -> None: + self.source_span = None + self.n_hash = None + self.n_spans = [] + + def _fingerprint(self, parent_op: str, parent_hash: str, source_hash: str, n_hash: str) -> str: + # XXX: this has to be a hardcoded string otherwise grouping will break + problem_class = "GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES" + full_fingerprint = hashlib.sha1( + (str(parent_op) + str(parent_hash) + str(source_hash) + str(n_hash)).encode("utf8"), + ).hexdigest() + return f"1-{problem_class}-{full_fingerprint}" + + +def contains_complete_query(span: Span, is_source: bool | None = False) -> bool: + # Remove the truncation check from the n_plus_one db detector. + query = span.get("description", None) + if is_source and query: + return True + else: + return query and not query.endswith("...") diff --git a/src/sentry/utils/performance_issues/performance_detection.py b/src/sentry/utils/performance_issues/performance_detection.py index e2a1f043d1a27a..5b4f2c11bc1bec 100644 --- a/src/sentry/utils/performance_issues/performance_detection.py +++ b/src/sentry/utils/performance_issues/performance_detection.py @@ -243,6 +243,11 @@ def get_detection_settings(project_id: int | None = None) -> dict[DetectorType, "duration_threshold": settings["n_plus_one_db_duration_threshold"], # ms "detection_enabled": settings["n_plus_one_db_queries_detection_enabled"], }, + DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES: { + "count": settings["n_plus_one_db_count"], + "duration_threshold": settings["n_plus_one_db_duration_threshold"], # ms + "detection_enabled": settings["n_plus_one_db_queries_detection_enabled"], + }, DetectorType.CONSECUTIVE_DB_OP: { # time saved by running all queries in parallel "min_time_saved": settings["consecutive_db_min_time_saved_threshold"], # ms diff --git a/tests/sentry/utils/performance_issues/experiments/test_n_plus_one_db_span_detector.py b/tests/sentry/utils/performance_issues/experiments/test_n_plus_one_db_span_detector.py new file mode 100644 index 00000000000000..32da9b0d044efc --- /dev/null +++ b/tests/sentry/utils/performance_issues/experiments/test_n_plus_one_db_span_detector.py @@ -0,0 +1,309 @@ +from __future__ import annotations + +import unittest +from typing import Any + +import pytest + +from sentry.issues.grouptype import PerformanceNPlusOneExperimentalGroupType +from sentry.issues.issue_occurrence import IssueEvidence +from sentry.models.options.project_option import ProjectOption +from sentry.testutils.cases import TestCase +from sentry.testutils.performance_issues.event_generators import get_event +from sentry.utils.performance_issues.base import DetectorType +from sentry.utils.performance_issues.detectors.experiments.n_plus_one_db_span_detector import ( + NPlusOneDBSpanExperimentalDetector, +) +from sentry.utils.performance_issues.performance_detection import ( + get_detection_settings, + run_detector_on_data, +) +from sentry.utils.performance_issues.performance_problem import PerformanceProblem + + +@pytest.mark.django_db +class NPlusOneDBSpanExperimentalDetectorTest(unittest.TestCase): + def setUp(self): + super().setUp() + self._settings = get_detection_settings() + + def find_problems( + self, event: dict[str, Any], setting_overides: dict[str, Any] | None = None + ) -> list[PerformanceProblem]: + if setting_overides: + for option_name, value in setting_overides.items(): + self._settings[DetectorType.EXPERIMENTAL_N_PLUS_ONE_DB_QUERIES][option_name] = value + + detector = NPlusOneDBSpanExperimentalDetector(self._settings, event) + run_detector_on_data(detector, event) + return list(detector.stored_problems.values()) + + def test_does_not_detect_issues_in_fast_transaction(self): + event = get_event("no-issue-in-django-detail-view") + assert self.find_problems(event) == [] + + def test_detects_n_plus_one_with_unparameterized_query( + self, + ): + event = get_event("n-plus-one-in-django-index-view-unparameterized") + assert self.find_problems(event) == [ + PerformanceProblem( + fingerprint="1-GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES-8d86357da4d8a866b19c97670edee38d037a7bc8", + op="db", + desc="SELECT `books_author`.`id`, `books_author`.`name` FROM `books_author` WHERE `books_author`.`id` = 1 LIMIT 21", + type=PerformanceNPlusOneExperimentalGroupType, + parent_span_ids=["8dd7a5869a4f4583"], + cause_span_ids=["9179e43ae844b174"], + offender_span_ids=[ + "b8be6138369491dd", + "b2d4826e7b618f1b", + "b3fdeea42536dbf1", + "b409e78a092e642f", + "86d2ede57bbf48d4", + "8e554c84cdc9731e", + "94d6230f3f910e12", + "a210b87a2191ceb6", + "88a5ccaf25b9bd8f", + "bb32cf50fc56b296", + ], + evidence_data={ + "transaction_name": "/books/", + "op": "db", + "parent_span_ids": ["8dd7a5869a4f4583"], + "parent_span": "django.view - index", + "cause_span_ids": ["9179e43ae844b174"], + "offender_span_ids": [ + "b8be6138369491dd", + "b2d4826e7b618f1b", + "b3fdeea42536dbf1", + "b409e78a092e642f", + "86d2ede57bbf48d4", + "8e554c84cdc9731e", + "94d6230f3f910e12", + "a210b87a2191ceb6", + "88a5ccaf25b9bd8f", + "bb32cf50fc56b296", + ], + "repeating_spans": "db - SELECT `books_author`.`id`, `books_author`.`name` FROM `books_author` WHERE `books_author`.`id` = 1 LIMIT 21", + "repeating_spans_compact": "SELECT `books_author`.`id`, `books_author`.`name` FROM `books_author` WHERE `books_author`.`id` = 1 LIMIT 21", + "num_repeating_spans": "10", + }, + evidence_display=[ + IssueEvidence( + name="Offending Spans", + value="db - SELECT `books_author`.`id`, `books_author`.`name` FROM `books_author` WHERE `books_author`.`id` = 1 LIMIT 21", + important=True, + ) + ], + ) + ] + + def test_does_not_detect_n_plus_one_with_source_redis_query_with_noredis_detector( + self, + ): + event = get_event("n-plus-one-in-django-index-view-source-redis") + assert self.find_problems(event) == [] + + def test_does_not_detect_n_plus_one_with_repeating_redis_query_with_noredis_detector( + self, + ): + event = get_event("n-plus-one-in-django-index-view-repeating-redis") + assert self.find_problems(event) == [] + + def test_ignores_fast_n_plus_one(self): + event = get_event("fast-n-plus-one-in-django-new-view") + assert self.find_problems(event) == [] + + def test_detects_slow_span_but_not_n_plus_one_in_query_waterfall(self): + event = get_event("query-waterfall-in-django-random-view") + assert self.find_problems(event) == [] + + def test_finds_n_plus_one_with_db_dot_something_spans(self): + event = get_event("n-plus-one-in-django-index-view-activerecord") + assert self.find_problems(event) == [ + PerformanceProblem( + fingerprint="1-GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES-8d86357da4d8a866b19c97670edee38d037a7bc8", + op="db", + desc="SELECT `books_author`.`id`, `books_author`.`name` FROM `books_author` WHERE `books_author`.`id` = %s LIMIT 21", + type=PerformanceNPlusOneExperimentalGroupType, + parent_span_ids=["8dd7a5869a4f4583"], + cause_span_ids=["9179e43ae844b174"], + offender_span_ids=[ + "b8be6138369491dd", + "b2d4826e7b618f1b", + "b3fdeea42536dbf1", + "b409e78a092e642f", + "86d2ede57bbf48d4", + "8e554c84cdc9731e", + "94d6230f3f910e12", + "a210b87a2191ceb6", + "88a5ccaf25b9bd8f", + "bb32cf50fc56b296", + ], + evidence_data={ + "op": "db", + "parent_span_ids": ["8dd7a5869a4f4583"], + "cause_span_ids": ["9179e43ae844b174"], + "offender_span_ids": [ + "b8be6138369491dd", + "b2d4826e7b618f1b", + "b3fdeea42536dbf1", + "b409e78a092e642f", + "86d2ede57bbf48d4", + "8e554c84cdc9731e", + "94d6230f3f910e12", + "a210b87a2191ceb6", + "88a5ccaf25b9bd8f", + "bb32cf50fc56b296", + ], + }, + evidence_display=[], + ) + ] + + def test_n_plus_one_db_detector_has_different_fingerprints_for_different_n_plus_one_events( + self, + ): + index_n_plus_one_event = get_event("n-plus-one-in-django-index-view") + new_n_plus_one_event = get_event("n-plus-one-in-django-new-view") + + index_problems = self.find_problems(index_n_plus_one_event) + new_problems = self.find_problems(new_n_plus_one_event) + + index_fingerprint = index_problems[0].fingerprint + new_fingerprint = new_problems[0].fingerprint + + assert index_fingerprint + assert new_fingerprint + assert index_fingerprint != new_fingerprint + + def test_detects_n_plus_one_with_multiple_potential_sources(self): + event = get_event("n-plus-one-in-django-with-odd-db-sources") + + assert self.find_problems(event, {"duration_threshold": 0}) == [ + PerformanceProblem( + fingerprint="1-GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES-e55ea09e1cff0ca2369f287cf624700f98cf4b50", + op="db", + type=PerformanceNPlusOneExperimentalGroupType, + desc='SELECT "expense_expenses"."id", "expense_expenses"."report_id", "expense_expenses"."amount" FROM "expense_expenses" WHERE "expense_expenses"."report_id" = %s', + parent_span_ids=["81a4b462bdc5c764"], + cause_span_ids=["99797d06e2fa9750"], + offender_span_ids=[ + "9c7876a6d7a26c72", + "b31f67541d38ad0c", + "aff9d1545b41f1de", + "86a56025d94edb85", + "b5e340041cfc2532", + "b77a0b154e782baa", + "9c46a977962d6ed1", + "b03da8752eeddebe", + "8c173716d4c7e41b", + "b4e6f90c66e90238", + "987affc4f2faa24b", + "b7d323b4f5f8b2b0", + "a4f0a57410b61072", + "a6120e2d88c86ea4", + "a87019f03438311e", + "b5487ad7228cfd6e", + "bc44d59a63a4115c", + "84b05df439e4a6ee", + "be85dffe4a9a3120", + "a3c381b1952dd7fb", + ], + evidence_data={ + "op": "db", + "parent_span_ids": ["81a4b462bdc5c764"], + "cause_span_ids": ["99797d06e2fa9750"], + "offender_span_ids": [ + "9c7876a6d7a26c72", + "b31f67541d38ad0c", + "aff9d1545b41f1de", + "86a56025d94edb85", + "b5e340041cfc2532", + "b77a0b154e782baa", + "9c46a977962d6ed1", + "b03da8752eeddebe", + "8c173716d4c7e41b", + "b4e6f90c66e90238", + "987affc4f2faa24b", + "b7d323b4f5f8b2b0", + "a4f0a57410b61072", + "a6120e2d88c86ea4", + "a87019f03438311e", + "b5487ad7228cfd6e", + "bc44d59a63a4115c", + "84b05df439e4a6ee", + "be85dffe4a9a3120", + "a3c381b1952dd7fb", + ], + }, + evidence_display=[], + ), + ] + + def test_detects_overlapping_n_plus_one(self): + event = get_event("parallel-n-plus-one-in-django-index-view") + assert self.find_problems(event) == [ + PerformanceProblem( + fingerprint="1-GroupType.PERFORMANCE_N_PLUS_ONE_DB_QUERIES-8d86357da4d8a866b19c97670edee38d037a7bc8", + op="db", + desc="SELECT `books_author`.`id`, `books_author`.`name` FROM `books_author` WHERE `books_author`.`id` = %s LIMIT 21", + type=PerformanceNPlusOneExperimentalGroupType, + parent_span_ids=["8dd7a5869a4f4583"], + cause_span_ids=["9179e43ae844b174"], + offender_span_ids=[ + "b8be6138369491dd", + "b2d4826e7b618f1b", + "b3fdeea42536dbf1", + "b409e78a092e642f", + "86d2ede57bbf48d4", + "8e554c84cdc9731e", + "94d6230f3f910e12", + "a210b87a2191ceb6", + "88a5ccaf25b9bd8f", + "bb32cf50fc56b296", + ], + evidence_data={ + "op": "db", + "parent_span_ids": ["8dd7a5869a4f4583"], + "cause_span_ids": ["9179e43ae844b174"], + "offender_span_ids": [ + "b8be6138369491dd", + "b2d4826e7b618f1b", + "b3fdeea42536dbf1", + "b409e78a092e642f", + "86d2ede57bbf48d4", + "8e554c84cdc9731e", + "94d6230f3f910e12", + "a210b87a2191ceb6", + "88a5ccaf25b9bd8f", + "bb32cf50fc56b296", + ], + }, + evidence_display=[], + ) + ] + + +@pytest.mark.django_db +class NPlusOneDbSettingTest(TestCase): + def test_respects_project_option(self): + project = self.create_project() + event = get_event("n-plus-one-in-django-index-view-activerecord") + event["project_id"] = project.id + + settings = get_detection_settings(project.id) + detector = NPlusOneDBSpanExperimentalDetector(settings, event) + + assert detector.is_creation_allowed_for_project(project) + + ProjectOption.objects.set_value( + project=project, + key="sentry:performance_issue_settings", + value={"n_plus_one_db_queries_detection_enabled": False}, + ) + + settings = get_detection_settings(project.id) + detector = NPlusOneDBSpanExperimentalDetector(settings, event) + + assert not detector.is_creation_allowed_for_project(project)