From 97105898d7645917c1d1f8a2dbd64186fe41f518 Mon Sep 17 00:00:00 2001 From: Leander Rodrigues Date: Thu, 10 Apr 2025 19:52:55 -0400 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=9A=A7=20Use=20the=20parameterized=20?= =?UTF-8?q?URL=20for=20checking=20similarity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../n_plus_one_api_calls_detector.py | 63 +------------------ 1 file changed, 3 insertions(+), 60 deletions(-) diff --git a/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py b/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py index c6f95296039b98..6483937b8061c8 100644 --- a/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py +++ b/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py @@ -1,15 +1,12 @@ from __future__ import annotations -import hashlib import os from collections import defaultdict -from collections.abc import Mapping, Sequence +from collections.abc import Mapping from datetime import timedelta from typing import Any from urllib.parse import parse_qs, urlparse -from django.utils.encoding import force_bytes - from sentry.issues.grouptype import PerformanceNPlusOneAPICallsGroupType from sentry.issues.issue_occurrence import IssueEvidence from sentry.models.organization import Organization @@ -51,7 +48,6 @@ def __init__(self, settings: dict[DetectorType, Any], event: dict[str, Any]) -> # TODO: Only store the span IDs and timestamps instead of entire span objects self.stored_problems: PerformanceProblemsMap = {} self.spans: list[Span] = [] - self.span_hashes: dict[str, str | None] = {} def visit_span(self, span: Span) -> None: if not NPlusOneAPICallsDetector.is_span_eligible(span): @@ -61,8 +57,6 @@ def visit_span(self, span: Span) -> None: if op not in self.settings.get("allowed_span_ops", []): return - self.span_hashes[span["span_id"]] = get_span_hash(span) - previous_span = self.spans[-1] if len(self.spans) > 0 else None if previous_span is None: @@ -250,62 +244,11 @@ def _spans_are_concurrent(self, span_a: Span, span_b: Span) -> bool: def _spans_are_similar(self, span_a: Span, span_b: Span) -> bool: return ( - self.span_hashes[span_a["span_id"]] == self.span_hashes[span_b["span_id"]] + parameterize_url(get_url_from_span(span_a)) + == parameterize_url(get_url_from_span(span_b)) and span_a["parent_span_id"] == span_b["parent_span_id"] ) -HTTP_METHODS = { - "GET", - "HEAD", - "POST", - "PUT", - "DELETE", - "CONNECT", - "OPTIONS", - "TRACE", - "PATCH", -} - - -def get_span_hash(span: Span) -> str | None: - if span.get("op") != "http.client": - return span.get("hash") - - parts = remove_http_client_query_string_strategy(span) - if not parts: - return None - - hash = hashlib.md5() - for part in parts: - hash.update(force_bytes(part, errors="replace")) - - return hash.hexdigest()[:16] - - -def remove_http_client_query_string_strategy(span: Span) -> Sequence[str] | None: - """ - This is an inline version of the `http.client` parameterization code in - `"default:2022-10-27"`, the default span grouping strategy at time of - writing. It's inlined here to insulate this detector from changes in the - strategy, which are coming soon. - """ - - # Check the description is of the form ` ` - description = span.get("description") or "" - parts = description.split(" ", 1) - if len(parts) != 2: - return None - - # Ensure that this is a valid http method - method, url_str = parts - method = method.upper() - if method not in HTTP_METHODS: - return None - - url = urlparse(url_str) - return [method, url.scheme, url.netloc, url.path] - - def without_query_params(url: str) -> str: return urlparse(url)._replace(query="").geturl() From 749da92dcab2315e4c3f5341e9dd6c04f0e7452b Mon Sep 17 00:00:00 2001 From: Leander Rodrigues Date: Mon, 14 Apr 2025 16:34:16 -0400 Subject: [PATCH 2/2] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20better=20evidence=20da?= =?UTF-8?q?ta,=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/sentry/utils/performance_issues/base.py | 31 ++++++-- .../n_plus_one_api_calls_detector.py | 60 ++++++++-------- .../performance_issues/performance_problem.py | 2 +- .../test_n_plus_one_api_calls_detector.py | 71 +++++++++++++------ 4 files changed, 107 insertions(+), 57 deletions(-) diff --git a/src/sentry/utils/performance_issues/base.py b/src/sentry/utils/performance_issues/base.py index e9063c98408628..40fe6456fb37ac 100644 --- a/src/sentry/utils/performance_issues/base.py +++ b/src/sentry/utils/performance_issues/base.py @@ -6,7 +6,7 @@ from abc import ABC, abstractmethod from datetime import timedelta from enum import Enum -from typing import Any, ClassVar +from typing import Any, ClassVar, TypedDict from urllib.parse import parse_qs, urlparse from sentry import options @@ -292,7 +292,17 @@ def fingerprint_resource_span(span: Span): return hashlib.sha1(stripped_url.encode("utf-8")).hexdigest() -def parameterize_url(url: str) -> str: +class ParameterizedUrl(TypedDict): + url: str + path_params: list[str] # e.g. ["123", "abc123de-1024-4321-abcd-1234567890ab"] + query_params: dict[str, list[str]] # e.g. {"limit": "50", "offset": "100"} + + +def parameterize_url_with_result(url: str) -> ParameterizedUrl: + """ + Given a URL, return the URL with parsed path and query parameters replaced with '*', + a list of the path parameters, and a dict of the query parameters. + """ parsed_url = urlparse(str(url)) protocol_fragments = [] @@ -305,15 +315,18 @@ def parameterize_url(url: str) -> str: host_fragments.append(str(fragment)) path_fragments = [] + path_params = [] for fragment in parsed_url.path.split("/"): - if PARAMETERIZED_URL_REGEX.search(fragment): + path_param = PARAMETERIZED_URL_REGEX.search(fragment) + if path_param: path_fragments.append("*") + path_params.append(path_param.group()) else: path_fragments.append(str(fragment)) query = parse_qs(parsed_url.query) - return "".join( + parameterized_url = "".join( [ "".join(protocol_fragments), ".".join(host_fragments), @@ -323,6 +336,16 @@ def parameterize_url(url: str) -> str: ] ).rstrip("?") + return ParameterizedUrl( + url=parameterized_url, + path_params=path_params, + query_params=query, + ) + + +def parameterize_url(url: str) -> str: + return parameterize_url_with_result(url).get("url", "") + def fingerprint_http_spans(spans: list[Span]) -> str: """ diff --git a/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py b/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py index 6483937b8061c8..831d2af4634fef 100644 --- a/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py +++ b/src/sentry/utils/performance_issues/detectors/n_plus_one_api_calls_detector.py @@ -5,7 +5,7 @@ from collections.abc import Mapping from datetime import timedelta from typing import Any -from urllib.parse import parse_qs, urlparse +from urllib.parse import urlparse from sentry.issues.grouptype import PerformanceNPlusOneAPICallsGroupType from sentry.issues.issue_occurrence import IssueEvidence @@ -21,6 +21,7 @@ get_span_evidence_value, get_url_from_span, parameterize_url, + parameterize_url_with_result, ) from ..performance_problem import PerformanceProblem from ..types import PerformanceProblemsMap, Span @@ -157,11 +158,16 @@ def _maybe_store_problem(self) -> None: return offender_span_ids = [span["span_id"] for span in self.spans] + problem_description = self._get_parameterized_url(self.spans[0]) + if problem_description == "": + problem_description = os.path.commonprefix( + [span.get("description", "") or "" for span in self.spans] + ) self.stored_problems[fingerprint] = PerformanceProblem( fingerprint=fingerprint, op=last_span["op"], - desc=os.path.commonprefix([span.get("description", "") or "" for span in self.spans]), + desc=problem_description, type=PerformanceNPlusOneAPICallsGroupType, cause_span_ids=[], parent_span_ids=[last_span.get("parent_span_id", None)], @@ -175,16 +181,14 @@ def _maybe_store_problem(self) -> None: "num_repeating_spans": str(len(offender_span_ids)) if offender_span_ids else "", "repeating_spans": self._get_path_prefix(self.spans[0]), "repeating_spans_compact": get_span_evidence_value(self.spans[0], include_op=False), - "parameters": self._get_parameters(), + "parameters": self._get_parameters()["query_params"], + "path_parameters": self._get_parameters()["path_params"], }, evidence_display=[ IssueEvidence( name="Offending Spans", value=get_notification_attachment_body( - last_span["op"], - os.path.commonprefix( - [span.get("description", "") or "" for span in self.spans] - ), + op=last_span["op"], desc=problem_description ), # Has to be marked important to be displayed in the notifications important=True, @@ -192,24 +196,28 @@ def _maybe_store_problem(self) -> None: ], ) - def _get_parameters(self) -> list[str]: + def _get_parameters(self) -> dict[str, list[str]]: if not self.spans or len(self.spans) == 0: - return [] + return {"query_params": [], "path_params": []} - urls = [get_url_from_span(span) for span in self.spans] - - all_parameters: Mapping[str, list[str]] = defaultdict(list) + parameterized_urls = [ + parameterize_url_with_result(get_url_from_span(span)) for span in self.spans + ] + path_params = [param["path_params"] for param in parameterized_urls] + query_dict: Mapping[str, list[str]] = defaultdict(list) - for url in urls: - parsed_url = urlparse(url) - parameters = parse_qs(parsed_url.query) + for parameterized_url in parameterized_urls: + query_params = parameterized_url["query_params"] - for key, value in parameters.items(): - all_parameters[key] += value + for key, value in query_params.items(): + query_dict[key] += value + return { + "path_params": [f"{', '.join(param_group)}" for param_group in path_params], + "query_params": [f"{key}: {', '.join(values)}" for key, values in query_dict.items()], + } - return [ - "{{{}: {}}}".format(key, ",".join(values)) for key, values in all_parameters.items() - ] + def _get_parameterized_url(self, span: Span) -> str: + return parameterize_url(get_url_from_span(span)) def _get_path_prefix(self, repeating_span: Span) -> str: if not repeating_span: @@ -225,9 +233,8 @@ def _fingerprint(self) -> str | None: # Check if we parameterized the URL at all. If not, do not attempt # fingerprinting. Unparameterized URLs run too high a risk of - # fingerprinting explosions. Query parameters are parameterized by - # definition, so exclude them from comparison - if without_query_params(parameterized_first_url) == without_query_params(first_url): + # fingerprinting explosions. + if parameterized_first_url == first_url: return None fingerprint = fingerprint_http_spans([self.spans[0]]) @@ -244,11 +251,6 @@ def _spans_are_concurrent(self, span_a: Span, span_b: Span) -> bool: def _spans_are_similar(self, span_a: Span, span_b: Span) -> bool: return ( - parameterize_url(get_url_from_span(span_a)) - == parameterize_url(get_url_from_span(span_b)) + self._get_parameterized_url(span_a) == self._get_parameterized_url(span_b) and span_a["parent_span_id"] == span_b["parent_span_id"] ) - - -def without_query_params(url: str) -> str: - return urlparse(url)._replace(query="").geturl() diff --git a/src/sentry/utils/performance_issues/performance_problem.py b/src/sentry/utils/performance_issues/performance_problem.py index 11c978539f3468..abfd297a321d04 100644 --- a/src/sentry/utils/performance_issues/performance_problem.py +++ b/src/sentry/utils/performance_issues/performance_problem.py @@ -22,7 +22,7 @@ class PerformanceProblem: # We can't make it required until we stop loading these from nodestore via EventPerformanceProblem, # since there's legacy data in there that won't have these fields. # So until we disable transaction based perf issues we'll need to keep this optional. - evidence_data: Mapping[str, Any] | None + evidence_data: dict[str, Any] | None # User-friendly evidence to be displayed directly evidence_display: Sequence[IssueEvidence] diff --git a/tests/sentry/utils/performance_issues/test_n_plus_one_api_calls_detector.py b/tests/sentry/utils/performance_issues/test_n_plus_one_api_calls_detector.py index 5ac6cae47bb448..0fc67b35ccf122 100644 --- a/tests/sentry/utils/performance_issues/test_n_plus_one_api_calls_detector.py +++ b/tests/sentry/utils/performance_issues/test_n_plus_one_api_calls_detector.py @@ -16,7 +16,6 @@ from sentry.utils.performance_issues.base import DetectorType, parameterize_url from sentry.utils.performance_issues.detectors.n_plus_one_api_calls_detector import ( NPlusOneAPICallsDetector, - without_query_params, ) from sentry.utils.performance_issues.performance_detection import ( get_detection_settings, @@ -188,9 +187,22 @@ def test_does_not_detect_problems_with_low_span_count(self): problems = self.find_problems(event) assert problems == [] - def test_does_not_detect_problem_with_unparameterized_urls(self): + def test_does_detect_problem_with_unparameterized_urls(self): event = get_event("n-plus-one-api-calls/n-plus-one-api-calls-in-weather-app") - assert self.find_problems(event) == [] + [problem] = self.find_problems(event) + + assert problem.fingerprint == "1-1010-bf7ad6b20bb345ae327362c849427956862bf839" + + def test_does_detect_problem_with_parameterized_urls(self): + event = self.create_event(lambda i: f"GET /clients/{i}/info/{i*100}/") + [problem] = self.find_problems(event) + assert problem.desc == "/clients/*/info/*/" + assert problem.evidence_data is not None + path_params = problem.evidence_data.get("path_parameters", []) + # It should sequentially store sets of path parameters on the evidence data + for i in range(len(path_params)): + assert path_params[i] == f"{i}, {i*100}" + assert problem.fingerprint == "1-1010-8bf177290e2d78550fef5a1f6e9ddf115e4b0614" def test_does_not_detect_problem_with_concurrent_calls_to_different_urls(self): event = get_event("n-plus-one-api-calls/not-n-plus-one-api-calls") @@ -221,28 +233,55 @@ def test_fingerprints_same_relative_urls_together(self): assert problem1.fingerprint == problem2.fingerprint def test_fingerprints_same_parameterized_integer_relative_urls_together(self): - event1 = self.create_event(lambda i: f"GET /clients/17/info?id={i}") + event1 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}") [problem1] = self.find_problems(event1) - event2 = self.create_event(lambda i: f"GET /clients/16/info?id={i*2}") + event2 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i*2}") [problem2] = self.find_problems(event2) assert problem1.fingerprint == problem2.fingerprint def test_fingerprints_different_relative_url_separately(self): - event1 = self.create_event(lambda i: f"GET /clients/11/info?id={i}") + event1 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}") [problem1] = self.find_problems(event1) - event2 = self.create_event(lambda i: f"GET /projects/11/details?pid={i}") + event2 = self.create_event(lambda i: f"GET /projects/{i}/details?pid={i}") + [problem2] = self.find_problems(event2) + + assert problem1.fingerprint != problem2.fingerprint + + def test_fingerprints_relative_urls_with_query_params_together(self): + event1 = self.create_event(lambda i: f"GET /clients/{i}/info") + [problem1] = self.find_problems(event1) + + event2 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}") + [problem2] = self.find_problems(event2) + + assert problem1.fingerprint == problem2.fingerprint + + def test_fingerprints_multiple_parameterized_integer_relative_urls_together(self): + event1 = self.create_event(lambda i: f"GET /clients/{i}/organization/{i}/info") + [problem1] = self.find_problems(event1) + + event2 = self.create_event(lambda i: f"GET /clients/{i*100}/organization/{i*100}/info") + [problem2] = self.find_problems(event2) + + assert problem1.fingerprint == problem2.fingerprint + + def test_fingerprints_different_parameterized_integer_relative_urls_separately(self): + event1 = self.create_event(lambda i: f"GET /clients/{i}/mario/{i}/info") + [problem1] = self.find_problems(event1) + + event2 = self.create_event(lambda i: f"GET /clients/{i}/luigi/{i}/info") [problem2] = self.find_problems(event2) assert problem1.fingerprint != problem2.fingerprint def test_ignores_hostname_for_fingerprinting(self): - event1 = self.create_event(lambda i: f"GET http://service.io/clients/42/info?id={i}") + event1 = self.create_event(lambda i: f"GET http://service.io/clients/{i}/info?id={i}") [problem1] = self.find_problems(event1) - event2 = self.create_event(lambda i: f"GET /clients/42/info?id={i}") + event2 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}") [problem2] = self.find_problems(event2) assert problem1.fingerprint == problem2.fingerprint @@ -433,20 +472,6 @@ def test_rejects_ineligible_spans(span): assert not NPlusOneAPICallsDetector.is_span_eligible(span) -@pytest.mark.parametrize( - "url,url_without_query", - [ - ("", ""), - ("http://service.io", "http://service.io"), - ("http://service.io/resource", "http://service.io/resource"), - ("/resource?id=1", "/resource"), - ("/resource?id=1&sort=down", "/resource"), - ], -) -def test_removes_query_params(url, url_without_query): - assert without_query_params(url) == url_without_query - - @pytest.mark.parametrize( "event", [get_event("n-plus-one-api-calls/not-n-plus-one-api-calls")],