Skip to content

fix(perf-issues): Use parameterized URL for similarities on N+1 API Calls #89362

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions src/sentry/utils/performance_issues/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from abc import ABC, abstractmethod
from datetime import timedelta
from enum import Enum
from typing import Any, ClassVar
from typing import Any, ClassVar, TypedDict
from urllib.parse import parse_qs, urlparse

from sentry import options
Expand Down Expand Up @@ -292,7 +292,17 @@ def fingerprint_resource_span(span: Span):
return hashlib.sha1(stripped_url.encode("utf-8")).hexdigest()


def parameterize_url(url: str) -> str:
class ParameterizedUrl(TypedDict):
url: str
path_params: list[str] # e.g. ["123", "abc123de-1024-4321-abcd-1234567890ab"]
query_params: dict[str, list[str]] # e.g. {"limit": "50", "offset": "100"}


def parameterize_url_with_result(url: str) -> ParameterizedUrl:
"""
Given a URL, return the URL with parsed path and query parameters replaced with '*',
a list of the path parameters, and a dict of the query parameters.
"""
parsed_url = urlparse(str(url))

protocol_fragments = []
Expand All @@ -305,15 +315,18 @@ def parameterize_url(url: str) -> str:
host_fragments.append(str(fragment))

path_fragments = []
path_params = []
for fragment in parsed_url.path.split("/"):
if PARAMETERIZED_URL_REGEX.search(fragment):
path_param = PARAMETERIZED_URL_REGEX.search(fragment)
if path_param:
path_fragments.append("*")
path_params.append(path_param.group())
else:
path_fragments.append(str(fragment))

query = parse_qs(parsed_url.query)

return "".join(
parameterized_url = "".join(
[
"".join(protocol_fragments),
".".join(host_fragments),
Expand All @@ -323,6 +336,16 @@ def parameterize_url(url: str) -> str:
]
).rstrip("?")

return ParameterizedUrl(
url=parameterized_url,
path_params=path_params,
query_params=query,
)


def parameterize_url(url: str) -> str:
return parameterize_url_with_result(url).get("url", "")


def fingerprint_http_spans(spans: list[Span]) -> str:
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from __future__ import annotations

import hashlib
import os
from collections import defaultdict
from collections.abc import Mapping, Sequence
from collections.abc import Mapping
from datetime import timedelta
from typing import Any
from urllib.parse import parse_qs, urlparse

from django.utils.encoding import force_bytes
from urllib.parse import urlparse

from sentry.issues.grouptype import PerformanceNPlusOneAPICallsGroupType
from sentry.issues.issue_occurrence import IssueEvidence
Expand All @@ -24,6 +21,7 @@
get_span_evidence_value,
get_url_from_span,
parameterize_url,
parameterize_url_with_result,
)
from ..performance_problem import PerformanceProblem
from ..types import PerformanceProblemsMap, Span
Expand Down Expand Up @@ -51,7 +49,6 @@ def __init__(self, settings: dict[DetectorType, Any], event: dict[str, Any]) ->
# TODO: Only store the span IDs and timestamps instead of entire span objects
self.stored_problems: PerformanceProblemsMap = {}
self.spans: list[Span] = []
self.span_hashes: dict[str, str | None] = {}

def visit_span(self, span: Span) -> None:
if not NPlusOneAPICallsDetector.is_span_eligible(span):
Expand All @@ -61,8 +58,6 @@ def visit_span(self, span: Span) -> None:
if op not in self.settings.get("allowed_span_ops", []):
return

self.span_hashes[span["span_id"]] = get_span_hash(span)

previous_span = self.spans[-1] if len(self.spans) > 0 else None

if previous_span is None:
Expand Down Expand Up @@ -163,11 +158,16 @@ def _maybe_store_problem(self) -> None:
return

offender_span_ids = [span["span_id"] for span in self.spans]
problem_description = self._get_parameterized_url(self.spans[0])
if problem_description == "":
problem_description = os.path.commonprefix(
[span.get("description", "") or "" for span in self.spans]
)

self.stored_problems[fingerprint] = PerformanceProblem(
fingerprint=fingerprint,
op=last_span["op"],
desc=os.path.commonprefix([span.get("description", "") or "" for span in self.spans]),
desc=problem_description,
type=PerformanceNPlusOneAPICallsGroupType,
cause_span_ids=[],
parent_span_ids=[last_span.get("parent_span_id", None)],
Expand All @@ -181,41 +181,43 @@ def _maybe_store_problem(self) -> None:
"num_repeating_spans": str(len(offender_span_ids)) if offender_span_ids else "",
"repeating_spans": self._get_path_prefix(self.spans[0]),
"repeating_spans_compact": get_span_evidence_value(self.spans[0], include_op=False),
"parameters": self._get_parameters(),
"parameters": self._get_parameters()["query_params"],
"path_parameters": self._get_parameters()["path_params"],
},
evidence_display=[
IssueEvidence(
name="Offending Spans",
value=get_notification_attachment_body(
last_span["op"],
os.path.commonprefix(
[span.get("description", "") or "" for span in self.spans]
),
op=last_span["op"], desc=problem_description
),
# Has to be marked important to be displayed in the notifications
important=True,
)
],
)

def _get_parameters(self) -> list[str]:
def _get_parameters(self) -> dict[str, list[str]]:
if not self.spans or len(self.spans) == 0:
return []

urls = [get_url_from_span(span) for span in self.spans]
return {"query_params": [], "path_params": []}

all_parameters: Mapping[str, list[str]] = defaultdict(list)
parameterized_urls = [
parameterize_url_with_result(get_url_from_span(span)) for span in self.spans
]
path_params = [param["path_params"] for param in parameterized_urls]
query_dict: Mapping[str, list[str]] = defaultdict(list)

for url in urls:
parsed_url = urlparse(url)
parameters = parse_qs(parsed_url.query)
for parameterized_url in parameterized_urls:
query_params = parameterized_url["query_params"]

for key, value in parameters.items():
all_parameters[key] += value
for key, value in query_params.items():
query_dict[key] += value
return {
"path_params": [f"{', '.join(param_group)}" for param_group in path_params],
"query_params": [f"{key}: {', '.join(values)}" for key, values in query_dict.items()],
}

return [
"{{{}: {}}}".format(key, ",".join(values)) for key, values in all_parameters.items()
]
def _get_parameterized_url(self, span: Span) -> str:
return parameterize_url(get_url_from_span(span))

def _get_path_prefix(self, repeating_span: Span) -> str:
if not repeating_span:
Expand All @@ -231,9 +233,8 @@ def _fingerprint(self) -> str | None:

# Check if we parameterized the URL at all. If not, do not attempt
# fingerprinting. Unparameterized URLs run too high a risk of
# fingerprinting explosions. Query parameters are parameterized by
# definition, so exclude them from comparison
if without_query_params(parameterized_first_url) == without_query_params(first_url):
# fingerprinting explosions.
if parameterized_first_url == first_url:
return None

fingerprint = fingerprint_http_spans([self.spans[0]])
Expand All @@ -250,62 +251,6 @@ def _spans_are_concurrent(self, span_a: Span, span_b: Span) -> bool:

def _spans_are_similar(self, span_a: Span, span_b: Span) -> bool:
return (
self.span_hashes[span_a["span_id"]] == self.span_hashes[span_b["span_id"]]
self._get_parameterized_url(span_a) == self._get_parameterized_url(span_b)
and span_a["parent_span_id"] == span_b["parent_span_id"]
)


HTTP_METHODS = {
"GET",
"HEAD",
"POST",
"PUT",
"DELETE",
"CONNECT",
"OPTIONS",
"TRACE",
"PATCH",
}


def get_span_hash(span: Span) -> str | None:
if span.get("op") != "http.client":
return span.get("hash")

parts = remove_http_client_query_string_strategy(span)
if not parts:
return None

hash = hashlib.md5()
for part in parts:
hash.update(force_bytes(part, errors="replace"))

return hash.hexdigest()[:16]


def remove_http_client_query_string_strategy(span: Span) -> Sequence[str] | None:
"""
This is an inline version of the `http.client` parameterization code in
`"default:2022-10-27"`, the default span grouping strategy at time of
writing. It's inlined here to insulate this detector from changes in the
strategy, which are coming soon.
"""

# Check the description is of the form `<HTTP METHOD> <URL>`
description = span.get("description") or ""
parts = description.split(" ", 1)
if len(parts) != 2:
return None

# Ensure that this is a valid http method
method, url_str = parts
method = method.upper()
if method not in HTTP_METHODS:
return None

url = urlparse(url_str)
return [method, url.scheme, url.netloc, url.path]


def without_query_params(url: str) -> str:
return urlparse(url)._replace(query="").geturl()
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class PerformanceProblem:
# We can't make it required until we stop loading these from nodestore via EventPerformanceProblem,
# since there's legacy data in there that won't have these fields.
# So until we disable transaction based perf issues we'll need to keep this optional.
evidence_data: Mapping[str, Any] | None
evidence_data: dict[str, Any] | None
# User-friendly evidence to be displayed directly
evidence_display: Sequence[IssueEvidence]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from sentry.utils.performance_issues.base import DetectorType, parameterize_url
from sentry.utils.performance_issues.detectors.n_plus_one_api_calls_detector import (
NPlusOneAPICallsDetector,
without_query_params,
)
from sentry.utils.performance_issues.performance_detection import (
get_detection_settings,
Expand Down Expand Up @@ -188,9 +187,22 @@ def test_does_not_detect_problems_with_low_span_count(self):
problems = self.find_problems(event)
assert problems == []

def test_does_not_detect_problem_with_unparameterized_urls(self):
def test_does_detect_problem_with_unparameterized_urls(self):
event = get_event("n-plus-one-api-calls/n-plus-one-api-calls-in-weather-app")
assert self.find_problems(event) == []
[problem] = self.find_problems(event)

assert problem.fingerprint == "1-1010-bf7ad6b20bb345ae327362c849427956862bf839"

def test_does_detect_problem_with_parameterized_urls(self):
event = self.create_event(lambda i: f"GET /clients/{i}/info/{i*100}/")
[problem] = self.find_problems(event)
assert problem.desc == "/clients/*/info/*/"
assert problem.evidence_data is not None
path_params = problem.evidence_data.get("path_parameters", [])
# It should sequentially store sets of path parameters on the evidence data
for i in range(len(path_params)):
assert path_params[i] == f"{i}, {i*100}"
assert problem.fingerprint == "1-1010-8bf177290e2d78550fef5a1f6e9ddf115e4b0614"

def test_does_not_detect_problem_with_concurrent_calls_to_different_urls(self):
event = get_event("n-plus-one-api-calls/not-n-plus-one-api-calls")
Expand Down Expand Up @@ -221,28 +233,55 @@ def test_fingerprints_same_relative_urls_together(self):
assert problem1.fingerprint == problem2.fingerprint

def test_fingerprints_same_parameterized_integer_relative_urls_together(self):
event1 = self.create_event(lambda i: f"GET /clients/17/info?id={i}")
event1 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}")
[problem1] = self.find_problems(event1)

event2 = self.create_event(lambda i: f"GET /clients/16/info?id={i*2}")
event2 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i*2}")
[problem2] = self.find_problems(event2)

assert problem1.fingerprint == problem2.fingerprint

def test_fingerprints_different_relative_url_separately(self):
event1 = self.create_event(lambda i: f"GET /clients/11/info?id={i}")
event1 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}")
[problem1] = self.find_problems(event1)

event2 = self.create_event(lambda i: f"GET /projects/11/details?pid={i}")
event2 = self.create_event(lambda i: f"GET /projects/{i}/details?pid={i}")
[problem2] = self.find_problems(event2)

assert problem1.fingerprint != problem2.fingerprint

def test_fingerprints_relative_urls_with_query_params_together(self):
event1 = self.create_event(lambda i: f"GET /clients/{i}/info")
[problem1] = self.find_problems(event1)

event2 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}")
[problem2] = self.find_problems(event2)

assert problem1.fingerprint == problem2.fingerprint

def test_fingerprints_multiple_parameterized_integer_relative_urls_together(self):
event1 = self.create_event(lambda i: f"GET /clients/{i}/organization/{i}/info")
[problem1] = self.find_problems(event1)

event2 = self.create_event(lambda i: f"GET /clients/{i*100}/organization/{i*100}/info")
[problem2] = self.find_problems(event2)

assert problem1.fingerprint == problem2.fingerprint

def test_fingerprints_different_parameterized_integer_relative_urls_separately(self):
event1 = self.create_event(lambda i: f"GET /clients/{i}/mario/{i}/info")
[problem1] = self.find_problems(event1)

event2 = self.create_event(lambda i: f"GET /clients/{i}/luigi/{i}/info")
[problem2] = self.find_problems(event2)

assert problem1.fingerprint != problem2.fingerprint

def test_ignores_hostname_for_fingerprinting(self):
event1 = self.create_event(lambda i: f"GET http://service.io/clients/42/info?id={i}")
event1 = self.create_event(lambda i: f"GET http://service.io/clients/{i}/info?id={i}")
[problem1] = self.find_problems(event1)

event2 = self.create_event(lambda i: f"GET /clients/42/info?id={i}")
event2 = self.create_event(lambda i: f"GET /clients/{i}/info?id={i}")
[problem2] = self.find_problems(event2)

assert problem1.fingerprint == problem2.fingerprint
Expand Down Expand Up @@ -433,20 +472,6 @@ def test_rejects_ineligible_spans(span):
assert not NPlusOneAPICallsDetector.is_span_eligible(span)


@pytest.mark.parametrize(
"url,url_without_query",
[
("", ""),
("http://service.io", "http://service.io"),
("http://service.io/resource", "http://service.io/resource"),
("/resource?id=1", "/resource"),
("/resource?id=1&sort=down", "/resource"),
],
)
def test_removes_query_params(url, url_without_query):
assert without_query_params(url) == url_without_query


@pytest.mark.parametrize(
"event",
[get_event("n-plus-one-api-calls/not-n-plus-one-api-calls")],
Expand Down
Loading