Skip to content

Commit 03d50bd

Browse files
feat(issue-priority): Add a circuit breaker on consecutive failures (#67913)
If Seer fails for multiple requests in succession, we want a way to start skipping Seer until the service recovers. This circuit breaker keeps track of consecutive failures on the service and will start throttling requests if the limit is hit. In order to detect if the service is recovered, the circuit breaker will allow a configurable number of requests through per minute. If that request succeeds, the cache key will be cleared and we will resume making requests to Seer. --------- Co-authored-by: getsantry[bot] <66042841+getsantry[bot]@users.noreply.github.com>
1 parent f92d9f4 commit 03d50bd

File tree

5 files changed

+154
-13
lines changed

5 files changed

+154
-13
lines changed

src/sentry/event_manager.py

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,11 @@
116116
from sentry.utils import json, metrics
117117
from sentry.utils.cache import cache_key_for_event
118118
from sentry.utils.canonical import CanonicalKeyDict
119+
from sentry.utils.circuit_breaker import (
120+
ERROR_COUNT_CACHE_KEY,
121+
CircuitBreakerPassthrough,
122+
circuit_breaker_activated,
123+
)
119124
from sentry.utils.dates import to_datetime
120125
from sentry.utils.event import has_event_minified_stack_trace, has_stacktrace, is_handled
121126
from sentry.utils.eventuser import EventUser
@@ -141,6 +146,10 @@
141146

142147
HIGH_SEVERITY_THRESHOLD = 0.1
143148

149+
SEER_GLOBAL_RATE_LIMIT_DEFAULT = 20 # 20 requests per second
150+
SEER_PROJECT_RATE_LIMIT_DEFAULT = 5 # 5 requests per second
151+
SEER_ERROR_COUNT_KEY = ERROR_COUNT_CACHE_KEY("sentry.seer.severity-failures")
152+
144153

145154
@dataclass
146155
class GroupInfo:
@@ -1829,9 +1838,16 @@ def _create_group(project: Project, event: Event, **group_creation_kwargs: Any)
18291838
group_data.setdefault("metadata", {}).update(sdk_metadata_from_event(event))
18301839

18311840
# add severity to metadata for alert filtering
1832-
group_type = group_creation_kwargs.get("type", None)
1833-
severity = _get_severity_metadata_for_group(event, project.id, group_type)
1834-
group_data["metadata"].update(severity)
1841+
try:
1842+
group_type = group_creation_kwargs.get("type", None)
1843+
severity = _get_severity_metadata_for_group(event, project.id, group_type)
1844+
group_data["metadata"].update(severity)
1845+
except Exception as e:
1846+
logger.exception(
1847+
"Failed to get severity metadata for group",
1848+
repr(e),
1849+
extra={"event_id": event.event_id},
1850+
)
18351851

18361852
if features.has("projects:issue-priority", project, actor=None):
18371853
# the kwargs only include priority for non-error issue platform events, which takes precedence.
@@ -2163,7 +2179,10 @@ def _get_severity_metadata_for_group(
21632179
from sentry.receivers.rules import PLATFORMS_WITH_PRIORITY_ALERTS
21642180

21652181
if killswitch_matches_context("issues.skip-seer-requests", {"project_id": event.project_id}):
2166-
logger.warning("get_severity_metadata_for_group.seer_killswitch_enabled")
2182+
logger.warning(
2183+
"get_severity_metadata_for_group.seer_killswitch_enabled",
2184+
extra={"event_id": event.event_id, "project_id": project_id},
2185+
)
21672186
metrics.incr("issues.severity.seer_killswitch_enabled")
21682187
return {}
21692188

@@ -2180,25 +2199,32 @@ def _get_severity_metadata_for_group(
21802199
if not is_supported_platform or not is_error_group:
21812200
return {}
21822201

2202+
passthrough_data = options.get(
2203+
"issues.severity.seer-circuit-breaker-passthrough-limit",
2204+
CircuitBreakerPassthrough(limit=1, window=10),
2205+
)
2206+
if circuit_breaker_activated("sentry.seer.severity", passthrough_data=passthrough_data):
2207+
logger.warning(
2208+
"get_severity_metadata_for_group.circuit_breaker_activated",
2209+
extra={"event_id": event.event_id, "project_id": project_id},
2210+
)
2211+
return {}
2212+
21832213
from sentry import ratelimits as ratelimiter
21842214

2185-
limit = options.get("issues.severity.seer-global-rate-limit", 20)
2215+
limit = options.get("issues.severity.seer-global-rate-limit", SEER_GLOBAL_RATE_LIMIT_DEFAULT)
21862216
if ratelimiter.backend.is_limited(
2187-
"seer:severity-calculation:global-limit",
2188-
limit=limit,
2189-
window=1, # starting this out 20 requests per second
2217+
"seer:severity-calculation:global-limit", limit=limit, window=1
21902218
):
21912219
logger.warning(
21922220
"get_severity_metadata_for_group.rate_limited_globally",
21932221
extra={"event_id": event.event_id, "project_id": project_id},
21942222
)
21952223
metrics.incr("issues.severity.rate_limited_globally")
21962224

2197-
limit = options.get("issues.severity.seer-project-rate-limit", 5)
2225+
limit = options.get("issues.severity.seer-project-rate-limit", SEER_PROJECT_RATE_LIMIT_DEFAULT)
21982226
if ratelimiter.backend.is_limited(
2199-
f"seer:severity-calculation:{project_id}",
2200-
limit=limit,
2201-
window=1, # starting this out 5 requests per second
2227+
f"seer:severity-calculation:{project_id}", limit=limit, window=1
22022228
):
22032229
logger.warning(
22042230
"get_severity_metadata_for_group.rate_limited_for_project",
@@ -2215,7 +2241,7 @@ def _get_severity_metadata_for_group(
22152241
}
22162242
except Exception as e:
22172243
logger.warning("Failed to calculate severity score for group", repr(e))
2218-
2244+
update_severity_error_count()
22192245
return {}
22202246

22212247

@@ -2259,6 +2285,19 @@ def _get_priority_for_group(severity: Mapping[str, Any], kwargs: Mapping[str, An
22592285
return PriorityLevel.MEDIUM
22602286

22612287

2288+
def update_severity_error_count(reset=False) -> None:
2289+
timeout = 60 * 60 # 1 hour
2290+
if reset:
2291+
cache.set(SEER_ERROR_COUNT_KEY, 0, timeout=timeout)
2292+
return
2293+
2294+
try:
2295+
cache.incr(SEER_ERROR_COUNT_KEY)
2296+
cache.touch(SEER_ERROR_COUNT_KEY, timeout=timeout)
2297+
except ValueError:
2298+
cache.set(SEER_ERROR_COUNT_KEY, 1, timeout=timeout)
2299+
2300+
22622301
def _get_severity_score(event: Event) -> tuple[float, str]:
22632302
# Short circuit the severity value if we know the event is fatal or info/debug
22642303
level = str(event.data.get("level", "error"))
@@ -2344,20 +2383,23 @@ def _get_severity_score(event: Event) -> tuple[float, str]:
23442383
extra=logger_data,
23452384
)
23462385
reason = "microservice_max_retry"
2386+
update_severity_error_count()
23472387
except Exception as e:
23482388
logger.warning(
23492389
"Unable to get severity score from microservice. Got: %s.",
23502390
repr(e),
23512391
extra=logger_data,
23522392
)
23532393
reason = "microservice_error"
2394+
update_severity_error_count()
23542395
else:
23552396
logger.info(
23562397
"Got severity score of %s for event %s",
23572398
severity,
23582399
event.data["event_id"],
23592400
extra=logger_data,
23602401
)
2402+
update_severity_error_count(reset=True)
23612403

23622404
return severity, reason
23632405

src/sentry/options/defaults.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,13 @@
776776
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
777777
)
778778

779+
register(
780+
"issues.severity.seer-circuit-breaker-passthrough-limit",
781+
type=Dict,
782+
default={"limit": 1, "window": 10},
783+
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
784+
)
785+
779786
register(
780787
"issues.severity.seer-timout",
781788
type=Float,

src/sentry/utils/circuit_breaker.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from typing import TypedDict
2+
3+
from django.core.cache import cache
4+
5+
from sentry import ratelimits as ratelimiter
6+
from sentry.utils import metrics
7+
8+
DEFAULT_ERROR_LIMIT = 30
9+
ERROR_COUNT_CACHE_KEY = lambda key: f"circuit_breaker:{key}-error-count"
10+
PASSTHROUGH_RATELIMIT_KEY = lambda key: f"circuit_breaker:{key}-passthrough"
11+
12+
13+
class CircuitBreakerPassthrough(TypedDict, total=True):
14+
limit: int
15+
window: int
16+
17+
18+
def circuit_breaker_activated(
19+
key: str,
20+
error_limit: int = DEFAULT_ERROR_LIMIT,
21+
passthrough_data: CircuitBreakerPassthrough | None = None,
22+
) -> bool:
23+
"""
24+
Activates the circuit breaker if the error count for a cache key exceeds the error limit.
25+
26+
The circuit breaker can allow a certain number of requests to pass through per minute, defined by
27+
the passthrough limit if provided.
28+
"""
29+
failure_count = cache.get_or_set(ERROR_COUNT_CACHE_KEY(key), default=0, timeout=60 * 60) or 0
30+
if failure_count < error_limit:
31+
return False # not blocked
32+
33+
# Limit has been exceeded, check if we should allow any requests to pass through
34+
if passthrough_data:
35+
if not ratelimiter.backend.is_limited(
36+
PASSTHROUGH_RATELIMIT_KEY(key),
37+
limit=passthrough_data["limit"],
38+
window=passthrough_data["window"],
39+
):
40+
metrics.incr(f"circuit_breaker.{key}.bypassed")
41+
return False # not blocked
42+
43+
metrics.incr(f"circuit_breaker.{key}.throttled")
44+
return True # blocked

tests/sentry/event_manager/test_severity.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
from typing import Any
55
from unittest.mock import MagicMock, patch
66

7+
from django.core.cache import cache
78
from urllib3 import HTTPResponse
89
from urllib3.exceptions import MaxRetryError
910

1011
from sentry import options
1112
from sentry.event_manager import (
1213
NON_TITLE_EVENT_TITLES,
14+
SEER_ERROR_COUNT_KEY,
1315
EventManager,
1416
_get_severity_score,
1517
severity_connection_pool,
@@ -86,6 +88,7 @@ def test_error_event_simple(
8688
)
8789
assert severity == 0.1231
8890
assert reason == "ml"
91+
assert cache.get(SEER_ERROR_COUNT_KEY) == 0
8992

9093
@patch(
9194
"sentry.event_manager.severity_connection_pool.urlopen",
@@ -133,6 +136,7 @@ def test_message_event_simple(
133136
)
134137
assert severity == 0.1231
135138
assert reason == "ml"
139+
assert cache.get(SEER_ERROR_COUNT_KEY) == 0
136140

137141
@patch(
138142
"sentry.event_manager.severity_connection_pool.urlopen",
@@ -270,6 +274,7 @@ def test_max_retry_exception(
270274
)
271275
assert severity == 1.0
272276
assert reason == "microservice_max_retry"
277+
assert cache.get(SEER_ERROR_COUNT_KEY) == 1
273278

274279
@patch(
275280
"sentry.event_manager.severity_connection_pool.urlopen",
@@ -314,6 +319,7 @@ def test_other_exception(
314319
)
315320
assert severity == 1.0
316321
assert reason == "microservice_error"
322+
assert cache.get(SEER_ERROR_COUNT_KEY) == 1
317323

318324

319325
@apply_feature_flag_on_cls("projects:first-event-severity-calculation")
@@ -426,4 +432,5 @@ def test_killswitch_on(self, mock_get_severity_score: MagicMock):
426432

427433
assert event.group
428434
assert "severity" not in event.group.get_event_metadata()
435+
assert cache.get(SEER_ERROR_COUNT_KEY) is None
429436
assert mock_get_severity_score.call_count == 0
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import time
2+
from unittest.mock import patch
3+
4+
from django.core.cache import cache
5+
6+
from sentry.testutils.cases import TestCase
7+
from sentry.utils.circuit_breaker import (
8+
ERROR_COUNT_CACHE_KEY,
9+
CircuitBreakerPassthrough,
10+
circuit_breaker_activated,
11+
)
12+
13+
14+
class TestCircuitBreaker(TestCase):
15+
def setUp(self):
16+
self.key = "test"
17+
self.error_limit = 5
18+
self.passthrough_data = CircuitBreakerPassthrough(limit=2, window=1)
19+
cache.set(ERROR_COUNT_CACHE_KEY(self.key), self.error_limit)
20+
21+
def test_not_activated(self):
22+
assert not circuit_breaker_activated(self.key, self.error_limit + 1)
23+
24+
def test_activated_at_error_limit(self):
25+
assert circuit_breaker_activated(key=self.key, error_limit=self.error_limit)
26+
27+
@patch("sentry.utils.circuit_breaker.metrics.incr")
28+
def test_passthrough(self, mock_metrics):
29+
assert not circuit_breaker_activated(self.key, self.error_limit, self.passthrough_data)
30+
mock_metrics.assert_called_with(f"circuit_breaker.{self.key}.bypassed")
31+
32+
assert not circuit_breaker_activated(self.key, self.error_limit, self.passthrough_data)
33+
mock_metrics.assert_called_with(f"circuit_breaker.{self.key}.bypassed")
34+
35+
assert circuit_breaker_activated(self.key, self.error_limit, self.passthrough_data)
36+
mock_metrics.assert_called_with(f"circuit_breaker.{self.key}.throttled")
37+
38+
# Wait for the passthrough window to expire and try again
39+
time.sleep(1)
40+
assert not circuit_breaker_activated(self.key, self.error_limit, self.passthrough_data)
41+
mock_metrics.assert_called_with(f"circuit_breaker.{self.key}.bypassed")

0 commit comments

Comments
 (0)