Skip to content

Commit f9f9d4c

Browse files
author
Bartek Ogryczak
authored
perf(issues): getting users seen counts from cache rather than directly from Snuba (#69556)
This most basic is an optimization to not go to Snuba for _every_ new event in the issue. While the number of events seen since the last time we checked unique users count is less than then number of users needed to reach the threshold, we can be sure that the threshold was _not_ reached. Caveat: when the cached count is just one away from triggering, we'll still call Snuba on every new event in issue until new user is seen.
1 parent ad2814f commit f9f9d4c

File tree

4 files changed

+313
-20
lines changed

4 files changed

+313
-20
lines changed

src/sentry/conf/server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,6 +1582,8 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]:
15821582
"organizations:grouping-title-ui": False,
15831583
# Enable experimental new version of Merged Issues where sub-hashes are shown
15841584
"organizations:grouping-tree-ui": False,
1585+
# Enable caching group counts in GroupSnooze
1586+
"organizations:groupsnooze-cached-counts": False,
15851587
# Allows an org to have a larger set of project ownership rules per project
15861588
"organizations:higher-ownership-limit": False,
15871589
# Enable incidents feature

src/sentry/features/temporary.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def register_temporary_features(manager: FeatureManager):
8080
manager.add("organizations:grouping-suppress-unnecessary-secondary-hash", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
8181
manager.add("organizations:grouping-title-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
8282
manager.add("organizations:grouping-tree-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
83+
manager.add("organizations:groupsnooze-cached-counts", OrganizationFeature, FeatureHandlerStrategy.OPTIONS)
8384
manager.add("organizations:higher-ownership-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
8485
manager.add("organizations:increased-issue-owners-rate-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
8586
manager.add("organizations:integrations-custom-alert-priorities", OrganizationFeature, FeatureHandlerStrategy.REMOTE)

src/sentry/models/groupsnooze.py

Lines changed: 127 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from __future__ import annotations
22

33
from datetime import timedelta
4-
from typing import ClassVar, Self
4+
from typing import TYPE_CHECKING, ClassVar, Self
55

66
from django.db import models
77
from django.db.models.signals import post_delete, post_save
88
from django.utils import timezone
99

10+
from sentry import features, tsdb
1011
from sentry.backup.scopes import RelocationScope
1112
from sentry.db.models import (
1213
BaseManager,
@@ -22,6 +23,9 @@
2223
from sentry.utils import metrics
2324
from sentry.utils.cache import cache
2425

26+
if TYPE_CHECKING:
27+
from sentry.models.group import Group
28+
2529

2630
@region_silo_only_model
2731
class GroupSnooze(Model):
@@ -61,10 +65,12 @@ class Meta:
6165
__repr__ = sane_repr("group_id")
6266

6367
@classmethod
64-
def get_cache_key(cls, group_id):
65-
return "groupsnooze_group_id:1:%s" % (group_id)
68+
def get_cache_key(cls, group_id: int) -> str:
69+
return f"groupsnooze_group_id:1:{group_id}"
6670

67-
def is_valid(self, group=None, test_rates=False, use_pending_data=False):
71+
def is_valid(
72+
self, group: Group | None = None, test_rates: bool = False, use_pending_data: bool = False
73+
) -> bool:
6874
if group is None:
6975
group = self.group
7076
elif group.id != self.group_id:
@@ -85,22 +91,12 @@ def is_valid(self, group=None, test_rates=False, use_pending_data=False):
8591
return False
8692

8793
if self.user_count and test_rates:
88-
if self.user_window:
89-
if not self.test_user_rates():
90-
return False
91-
elif (
92-
self.user_count
93-
<= group.count_users_seen(
94-
referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
95-
)
96-
- self.state["users_seen"]
97-
):
94+
if not self.test_user_rates_or_counts(group):
9895
return False
99-
return True
10096

101-
def test_frequency_rates(self):
102-
from sentry import tsdb
97+
return True
10398

99+
def test_frequency_rates(self) -> bool:
104100
metrics.incr("groupsnooze.test_frequency_rates")
105101

106102
end = timezone.now()
@@ -120,10 +116,38 @@ def test_frequency_rates(self):
120116

121117
return True
122118

123-
def test_user_rates(self):
124-
from sentry import tsdb
119+
def test_user_rates_or_counts(self, group: Group) -> bool:
120+
"""
121+
Test if the number of unique users or rate of users seen by the group is below the snooze threshold.
122+
Returns: True if below threshold, False otherwise.
123+
124+
- Non-cached version of the function queries Snuba for the real count every time.
125+
- Cached version uses Redis counters to store the number of events seen since last check,
126+
if it's less than the number of users needed to reach the threshold, we can be sure
127+
that we couldn't have reach enough users to reach the threshold, so there's no need
128+
to query Snuba. This functionality relies on the fact that this is called in
129+
post-processing for every event, so we can assume that the call-count == event count.
130+
"""
131+
if features.has(
132+
"organizations:groupsnooze-cached-counts", organization=self.group.project.organization
133+
):
134+
if self.user_window:
135+
if not self.test_user_rates_w_cache():
136+
return False
137+
elif not self.test_user_counts_w_cache(group):
138+
return False
139+
return True
140+
else:
141+
if self.user_window:
142+
if not self.test_user_rates_no_cache():
143+
return False
144+
elif not self.test_user_counts_no_cache(group):
145+
return False
146+
return True
125147

126-
metrics.incr("groupsnooze.test_user_rates")
148+
def test_user_rates_no_cache(self) -> bool:
149+
metrics.incr("groupsnooze.test_user_rates", tags={"cached": "false"})
150+
metrics.incr("groupsnooze.test_user_rates.snuba_call")
127151

128152
end = timezone.now()
129153
start = end - timedelta(minutes=self.user_window)
@@ -142,6 +166,89 @@ def test_user_rates(self):
142166

143167
return True
144168

169+
def test_user_rates_w_cache(self) -> bool:
170+
cache_key = f"groupsnooze:v1:{self.id}:test_user_rate:events_seen_counter"
171+
172+
cache_ttl = self.user_window * 60 # Redis TTL in seconds (window is in minutes)
173+
174+
value: int | float = float("inf") # using +inf as a sentinel value
175+
176+
try:
177+
value = cache.incr(cache_key)
178+
cache.touch(cache_key, cache_ttl)
179+
except ValueError:
180+
# key doesn't exist, fall back on sentinel value
181+
pass
182+
183+
if value < self.user_count:
184+
# if number of hits within the window is less than the threshold, we can't have reached enough users
185+
metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "true"})
186+
return True
187+
188+
metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "false"})
189+
metrics.incr("groupsnooze.test_user_rates.snuba_call")
190+
end = timezone.now()
191+
start = end - timedelta(minutes=self.user_window)
192+
193+
rate = tsdb.backend.get_distinct_counts_totals(
194+
model=get_issue_tsdb_user_group_model(self.group.issue_category),
195+
keys=[self.group_id],
196+
start=start,
197+
end=end,
198+
tenant_ids={"organization_id": self.group.project.organization_id},
199+
referrer_suffix="user_count_snoozes",
200+
)[self.group_id]
201+
202+
# TTL is further into the future than it needs to be, but we'd rather over-estimate
203+
# and call Snuba more often than under-estimate and not trigger
204+
cache.set(cache_key, rate, cache_ttl)
205+
206+
if rate >= self.user_count:
207+
return False
208+
209+
return True
210+
211+
def test_user_counts_no_cache(self, group: Group) -> bool:
212+
metrics.incr("groupsnooze.test_user_counts", tags={"cached": "false"})
213+
metrics.incr("groupsnooze.test_user_counts.snuba_call")
214+
215+
threshold = self.user_count + self.state["users_seen"]
216+
real_count = group.count_users_seen(
217+
referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
218+
)
219+
return real_count < threshold
220+
221+
def test_user_counts_w_cache(self, group: Group) -> bool:
222+
cache_key = f"groupsnooze:v1:{self.id}:test_user_counts:events_seen_counter"
223+
try:
224+
users_seen = self.state["users_seen"]
225+
except (KeyError, TypeError):
226+
users_seen = 0
227+
228+
threshold = self.user_count + users_seen
229+
230+
CACHE_TTL = 300 # Redis TTL in seconds
231+
232+
value: int | float = float("inf") # using +inf as a sentinel value
233+
try:
234+
value = cache.incr(cache_key)
235+
except ValueError:
236+
# key doesn't exist, fall back on sentinel value
237+
pass
238+
239+
if value < threshold:
240+
# if we've seen less than that many events, we can't possibly have seen enough users
241+
metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "true"})
242+
return True
243+
244+
metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "false"})
245+
metrics.incr("groupsnooze.test_user_counts.snuba_call")
246+
real_count = group.count_users_seen(
247+
referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
248+
)
249+
cache.set(cache_key, real_count, CACHE_TTL)
250+
return real_count < threshold
251+
145252

146253
post_save.connect(
147254
lambda instance, **kwargs: cache.set(

0 commit comments

Comments
 (0)