diff --git a/src/sentry/conf/server.py b/src/sentry/conf/server.py index f08d20b2e55946..b734b524c92927 100644 --- a/src/sentry/conf/server.py +++ b/src/sentry/conf/server.py @@ -1582,6 +1582,8 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]: "organizations:grouping-title-ui": False, # Enable experimental new version of Merged Issues where sub-hashes are shown "organizations:grouping-tree-ui": False, + # Enable caching group counts in GroupSnooze + "organizations:groupsnooze-cached-counts": False, # Allows an org to have a larger set of project ownership rules per project "organizations:higher-ownership-limit": False, # Enable incidents feature diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py index e2362d07d5e5c5..b48c92a99ef63c 100644 --- a/src/sentry/features/temporary.py +++ b/src/sentry/features/temporary.py @@ -80,6 +80,7 @@ def register_temporary_features(manager: FeatureManager): manager.add("organizations:grouping-suppress-unnecessary-secondary-hash", OrganizationFeature, FeatureHandlerStrategy.INTERNAL) manager.add("organizations:grouping-title-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE) manager.add("organizations:grouping-tree-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE) + manager.add("organizations:groupsnooze-cached-counts", OrganizationFeature, FeatureHandlerStrategy.OPTIONS) manager.add("organizations:higher-ownership-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL) manager.add("organizations:increased-issue-owners-rate-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL) manager.add("organizations:integrations-custom-alert-priorities", OrganizationFeature, FeatureHandlerStrategy.REMOTE) diff --git a/src/sentry/models/groupsnooze.py b/src/sentry/models/groupsnooze.py index 5446677b049a58..02e7896a3a1f94 100644 --- a/src/sentry/models/groupsnooze.py +++ b/src/sentry/models/groupsnooze.py @@ -1,12 +1,13 @@ from __future__ import annotations from datetime import timedelta -from typing import ClassVar, Self +from typing import TYPE_CHECKING, ClassVar, Self from django.db import models from django.db.models.signals import post_delete, post_save from django.utils import timezone +from sentry import features, tsdb from sentry.backup.scopes import RelocationScope from sentry.db.models import ( BaseManager, @@ -22,6 +23,9 @@ from sentry.utils import metrics from sentry.utils.cache import cache +if TYPE_CHECKING: + from sentry.models.group import Group + @region_silo_only_model class GroupSnooze(Model): @@ -61,10 +65,12 @@ class Meta: __repr__ = sane_repr("group_id") @classmethod - def get_cache_key(cls, group_id): - return "groupsnooze_group_id:1:%s" % (group_id) + def get_cache_key(cls, group_id: int) -> str: + return f"groupsnooze_group_id:1:{group_id}" - def is_valid(self, group=None, test_rates=False, use_pending_data=False): + def is_valid( + self, group: Group | None = None, test_rates: bool = False, use_pending_data: bool = False + ) -> bool: if group is None: group = self.group elif group.id != self.group_id: @@ -85,22 +91,12 @@ def is_valid(self, group=None, test_rates=False, use_pending_data=False): return False if self.user_count and test_rates: - if self.user_window: - if not self.test_user_rates(): - return False - elif ( - self.user_count - <= group.count_users_seen( - referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value - ) - - self.state["users_seen"] - ): + if not self.test_user_rates_or_counts(group): return False - return True - def test_frequency_rates(self): - from sentry import tsdb + return True + def test_frequency_rates(self) -> bool: metrics.incr("groupsnooze.test_frequency_rates") end = timezone.now() @@ -120,10 +116,38 @@ def test_frequency_rates(self): return True - def test_user_rates(self): - from sentry import tsdb + def test_user_rates_or_counts(self, group: Group) -> bool: + """ + Test if the number of unique users or rate of users seen by the group is below the snooze threshold. + Returns: True if below threshold, False otherwise. + + - Non-cached version of the function queries Snuba for the real count every time. + - Cached version uses Redis counters to store the number of events seen since last check, + if it's less than the number of users needed to reach the threshold, we can be sure + that we couldn't have reach enough users to reach the threshold, so there's no need + to query Snuba. This functionality relies on the fact that this is called in + post-processing for every event, so we can assume that the call-count == event count. + """ + if features.has( + "organizations:groupsnooze-cached-counts", organization=self.group.project.organization + ): + if self.user_window: + if not self.test_user_rates_w_cache(): + return False + elif not self.test_user_counts_w_cache(group): + return False + return True + else: + if self.user_window: + if not self.test_user_rates_no_cache(): + return False + elif not self.test_user_counts_no_cache(group): + return False + return True - metrics.incr("groupsnooze.test_user_rates") + def test_user_rates_no_cache(self) -> bool: + metrics.incr("groupsnooze.test_user_rates", tags={"cached": "false"}) + metrics.incr("groupsnooze.test_user_rates.snuba_call") end = timezone.now() start = end - timedelta(minutes=self.user_window) @@ -142,6 +166,89 @@ def test_user_rates(self): return True + def test_user_rates_w_cache(self) -> bool: + cache_key = f"groupsnooze:v1:{self.id}:test_user_rate:events_seen_counter" + + cache_ttl = self.user_window * 60 # Redis TTL in seconds (window is in minutes) + + value: int | float = float("inf") # using +inf as a sentinel value + + try: + value = cache.incr(cache_key) + cache.touch(cache_key, cache_ttl) + except ValueError: + # key doesn't exist, fall back on sentinel value + pass + + if value < self.user_count: + # if number of hits within the window is less than the threshold, we can't have reached enough users + metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "true"}) + return True + + metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "false"}) + metrics.incr("groupsnooze.test_user_rates.snuba_call") + end = timezone.now() + start = end - timedelta(minutes=self.user_window) + + rate = tsdb.backend.get_distinct_counts_totals( + model=get_issue_tsdb_user_group_model(self.group.issue_category), + keys=[self.group_id], + start=start, + end=end, + tenant_ids={"organization_id": self.group.project.organization_id}, + referrer_suffix="user_count_snoozes", + )[self.group_id] + + # TTL is further into the future than it needs to be, but we'd rather over-estimate + # and call Snuba more often than under-estimate and not trigger + cache.set(cache_key, rate, cache_ttl) + + if rate >= self.user_count: + return False + + return True + + def test_user_counts_no_cache(self, group: Group) -> bool: + metrics.incr("groupsnooze.test_user_counts", tags={"cached": "false"}) + metrics.incr("groupsnooze.test_user_counts.snuba_call") + + threshold = self.user_count + self.state["users_seen"] + real_count = group.count_users_seen( + referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value + ) + return real_count < threshold + + def test_user_counts_w_cache(self, group: Group) -> bool: + cache_key = f"groupsnooze:v1:{self.id}:test_user_counts:events_seen_counter" + try: + users_seen = self.state["users_seen"] + except (KeyError, TypeError): + users_seen = 0 + + threshold = self.user_count + users_seen + + CACHE_TTL = 300 # Redis TTL in seconds + + value: int | float = float("inf") # using +inf as a sentinel value + try: + value = cache.incr(cache_key) + except ValueError: + # key doesn't exist, fall back on sentinel value + pass + + if value < threshold: + # if we've seen less than that many events, we can't possibly have seen enough users + metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "true"}) + return True + + metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "false"}) + metrics.incr("groupsnooze.test_user_counts.snuba_call") + real_count = group.count_users_seen( + referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value + ) + cache.set(cache_key, real_count, CACHE_TTL) + return real_count < threshold + post_save.connect( lambda instance, **kwargs: cache.set( diff --git a/tests/sentry/models/test_groupsnooze.py b/tests/sentry/models/test_groupsnooze.py index e0b7ff02346dbe..c37bc73ce92b71 100644 --- a/tests/sentry/models/test_groupsnooze.py +++ b/tests/sentry/models/test_groupsnooze.py @@ -1,13 +1,16 @@ import itertools from datetime import timedelta +from unittest import mock import pytest from django.utils import timezone +import sentry.models.groupsnooze from sentry.models.group import Group from sentry.models.groupsnooze import GroupSnooze from sentry.testutils.cases import PerformanceIssueTestCase, SnubaTestCase, TestCase from sentry.testutils.helpers.datetime import before_now, freeze_time, iso_format +from sentry.testutils.helpers.features import apply_feature_flag_on_cls from sentry.testutils.performance_issues.store_transaction import PerfIssueTransactionTestMixin from sentry.utils.samples import load_data from tests.sentry.issues.test_utils import SearchIssueTestMixin @@ -186,3 +189,183 @@ def test_rate_reached_generic_issue(self): assert generic_group is not None snooze = GroupSnooze.objects.create(group=generic_group, count=10, window=24 * 60) assert not snooze.is_valid(test_rates=True) + + +@apply_feature_flag_on_cls("organizations:groupsnooze-cached-counts") +class GroupSnoozeWCacheTest(GroupSnoozeTest): + """ + Test the cached version of the snooze. + Runs all the test defined in GroupSnoozeTest with the cached version of the snooze. + Plus the tests defined below. + """ + + def test_test_user_rates_w_cache(self): + snooze = GroupSnooze.objects.create(group=self.group, user_count=100, user_window=60) + + cache_key = f"groupsnooze:v1:{snooze.id}:test_user_rate:events_seen_counter" + + with ( + mock.patch( + "sentry.models.groupsnooze.tsdb.backend.get_distinct_counts_totals" + ) as mocked_get_distinct_counts_totals, + mock.patch.object( + sentry.models.groupsnooze, "cache", wraps=sentry.models.groupsnooze.cache # type: ignore[attr-defined] + ) as cache_spy, + ): + mocked_get_distinct_counts_totals.side_effect = [ + {snooze.group_id: c} for c in [95, 98, 100] + ] + + cache_spy.set = mock.Mock(side_effect=cache_spy.set) + cache_spy.incr = mock.Mock(side_effect=cache_spy.incr) + + assert snooze.is_valid(test_rates=True) + assert mocked_get_distinct_counts_totals.call_count == 1 + assert cache_spy.set.called_with(cache_key, 95, 3600) + + assert snooze.is_valid(test_rates=True) + assert mocked_get_distinct_counts_totals.call_count == 1 + assert cache_spy.incr.called_with(cache_key) + assert cache_spy.get(cache_key) == 96 + + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 97 + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 98 + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 99 + + # cache counter reaches 100, but gets 98 from get_distinct_counts_totals + + assert snooze.is_valid(test_rates=True) + assert mocked_get_distinct_counts_totals.call_count == 2 + assert cache_spy.set.called_with(cache_key, 98, 3600) + assert cache_spy.get(cache_key) == 98 + + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 99 + # with this call counter reaches 100, gets 100 from get_distinct_counts_totals, so is_valid returns False + assert not snooze.is_valid(test_rates=True) + assert mocked_get_distinct_counts_totals.call_count == 3 + + def test_test_user_rates_w_cache_expired(self): + snooze = GroupSnooze.objects.create(group=self.group, user_count=100, user_window=60) + + cache_key = f"groupsnooze:v1:{snooze.id}:test_user_rate:events_seen_counter" + + with ( + mock.patch( + "sentry.models.groupsnooze.tsdb.backend.get_distinct_counts_totals" + ) as mocked_get_distinct_counts_totals, + mock.patch.object( + sentry.models.groupsnooze, "cache", wraps=sentry.models.groupsnooze.cache # type: ignore[attr-defined] + ) as cache_spy, + ): + mocked_get_distinct_counts_totals.side_effect = [ + {snooze.group_id: c} for c in [98, 99, 100] + ] + + cache_spy.set = mock.Mock(side_effect=cache_spy.set) + cache_spy.incr = mock.Mock(side_effect=cache_spy.incr) + + assert snooze.is_valid(test_rates=True) + assert mocked_get_distinct_counts_totals.call_count == 1 + assert cache_spy.set.called_with(cache_key, 98, 3600) + + # simulate cache expiration + cache_spy.delete(cache_key) + + assert snooze.is_valid(test_rates=True) + assert mocked_get_distinct_counts_totals.call_count == 2 + assert cache_spy.set.called_with(cache_key, 99, 3600) + + # simulate cache expiration + cache_spy.delete(cache_key) + + assert not snooze.is_valid(test_rates=True) + assert mocked_get_distinct_counts_totals.call_count == 3 + assert cache_spy.set.called_with(cache_key, 100, 3600) + + def test_test_user_count_w_cache(self): + snooze = GroupSnooze.objects.create(group=self.group, user_count=100) + + cache_key = f"groupsnooze:v1:{snooze.id}:test_user_counts:events_seen_counter" + + with ( + mock.patch.object( + snooze.group, + "count_users_seen", + side_effect=[95, 98, 100], + ) as mocked_count_users_seen, + mock.patch.object( + sentry.models.groupsnooze, "cache", wraps=sentry.models.groupsnooze.cache # type: ignore[attr-defined] + ) as cache_spy, + ): + + cache_spy.set = mock.Mock(side_effect=cache_spy.set) + cache_spy.incr = mock.Mock(side_effect=cache_spy.incr) + + assert snooze.is_valid(test_rates=True) + assert mocked_count_users_seen.call_count == 1 + assert cache_spy.set.called_with(cache_key, 95, 300) + + assert snooze.is_valid(test_rates=True) + assert mocked_count_users_seen.call_count == 1 + assert cache_spy.incr.called_with(cache_key) + assert cache_spy.get(cache_key) == 96 + + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 97 + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 98 + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 99 + + # cache counter reaches 100, but gets 98 from count_users_seen + + assert snooze.is_valid(test_rates=True) + assert mocked_count_users_seen.call_count == 2 + assert cache_spy.set.called_with(cache_key, 98, 300) + assert cache_spy.get(cache_key) == 98 + + assert snooze.is_valid(test_rates=True) + assert cache_spy.get(cache_key) == 99 + # with this call counter reaches 100, gets 100 from count_users_seen, so is_valid returns False + assert not snooze.is_valid(test_rates=True) + assert mocked_count_users_seen.call_count == 3 + + def test_test_user_count_w_cache_expired(self): + snooze = GroupSnooze.objects.create(group=self.group, user_count=100) + + cache_key = f"groupsnooze:v1:{snooze.id}:test_user_counts:events_seen_counter" + + with ( + mock.patch.object( + snooze.group, + "count_users_seen", + side_effect=[95, 98, 100], + ) as mocked_count_users_seen, + mock.patch.object( + sentry.models.groupsnooze, "cache", wraps=sentry.models.groupsnooze.cache # type: ignore[attr-defined] + ) as cache_spy, + ): + cache_spy.set = mock.Mock(side_effect=cache_spy.set) + cache_spy.incr = mock.Mock(side_effect=cache_spy.incr) + + assert snooze.is_valid(test_rates=True) + assert mocked_count_users_seen.call_count == 1 + assert cache_spy.set.called_with(cache_key, 98, 300) + + # simulate cache expiration + cache_spy.delete(cache_key) + + assert snooze.is_valid(test_rates=True) + assert mocked_count_users_seen.call_count == 2 + assert cache_spy.set.called_with(cache_key, 99, 300) + + # simulate cache expiration + cache_spy.delete(cache_key) + + assert not snooze.is_valid(test_rates=True) + assert mocked_count_users_seen.call_count == 3 + assert cache_spy.set.called_with(cache_key, 100, 300)