getsentry · vartec · Apr 25, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 23, 2024
diff --git a/src/sentry/conf/server.py b/src/sentry/conf/server.py
@@ -1582,6 +1582,8 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]:
     "organizations:grouping-title-ui": False,
     # Enable experimental new version of Merged Issues where sub-hashes are shown
     "organizations:grouping-tree-ui": False,
+    # Enable caching group counts in GroupSnooze
+    "organizations:groupsnooze-cached-counts": False,
     # Allows an org to have a larger set of project ownership rules per project
     "organizations:higher-ownership-limit": False,
     # Enable incidents feature

diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py
@@ -80,6 +80,7 @@ def register_temporary_features(manager: FeatureManager):
     manager.add("organizations:grouping-suppress-unnecessary-secondary-hash", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
     manager.add("organizations:grouping-title-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
     manager.add("organizations:grouping-tree-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
+    manager.add("organizations:groupsnooze-cached-counts", OrganizationFeature, FeatureHandlerStrategy.OPTIONS)
     manager.add("organizations:higher-ownership-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
     manager.add("organizations:increased-issue-owners-rate-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
     manager.add("organizations:integrations-custom-alert-priorities", OrganizationFeature, FeatureHandlerStrategy.REMOTE)

diff --git a/src/sentry/models/groupsnooze.py b/src/sentry/models/groupsnooze.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
 from datetime import timedelta
-from typing import ClassVar, Self
+from typing import TYPE_CHECKING, ClassVar, Self
 
 from django.db import models
 from django.db.models.signals import post_delete, post_save
 from django.utils import timezone
 
+from sentry import features, tsdb
 from sentry.backup.scopes import RelocationScope
 from sentry.db.models import (
     BaseManager,
@@ -22,6 +23,9 @@
 from sentry.utils import metrics
 from sentry.utils.cache import cache
 
+if TYPE_CHECKING:
+    from sentry.models.group import Group
+
 
 @region_silo_only_model
 class GroupSnooze(Model):
@@ -61,10 +65,12 @@ class Meta:
     __repr__ = sane_repr("group_id")
 
     @classmethod
-    def get_cache_key(cls, group_id):
-        return "groupsnooze_group_id:1:%s" % (group_id)
+    def get_cache_key(cls, group_id: int) -> str:
+        return f"groupsnooze_group_id:1:{group_id}"
 
-    def is_valid(self, group=None, test_rates=False, use_pending_data=False):
+    def is_valid(
+        self, group: Group | None = None, test_rates: bool = False, use_pending_data: bool = False
+    ) -> bool:
         if group is None:
             group = self.group
         elif group.id != self.group_id:
@@ -85,22 +91,12 @@ def is_valid(self, group=None, test_rates=False, use_pending_data=False):
                     return False
 
         if self.user_count and test_rates:
-            if self.user_window:
-                if not self.test_user_rates():
-                    return False
-            elif (
-                self.user_count
-                <= group.count_users_seen(
-                    referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
-                )
-                - self.state["users_seen"]
-            ):
+            if not self.test_user_rates_or_counts(group):
                 return False
-        return True
 
-    def test_frequency_rates(self):
-        from sentry import tsdb
+        return True
 
+    def test_frequency_rates(self) -> bool:
         metrics.incr("groupsnooze.test_frequency_rates")
 
         end = timezone.now()
@@ -120,10 +116,38 @@ def test_frequency_rates(self):
 
         return True
 
-    def test_user_rates(self):
-        from sentry import tsdb
+    def test_user_rates_or_counts(self, group: Group) -> bool:
+        """
+        Test if the number of unique users or rate of users seen by the group is below the snooze threshold.
+        Returns: True if below threshold, False otherwise.
+
+        - Non-cached version of the function queries Snuba for the real count every time.
+        - Cached version uses Redis counters to store the number of events seen since last check,
+          if it's less than the number of users needed to reach the threshold, we can be sure
+          that we couldn't have reach enough users to reach the threshold, so there's no need
+          to query Snuba. This functionality relies on the fact that this is called in
+          post-processing for every event, so we can assume that the call-count == event count.
+        """
+        if features.has(
+            "organizations:groupsnooze-cached-counts", organization=self.group.project.organization
+        ):
+            if self.user_window:
+                if not self.test_user_rates_w_cache():
+                    return False
+            elif not self.test_user_counts_w_cache(group):
+                return False
+            return True
+        else:
+            if self.user_window:
+                if not self.test_user_rates_no_cache():
+                    return False
+            elif not self.test_user_counts_no_cache(group):
+                return False
+            return True
 
-        metrics.incr("groupsnooze.test_user_rates")
+    def test_user_rates_no_cache(self) -> bool:
+        metrics.incr("groupsnooze.test_user_rates", tags={"cached": "false"})
+        metrics.incr("groupsnooze.test_user_rates.snuba_call")
 
         end = timezone.now()
         start = end - timedelta(minutes=self.user_window)
@@ -142,6 +166,89 @@ def test_user_rates(self):
 
         return True
 
+    def test_user_rates_w_cache(self) -> bool:
+        cache_key = f"groupsnooze:v1:{self.id}:test_user_rate:events_seen_counter"
+
+        cache_ttl = self.user_window * 60  # Redis TTL in seconds (window is in minutes)
+
+        value: int | float = float("inf")  # using +inf as a sentinel value
+
+        try:
+            value = cache.incr(cache_key)
+            cache.touch(cache_key, cache_ttl)
+        except ValueError:
+            # key doesn't exist, fall back on sentinel value
+            pass
+
+        if value < self.user_count:
+            # if number of hits within the window is less than the threshold, we can't have reached enough users
+            metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "true"})
+            return True
+
+        metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "false"})
+        metrics.incr("groupsnooze.test_user_rates.snuba_call")
+        end = timezone.now()
+        start = end - timedelta(minutes=self.user_window)
+
+        rate = tsdb.backend.get_distinct_counts_totals(
+            model=get_issue_tsdb_user_group_model(self.group.issue_category),
+            keys=[self.group_id],
+            start=start,
+            end=end,
+            tenant_ids={"organization_id": self.group.project.organization_id},
+            referrer_suffix="user_count_snoozes",
+        )[self.group_id]
+
+        # TTL is further into the future than it needs to be, but we'd rather over-estimate
+        # and call Snuba more often than under-estimate and not trigger
+        cache.set(cache_key, rate, cache_ttl)
+
+        if rate >= self.user_count:
+            return False
+
+        return True
+
+    def test_user_counts_no_cache(self, group: Group) -> bool:
+        metrics.incr("groupsnooze.test_user_counts", tags={"cached": "false"})
+        metrics.incr("groupsnooze.test_user_counts.snuba_call")
+
+        threshold = self.user_count + self.state["users_seen"]
+        real_count = group.count_users_seen(
+            referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
+        )
+        return real_count < threshold
+
+    def test_user_counts_w_cache(self, group: Group) -> bool:
+        cache_key = f"groupsnooze:v1:{self.id}:test_user_counts:events_seen_counter"
+        try:
+            users_seen = self.state["users_seen"]
+        except (KeyError, TypeError):
+            users_seen = 0
+
+        threshold = self.user_count + users_seen
+
+        CACHE_TTL = 300  # Redis TTL in seconds
+
+        value: int | float = float("inf")  # using +inf as a sentinel value
+        try:
+            value = cache.incr(cache_key)
+        except ValueError:
+            # key doesn't exist, fall back on sentinel value
+            pass
+
+        if value < threshold:
+            # if we've seen less than that many events, we can't possibly have seen enough users
+            metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "true"})
+            return True
+
+        metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "false"})
+        metrics.incr("groupsnooze.test_user_counts.snuba_call")
+        real_count = group.count_users_seen(
+            referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
+        )
+        cache.set(cache_key, real_count, CACHE_TTL)
+        return real_count < threshold
+
 
 post_save.connect(
     lambda instance, **kwargs: cache.set(