perf(issues): getting users seen counts from cache rather than directly from Snuba (#69556)

Bartek Ogryczak · web-flow · commit f9f9d4c89522 · 2024-04-25T23:22:06.000Z
This most basic is an optimization to not go to Snuba for _every_ new
event in the issue.
While the number of events seen since the last time we checked unique
users count is less than then number of users needed to reach the
threshold, we can be sure that the threshold was _not_ reached.

Caveat: when the cached count is just one away from triggering, we'll
still call Snuba on every new event in issue until new user is seen.
diff --git a/src/sentry/conf/server.py b/src/sentry/conf/server.py
@@ -1582,6 +1582,8 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]:
     "organizations:grouping-title-ui": False,
     # Enable experimental new version of Merged Issues where sub-hashes are shown
     "organizations:grouping-tree-ui": False,
+    # Enable caching group counts in GroupSnooze
+    "organizations:groupsnooze-cached-counts": False,
     # Allows an org to have a larger set of project ownership rules per project
     "organizations:higher-ownership-limit": False,
     # Enable incidents feature
diff --git a/src/sentry/features/temporary.py b/src/sentry/features/temporary.py
@@ -80,6 +80,7 @@ def register_temporary_features(manager: FeatureManager):
     manager.add("organizations:grouping-suppress-unnecessary-secondary-hash", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
     manager.add("organizations:grouping-title-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
     manager.add("organizations:grouping-tree-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
+    manager.add("organizations:groupsnooze-cached-counts", OrganizationFeature, FeatureHandlerStrategy.OPTIONS)
     manager.add("organizations:higher-ownership-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
     manager.add("organizations:increased-issue-owners-rate-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
     manager.add("organizations:integrations-custom-alert-priorities", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
diff --git a/src/sentry/models/groupsnooze.py b/src/sentry/models/groupsnooze.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
 from datetime import timedelta
-from typing import ClassVar, Self
+from typing import TYPE_CHECKING, ClassVar, Self
 
 from django.db import models
 from django.db.models.signals import post_delete, post_save
 from django.utils import timezone
 
+from sentry import features, tsdb
 from sentry.backup.scopes import RelocationScope
 from sentry.db.models import (
     BaseManager,
@@ -22,6 +23,9 @@
 from sentry.utils import metrics
 from sentry.utils.cache import cache
 
+if TYPE_CHECKING:
+    from sentry.models.group import Group
+
 
 @region_silo_only_model
 class GroupSnooze(Model):
@@ -61,10 +65,12 @@ class Meta:
     __repr__ = sane_repr("group_id")
 
     @classmethod
-    def get_cache_key(cls, group_id):
-        return "groupsnooze_group_id:1:%s" % (group_id)
+    def get_cache_key(cls, group_id: int) -> str:
+        return f"groupsnooze_group_id:1:{group_id}"
 
-    def is_valid(self, group=None, test_rates=False, use_pending_data=False):
+    def is_valid(
+        self, group: Group | None = None, test_rates: bool = False, use_pending_data: bool = False
+    ) -> bool:
         if group is None:
             group = self.group
         elif group.id != self.group_id:
@@ -85,22 +91,12 @@ def is_valid(self, group=None, test_rates=False, use_pending_data=False):
                     return False
 
         if self.user_count and test_rates:
-            if self.user_window:
-                if not self.test_user_rates():
-                    return False
-            elif (
-                self.user_count
-                <= group.count_users_seen(
-                    referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
-                )
-                - self.state["users_seen"]
-            ):
+            if not self.test_user_rates_or_counts(group):
                 return False
-        return True
 
-    def test_frequency_rates(self):
-        from sentry import tsdb
+        return True
 
+    def test_frequency_rates(self) -> bool:
         metrics.incr("groupsnooze.test_frequency_rates")
 
         end = timezone.now()
@@ -120,10 +116,38 @@ def test_frequency_rates(self):
 
         return True
 
-    def test_user_rates(self):
-        from sentry import tsdb
+    def test_user_rates_or_counts(self, group: Group) -> bool:
+        """
+        Test if the number of unique users or rate of users seen by the group is below the snooze threshold.
+        Returns: True if below threshold, False otherwise.
+
+        - Non-cached version of the function queries Snuba for the real count every time.
+        - Cached version uses Redis counters to store the number of events seen since last check,
+          if it's less than the number of users needed to reach the threshold, we can be sure
+          that we couldn't have reach enough users to reach the threshold, so there's no need
+          to query Snuba. This functionality relies on the fact that this is called in
+          post-processing for every event, so we can assume that the call-count == event count.
+        """
+        if features.has(
+            "organizations:groupsnooze-cached-counts", organization=self.group.project.organization
+        ):
+            if self.user_window:
+                if not self.test_user_rates_w_cache():
+                    return False
+            elif not self.test_user_counts_w_cache(group):
+                return False
+            return True
+        else:
+            if self.user_window:
+                if not self.test_user_rates_no_cache():
+                    return False
+            elif not self.test_user_counts_no_cache(group):
+                return False
+            return True
 
-        metrics.incr("groupsnooze.test_user_rates")
+    def test_user_rates_no_cache(self) -> bool:
+        metrics.incr("groupsnooze.test_user_rates", tags={"cached": "false"})
+        metrics.incr("groupsnooze.test_user_rates.snuba_call")
 
         end = timezone.now()
         start = end - timedelta(minutes=self.user_window)
@@ -142,6 +166,89 @@ def test_user_rates(self):
 
         return True
 
+    def test_user_rates_w_cache(self) -> bool:
+        cache_key = f"groupsnooze:v1:{self.id}:test_user_rate:events_seen_counter"
+
+        cache_ttl = self.user_window * 60  # Redis TTL in seconds (window is in minutes)
+
+        value: int | float = float("inf")  # using +inf as a sentinel value
+
+        try:
+            value = cache.incr(cache_key)
+            cache.touch(cache_key, cache_ttl)
+        except ValueError:
+            # key doesn't exist, fall back on sentinel value
+            pass
+
+        if value < self.user_count:
+            # if number of hits within the window is less than the threshold, we can't have reached enough users
+            metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "true"})
+            return True
+
+        metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "false"})
+        metrics.incr("groupsnooze.test_user_rates.snuba_call")
+        end = timezone.now()
+        start = end - timedelta(minutes=self.user_window)
+
+        rate = tsdb.backend.get_distinct_counts_totals(
+            model=get_issue_tsdb_user_group_model(self.group.issue_category),
+            keys=[self.group_id],
+            start=start,
+            end=end,
+            tenant_ids={"organization_id": self.group.project.organization_id},
+            referrer_suffix="user_count_snoozes",
+        )[self.group_id]
+
+        # TTL is further into the future than it needs to be, but we'd rather over-estimate
+        # and call Snuba more often than under-estimate and not trigger
+        cache.set(cache_key, rate, cache_ttl)
+
+        if rate >= self.user_count:
+            return False
+
+        return True
+
+    def test_user_counts_no_cache(self, group: Group) -> bool:
+        metrics.incr("groupsnooze.test_user_counts", tags={"cached": "false"})
+        metrics.incr("groupsnooze.test_user_counts.snuba_call")
+
+        threshold = self.user_count + self.state["users_seen"]
+        real_count = group.count_users_seen(
+            referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
+        )
+        return real_count < threshold
+
+    def test_user_counts_w_cache(self, group: Group) -> bool:
+        cache_key = f"groupsnooze:v1:{self.id}:test_user_counts:events_seen_counter"
+        try:
+            users_seen = self.state["users_seen"]
+        except (KeyError, TypeError):
+            users_seen = 0
+
+        threshold = self.user_count + users_seen
+
+        CACHE_TTL = 300  # Redis TTL in seconds
+
+        value: int | float = float("inf")  # using +inf as a sentinel value
+        try:
+            value = cache.incr(cache_key)
+        except ValueError:
+            # key doesn't exist, fall back on sentinel value
+            pass
+
+        if value < threshold:
+            # if we've seen less than that many events, we can't possibly have seen enough users
+            metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "true"})
+            return True
+
+        metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "false"})
+        metrics.incr("groupsnooze.test_user_counts.snuba_call")
+        real_count = group.count_users_seen(
+            referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
+        )
+        cache.set(cache_key, real_count, CACHE_TTL)
+        return real_count < threshold
+
 
 post_save.connect(
     lambda instance, **kwargs: cache.set(
diff --git a/tests/sentry/models/test_groupsnooze.py b/tests/sentry/models/test_groupsnooze.py