Skip to content

perf(issues): getting users seen counts from cache rather than directly from Snuba #69556

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/sentry/conf/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1582,6 +1582,8 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]:
"organizations:grouping-title-ui": False,
# Enable experimental new version of Merged Issues where sub-hashes are shown
"organizations:grouping-tree-ui": False,
# Enable caching group counts in GroupSnooze
"organizations:groupsnooze-cached-counts": False,
# Allows an org to have a larger set of project ownership rules per project
"organizations:higher-ownership-limit": False,
# Enable incidents feature
Expand Down
1 change: 1 addition & 0 deletions src/sentry/features/temporary.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def register_temporary_features(manager: FeatureManager):
manager.add("organizations:grouping-suppress-unnecessary-secondary-hash", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
manager.add("organizations:grouping-title-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
manager.add("organizations:grouping-tree-ui", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
manager.add("organizations:groupsnooze-cached-counts", OrganizationFeature, FeatureHandlerStrategy.OPTIONS)
manager.add("organizations:higher-ownership-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
manager.add("organizations:increased-issue-owners-rate-limit", OrganizationFeature, FeatureHandlerStrategy.INTERNAL)
manager.add("organizations:integrations-custom-alert-priorities", OrganizationFeature, FeatureHandlerStrategy.REMOTE)
Expand Down
147 changes: 127 additions & 20 deletions src/sentry/models/groupsnooze.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from __future__ import annotations

from datetime import timedelta
from typing import ClassVar, Self
from typing import TYPE_CHECKING, ClassVar, Self

from django.db import models
from django.db.models.signals import post_delete, post_save
from django.utils import timezone

from sentry import features, tsdb
from sentry.backup.scopes import RelocationScope
from sentry.db.models import (
BaseManager,
Expand All @@ -22,6 +23,9 @@
from sentry.utils import metrics
from sentry.utils.cache import cache

if TYPE_CHECKING:
from sentry.models.group import Group


@region_silo_only_model
class GroupSnooze(Model):
Expand Down Expand Up @@ -61,10 +65,12 @@ class Meta:
__repr__ = sane_repr("group_id")

@classmethod
def get_cache_key(cls, group_id):
return "groupsnooze_group_id:1:%s" % (group_id)
def get_cache_key(cls, group_id: int) -> str:
return f"groupsnooze_group_id:1:{group_id}"

def is_valid(self, group=None, test_rates=False, use_pending_data=False):
def is_valid(
self, group: Group | None = None, test_rates: bool = False, use_pending_data: bool = False
) -> bool:
if group is None:
group = self.group
elif group.id != self.group_id:
Expand All @@ -85,22 +91,12 @@ def is_valid(self, group=None, test_rates=False, use_pending_data=False):
return False

if self.user_count and test_rates:
if self.user_window:
if not self.test_user_rates():
return False
elif (
self.user_count
<= group.count_users_seen(
referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
)
- self.state["users_seen"]
):
if not self.test_user_rates_or_counts(group):
return False
return True

def test_frequency_rates(self):
from sentry import tsdb
return True

def test_frequency_rates(self) -> bool:
metrics.incr("groupsnooze.test_frequency_rates")

end = timezone.now()
Expand All @@ -120,10 +116,38 @@ def test_frequency_rates(self):

return True

def test_user_rates(self):
from sentry import tsdb
def test_user_rates_or_counts(self, group: Group) -> bool:
"""
Test if the number of unique users or rate of users seen by the group is below the snooze threshold.
Returns: True if below threshold, False otherwise.

- Non-cached version of the function queries Snuba for the real count every time.
- Cached version uses Redis counters to store the number of events seen since last check,
if it's less than the number of users needed to reach the threshold, we can be sure
that we couldn't have reach enough users to reach the threshold, so there's no need
to query Snuba. This functionality relies on the fact that this is called in
post-processing for every event, so we can assume that the call-count == event count.
"""
if features.has(
"organizations:groupsnooze-cached-counts", organization=self.group.project.organization
):
if self.user_window:
if not self.test_user_rates_w_cache():
return False
elif not self.test_user_counts_w_cache(group):
return False
return True
else:
if self.user_window:
if not self.test_user_rates_no_cache():
return False
elif not self.test_user_counts_no_cache(group):
return False
return True

metrics.incr("groupsnooze.test_user_rates")
def test_user_rates_no_cache(self) -> bool:
metrics.incr("groupsnooze.test_user_rates", tags={"cached": "false"})
metrics.incr("groupsnooze.test_user_rates.snuba_call")

end = timezone.now()
start = end - timedelta(minutes=self.user_window)
Expand All @@ -142,6 +166,89 @@ def test_user_rates(self):

return True

def test_user_rates_w_cache(self) -> bool:
cache_key = f"groupsnooze:v1:{self.id}:test_user_rate:events_seen_counter"

cache_ttl = self.user_window * 60 # Redis TTL in seconds (window is in minutes)

value: int | float = float("inf") # using +inf as a sentinel value

try:
value = cache.incr(cache_key)
cache.touch(cache_key, cache_ttl)
except ValueError:
# key doesn't exist, fall back on sentinel value
pass

if value < self.user_count:
# if number of hits within the window is less than the threshold, we can't have reached enough users
metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "true"})
return True

metrics.incr("groupsnooze.test_user_rates", tags={"cached": "true", "hit": "false"})
metrics.incr("groupsnooze.test_user_rates.snuba_call")
end = timezone.now()
start = end - timedelta(minutes=self.user_window)

rate = tsdb.backend.get_distinct_counts_totals(
model=get_issue_tsdb_user_group_model(self.group.issue_category),
keys=[self.group_id],
start=start,
end=end,
tenant_ids={"organization_id": self.group.project.organization_id},
referrer_suffix="user_count_snoozes",
)[self.group_id]

# TTL is further into the future than it needs to be, but we'd rather over-estimate
# and call Snuba more often than under-estimate and not trigger
cache.set(cache_key, rate, cache_ttl)

if rate >= self.user_count:
return False

return True

def test_user_counts_no_cache(self, group: Group) -> bool:
metrics.incr("groupsnooze.test_user_counts", tags={"cached": "false"})
metrics.incr("groupsnooze.test_user_counts.snuba_call")

threshold = self.user_count + self.state["users_seen"]
real_count = group.count_users_seen(
referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
)
return real_count < threshold

def test_user_counts_w_cache(self, group: Group) -> bool:
cache_key = f"groupsnooze:v1:{self.id}:test_user_counts:events_seen_counter"
try:
users_seen = self.state["users_seen"]
except (KeyError, TypeError):
users_seen = 0

threshold = self.user_count + users_seen

CACHE_TTL = 300 # Redis TTL in seconds

value: int | float = float("inf") # using +inf as a sentinel value
try:
value = cache.incr(cache_key)
except ValueError:
# key doesn't exist, fall back on sentinel value
pass

if value < threshold:
# if we've seen less than that many events, we can't possibly have seen enough users
metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "true"})
return True

metrics.incr("groupsnooze.test_user_counts", tags={"cached": "true", "hit": "false"})
metrics.incr("groupsnooze.test_user_counts.snuba_call")
real_count = group.count_users_seen(
referrer=Referrer.TAGSTORE_GET_GROUPS_USER_COUNTS_GROUP_SNOOZE.value
)
cache.set(cache_key, real_count, CACHE_TTL)
return real_count < threshold


post_save.connect(
lambda instance, **kwargs: cache.set(
Expand Down
Loading
Loading