Skip to content

Commit bbca58a

Browse files
committed
ref(crons): Normalize crons incident issues
Prior to incidents we created issues for each type of faiure (error, timeout, missed). This is because only one failed check-in was needed to create an issue. With incidents you can configure how many failures are needed, meaning there could be 2 missed, 1 timeout, and 1 error. This removes the various issue occurrence types and replaces them with a single MonitorIncidentType
1 parent 083a28a commit bbca58a

File tree

10 files changed

+39
-322
lines changed

10 files changed

+39
-322
lines changed

src/sentry/issues/grouptype.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,6 @@ class PerformanceGroupTypeDefaults:
247247
noise_config = NoiseConfig()
248248

249249

250-
class CronGroupTypeDefaults:
251-
notification_config = NotificationConfig(context=[])
252-
253-
254250
class ReplayGroupTypeDefaults:
255251
notification_config = NotificationConfig(context=[])
256252

@@ -518,36 +514,27 @@ class ProfileFunctionRegressionType(GroupType):
518514

519515

520516
@dataclass(frozen=True)
521-
class MonitorCheckInFailure(CronGroupTypeDefaults, GroupType):
517+
class MonitorIncidentType(GroupType):
522518
type_id = 4001
523519
slug = "monitor_check_in_failure"
524-
description = "Monitor Check In Failed"
520+
description = "Crons Monitor Failed"
525521
category = GroupCategory.CRON.value
526522
released = True
527523
creation_quota = Quota(3600, 60, 60_000) # 60,000 per hour, sliding window of 60 seconds
528524
default_priority = PriorityLevel.HIGH
525+
notification_config = NotificationConfig(context=[])
529526

530527

531528
@dataclass(frozen=True)
532-
class MonitorCheckInTimeout(CronGroupTypeDefaults, GroupType):
529+
class MonitorCheckInTimeoutDeprecated(MonitorIncidentType, GroupType):
530+
# This is deprecated, only kept around for it's type_id
533531
type_id = 4002
534-
slug = "monitor_check_in_timeout"
535-
description = "Monitor Check In Timeout"
536-
category = GroupCategory.CRON.value
537-
released = True
538-
creation_quota = Quota(3600, 60, 60_000) # 60,000 per hour, sliding window of 60 seconds
539-
default_priority = PriorityLevel.HIGH
540532

541533

542534
@dataclass(frozen=True)
543-
class MonitorCheckInMissed(CronGroupTypeDefaults, GroupType):
535+
class MonitorCheckInMissedDeprecated(MonitorIncidentType, GroupType):
536+
# This is deprecated, only kept around for it's type_id
544537
type_id = 4003
545-
slug = "monitor_check_in_missed"
546-
description = "Monitor Check In Missed"
547-
category = GroupCategory.CRON.value
548-
released = True
549-
creation_quota = Quota(3600, 60, 60_000) # 60,000 per hour, sliding window of 60 seconds
550-
default_priority = PriorityLevel.HIGH
551538

552539

553540
@dataclass(frozen=True)

src/sentry/monitors/constants.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77
# current limit is 28 days
88
MAX_TIMEOUT = 40_320
99

10-
# Format to use in the issue subtitle for the missed check-in timestamp
11-
SUBTITLE_DATETIME_FORMAT = "%b %d, %I:%M %p %Z"
12-
1310
# maximum value for incident + recovery thresholds to be set
1411
# affects the performance of recent check-ins query
1512
# lowering this may invalidate monitors + block check-ins

src/sentry/monitors/logic/mark_failed.py

Lines changed: 7 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,8 @@
77
from django.db.models import Q
88

99
from sentry import features
10-
from sentry.issues.grouptype import (
11-
MonitorCheckInFailure,
12-
MonitorCheckInMissed,
13-
MonitorCheckInTimeout,
14-
)
10+
from sentry.issues.grouptype import MonitorIncidentType
1511
from sentry.models.organization import Organization
16-
from sentry.monitors.constants import SUBTITLE_DATETIME_FORMAT, TIMEOUT
1712
from sentry.monitors.models import (
1813
CheckInStatus,
1914
MonitorCheckIn,
@@ -243,10 +238,8 @@ def create_issue_platform_occurrence(
243238
monitor_env = failed_checkin.monitor_environment
244239
current_timestamp = datetime.now(timezone.utc)
245240

246-
occurrence_data = get_occurrence_data(failed_checkin)
247-
248241
# Get last successful check-in to show in evidence display
249-
last_successful_checkin_timestamp = "None"
242+
last_successful_checkin_timestamp = "Never"
250243
last_successful_checkin = monitor_env.get_last_successful_checkin()
251244
if last_successful_checkin:
252245
last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()
@@ -257,11 +250,11 @@ def create_issue_platform_occurrence(
257250
project_id=monitor_env.monitor.project_id,
258251
event_id=uuid.uuid4().hex,
259252
fingerprint=[incident.grouphash],
260-
type=occurrence_data["group_type"],
253+
type=MonitorIncidentType,
261254
issue_title=f"Monitor failure: {monitor_env.monitor.name}",
262-
subtitle=occurrence_data["subtitle"],
255+
subtitle="Your monitor has reached its failure threshold.",
263256
evidence_display=[
264-
IssueEvidence(name="Failure reason", value=occurrence_data["reason"], important=True),
257+
IssueEvidence(name="Failure reason", value="incident", important=True),
265258
IssueEvidence(
266259
name="Environment", value=monitor_env.get_environment().name, important=False
267260
),
@@ -272,9 +265,9 @@ def create_issue_platform_occurrence(
272265
),
273266
],
274267
evidence_data={},
275-
culprit=occurrence_data["reason"],
268+
culprit="incident",
276269
detection_time=current_timestamp,
277-
level=occurrence_data["level"],
270+
level="error",
278271
assignee=monitor_env.monitor.owner_actor,
279272
)
280273

@@ -324,36 +317,3 @@ def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
324317
"status": monitor_environment.get_status_display(),
325318
"type": monitor_environment.monitor.get_type_display(),
326319
}
327-
328-
329-
def get_occurrence_data(checkin: MonitorCheckIn):
330-
if checkin.status == CheckInStatus.MISSED:
331-
expected_time = (
332-
checkin.expected_time.astimezone(checkin.monitor.timezone).strftime(
333-
SUBTITLE_DATETIME_FORMAT
334-
)
335-
if checkin.expected_time
336-
else "the expected time"
337-
)
338-
return {
339-
"group_type": MonitorCheckInMissed,
340-
"level": "warning",
341-
"reason": "missed_checkin",
342-
"subtitle": f"No check-in reported on {expected_time}.",
343-
}
344-
345-
if checkin.status == CheckInStatus.TIMEOUT:
346-
duration = (checkin.monitor.config or {}).get("max_runtime") or TIMEOUT
347-
return {
348-
"group_type": MonitorCheckInTimeout,
349-
"level": "error",
350-
"reason": "duration",
351-
"subtitle": f"Check-in exceeded maximum duration of {duration} minutes.",
352-
}
353-
354-
return {
355-
"group_type": MonitorCheckInFailure,
356-
"level": "error",
357-
"reason": "error",
358-
"subtitle": "An error occurred during the latest check-in.",
359-
}

tests/sentry/integrations/slack/notifications/test_issue_alert.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from sentry.digests.backends.redis import RedisBackend
1212
from sentry.digests.notifications import event_to_record
1313
from sentry.integrations.slack.message_builder.issues import get_tags
14-
from sentry.issues.grouptype import MonitorCheckInFailure
14+
from sentry.issues.grouptype import MonitorIncidentType
1515
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
1616
from sentry.models.identity import Identity, IdentityStatus
1717
from sentry.models.integrations.external_actor import ExternalActor
@@ -157,15 +157,15 @@ def test_crons_issue_alert_user_block(self):
157157
IssueEvidence("Evidence 2", "Value 2", False),
158158
IssueEvidence("Evidence 3", "Value 3", False),
159159
],
160-
MonitorCheckInFailure,
160+
MonitorIncidentType,
161161
datetime.now(UTC),
162162
"info",
163163
"/api/123",
164164
)
165165
occurrence.save()
166166
event.occurrence = occurrence
167167

168-
event.group.type = MonitorCheckInFailure.type_id
168+
event.group.type = MonitorIncidentType.type_id
169169
notification = AlertRuleNotification(
170170
Notification(event=event, rule=self.rule), ActionTargetType.MEMBER, self.user.id
171171
)

tests/sentry/integrations/slack/test_message_builder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from sentry.issues.grouptype import (
2727
ErrorGroupType,
2828
FeedbackGroup,
29-
MonitorCheckInFailure,
29+
MonitorIncidentType,
3030
PerformanceP95EndpointRegressionGroupType,
3131
ProfileFileIOGroupType,
3232
)
@@ -1321,7 +1321,7 @@ def setUp(self):
13211321
type=PerformanceP95EndpointRegressionGroupType.type_id
13221322
)
13231323

1324-
self.cron_issue = self.create_group(type=MonitorCheckInFailure.type_id)
1324+
self.cron_issue = self.create_group(type=MonitorIncidentType.type_id)
13251325
self.feedback_issue = self.create_group(
13261326
type=FeedbackGroup.type_id, substatus=GroupSubStatus.NEW
13271327
)

tests/sentry/issues/test_ingest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
GroupCategory,
1515
GroupType,
1616
GroupTypeRegistry,
17-
MonitorCheckInFailure,
17+
MonitorIncidentType,
1818
NoiseConfig,
1919
)
2020
from sentry.issues.ingest import (
@@ -248,7 +248,7 @@ def test_existing_group_different_category(self) -> None:
248248

249249
new_event = self.store_event(data={}, project_id=self.project.id)
250250
new_occurrence = self.build_occurrence(
251-
fingerprint=["some-fingerprint"], type=MonitorCheckInFailure.type_id
251+
fingerprint=["some-fingerprint"], type=MonitorIncidentType.type_id
252252
)
253253
with mock.patch("sentry.issues.ingest.logger") as logger:
254254
assert save_issue_from_occurrence(new_occurrence, new_event, None) is None

tests/sentry/mail/test_adapter.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from sentry.api.serializers.models.userreport import UserReportWithGroupSerializer
1818
from sentry.digests.notifications import build_digest, event_to_record
1919
from sentry.event_manager import EventManager, get_event_type
20-
from sentry.issues.grouptype import MonitorCheckInFailure
20+
from sentry.issues.grouptype import MonitorIncidentType
2121
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
2222
from sentry.mail import build_subject_prefix, mail_adapter
2323
from sentry.models.activity import Activity
@@ -328,15 +328,15 @@ def test_simple_notification_generic(self):
328328
IssueEvidence("Evidence 2", "Value 2", False),
329329
IssueEvidence("Evidence 3", "Value 3", False),
330330
],
331-
MonitorCheckInFailure,
331+
MonitorIncidentType,
332332
timezone.now(),
333333
"info",
334334
"/api/123",
335335
)
336336
occurrence.save()
337337
event.occurrence = occurrence
338338

339-
event.group.type = MonitorCheckInFailure.type_id
339+
event.group.type = MonitorIncidentType.type_id
340340

341341
rule = Rule.objects.create(project=self.project, label="my rule")
342342
ProjectOwnership.objects.create(project_id=self.project.id, fallthrough=True)
@@ -384,15 +384,15 @@ def test_simple_notification_generic_no_evidence(self):
384384
"1234",
385385
{"Test": 123},
386386
[], # no evidence
387-
MonitorCheckInFailure,
387+
MonitorIncidentType,
388388
timezone.now(),
389389
"info",
390390
"/api/123",
391391
)
392392
occurrence.save()
393393
event.occurrence = occurrence
394394

395-
event.group.type = MonitorCheckInFailure.type_id
395+
event.group.type = MonitorIncidentType.type_id
396396

397397
rule = Rule.objects.create(project=self.project, label="my rule")
398398
ProjectOwnership.objects.create(project_id=self.project.id, fallthrough=True)

tests/sentry/migrations/test_0692_backfill_group_priority_again.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from sentry.issues.grouptype import (
77
ErrorGroupType,
88
FeedbackGroup,
9-
MonitorCheckInFailure,
9+
MonitorIncidentType,
1010
PerformanceConsecutiveHTTPQueriesGroupType,
1111
PerformanceP95EndpointRegressionGroupType,
1212
ReplayDeadClickType,
@@ -114,7 +114,7 @@ def _create_groups_to_backfill(self, project: Project) -> None:
114114
{
115115
"status": GroupStatus.UNRESOLVED,
116116
"substatus": GroupSubStatus.ESCALATING,
117-
"type": MonitorCheckInFailure.type_id,
117+
"type": MonitorIncidentType.type_id,
118118
},
119119
PriorityLevel.HIGH,
120120
),
@@ -181,7 +181,7 @@ def _create_groups_to_backfill(self, project: Project) -> None:
181181
(
182182
"cron group with log level WARNING",
183183
{
184-
"type": MonitorCheckInFailure.type_id,
184+
"type": MonitorIncidentType.type_id,
185185
"level": logging.WARNING,
186186
},
187187
PriorityLevel.MEDIUM,
@@ -190,15 +190,15 @@ def _create_groups_to_backfill(self, project: Project) -> None:
190190
"cron group with log level ERROR",
191191
{
192192
"substatus": GroupSubStatus.ONGOING,
193-
"type": MonitorCheckInFailure.type_id,
193+
"type": MonitorIncidentType.type_id,
194194
"level": logging.ERROR,
195195
},
196196
PriorityLevel.HIGH,
197197
),
198198
(
199199
"cron group with log level DEBUG",
200200
{
201-
"type": MonitorCheckInFailure.type_id,
201+
"type": MonitorIncidentType.type_id,
202202
"level": logging.DEBUG,
203203
},
204204
PriorityLevel.HIGH,

0 commit comments

Comments
 (0)