Skip to content

Commit bd7c8a6

Browse files
David WangMichaelSun48
David Wang
authored andcommitted
feat(crons): Record broken monitor recovery analytic (#69260)
Records the event from #69259 when a monitor environment with a broken detection is resolved
1 parent d8eb54b commit bd7c8a6

File tree

2 files changed

+88
-1
lines changed

2 files changed

+88
-1
lines changed

src/sentry/monitors/logic/mark_ok.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import logging
2-
from datetime import datetime
2+
from datetime import datetime, timedelta
33

4+
from django.utils import timezone
5+
6+
from sentry import analytics
47
from sentry.monitors.models import CheckInStatus, MonitorCheckIn, MonitorEnvironment, MonitorStatus
8+
from sentry.monitors.tasks.detect_broken_monitor_envs import NUM_DAYS_BROKEN_PERIOD
59

610
logger = logging.getLogger(__name__)
711

@@ -59,6 +63,18 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
5963
"grouphash": incident.grouphash,
6064
},
6165
)
66+
# if incident was longer than the broken env time, check if there was a broken detection that is also now resolved
67+
if incident.starting_timestamp <= timezone.now() - timedelta(
68+
days=NUM_DAYS_BROKEN_PERIOD
69+
):
70+
if incident.monitorenvbrokendetection_set.exists():
71+
analytics.record(
72+
"cron_monitor_broken_status.recovery",
73+
organization_id=monitor_env.monitor.organization_id,
74+
project_id=monitor_env.monitor.project_id,
75+
monitor_id=monitor_env.monitor.id,
76+
monitor_env_id=monitor_env.id,
77+
)
6278

6379
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
6480
**params

tests/sentry/monitors/logic/test_mark_ok.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import timedelta
2+
from unittest import mock
23
from unittest.mock import patch
34

45
from django.utils import timezone
@@ -10,6 +11,7 @@
1011
CheckInStatus,
1112
Monitor,
1213
MonitorCheckIn,
14+
MonitorEnvBrokenDetection,
1315
MonitorEnvironment,
1416
MonitorIncident,
1517
MonitorStatus,
@@ -240,3 +242,72 @@ def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
240242
"new_substatus": None,
241243
},
242244
) == dict(status_change)
245+
246+
@mock.patch("sentry.analytics.record")
247+
def test_mark_ok_broken_recovery(self, mock_record):
248+
now = timezone.now().replace(second=0, microsecond=0)
249+
250+
monitor = Monitor.objects.create(
251+
name="test monitor",
252+
organization_id=self.organization.id,
253+
project_id=self.project.id,
254+
type=MonitorType.CRON_JOB,
255+
config={
256+
"schedule": "* * * * *",
257+
"schedule_type": ScheduleType.CRONTAB,
258+
"max_runtime": None,
259+
"checkin_margin": None,
260+
"recovery_threshold": None,
261+
},
262+
)
263+
264+
# Start with monitor in an ERROR state and broken detection
265+
monitor_environment = MonitorEnvironment.objects.create(
266+
monitor=monitor,
267+
environment_id=self.environment.id,
268+
status=MonitorStatus.ERROR,
269+
last_checkin=now - timedelta(minutes=1),
270+
next_checkin=now,
271+
)
272+
checkin = MonitorCheckIn.objects.create(
273+
monitor=monitor,
274+
monitor_environment=monitor_environment,
275+
project_id=self.project.id,
276+
status=CheckInStatus.ERROR,
277+
date_added=timezone.now() - timedelta(days=14),
278+
)
279+
incident = MonitorIncident.objects.create(
280+
monitor=monitor,
281+
monitor_environment=monitor_environment,
282+
starting_checkin=checkin,
283+
starting_timestamp=checkin.date_added,
284+
)
285+
MonitorEnvBrokenDetection.objects.create(
286+
monitor_incident=incident,
287+
)
288+
289+
# OK checkin comes in
290+
success_checkin = MonitorCheckIn.objects.create(
291+
monitor=monitor,
292+
monitor_environment=monitor_environment,
293+
project_id=self.project.id,
294+
status=CheckInStatus.OK,
295+
date_added=now,
296+
)
297+
mark_ok(success_checkin, ts=now)
298+
299+
# Monitor has recovered to OK with updated upcoming timestamps
300+
monitor_environment.refresh_from_db()
301+
assert monitor_environment.status == MonitorStatus.OK
302+
assert monitor_environment.next_checkin == now + timedelta(minutes=1)
303+
assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
304+
assert monitor_environment.last_checkin == now
305+
306+
# We recorded an analytics event
307+
mock_record.assert_called_with(
308+
"cron_monitor_broken_status.recovery",
309+
organization_id=self.organization.id,
310+
project_id=self.project.id,
311+
monitor_id=monitor.id,
312+
monitor_env_id=monitor_environment.id,
313+
)

0 commit comments

Comments
 (0)