Skip to content

Commit ef1347a

Browse files
ceorourkeandrewshie-sentry
authored andcommitted
chore(ACI): Add 1:1 metrics for dual processing (#91840)
In order for us to determine whether we are firing metric alerts at the same rate in workflow engine, add org id to the existing metric alert fire/resolve metrics and add a new metric to process workflows that doesn't count the number of actions being fired. We'll have to be careful to not add too many orgs to this flag as that'd result in a high cardinality of tags in Datadog.
1 parent ee9affd commit ef1347a

File tree

4 files changed

+117
-13
lines changed

4 files changed

+117
-13
lines changed

src/sentry/incidents/subscription_processor.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,16 @@ def process_update(self, subscription_update: QuerySubscriptionUpdate) -> None:
443443
"organizations:anomaly-detection-rollout", self.subscription.project.organization
444444
)
445445

446+
alert_triggered_tags = {
447+
"detection_type": self.alert_rule.detection_type,
448+
"organization_id": None,
449+
}
450+
if features.has(
451+
"organizations:workflow-engine-metric-alert-dual-processing-logs",
452+
self.subscription.project.organization,
453+
):
454+
alert_triggered_tags["organization_id"] = self.alert_rule.organization_id
455+
446456
potential_anomalies = None
447457
if (
448458
has_anomaly_detection
@@ -505,7 +515,7 @@ def process_update(self, subscription_update: QuerySubscriptionUpdate) -> None:
505515
) and not self.check_trigger_matches_status(trigger, TriggerStatus.ACTIVE):
506516
metrics.incr(
507517
"incidents.alert_rules.threshold.alert",
508-
tags={"detection_type": self.alert_rule.detection_type},
518+
tags=alert_triggered_tags,
509519
)
510520
incident_trigger = self.trigger_alert_threshold(
511521
trigger, aggregation_value
@@ -522,7 +532,7 @@ def process_update(self, subscription_update: QuerySubscriptionUpdate) -> None:
522532
):
523533
metrics.incr(
524534
"incidents.alert_rules.threshold.resolve",
525-
tags={"detection_type": self.alert_rule.detection_type},
535+
tags=alert_triggered_tags,
526536
)
527537
incident_trigger = self.trigger_resolve_threshold(
528538
trigger, aggregation_value
@@ -544,7 +554,7 @@ def process_update(self, subscription_update: QuerySubscriptionUpdate) -> None:
544554
# And the trigger is not yet active
545555
metrics.incr(
546556
"incidents.alert_rules.threshold.alert",
547-
tags={"detection_type": self.alert_rule.detection_type},
557+
tags=alert_triggered_tags,
548558
)
549559
# triggering a threshold will create an incident and set the status to active
550560
incident_trigger = self.trigger_alert_threshold(trigger, aggregation_value)
@@ -562,7 +572,7 @@ def process_update(self, subscription_update: QuerySubscriptionUpdate) -> None:
562572
):
563573
metrics.incr(
564574
"incidents.alert_rules.threshold.resolve",
565-
tags={"detection_type": self.alert_rule.detection_type},
575+
tags=alert_triggered_tags,
566576
)
567577
incident_trigger = self.trigger_resolve_threshold(
568578
trigger, aggregation_value

src/sentry/workflow_engine/processors/workflow.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
from dataclasses import asdict, replace
33
from enum import StrEnum
4+
from typing import Any
45

56
import sentry_sdk
67
from django.db import router, transaction
@@ -217,6 +218,14 @@ def process_workflows(event_data: WorkflowEventData) -> set[Workflow]:
217218

218219
# TODO: remove fetching org, only used for feature flag checks
219220
organization = detector.project.organization
221+
workflow_metric_tags: dict[str, Any] = {
222+
"detector_type": detector.type,
223+
"organization_id": None,
224+
}
225+
if features.has(
226+
"organizations:workflow-engine-metric-alert-dual-processing-logs", organization
227+
):
228+
workflow_metric_tags["organization_id"] = organization.id
220229

221230
# Get the workflows, evaluate the when_condition_group, finally evaluate the actions for workflows that are triggered
222231
workflows = set(
@@ -231,7 +240,7 @@ def process_workflows(event_data: WorkflowEventData) -> set[Workflow]:
231240
metrics.incr(
232241
"workflow_engine.process_workflows",
233242
len(workflows),
234-
tags={"detector_type": detector.type},
243+
tags=workflow_metric_tags,
235244
)
236245

237246
logger.info(
@@ -254,7 +263,7 @@ def process_workflows(event_data: WorkflowEventData) -> set[Workflow]:
254263
metrics.incr(
255264
"workflow_engine.process_workflows.triggered_workflows",
256265
len(triggered_workflows),
257-
tags={"detector_type": detector.type},
266+
tags=workflow_metric_tags,
258267
)
259268

260269
logger.info(
@@ -275,7 +284,7 @@ def process_workflows(event_data: WorkflowEventData) -> set[Workflow]:
275284
metrics.incr(
276285
"workflow_engine.process_workflows.actions",
277286
amount=len(actions),
278-
tags={"detector_type": detector.type},
287+
tags=workflow_metric_tags,
279288
)
280289

281290
logger.info(
@@ -300,7 +309,7 @@ def process_workflows(event_data: WorkflowEventData) -> set[Workflow]:
300309
metrics.incr(
301310
"workflow_engine.process_workflows.triggered_actions",
302311
amount=len(actions),
303-
tags={"detector_type": detector.type},
312+
tags=workflow_metric_tags,
304313
)
305314
logger.info(
306315
"workflow_engine.process_workflows.triggered_actions (batch)",
@@ -312,5 +321,10 @@ def process_workflows(event_data: WorkflowEventData) -> set[Workflow]:
312321
"detector_type": detector.type,
313322
},
314323
)
324+
# in order to check if workflow engine is firing 1:1 with the old system, we must only count once rather than each action
325+
metrics.incr(
326+
"workflow_engine.process_workflows.fired_actions",
327+
tags=workflow_metric_tags,
328+
)
315329

316330
return triggered_workflows

tests/sentry/incidents/test_subscription_processor.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,8 +1024,9 @@ def test_seer_call_failed_parse(self, mock_logger, mock_seer_request):
10241024
)
10251025
assert result is None
10261026

1027+
@patch("sentry.incidents.subscription_processor.metrics")
10271028
@patch("sentry.incidents.utils.metric_issue_poc.create_or_update_metric_issue")
1028-
def test_alert(self, create_metric_issue_mock):
1029+
def test_alert(self, create_metric_issue_mock, mock_metrics):
10291030
# Verify that an alert rule that only expects a single update to be over the
10301031
# alert threshold triggers correctly
10311032
rule = self.rule
@@ -1054,6 +1055,31 @@ def test_alert(self, create_metric_issue_mock):
10541055
],
10551056
)
10561057
create_metric_issue_mock.assert_not_called()
1058+
mock_metrics.incr.assert_has_calls(
1059+
[
1060+
call(
1061+
"incidents.alert_rules.threshold.alert",
1062+
tags={"detection_type": "static", "organization_id": None},
1063+
),
1064+
call("incidents.alert_rules.trigger", tags={"type": "fire"}),
1065+
],
1066+
)
1067+
1068+
@with_feature("organizations:workflow-engine-metric-alert-dual-processing-logs")
1069+
@patch("sentry.incidents.subscription_processor.metrics")
1070+
def test_alert_metrics(self, mock_metrics):
1071+
rule = self.rule
1072+
trigger = self.trigger
1073+
self.send_update(rule, trigger.alert_threshold + 1)
1074+
mock_metrics.incr.assert_has_calls(
1075+
[
1076+
call(
1077+
"incidents.alert_rules.threshold.alert",
1078+
tags={"detection_type": "static", "organization_id": rule.organization_id},
1079+
),
1080+
call("incidents.alert_rules.trigger", tags={"type": "fire"}),
1081+
],
1082+
)
10571083

10581084
def test_alert_dedupe(self):
10591085
# Verify that an alert rule that only expects a single update to be over the
@@ -1174,8 +1200,9 @@ def test_no_active_incident_resolve(self):
11741200
self.assert_trigger_does_not_exist(trigger)
11751201
self.assert_action_handler_called_with_actions(None, [])
11761202

1203+
@patch("sentry.incidents.subscription_processor.metrics")
11771204
@patch("sentry.incidents.utils.metric_issue_poc.create_or_update_metric_issue")
1178-
def test_resolve(self, create_metric_issue_mock):
1205+
def test_resolve(self, create_metric_issue_mock, mock_metrics):
11791206
# Verify that an alert rule that only expects a single update to be under the
11801207
# resolve threshold triggers correctly
11811208
rule = self.rule
@@ -1218,6 +1245,42 @@ def test_resolve(self, create_metric_issue_mock):
12181245
],
12191246
)
12201247
create_metric_issue_mock.assert_not_called()
1248+
mock_metrics.incr.assert_has_calls(
1249+
[
1250+
call(
1251+
"incidents.alert_rules.threshold.alert",
1252+
tags={"detection_type": "static", "organization_id": None},
1253+
),
1254+
call("incidents.alert_rules.trigger", tags={"type": "fire"}),
1255+
call(
1256+
"incidents.alert_rules.threshold.resolve",
1257+
tags={"detection_type": "static", "organization_id": None},
1258+
),
1259+
call("incidents.alert_rules.trigger", tags={"type": "resolve"}),
1260+
]
1261+
)
1262+
1263+
@with_feature("organizations:workflow-engine-metric-alert-dual-processing-logs")
1264+
@patch("sentry.incidents.subscription_processor.metrics")
1265+
def test_resolve_metrics(self, mock_metrics):
1266+
rule = self.rule
1267+
trigger = self.trigger
1268+
self.send_update(rule, trigger.alert_threshold + 1, timedelta(minutes=-2))
1269+
self.send_update(rule, rule.resolve_threshold - 1, timedelta(minutes=-1))
1270+
mock_metrics.incr.assert_has_calls(
1271+
[
1272+
call(
1273+
"incidents.alert_rules.threshold.alert",
1274+
tags={"detection_type": "static", "organization_id": rule.organization_id},
1275+
),
1276+
call("incidents.alert_rules.trigger", tags={"type": "fire"}),
1277+
call(
1278+
"incidents.alert_rules.threshold.resolve",
1279+
tags={"detection_type": "static", "organization_id": rule.organization_id},
1280+
),
1281+
call("incidents.alert_rules.trigger", tags={"type": "resolve"}),
1282+
]
1283+
)
12211284

12221285
def test_resolve_multiple_threshold_periods(self):
12231286
# Verify that a rule that expects two consecutive updates to be under the

tests/sentry/workflow_engine/processors/test_workflow.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ def setUp(self):
6262
id=1, is_new=False, is_regression=True, is_new_group_environment=False
6363
),
6464
)
65+
self.workflow_metric_tags = {
66+
"detector_type": self.error_detector.type,
67+
"organization_id": None,
68+
}
6569

6670
def test_skips_disabled_workflows(self):
6771
workflow_triggers = self.create_data_condition_group()
@@ -276,7 +280,7 @@ def test_metrics_with_workflows(self, mock_incr):
276280
mock_incr.assert_any_call(
277281
"workflow_engine.process_workflows",
278282
1,
279-
tags={"detector_type": self.error_detector.type},
283+
tags=self.workflow_metric_tags,
280284
)
281285

282286
@patch("sentry.utils.metrics.incr")
@@ -286,7 +290,7 @@ def test_metrics_triggered_workflows(self, mock_incr):
286290
mock_incr.assert_any_call(
287291
"workflow_engine.process_workflows.triggered_workflows",
288292
1,
289-
tags={"detector_type": self.error_detector.type},
293+
tags=self.workflow_metric_tags,
290294
)
291295

292296
@with_feature("organizations:workflow-engine-process-workflows")
@@ -298,7 +302,20 @@ def test_metrics_triggered_actions(self, mock_incr):
298302
mock_incr.assert_any_call(
299303
"workflow_engine.process_workflows.triggered_actions",
300304
amount=0,
301-
tags={"detector_type": self.error_detector.type},
305+
tags=self.workflow_metric_tags,
306+
)
307+
308+
@with_feature("organizations:workflow-engine-process-workflows")
309+
@with_feature("organizations:workflow-engine-metric-alert-dual-processing-logs")
310+
@patch("sentry.utils.metrics.incr")
311+
def test_metrics_issue_dual_processing_metrics(self, mock_incr):
312+
process_workflows(self.event_data)
313+
mock_incr.assert_any_call(
314+
"workflow_engine.process_workflows.fired_actions",
315+
tags={
316+
"detector_type": self.error_detector.type,
317+
"organization_id": self.error_detector.project.organization_id,
318+
},
302319
)
303320

304321

0 commit comments

Comments
 (0)