Skip to content

Commit

Permalink
[OBX-UX-MGTM][ALERTING] Add the reason message to the rules recovery …
Browse files Browse the repository at this point in the history
…context (#211411)

## Summary

It fixes #184803 by:

### Adding the reason message to recovery context variables in the
following rules:
- Inventory Threshold
- Metric threshold
- Custom threshold
- Log threshold

### Enabling recovery context and handling the recovery alert context
for APM (except Anomaly)
- Latency threshold
- Error count
- Failed transaction rate
  • Loading branch information
fkanout authored Mar 3, 2025
1 parent 3b3bbb1 commit 9a6b4ec
Show file tree
Hide file tree
Showing 20 changed files with 572 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,7 @@ export const ANOMALY_DETECTOR_SELECTOR_OPTIONS = [
// Client side registrations:
// x-pack/solutions/observability/plugins/apm/public/components/alerting/<alert>/index.tsx
// x-pack/solutions/observability/plugins/apm/public/components/alerting/register_apm_alerts

export interface AdditionalContext {
[x: string]: any;
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export const errorCountMessage = i18n.translate(
export const errorCountRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.errorCount.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down Expand Up @@ -61,7 +61,7 @@ export const transactionDurationMessage = i18n.translate(
export const transactionDurationRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.transactionDuration.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down Expand Up @@ -97,7 +97,7 @@ export const transactionErrorRateMessage = i18n.translate(
export const transactionErrorRateRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.transactionErrorRate.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down Expand Up @@ -132,7 +132,7 @@ export const anomalyMessage = i18n.translate(
export const anomalyRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.transactionDurationAnomaly.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@ import { getAlertUrlErrorCount, getAlertUrlTransaction } from '../../../../commo
import {
anomalyMessage,
errorCountMessage,
errorCountRecoveryMessage,
transactionDurationMessage,
transactionDurationRecoveryMessage,
transactionErrorRateMessage,
transactionErrorRateRecoveryMessage,
} from '../../../../common/rules/default_action_message';
import type { AlertParams } from './anomaly_rule_type';

Expand Down Expand Up @@ -49,6 +52,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
}),
requiresAppContext: false,
defaultActionMessage: errorCountMessage,
defaultRecoveryMessage: errorCountRecoveryMessage,
priority: 80,
});

Expand Down Expand Up @@ -80,6 +84,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
alertDetailsAppSection: lazy(() => import('../ui_components/alert_details_app_section')),
requiresAppContext: false,
defaultActionMessage: transactionDurationMessage,
defaultRecoveryMessage: transactionDurationRecoveryMessage,
priority: 60,
});

Expand Down Expand Up @@ -108,6 +113,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
}),
requiresAppContext: false,
defaultActionMessage: transactionErrorRateMessage,
defaultRecoveryMessage: transactionErrorRateRecoveryMessage,
priority: 70,
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1025,4 +1025,113 @@ describe('Error count alert', () => {
},
});
});
it('sends recovered alerts with their context', async () => {
const { services, dependencies, executor } = createRuleTypeMocks();

registerErrorCountRuleType(dependencies);

const params = {
threshold: 2,
windowSize: 5,
windowUnit: 'm',
};

services.scopedClusterClient.asCurrentUser.search.mockResponse({
hits: {
hits: [],
total: {
relation: 'eq',
value: 1,
},
},
aggregations: {
error_counts: {
buckets: [],
},
},
took: 0,
timed_out: false,
_shards: {
failed: 0,
skipped: 0,
successful: 1,
total: 1,
},
});
services.alertsClient.getRecoveredAlerts.mockReturnValue([
{
alert: {
getId: jest.fn().mockReturnValue('test-id'),
getUuid: jest.fn().mockReturnValue('test-uuid'),
scheduledExecutionOptions: undefined,
meta: [],
state: [],
context: {},
id: 'synthtrace-high-cardinality-0_Synthtrace: many_errors',
alertAsData: undefined,
},
hit: {
'processor.event': 'error',
'kibana.alert.evaluation.value': 60568922,
'kibana.alert.evaluation.threshold': 24999998,
'kibana.alert.reason':
'Error count is 60568922 in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors. Alert when > 24999998.',
'agent.name': 'java',
'service.environment': 'Synthtrace: many_errors',
'service.name': 'synthtrace-high-cardinality-0',
'kibana.alert.rule.category': 'Error count threshold',
'kibana.alert.rule.consumer': 'alerts',
'kibana.alert.rule.execution.uuid': '8ecb0754-1220-4b6b-b95d-87b3594e925a',
'kibana.alert.rule.name': 'Error count threshold rule',
'kibana.alert.rule.parameters': [],
'kibana.alert.rule.producer': 'apm',
'kibana.alert.rule.revision': 8,
'kibana.alert.rule.rule_type_id': 'apm.error_rate',
'kibana.alert.rule.tags': [],
'kibana.alert.rule.uuid': '63028cf5-c059-4a6b-b375-fd9007233223',
'kibana.space_ids': [],
'@timestamp': '2025-02-20T12:11:51.960Z',
'event.action': 'active',
'event.kind': 'signal',
'kibana.alert.rule.execution.timestamp': '2025-02-20T12:11:51.960Z',
'kibana.alert.action_group': 'threshold_met',
'kibana.alert.flapping': true,
'kibana.alert.flapping_history': [],
'kibana.alert.instance.id': 'synthtrace-high-cardinality-0_Synthtrace: many_errors',
'kibana.alert.maintenance_window_ids': [],
'kibana.alert.consecutive_matches': 2,
'kibana.alert.status': 'active',
'kibana.alert.uuid': '81617b97-02d2-413a-9f64-77161de80df4',
'kibana.alert.workflow_status': 'open',
'kibana.alert.duration.us': 12012000,
'kibana.alert.start': '2025-02-20T12:11:39.948Z',
'kibana.alert.time_range': [],
'kibana.version': '9.1.0',
tags: [],
'kibana.alert.previous_action_group': 'threshold_met',
},
},
]);

await executor({ params });

expect(services.alertsClient.setAlertData).toHaveBeenCalledTimes(1);

expect(services.alertsClient.setAlertData).toHaveBeenCalledWith({
context: {
alertDetailsUrl: 'http://localhost:5601/eyr/app/observability/alerts/test-uuid',
environment: 'Synthtrace: many_errors',
errorGroupingKey: undefined,
interval: '5 mins',
reason:
'Error count is 60568922 in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors. Alert when > 24999998.',
serviceName: 'synthtrace-high-cardinality-0',
threshold: 2,
triggerValue: 60568922,
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/synthtrace-high-cardinality-0/errors?environment=Synthtrace%3A%20many_errors',
},
id: 'test-id',
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import {
import type {
THRESHOLD_MET_GROUP,
ApmRuleParamsType,
AdditionalContext,
} from '../../../../../common/rules/apm_rule_types';
import {
APM_SERVER_FEATURE_ID,
Expand Down Expand Up @@ -105,6 +106,7 @@ export function registerErrorCountRuleType({
actionGroups: ruleTypeConfig.actionGroups,
defaultActionGroupId: ruleTypeConfig.defaultActionGroupId,
validate: { params: errorCountParamsSchema },
doesSetRecoveryContext: true,
schemas: {
params: {
type: 'config-schema',
Expand Down Expand Up @@ -269,6 +271,53 @@ export function registerErrorCountRuleType({
});
}
);
// Handle recovered alerts context
const recoveredAlerts = alertsClient.getRecoveredAlerts() ?? [];
for (const recoveredAlert of recoveredAlerts) {
const alertHits = recoveredAlert.hit as AdditionalContext;
const recoveredAlertId = recoveredAlert.alert.getId();
const alertUuid = recoveredAlert.alert.getUuid();
const alertDetailsUrl = getAlertDetailsUrl(basePath, spaceId, alertUuid);
const groupByFields: Record<string, string> = allGroupByFields.reduce(
(acc, sourceField: string) => {
if (alertHits?.[sourceField] !== undefined) {
acc[sourceField] = alertHits[sourceField];
}
return acc;
},
{} as Record<string, string>
);

const relativeViewInAppUrl = getAlertUrlErrorCount(
groupByFields[SERVICE_NAME],
getEnvironmentEsField(groupByFields[SERVICE_ENVIRONMENT])?.[SERVICE_ENVIRONMENT]
);
const viewInAppUrl = addSpaceIdToPath(
basePath.publicBaseUrl,
spaceId,
relativeViewInAppUrl
);
const groupByActionVariables = getGroupByActionVariables(groupByFields);
const recoveredContext = {
alertDetailsUrl,
interval: formatDurationFromTimeUnitChar(
ruleParams.windowSize,
ruleParams.windowUnit as TimeUnitChar
),
reason: alertHits?.[ALERT_REASON],
// When group by doesn't include error.grouping_key, the context.error.grouping_key action variable will contain value of the Error Grouping Key filter
errorGroupingKey: ruleParams.errorGroupingKey,
threshold: ruleParams.threshold,
triggerValue: alertHits?.[ALERT_EVALUATION_VALUE],
viewInAppUrl,
...groupByActionVariables,
};

alertsClient.setAlertData({
id: recoveredAlertId,
context: recoveredContext,
});
}

return { state: {} };
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -411,4 +411,126 @@ describe('registerTransactionDurationRuleType', () => {
},
});
});
it('sends recovered alert with their context', async () => {
const { services, dependencies, executor } = createRuleTypeMocks();

registerTransactionDurationRuleType(dependencies);

services.scopedClusterClient.asCurrentUser.search.mockResponse({
hits: {
hits: [],
total: {
relation: 'eq',
value: 0,
},
},
aggregations: {
series: {
buckets: [],
},
},
took: 0,
timed_out: false,
_shards: {
failed: 0,
skipped: 0,
successful: 1,
total: 1,
},
});

services.alertsClient.getRecoveredAlerts.mockReturnValue([
{
alert: {
getId: jest.fn().mockReturnValue('test-id'),
getUuid: jest.fn().mockReturnValue('test-uuid'),
scheduledExecutionOptions: undefined,
meta: {},
state: {},
context: {},
id: 'synthtrace-high-cardinality-0_Synthtrace: many_errors_request',
alertAsData: undefined,
},
hit: {
'processor.event': 'transaction',
'kibana.alert.evaluation.value': 1000000,
'kibana.alert.evaluation.threshold': 149000,
'kibana.alert.reason':
'Avg. latency is 1,000 ms in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors, type: request. Alert when > 149 ms.',
'agent.name': 'java',
labels: { custom_label: [] },
'service.environment': 'Synthtrace: many_errors',
'service.name': 'synthtrace-high-cardinality-0',
'transaction.type': 'request',
'kibana.alert.rule.category': 'Latency threshold',
'kibana.alert.rule.consumer': 'alerts',
'kibana.alert.rule.execution.uuid': '646b1ca4-5799-4b3f-b253-593941da2c2f',
'kibana.alert.rule.name': 'Latency threshold rule',
'kibana.alert.rule.parameters': {
aggregationType: 'avg',
threshold: 149,
windowSize: 5,
windowUnit: 'd',
environment: 'ENVIRONMENT_ALL',
},
'kibana.alert.rule.producer': 'apm',
'kibana.alert.rule.revision': 15,
'kibana.alert.rule.rule_type_id': 'apm.transaction_duration',
'kibana.alert.rule.tags': [],
'kibana.alert.rule.uuid': '9c4a8e4f-b55c-426c-b4cc-fd2c9cb8bf89',
'kibana.space_ids': ['default'],
'@timestamp': '2025-02-20T12:40:40.956Z',
'event.action': 'open',
'event.kind': 'signal',
'kibana.alert.rule.execution.timestamp': '2025-02-20T12:40:40.956Z',
'kibana.alert.action_group': 'threshold_met',
'kibana.alert.flapping': false,
'kibana.alert.flapping_history': [true],
'kibana.alert.instance.id':
'synthtrace-high-cardinality-0_Synthtrace: many_errors_request',
'kibana.alert.maintenance_window_ids': [],
'kibana.alert.consecutive_matches': 1,
'kibana.alert.status': 'active',
'kibana.alert.uuid': 'b60476e6-f4e3-47a1-ac1a-a53616411b66',
'kibana.alert.severity_improving': false,
'kibana.alert.workflow_status': 'open',
'kibana.alert.duration.us': 0,
'kibana.alert.start': '2025-02-20T12:40:40.956Z',
'kibana.alert.time_range': { gte: '2025-02-20T12:40:40.956Z' },
'kibana.version': '9.1.0',
tags: [],
},
},
]);
services.alertsClient.report.mockReturnValue({ uuid: 'test-uuid' });

const params = {
threshold: 3000,
windowSize: 5,
windowUnit: 'm',
transactionType: 'request',
serviceName: 'opbeans-java',
aggregationType: 'avg',
transactionName: 'GET /orders',
};
await executor({ params });
expect(services.alertsClient.setAlertData).toHaveBeenCalledTimes(1);
expect(services.alertsClient.setAlertData).toHaveBeenCalledWith({
context: {
alertDetailsUrl: 'http://localhost:5601/eyr/app/observability/alerts/test-uuid',
environment: 'Synthtrace: many_errors',
interval: '5 mins',
reason:
'Avg. latency is 1,000 ms in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors, type: request. Alert when > 149 ms.',
serviceName: 'synthtrace-high-cardinality-0',
threshold: 3000,
transactionName: 'GET /orders',
transactionType: 'request',
triggerValue: '1,000 ms',
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/synthtrace-high-cardinality-0?transactionType=request&environment=Synthtrace%3A%20many_errors',
},
id: 'test-id',
});
});
});
Loading

0 comments on commit 9a6b4ec

Please sign in to comment.