Skip to content

Commit

Permalink
[9.0] [OBX-UX-MGTM][ALERTING] Add the reason message to the rules rec…
Browse files Browse the repository at this point in the history
…overy context (#211411) (#212896)

# Backport

This will backport the following commits from `main` to `9.0`:
- [[OBX-UX-MGTM][ALERTING] Add the reason message to the rules recovery
context (#211411)](#211411)

<!--- Backport version: 9.6.6 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sorenlouv/backport)

<!--BACKPORT [{"author":{"name":"Faisal
Kanout","email":"faisal.kanout@elastic.co"},"sourceCommit":{"committedDate":"2025-03-03T12:04:52Z","message":"[OBX-UX-MGTM][ALERTING]
Add the reason message to the rules recovery context (#211411)\n\n##
Summary\n\nIt fixes #184803 by:\n\n### Adding the reason message to
recovery context variables in the\nfollowing rules:\n- Inventory
Threshold\n- Metric threshold\n- Custom threshold\n- Log
threshold\n\n### Enabling recovery context and handling the recovery
alert context\nfor APM (except Anomaly)\n- Latency threshold\n- Error
count\n- Failed transaction
rate","sha":"9a6b4ecda3c0f1db49f8f93cd6968b4e2f2055d7","branchLabelMapping":{"^v9.1.0$":"main","^v8.19.0$":"8.x","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:enhancement","Feature:Alerting","backport:prev-minor","Team:obs-ux-infra_services","Team:obs-ux-management","v9.1.0"],"title":"[OBX-UX-MGTM][ALERTING]
Add the reason message to the rules recovery
context","number":211411,"url":"https://github.com/elastic/kibana/pull/211411","mergeCommit":{"message":"[OBX-UX-MGTM][ALERTING]
Add the reason message to the rules recovery context (#211411)\n\n##
Summary\n\nIt fixes #184803 by:\n\n### Adding the reason message to
recovery context variables in the\nfollowing rules:\n- Inventory
Threshold\n- Metric threshold\n- Custom threshold\n- Log
threshold\n\n### Enabling recovery context and handling the recovery
alert context\nfor APM (except Anomaly)\n- Latency threshold\n- Error
count\n- Failed transaction
rate","sha":"9a6b4ecda3c0f1db49f8f93cd6968b4e2f2055d7"}},"sourceBranch":"main","suggestedTargetBranches":[],"targetPullRequestStates":[{"branch":"main","label":"v9.1.0","branchLabelMappingKey":"^v9.1.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/211411","number":211411,"mergeCommit":{"message":"[OBX-UX-MGTM][ALERTING]
Add the reason message to the rules recovery context (#211411)\n\n##
Summary\n\nIt fixes #184803 by:\n\n### Adding the reason message to
recovery context variables in the\nfollowing rules:\n- Inventory
Threshold\n- Metric threshold\n- Custom threshold\n- Log
threshold\n\n### Enabling recovery context and handling the recovery
alert context\nfor APM (except Anomaly)\n- Latency threshold\n- Error
count\n- Failed transaction
rate","sha":"9a6b4ecda3c0f1db49f8f93cd6968b4e2f2055d7"}}]}] BACKPORT-->

Co-authored-by: Faisal Kanout <faisal.kanout@elastic.co>
  • Loading branch information
kibanamachine and fkanout authored Mar 3, 2025
1 parent 9953d52 commit 2fc95c6
Show file tree
Hide file tree
Showing 20 changed files with 572 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,7 @@ export const ANOMALY_DETECTOR_SELECTOR_OPTIONS = [
// Client side registrations:
// x-pack/solutions/observability/plugins/apm/public/components/alerting/<alert>/index.tsx
// x-pack/solutions/observability/plugins/apm/public/components/alerting/register_apm_alerts

export interface AdditionalContext {
[x: string]: any;
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export const errorCountMessage = i18n.translate(
export const errorCountRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.errorCount.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down Expand Up @@ -61,7 +61,7 @@ export const transactionDurationMessage = i18n.translate(
export const transactionDurationRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.transactionDuration.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down Expand Up @@ -97,7 +97,7 @@ export const transactionErrorRateMessage = i18n.translate(
export const transactionErrorRateRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.transactionErrorRate.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down Expand Up @@ -132,7 +132,7 @@ export const anomalyMessage = i18n.translate(
export const anomalyRecoveryMessage = i18n.translate(
'xpack.apm.alertTypes.transactionDurationAnomaly.defaultRecoveryMessage',
{
defaultMessage: `'{{context.reason}}'
defaultMessage: `Recovered: '{{context.reason}}'
'{{rule.name}}' has recovered.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@ import { getAlertUrlErrorCount, getAlertUrlTransaction } from '../../../../commo
import {
anomalyMessage,
errorCountMessage,
errorCountRecoveryMessage,
transactionDurationMessage,
transactionDurationRecoveryMessage,
transactionErrorRateMessage,
transactionErrorRateRecoveryMessage,
} from '../../../../common/rules/default_action_message';
import type { AlertParams } from './anomaly_rule_type';

Expand Down Expand Up @@ -49,6 +52,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
}),
requiresAppContext: false,
defaultActionMessage: errorCountMessage,
defaultRecoveryMessage: errorCountRecoveryMessage,
priority: 80,
});

Expand Down Expand Up @@ -80,6 +84,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
alertDetailsAppSection: lazy(() => import('../ui_components/alert_details_app_section')),
requiresAppContext: false,
defaultActionMessage: transactionDurationMessage,
defaultRecoveryMessage: transactionDurationRecoveryMessage,
priority: 60,
});

Expand Down Expand Up @@ -108,6 +113,7 @@ export function registerApmRuleTypes(observabilityRuleTypeRegistry: Observabilit
}),
requiresAppContext: false,
defaultActionMessage: transactionErrorRateMessage,
defaultRecoveryMessage: transactionErrorRateRecoveryMessage,
priority: 70,
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1025,4 +1025,113 @@ describe('Error count alert', () => {
},
});
});
it('sends recovered alerts with their context', async () => {
const { services, dependencies, executor } = createRuleTypeMocks();

registerErrorCountRuleType(dependencies);

const params = {
threshold: 2,
windowSize: 5,
windowUnit: 'm',
};

services.scopedClusterClient.asCurrentUser.search.mockResponse({
hits: {
hits: [],
total: {
relation: 'eq',
value: 1,
},
},
aggregations: {
error_counts: {
buckets: [],
},
},
took: 0,
timed_out: false,
_shards: {
failed: 0,
skipped: 0,
successful: 1,
total: 1,
},
});
services.alertsClient.getRecoveredAlerts.mockReturnValue([
{
alert: {
getId: jest.fn().mockReturnValue('test-id'),
getUuid: jest.fn().mockReturnValue('test-uuid'),
scheduledExecutionOptions: undefined,
meta: [],
state: [],
context: {},
id: 'synthtrace-high-cardinality-0_Synthtrace: many_errors',
alertAsData: undefined,
},
hit: {
'processor.event': 'error',
'kibana.alert.evaluation.value': 60568922,
'kibana.alert.evaluation.threshold': 24999998,
'kibana.alert.reason':
'Error count is 60568922 in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors. Alert when > 24999998.',
'agent.name': 'java',
'service.environment': 'Synthtrace: many_errors',
'service.name': 'synthtrace-high-cardinality-0',
'kibana.alert.rule.category': 'Error count threshold',
'kibana.alert.rule.consumer': 'alerts',
'kibana.alert.rule.execution.uuid': '8ecb0754-1220-4b6b-b95d-87b3594e925a',
'kibana.alert.rule.name': 'Error count threshold rule',
'kibana.alert.rule.parameters': [],
'kibana.alert.rule.producer': 'apm',
'kibana.alert.rule.revision': 8,
'kibana.alert.rule.rule_type_id': 'apm.error_rate',
'kibana.alert.rule.tags': [],
'kibana.alert.rule.uuid': '63028cf5-c059-4a6b-b375-fd9007233223',
'kibana.space_ids': [],
'@timestamp': '2025-02-20T12:11:51.960Z',
'event.action': 'active',
'event.kind': 'signal',
'kibana.alert.rule.execution.timestamp': '2025-02-20T12:11:51.960Z',
'kibana.alert.action_group': 'threshold_met',
'kibana.alert.flapping': true,
'kibana.alert.flapping_history': [],
'kibana.alert.instance.id': 'synthtrace-high-cardinality-0_Synthtrace: many_errors',
'kibana.alert.maintenance_window_ids': [],
'kibana.alert.consecutive_matches': 2,
'kibana.alert.status': 'active',
'kibana.alert.uuid': '81617b97-02d2-413a-9f64-77161de80df4',
'kibana.alert.workflow_status': 'open',
'kibana.alert.duration.us': 12012000,
'kibana.alert.start': '2025-02-20T12:11:39.948Z',
'kibana.alert.time_range': [],
'kibana.version': '9.1.0',
tags: [],
'kibana.alert.previous_action_group': 'threshold_met',
},
},
]);

await executor({ params });

expect(services.alertsClient.setAlertData).toHaveBeenCalledTimes(1);

expect(services.alertsClient.setAlertData).toHaveBeenCalledWith({
context: {
alertDetailsUrl: 'http://localhost:5601/eyr/app/observability/alerts/test-uuid',
environment: 'Synthtrace: many_errors',
errorGroupingKey: undefined,
interval: '5 mins',
reason:
'Error count is 60568922 in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors. Alert when > 24999998.',
serviceName: 'synthtrace-high-cardinality-0',
threshold: 2,
triggerValue: 60568922,
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/synthtrace-high-cardinality-0/errors?environment=Synthtrace%3A%20many_errors',
},
id: 'test-id',
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import {
import type {
THRESHOLD_MET_GROUP,
ApmRuleParamsType,
AdditionalContext,
} from '../../../../../common/rules/apm_rule_types';
import {
APM_SERVER_FEATURE_ID,
Expand Down Expand Up @@ -105,6 +106,7 @@ export function registerErrorCountRuleType({
actionGroups: ruleTypeConfig.actionGroups,
defaultActionGroupId: ruleTypeConfig.defaultActionGroupId,
validate: { params: errorCountParamsSchema },
doesSetRecoveryContext: true,
schemas: {
params: {
type: 'config-schema',
Expand Down Expand Up @@ -271,6 +273,53 @@ export function registerErrorCountRuleType({
});
}
);
// Handle recovered alerts context
const recoveredAlerts = alertsClient.getRecoveredAlerts() ?? [];
for (const recoveredAlert of recoveredAlerts) {
const alertHits = recoveredAlert.hit as AdditionalContext;
const recoveredAlertId = recoveredAlert.alert.getId();
const alertUuid = recoveredAlert.alert.getUuid();
const alertDetailsUrl = getAlertDetailsUrl(basePath, spaceId, alertUuid);
const groupByFields: Record<string, string> = allGroupByFields.reduce(
(acc, sourceField: string) => {
if (alertHits?.[sourceField] !== undefined) {
acc[sourceField] = alertHits[sourceField];
}
return acc;
},
{} as Record<string, string>
);

const relativeViewInAppUrl = getAlertUrlErrorCount(
groupByFields[SERVICE_NAME],
getEnvironmentEsField(groupByFields[SERVICE_ENVIRONMENT])?.[SERVICE_ENVIRONMENT]
);
const viewInAppUrl = addSpaceIdToPath(
basePath.publicBaseUrl,
spaceId,
relativeViewInAppUrl
);
const groupByActionVariables = getGroupByActionVariables(groupByFields);
const recoveredContext = {
alertDetailsUrl,
interval: formatDurationFromTimeUnitChar(
ruleParams.windowSize,
ruleParams.windowUnit as TimeUnitChar
),
reason: alertHits?.[ALERT_REASON],
// When group by doesn't include error.grouping_key, the context.error.grouping_key action variable will contain value of the Error Grouping Key filter
errorGroupingKey: ruleParams.errorGroupingKey,
threshold: ruleParams.threshold,
triggerValue: alertHits?.[ALERT_EVALUATION_VALUE],
viewInAppUrl,
...groupByActionVariables,
};

alertsClient.setAlertData({
id: recoveredAlertId,
context: recoveredContext,
});
}

return { state: {} };
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -411,4 +411,126 @@ describe('registerTransactionDurationRuleType', () => {
},
});
});
it('sends recovered alert with their context', async () => {
const { services, dependencies, executor } = createRuleTypeMocks();

registerTransactionDurationRuleType(dependencies);

services.scopedClusterClient.asCurrentUser.search.mockResponse({
hits: {
hits: [],
total: {
relation: 'eq',
value: 0,
},
},
aggregations: {
series: {
buckets: [],
},
},
took: 0,
timed_out: false,
_shards: {
failed: 0,
skipped: 0,
successful: 1,
total: 1,
},
});

services.alertsClient.getRecoveredAlerts.mockReturnValue([
{
alert: {
getId: jest.fn().mockReturnValue('test-id'),
getUuid: jest.fn().mockReturnValue('test-uuid'),
scheduledExecutionOptions: undefined,
meta: {},
state: {},
context: {},
id: 'synthtrace-high-cardinality-0_Synthtrace: many_errors_request',
alertAsData: undefined,
},
hit: {
'processor.event': 'transaction',
'kibana.alert.evaluation.value': 1000000,
'kibana.alert.evaluation.threshold': 149000,
'kibana.alert.reason':
'Avg. latency is 1,000 ms in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors, type: request. Alert when > 149 ms.',
'agent.name': 'java',
labels: { custom_label: [] },
'service.environment': 'Synthtrace: many_errors',
'service.name': 'synthtrace-high-cardinality-0',
'transaction.type': 'request',
'kibana.alert.rule.category': 'Latency threshold',
'kibana.alert.rule.consumer': 'alerts',
'kibana.alert.rule.execution.uuid': '646b1ca4-5799-4b3f-b253-593941da2c2f',
'kibana.alert.rule.name': 'Latency threshold rule',
'kibana.alert.rule.parameters': {
aggregationType: 'avg',
threshold: 149,
windowSize: 5,
windowUnit: 'd',
environment: 'ENVIRONMENT_ALL',
},
'kibana.alert.rule.producer': 'apm',
'kibana.alert.rule.revision': 15,
'kibana.alert.rule.rule_type_id': 'apm.transaction_duration',
'kibana.alert.rule.tags': [],
'kibana.alert.rule.uuid': '9c4a8e4f-b55c-426c-b4cc-fd2c9cb8bf89',
'kibana.space_ids': ['default'],
'@timestamp': '2025-02-20T12:40:40.956Z',
'event.action': 'open',
'event.kind': 'signal',
'kibana.alert.rule.execution.timestamp': '2025-02-20T12:40:40.956Z',
'kibana.alert.action_group': 'threshold_met',
'kibana.alert.flapping': false,
'kibana.alert.flapping_history': [true],
'kibana.alert.instance.id':
'synthtrace-high-cardinality-0_Synthtrace: many_errors_request',
'kibana.alert.maintenance_window_ids': [],
'kibana.alert.consecutive_matches': 1,
'kibana.alert.status': 'active',
'kibana.alert.uuid': 'b60476e6-f4e3-47a1-ac1a-a53616411b66',
'kibana.alert.severity_improving': false,
'kibana.alert.workflow_status': 'open',
'kibana.alert.duration.us': 0,
'kibana.alert.start': '2025-02-20T12:40:40.956Z',
'kibana.alert.time_range': { gte: '2025-02-20T12:40:40.956Z' },
'kibana.version': '9.1.0',
tags: [],
},
},
]);
services.alertsClient.report.mockReturnValue({ uuid: 'test-uuid' });

const params = {
threshold: 3000,
windowSize: 5,
windowUnit: 'm',
transactionType: 'request',
serviceName: 'opbeans-java',
aggregationType: 'avg',
transactionName: 'GET /orders',
};
await executor({ params });
expect(services.alertsClient.setAlertData).toHaveBeenCalledTimes(1);
expect(services.alertsClient.setAlertData).toHaveBeenCalledWith({
context: {
alertDetailsUrl: 'http://localhost:5601/eyr/app/observability/alerts/test-uuid',
environment: 'Synthtrace: many_errors',
interval: '5 mins',
reason:
'Avg. latency is 1,000 ms in the last 5 days for service: synthtrace-high-cardinality-0, env: Synthtrace: many_errors, type: request. Alert when > 149 ms.',
serviceName: 'synthtrace-high-cardinality-0',
threshold: 3000,
transactionName: 'GET /orders',
transactionType: 'request',
triggerValue: '1,000 ms',
viewInAppUrl:
'http://localhost:5601/eyr/app/apm/services/synthtrace-high-cardinality-0?transactionType=request&environment=Synthtrace%3A%20many_errors',
},
id: 'test-id',
});
});
});
Loading

0 comments on commit 2fc95c6

Please sign in to comment.