From 59055c687b69957a30d1a3faec5d812815313ddd Mon Sep 17 00:00:00 2001 From: Marshall Main <55718608+marshallmain@users.noreply.github.com> Date: Mon, 15 Apr 2024 08:34:57 -0700 Subject: [PATCH] [Security Solution] Add retry to getMetrics to reduce flake (#180704) ## Summary API integration tests added in #180094 fail occasionally on main, but pass consistently locally and in the flaky test runner. This PR adds a retry to the `getMetrics` request in hopes of removing the flakiness. Flake issues: https://github.com/elastic/kibana/issues/180530 https://github.com/elastic/kibana/issues/180641 --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com> --- .../execution_logic/eql.ts | 22 +++++----- .../execution_logic/machine_learning.ts | 24 +++++------ .../execution_logic/utils.ts | 41 +++++++++++++++++++ 3 files changed, 63 insertions(+), 24 deletions(-) create mode 100644 x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/utils.ts diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/eql.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/eql.ts index faa2697e39039..bd594fc8c94a9 100644 --- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/eql.ts +++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/eql.ts @@ -52,6 +52,7 @@ import { } from '../../../../../../../common/utils/security_solution'; import { FtrProviderContext } from '../../../../../../ftr_provider_context'; import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder'; +import { getMetricsRequest, getMetricsWithRetry } from './utils'; /** * Specific AGENT_ID to use for some of the tests. If the archiver changes and you see errors @@ -66,6 +67,7 @@ export default ({ getService }: FtrProviderContext) => { const es = getService('es'); const log = getService('log'); const kibanaServer = getService('kibanaServer'); + const retry = getService('retry'); // TODO: add a new service for loading archiver files similar to "getService('es')" const config = getService('config'); @@ -205,15 +207,7 @@ export default ({ getService }: FtrProviderContext) => { }); it('classifies verification_exception errors as user errors', async () => { - function getMetricsRequest(reset: boolean = false) { - return request - .get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`) - .set('kbn-xsrf', 'foo') - .expect(200) - .then((response) => response.body); - } - - await getMetricsRequest(true); + await getMetricsRequest(request, true); const rule: EqlRuleCreateProps = { ...getEqlRuleForAlertTesting(['auditbeat-*']), query: 'file where field.doesnt.exist == true', @@ -234,9 +228,15 @@ export default ({ getService }: FtrProviderContext) => { ruleResponse.execution_summary.last_execution.message.includes('verification_exception') ).eql(true); - const metricsResponse = await getMetricsRequest(); + const metricsResponse = await getMetricsWithRetry( + request, + retry, + false, + (metrics) => + metrics.metrics?.task_run?.value.by_type['alerting:siem__eqlRule'].user_errors === 1 + ); expect( - metricsResponse.metrics.task_run.value.by_type['alerting:siem__eqlRule'].user_errors + metricsResponse.metrics?.task_run?.value.by_type['alerting:siem__eqlRule'].user_errors ).eql(1); }); diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts index 04454e80707a0..a822fb5f82b96 100644 --- a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts +++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/machine_learning.ts @@ -56,6 +56,7 @@ import { } from '../../../../../../../common/utils/security_solution'; import { FtrProviderContext } from '../../../../../../ftr_provider_context'; import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder'; +import { getMetricsRequest, getMetricsWithRetry } from './utils'; export default ({ getService }: FtrProviderContext) => { const supertest = getService('supertest'); @@ -69,6 +70,7 @@ export default ({ getService }: FtrProviderContext) => { const isServerless = config.get('serverless'); const dataPathBuilder = new EsArchivePathBuilder(isServerless); const auditPath = dataPathBuilder.getPath('auditbeat/hosts'); + const retry = getService('retry'); const siemModule = 'security_linux_v3'; const mlJobId = 'v3_linux_anomalous_network_activity'; @@ -181,15 +183,7 @@ export default ({ getService }: FtrProviderContext) => { }); it('classifies ml job missing errors as user errors', async () => { - function getMetricsRequest(reset: boolean = false) { - return request - .get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`) - .set('kbn-xsrf', 'foo') - .expect(200) - .then((response) => response.body); - } - - await getMetricsRequest(true); + await getMetricsRequest(request, true); const badRule: MachineLearningRuleCreateProps = { ...rule, machine_learning_job_id: 'doesNotExist', @@ -210,10 +204,14 @@ export default ({ getService }: FtrProviderContext) => { true ); - const metricsResponse = await getMetricsRequest(); - expect( - metricsResponse.metrics.task_run.value.by_type['alerting:siem__mlRule'].user_errors - ).toEqual(1); + const metricsResponse = await getMetricsWithRetry( + request, + retry, + false, + (metrics) => + metrics.metrics?.task_run?.value.by_type['alerting:siem__mlRule'].user_errors === 1 + ); + expect(metricsResponse.metrics?.task_run?.value.by_type['alerting:siem__mlRule']).toEqual(1); }); it('@skipInQA generates max alerts warning when circuit breaker is exceeded', async () => { diff --git a/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/utils.ts b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/utils.ts new file mode 100644 index 0000000000000..ec6fac6cf2bcd --- /dev/null +++ b/x-pack/test/security_solution_api_integration/test_suites/detections_response/detection_engine/rule_execution_logic/trial_license_complete_tier/execution_logic/utils.ts @@ -0,0 +1,41 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import supertest from 'supertest'; + +import { NodeMetrics } from '@kbn/task-manager-plugin/server/routes/metrics'; +import { RetryService } from '@kbn/ftr-common-functional-services'; + +export const getMetricsRequest = ( + request: supertest.SuperTest, + reset: boolean = false +) => { + return request + .get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`) + .set('kbn-xsrf', 'foo') + .expect(200) + .then((response) => response.body); +}; + +export const getMetricsWithRetry = ( + request: supertest.SuperTest, + retry: RetryService, + reset: boolean = false, + callback?: (metrics: NodeMetrics) => boolean +): Promise => { + return retry.try(async () => { + const metrics = await getMetricsRequest(request, reset); + + if (metrics.metrics) { + if ((callback && callback(metrics)) || !callback) { + return metrics; + } + } + + throw new Error(`Expected metrics not received: ${JSON.stringify(metrics)}`); + }); +};