Skip to content

Commit 689a65d

Browse files
authored
fix(ci): e2e dd logs failure (#23038)
* wip * bump to stable debian * fix bug with docker ARG evaluation * more dbg statements * try with more recent docker.io/datadog/fakeintake * test with bigger timeout * make test runner usable locally * add retries * more tweaks to config * cleanup
1 parent a39d60a commit 689a65d

File tree

8 files changed

+50
-31
lines changed

8 files changed

+50
-31
lines changed

.github/workflows/ci-integration-review.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ jobs:
113113
with:
114114
timeout_minutes: 30
115115
max_attempts: 3
116-
command: bash scripts/ci-int-e2e-test.sh int ${{ matrix.service }}
116+
command: bash scripts/int-e2e-test.sh int ${{ matrix.service }}
117117

118118
e2e-tests:
119119
needs: prep-pr
@@ -136,7 +136,7 @@ jobs:
136136
with:
137137
timeout_minutes: 35
138138
max_attempts: 3
139-
command: bash scripts/ci-int-e2e-test.sh e2e datadog-logs
139+
command: bash scripts/int-e2e-test.sh e2e datadog-logs
140140

141141
- name: datadog-e2e-metrics
142142
if: ${{ startsWith(github.event.review.body, '/ci-run-e2e-datadog-metrics')
@@ -146,7 +146,7 @@ jobs:
146146
with:
147147
timeout_minutes: 35
148148
max_attempts: 3
149-
command: bash scripts/ci-int-e2e-test.sh e2e datadog-metrics
149+
command: bash scripts/int-e2e-test.sh e2e datadog-metrics
150150

151151
update-pr-status:
152152
name: Signal result to PR

.github/workflows/e2e.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
with:
9090
timeout_minutes: 35
9191
max_attempts: 3
92-
command: bash scripts/ci-int-e2e-test.sh e2e datadog-logs
92+
command: bash scripts/int-e2e-test.sh e2e datadog-logs
9393

9494
- if: (github.event_name == 'schedule' || needs.changes.outputs.all-e2e == 'true' || needs.changes.outputs.e2e-datadog-metrics == 'true') &&
9595
(github.event_name != 'pull_request' || env.PR_HAS_ACCESS_TO_SECRETS == 'true')
@@ -98,7 +98,7 @@ jobs:
9898
with:
9999
timeout_minutes: 35
100100
max_attempts: 3
101-
command: bash scripts/ci-int-e2e-test.sh e2e datadog-metrics
101+
command: bash scripts/int-e2e-test.sh e2e datadog-metrics
102102

103103

104104
e2e-test-suite:

.github/workflows/integration.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ jobs:
148148
"${{ needs.changes.outputs.all-int }}" == "true" || \
149149
"$should_run" == "true" ]]; then
150150
echo "Running test for ${{ matrix.service }}"
151-
bash scripts/ci-int-e2e-test.sh int ${{ matrix.service }}
151+
bash scripts/int-e2e-test.sh int ${{ matrix.service }}
152152
else
153153
echo "Skipping ${{ matrix.service }} test as the value is false or conditions not met."
154154
fi

scripts/e2e/datadog-logs/compose.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,13 @@ services:
8585
# which does the validation of consistency with the other fakeintake service.
8686
fakeintake-agent:
8787
# TODO: temporarily pegging the image as latest results in failures
88-
image: docker.io/datadog/fakeintake:v77a06f2b
88+
image: docker.io/datadog/fakeintake:ved764626
8989

9090
# Receives log data from the `datadog-agent-vector` service. Is queried by the test runner
9191
# which does the validation of consistency with the other fakeintake service.
9292
fakeintake-vector:
9393
# TODO: temporarily pegging the image as latest results in failures
94-
image: docker.io/datadog/fakeintake:v77a06f2b
94+
image: docker.io/datadog/fakeintake:ved764626
9595

9696
networks:
9797
default:

scripts/ci-int-e2e-test.sh renamed to scripts/int-e2e-test.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,6 @@
66

77
set -u
88

9-
if [[ -z "${CI:-}" ]]; then
10-
echo "Aborted: this script is for use in CI." >&2
11-
exit 1
12-
fi
13-
149
if [ $# -ne 2 ]
1510
then
1611
echo "usage: $0 [int|e2e] TEST_NAME"
@@ -27,5 +22,10 @@ sleep 30
2722
cargo vdev -v "${TEST_TYPE}" test --retries 2 -a "${TEST_NAME}"
2823
RET=$?
2924
cargo vdev -v "${TEST_TYPE}" stop -a "${TEST_NAME}"
30-
./scripts/upload-test-results.sh
25+
26+
# Only upload test results if CI is defined
27+
if [[ -n "${CI:-}" ]]; then
28+
./scripts/upload-test-results.sh
29+
fi
30+
3131
exit $RET

tests/data/e2e/datadog/logs/agent_only.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,6 @@ logs_config:
2727
logs_no_ssl: true
2828
force_use_http: true
2929
batch_wait: 1
30+
31+
# Required per https://github.com/DataDog/datadog-agent/tree/main/test/fakeintake#docker
32+
dd_url: 'http://fakeintake-agent:80'

tests/data/e2e/datadog/logs/agent_vector.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,6 @@ vector:
3232
logs:
3333
enabled: true
3434
url: "http://vector:8181"
35+
36+
# Required per https://github.com/DataDog/datadog-agent/tree/main/test/fakeintake#docker
37+
dd_url: 'http://fakeintake-agent:80'

tests/e2e/datadog/logs/mod.rs

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
use serde_json::Value;
22
use tracing::info;
33

4-
use vector::test_util::trace_init;
5-
64
use super::*;
5+
use std::time::Duration;
6+
use vector::test_util::trace_init;
77

88
const LOGS_ENDPOINT: &str = "/api/v2/logs";
9+
const MAX_RETRIES: usize = 10;
10+
const WAIT_INTERVAL: Duration = Duration::from_secs(1);
911

1012
fn expected_log_events() -> usize {
1113
std::env::var("EXPECTED_LOG_EVENTS")
@@ -70,25 +72,36 @@ fn reduce_to_data(payloads: Vec<FakeIntakePayload<Value>>) -> Vec<Value> {
7072
async fn validate() {
7173
trace_init();
7274

73-
// Even with configuring docker service dependencies, we need a small buffer of time
74-
// to ensure events flow through to fakeintake before asking for them
75-
std::thread::sleep(std::time::Duration::from_secs(2));
76-
75+
// Retry until we have log payloads or hit max retries.
76+
// This is to ensure events flow through to fakeintake before asking for them.
7777
info!("getting log payloads from agent-only pipeline");
78-
let mut agent_payloads = get_fakeintake_payloads::<FakeIntakeResponseJson>(
79-
&fake_intake_agent_address(),
80-
LOGS_ENDPOINT,
81-
)
82-
.await
83-
.payloads;
84-
85-
// the logs endpoint receives an empty healthcheck payload in the beginning
86-
if !agent_payloads.is_empty() {
87-
agent_payloads.retain(|raw_payload| !raw_payload.data.as_array().unwrap().is_empty())
78+
let mut agent_payloads = Vec::new();
79+
for _ in 0..MAX_RETRIES {
80+
agent_payloads = get_fakeintake_payloads::<FakeIntakeResponseJson>(
81+
&fake_intake_agent_address(),
82+
LOGS_ENDPOINT,
83+
)
84+
.await
85+
.payloads;
86+
87+
if !agent_payloads.is_empty() {
88+
break;
89+
}
90+
91+
info!("No valid payloads yet, retrying...");
92+
tokio::time::sleep(WAIT_INTERVAL).await;
8893
}
8994

90-
let mut agent_payloads = reduce_to_data(agent_payloads);
95+
// If we still don't have valid payloads after retries, fail the test
96+
assert!(
97+
!agent_payloads.is_empty(),
98+
"Failed to get valid log payloads from agent pipeline after {MAX_RETRIES} retries"
99+
);
100+
101+
// The logs endpoint receives an empty healthcheck payload in the beginning
102+
agent_payloads.retain(|raw_payload| !raw_payload.data.as_array().unwrap().is_empty());
91103

104+
let mut agent_payloads = reduce_to_data(agent_payloads);
92105
common_assertions(&mut agent_payloads);
93106

94107
info!("getting log payloads from agent-vector pipeline");

0 commit comments

Comments
 (0)