Skip to content

Commit 6583312

Browse files
authored
Fix a spike in monotonic counter values after a failed openmetrics scrape (#20063)
* AGTMETRICS-177 skip flush first value on error flush_first_value indicates that new monotonic counters values should assume previous value of zero and be included in the very next flush, instead of waiting for another value to compute a diff. This breaks when scrape fails. We can no longer assume that the counter just appeared, and reporting it may cause an unwanted spike. Aggregator retention for previous values is also short lived, so if few scrapes are missed, even old counters will appear as new to the aggregator, causing even bigger spikes. * Changelog * Preserve use process_start_time on first successful scrape If the target process starts after the agent, we may have a few failed scrapes, but the values on first scrape can be fresh enough to use as is.
1 parent 501217e commit 6583312

File tree

3 files changed

+63
-44
lines changed

3 files changed

+63
-44
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a spike in monotonic counter values after a failed openmetrics scrape

datadog_checks_base/datadog_checks/base/checks/openmetrics/v2/scraper.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -230,13 +230,13 @@ def __init__(self, check, config):
230230
self.use_process_start_time = is_affirmative(config.get('use_process_start_time'))
231231

232232
# Used for monotonic counts
233-
self.flush_first_value = False
233+
self.flush_first_value = None
234234

235-
def scrape(self):
235+
def _scrape(self):
236236
"""
237237
Execute a scrape, and for each metric collected, transform the metric.
238238
"""
239-
runtime_data = {'flush_first_value': self.flush_first_value, 'static_tags': self.static_tags}
239+
runtime_data = {'flush_first_value': bool(self.flush_first_value), 'static_tags': self.static_tags}
240240

241241
# Determine which consume method to use based on target_info config
242242
if self.target_info:
@@ -251,7 +251,18 @@ def scrape(self):
251251

252252
transformer(metric, self.generate_sample_data(metric), runtime_data)
253253

254-
self.flush_first_value = True
254+
def scrape(self):
255+
try:
256+
self._scrape()
257+
self.flush_first_value = True
258+
except:
259+
# Don't flush new monotonic counts on next scrape:
260+
# 1. Previous value may have expired in the aggregator, causing a spike
261+
# 2. New counter itself may be too old and large when we discover it next time.
262+
# If we didn't have a successful scrape yet, keep the initial value (use process_start_time to decide).
263+
if self.flush_first_value:
264+
self.flush_first_value = False
265+
raise
255266

256267
def consume_metrics(self, runtime_data):
257268
"""
@@ -260,7 +271,7 @@ def consume_metrics(self, runtime_data):
260271

261272
metric_parser = self.parse_metrics()
262273

263-
if not self.flush_first_value and self.use_process_start_time:
274+
if self.flush_first_value is None and self.use_process_start_time:
264275
metric_parser = first_scrape_handler(metric_parser, runtime_data, datadog_agent.get_process_start_time())
265276
if self.label_aggregator.configured:
266277
metric_parser = self.label_aggregator(metric_parser)
@@ -283,7 +294,7 @@ def consume_metrics_w_target_info(self, runtime_data):
283294

284295
metric_parser = self.parse_metrics()
285296

286-
if not self.flush_first_value and self.use_process_start_time:
297+
if self.flush_first_value is None and self.use_process_start_time:
287298
metric_parser = first_scrape_handler(metric_parser, runtime_data, datadog_agent.get_process_start_time())
288299
if self.label_aggregator.configured:
289300
metric_parser = self.label_aggregator(metric_parser)

datadog_checks_base/tests/base/checks/openmetrics/test_v2/test_first_scrape_handler.py

Lines changed: 45 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ def test_first_scrape_handler(
8686
agent_start_time,
8787
process_start_time,
8888
):
89-
mock_http_response(_make_test_data(process_start_time))
90-
9189
check = get_check(
9290
{
9391
'metrics': ['.*'],
@@ -98,53 +96,62 @@ def test_first_scrape_handler(
9896

9997
datadog_agent.set_process_start_time(agent_start_time)
10098

101-
for _ in range(0, 5):
99+
for idx in range(0, 10):
102100
aggregator.reset()
103-
dd_run_check(check)
104101

105-
aggregator.assert_metric(
106-
'test.go_memstats_alloc_bytes.count',
107-
metric_type=aggregator.MONOTONIC_COUNT,
108-
count=1,
109-
flush_first_value=expect_first_flush,
110-
)
111-
aggregator.assert_metric(
112-
'test.go_gc_duration_seconds.count',
113-
metric_type=aggregator.MONOTONIC_COUNT,
114-
count=1,
115-
flush_first_value=expect_first_flush,
116-
)
102+
if idx != 5:
103+
mock_http_response(_make_test_data(process_start_time))
104+
dd_run_check(check)
117105

118-
if with_buckets:
119-
aggregator.assert_histogram_bucket(
120-
'test.skydns_skydns_dns_request_duration_seconds',
121-
None,
122-
None,
123-
None,
124-
True,
125-
None,
126-
None,
127-
count=2,
128-
flush_first_value=expect_first_flush,
129-
)
130-
else:
131-
aggregator.assert_metric(
132-
'test.skydns_skydns_dns_request_duration_seconds.count',
133-
metric_type=aggregator.MONOTONIC_COUNT,
134-
count=1,
135-
flush_first_value=expect_first_flush,
136-
)
137106
aggregator.assert_metric(
138-
'test.skydns_skydns_dns_request_duration_seconds.sum',
107+
'test.go_memstats_alloc_bytes.count',
139108
metric_type=aggregator.MONOTONIC_COUNT,
140109
count=1,
141110
flush_first_value=expect_first_flush,
142111
)
143112
aggregator.assert_metric(
144-
'test.skydns_skydns_dns_request_duration_seconds.bucket',
113+
'test.go_gc_duration_seconds.count',
145114
metric_type=aggregator.MONOTONIC_COUNT,
146115
count=1,
147116
flush_first_value=expect_first_flush,
148117
)
149118

150-
expect_first_flush = True
119+
if with_buckets:
120+
aggregator.assert_histogram_bucket(
121+
'test.skydns_skydns_dns_request_duration_seconds',
122+
None,
123+
None,
124+
None,
125+
True,
126+
None,
127+
None,
128+
count=2,
129+
flush_first_value=expect_first_flush,
130+
)
131+
else:
132+
aggregator.assert_metric(
133+
'test.skydns_skydns_dns_request_duration_seconds.count',
134+
metric_type=aggregator.MONOTONIC_COUNT,
135+
count=1,
136+
flush_first_value=expect_first_flush,
137+
)
138+
aggregator.assert_metric(
139+
'test.skydns_skydns_dns_request_duration_seconds.sum',
140+
metric_type=aggregator.MONOTONIC_COUNT,
141+
count=1,
142+
flush_first_value=expect_first_flush,
143+
)
144+
aggregator.assert_metric(
145+
'test.skydns_skydns_dns_request_duration_seconds.bucket',
146+
metric_type=aggregator.MONOTONIC_COUNT,
147+
count=1,
148+
flush_first_value=expect_first_flush,
149+
)
150+
151+
expect_first_flush = True
152+
else:
153+
mock_http_response("", status_code=500)
154+
with pytest.raises(Exception):
155+
dd_run_check(check)
156+
157+
expect_first_flush = False

0 commit comments

Comments
 (0)