Skip to content

Commit bd73500

Browse files
authored
Add metric to track seconds since last backfill cycle (#20165)
* add metric to track seconds since last backfill * changelog * Update check.py * lint
1 parent 9b63e46 commit bd73500

File tree

5 files changed

+32
-1
lines changed

5 files changed

+32
-1
lines changed

slurm/changelog.d/20165.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add metric to track seconds since last backfill cycle from sdiag

slurm/datadog_checks/slurm/check.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
# All rights reserved
33
# Licensed under a 3-clause BSD style license (see LICENSE)
44
import os
5+
import re
56
import subprocess
7+
import time
68
from datetime import timedelta
79

810
from datadog_checks.base import AgentCheck, is_affirmative
@@ -311,6 +313,7 @@ def process_sdiag(self, output):
311313
# we see the 'Backfilling stats' line.
312314
current_map = SDIAG_MAP['backfill_stats'] if backfill_section else SDIAG_MAP['main_stats']
313315

316+
# Try to match known metrics
314317
for metric, pattern in current_map.items():
315318
if pattern in line:
316319
try:
@@ -321,6 +324,17 @@ def process_sdiag(self, output):
321324
continue
322325
break
323326

327+
if 'Last cycle when' in line:
328+
try:
329+
match = re.search(r'\((\d+)\)', line)
330+
if match:
331+
last_cycle_epoch = int(match.group(1))
332+
now = int(time.time())
333+
diff = now - last_cycle_epoch
334+
self.gauge('sdiag.last_cycle_seconds_ago', diff, tags=self.tags)
335+
except Exception as e:
336+
self.log.debug("Failed to parse last cycle epoch from line '%s': %s", line, e)
337+
324338
for name, value in metrics.items():
325339
self.gauge(f'sdiag.{name}', value, tags=self.tags)
326340

slurm/metadata.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ slurm.sdiag.jobs_running,gauge,,job,,Number of jobs running since last reset.,1,
5454
slurm.sdiag.jobs_started,gauge,,job,,Number of jobs started since last reset.,1,slurm,slurm_sdiag_jobs_started,,
5555
slurm.sdiag.jobs_submitted,gauge,,job,,Number of jobs submitted since last reset.,1,slurm,slurm_sdiag_jobs_submitted,,
5656
slurm.sdiag.last_cycle,gauge,,microsecond,,Time in microseconds for last scheduling cycle.,1,slurm,slurm_sdiag_last_cycle,,
57+
slurm.sdiag.last_cycle_seconds_ago,gauge,,second,,Time in seconds since the last scheduling cycle.,1,slurm,slurm_sdiag_last_cycle_seconds_ago,,
5758
slurm.sdiag.last_queue_length,gauge,,job,,Length of jobs pending queue.,1,slurm,slurm_sdiag_last_queue_length,,
5859
slurm.sdiag.max_cycle,gauge,,microsecond,,Maximum time in microseconds for any scheduling cycle since last reset.,1,slurm,slurm_sdiag_max_cycle,,
5960
slurm.sdiag.mean_cycle,gauge,,microsecond,,Mean time in microseconds for all scheduling cycles since last reset.,1,slurm,slurm_sdiag_mean_cycle,,

slurm/tests/common.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,11 @@ def mock_output(filename):
11381138
{'name': 'slurm.sdiag.backfill.depth_mean_try_depth', 'value': 27, 'tags': []},
11391139
{'name': 'slurm.sdiag.backfill.queue_length_mean', 'value': 28, 'tags': []},
11401140
{'name': 'slurm.sdiag.backfill.mean_table_size', 'value': 29, 'tags': []},
1141+
{
1142+
'name': 'slurm.sdiag.last_cycle_seconds_ago',
1143+
'value': 1000,
1144+
'tags': [],
1145+
}, # mocked to be 1000 seconds in the test
11411146
]
11421147
}
11431148

slurm/tests/test_unit.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,17 @@ def test_slurm_binary_processing(mock_get_subprocess_output, instance, aggregato
118118
else:
119119
mock_get_subprocess_output.side_effect = [mock_output_main]
120120

121-
check.check(None)
121+
# Patch time.time only for sdiag to make the test deterministic
122+
if binary == 'sdiag':
123+
from unittest.mock import patch
124+
125+
# The epoch in sdiag.txt is 1726207912, mocking current time to 1726208912 (diff = 1000)
126+
with patch('datadog_checks.slurm.check.time') as mock_time:
127+
mock_time.time.return_value = 1726208912
128+
check.check(None)
129+
else:
130+
check.check(None)
131+
122132
if binary == 'sacct':
123133
# This one doesn't collect anything on the first run. It only collects on the second run.
124134
check.check(None)

0 commit comments

Comments
 (0)