Skip to content

Commit cca6d27

Browse files
authored
Change tagging logic in sinfo partition and node metrics (#20257)
* move variable tags away from metrics * change tagging logic * changelog * refactor my tests * change fixtures * test tweak * change sinfo logic * comment lint * lint * refactor gpu processing function
1 parent 3aa1bec commit cca6d27

File tree

8 files changed

+243
-505
lines changed

8 files changed

+243
-505
lines changed

slurm/changelog.d/20257.changed

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Change tagging logic for node and partition metrics
2+

slurm/datadog_checks/slurm/check.py

Lines changed: 62 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .config_models import ConfigMixin
1414
from .constants import (
1515
GPU_PARAMS,
16+
GPU_TOTAL,
1617
NODE_MAP,
1718
PARTITION_MAP,
1819
SACCT_MAP,
@@ -22,6 +23,7 @@
2223
SDIAG_MAP,
2324
SINFO_ADDITIONAL_NODE_PARAMS,
2425
SINFO_NODE_PARAMS,
26+
SINFO_PARTITION_INFO_PARAMS,
2527
SINFO_PARTITION_PARAMS,
2628
SINFO_STATE_CODE,
2729
SQUEUE_MAP,
@@ -78,6 +80,7 @@ def __init__(self, name, init_config, instances):
7880
# CMD compilation
7981
if self.collect_sinfo_stats:
8082
self.sinfo_partition_cmd = self.get_slurm_command('sinfo', SINFO_PARTITION_PARAMS)
83+
self.sinfo_partition_info_cmd = self.get_slurm_command('sinfo', SINFO_PARTITION_INFO_PARAMS)
8184
self.sinfo_collection_level = self.instance.get('sinfo_collection_level', 1)
8285
if self.sinfo_collection_level > 1:
8386
self.sinfo_node_cmd = self.get_slurm_command('sinfo', SINFO_NODE_PARAMS)
@@ -86,7 +89,8 @@ def __init__(self, name, init_config, instances):
8689
if self.gpu_stats:
8790
self.sinfo_node_cmd[-1] += GPU_PARAMS
8891
if self.gpu_stats:
89-
self.sinfo_partition_cmd[-1] += GPU_PARAMS
92+
self.sinfo_partition_cmd[-1] += GPU_TOTAL
93+
self.sinfo_partition_info_cmd[-1] += GPU_PARAMS
9094

9195
if self.collect_squeue_stats:
9296
self.squeue_cmd = self.get_slurm_command('squeue', SQUEUE_PARAMS)
@@ -124,6 +128,7 @@ def check(self, _):
124128

125129
if self.collect_sinfo_stats:
126130
commands.append(('sinfo', self.sinfo_partition_cmd, self.process_sinfo_partition))
131+
commands.append(('sinfo', self.sinfo_partition_info_cmd, self.process_sinfo_partition_info))
127132
if self.sinfo_collection_level > 1:
128133
commands.append(('snode', self.sinfo_node_cmd, self.process_sinfo_node))
129134

@@ -159,7 +164,7 @@ def check(self, _):
159164
self.log.debug("No output from %s", name)
160165

161166
def process_sinfo_partition(self, output):
162-
# normal*|c1|1|up|1000|N/A|1/0/0/1|allocated|1
167+
# test-queue*|N/A|1/2/0/3
163168
lines = output.strip().split('\n')
164169

165170
if self.debug_sinfo_stats:
@@ -174,12 +179,37 @@ def process_sinfo_partition(self, output):
174179
tags = self._process_tags(partition_data, PARTITION_MAP["tags"], tags)
175180

176181
if self.gpu_stats:
177-
gpu_tags = self._process_sinfo_gpu(partition_data[-2], partition_data[-1], "partition", tags)
182+
gpu_tag, _ = self._process_sinfo_gpu(partition_data[-1], None, "partition", tags)
183+
tags.extend(gpu_tag)
184+
185+
self._process_sinfo_aiot_state(partition_data[2], "partition", tags)
186+
187+
def process_sinfo_partition_info(self, output):
188+
# test-queue*|N/A|c[1-2]|up|1|972|allocated|10
189+
lines = output.strip().split('\n')
190+
191+
if self.debug_sinfo_stats:
192+
self.log.debug("Processing sinfo partition line: %s", lines)
193+
194+
for line in lines:
195+
partition_data = line.split('|')
196+
197+
tags = []
198+
tags.extend(self.tags)
199+
200+
tags = self._process_tags(partition_data, PARTITION_MAP["tags"], tags)
201+
202+
if self.gpu_stats:
203+
gpu_tags, gpu_info_tags = self._process_sinfo_gpu(
204+
partition_data[-2], partition_data[-1], "partition", tags
205+
)
178206
tags.extend(gpu_tags)
179207

180-
self._process_metrics(partition_data, PARTITION_MAP, tags)
208+
tags = self._process_tags(partition_data, PARTITION_MAP["info_tags"], tags)
209+
if self.gpu_stats:
210+
tags.extend(gpu_info_tags)
181211

182-
self._process_sinfo_aiot_state(partition_data[6], "partition", tags)
212+
self._process_metrics(partition_data, PARTITION_MAP, tags)
183213
self.gauge('partition.info', 1, tags)
184214

185215
self.gauge('sinfo.partition.enabled', 1)
@@ -200,16 +230,21 @@ def process_sinfo_node(self, output):
200230

201231
tags = self._process_tags(node_data, NODE_MAP["tags"], tags)
202232

203-
if self.sinfo_collection_level > 2:
204-
tags = self._process_tags(node_data, NODE_MAP["extended_tags"], tags)
205-
206233
if self.gpu_stats:
207-
gpu_tags = self._process_sinfo_gpu(node_data[-2], node_data[-1], "node", tags)
234+
gpu_tags, gpu_info_tags = self._process_sinfo_gpu(node_data[-2], node_data[-1], "node", tags)
208235
tags.extend(gpu_tags)
209236

210-
# Submit metrics
211237
self._process_metrics(node_data, NODE_MAP, tags)
238+
212239
self._process_sinfo_aiot_state(node_data[3], 'node', tags)
240+
241+
tags = self._process_tags(node_data, NODE_MAP["info_tags"], tags)
242+
if self.sinfo_collection_level > 2:
243+
tags = self._process_tags(node_data, NODE_MAP["extended_tags"], tags)
244+
245+
# Submit metrics
246+
if self.gpu_stats:
247+
tags.extend(gpu_info_tags)
213248
self.gauge('node.info', 1, tags=tags)
214249

215250
self.gauge('sinfo.node.enabled', 1)
@@ -358,12 +393,12 @@ def _update_sacct_params(self):
358393
# Update the sacct command with the dynamic SACCT_PARAMS
359394
self.sacct_cmd = self.get_slurm_command('sacct', sacct_params)
360395

361-
def _process_sinfo_aiot_state(self, cpus_state, namespace, tags):
396+
def _process_sinfo_aiot_state(self, aiot_state, namespace, tags):
362397
# "0/2/0/2"
363398
try:
364-
allocated, idle, other, total = cpus_state.split('/')
399+
allocated, idle, other, total = aiot_state.split('/')
365400
except ValueError as e:
366-
self.log.debug("Invalid CPU state '%s'. Skipping. Error: %s", cpus_state, e)
401+
self.log.debug("Invalid CPU state '%s'. Skipping. Error: %s", aiot_state, e)
367402
return
368403
if namespace == "partition":
369404
self.gauge(f'{namespace}.node.allocated', allocated, tags)
@@ -383,21 +418,19 @@ def _process_sinfo_gpu(self, gres, gres_used, namespace, tags):
383418
used_gpu_count = None
384419

385420
try:
386-
# gpu:tesla:4(IDX:0-3) -> ["gpu","tesla","4(IDX","0-3)"]
387-
gres_used_parts = gres_used.split(':')
388-
# gpu:tesla:4 -> ["gpu","tesla","4"]
421+
# Always parse total GPU info
389422
gres_total_parts = gres.split(':')
390-
391-
# Ensure gres_used_parts has the correct format for GPU usage
392-
if len(gres_used_parts) == 4 and gres_used_parts[0] == "gpu":
393-
_, gpu_type, used_gpu_count_part, used_gpu_used_idx_part = gres_used_parts
394-
used_gpu_count = int(used_gpu_count_part.split('(')[0])
395-
used_gpu_used_idx = used_gpu_used_idx_part.rstrip(')')
396-
397-
# Ensure gres_total_parts has the correct format for total GPUs
398423
if len(gres_total_parts) == 3 and gres_total_parts[0] == "gpu":
399-
_, _, total_gpu_part = gres_total_parts
424+
_, gpu_type, total_gpu_part = gres_total_parts
400425
total_gpu = int(total_gpu_part)
426+
427+
# Only parse used GPU info if gres_used is not None
428+
if gres_used is not None:
429+
gres_used_parts = gres_used.split(':')
430+
if len(gres_used_parts) == 4 and gres_used_parts[0] == "gpu":
431+
_, _, used_gpu_count_part, used_gpu_used_idx = gres_used_parts
432+
used_gpu_count = int(used_gpu_count_part.split('(')[0])
433+
used_gpu_used_idx = used_gpu_used_idx.rstrip(')')
401434
except (ValueError, IndexError) as e:
402435
self.log.debug(
403436
"Invalid GPU data: gres:'%s', gres_used:'%s'. Skipping GPU metric submission. Error: %s",
@@ -406,15 +439,15 @@ def _process_sinfo_gpu(self, gres, gres_used, namespace, tags):
406439
e,
407440
)
408441

409-
gpu_tags = [f"slurm_partition_gpu_type:{gpu_type}", f"slurm_partition_gpu_used_idx:{used_gpu_used_idx}"]
410-
442+
gpu_tags = [f"slurm_{namespace}_gpu_type:{gpu_type}"]
443+
gpu_info_tags = [f"slurm_{namespace}_gpu_used_idx:{used_gpu_used_idx}"]
411444
_tags = tags + gpu_tags
412445
if total_gpu is not None:
413446
self.gauge(f'{namespace}.gpu_total', total_gpu, _tags)
414-
if used_gpu_count is not None:
447+
if used_gpu_count is not None and gres_used is not None:
415448
self.gauge(f'{namespace}.gpu_used', used_gpu_count, _tags)
416449

417-
return gpu_tags
450+
return gpu_tags, gpu_info_tags
418451

419452
def _process_tags(self, data, map, tags):
420453
for tag_info in map:

slurm/datadog_checks/slurm/constants.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,16 @@
33
# Licensed under a 3-clause BSD style license (see LICENSE)
44
SINFO_PARTITION_PARAMS = [
55
"-ahO",
6-
"Partition:|,NodeList:|,CPUs:|,Available:|,Memory:|,Cluster:|,NodeAIOT:|,StateLong:|,Nodes:",
6+
"Partition:|,Cluster:|,NodeAIOT:",
77
]
8-
SINFO_NODE_PARAMS = ["-haNO", "PartitionName:|,Available:|,NodeList:|,CPUsState:|,Memory:|,Cluster:"]
8+
SINFO_PARTITION_INFO_PARAMS = [
9+
"-haO",
10+
"Partition:|,Cluster:|,NodeList:|,Available:|,CPUs:|,Memory:|,StateLong:|,Nodes:",
11+
]
12+
SINFO_NODE_PARAMS = ["-haNO", "Partition:|,Available:|,NodeList:|,CPUsState:|,Memory:|,Cluster:"]
913
SINFO_ADDITIONAL_NODE_PARAMS = "|,CPUsLoad:|,FreeMem:|,Disk:|,StateLong:|,Reason:|,Features_act:|,Threads:|,AllocMem:"
10-
GPU_PARAMS = "|,Gres:|,GresUsed:"
14+
GPU_TOTAL = "|,Gres:"
15+
GPU_PARAMS = GPU_TOTAL + "|,GresUsed:"
1116
SQUEUE_PARAMS = ["-aho", "%A|%u|%j|%T|%N|%C|%R|%m|%P"]
1217
SSHARE_PARAMS = ["-alnPU"]
1318
SACCT_PARAMS = [
@@ -20,26 +25,30 @@
2025
PARTITION_MAP = {
2126
"tags": [
2227
{"name": "slurm_partition_name", "index": 0},
23-
{"name": "slurm_partition_node_list", "index": 1},
24-
{"name": "slurm_partition_cpus_assigned", "index": 2},
28+
{"name": "slurm_cluster_name", "index": 1},
29+
],
30+
"info_tags": [
31+
{"name": "slurm_partition_node_list", "index": 2},
2532
{"name": "slurm_partition_availability", "index": 3},
26-
{"name": "slurm_partition_memory_assigned", "index": 4},
27-
{"name": "slurm_cluster_name", "index": 5},
28-
{"name": "slurm_partition_state", "index": 7},
33+
{"name": "slurm_partition_cpus_assigned", "index": 4},
34+
{"name": "slurm_partition_memory_assigned", "index": 5},
35+
{"name": "slurm_partition_state", "index": 6},
2936
],
3037
"metrics": [
31-
{"name": "partition.nodes.count", "index": 8},
38+
{"name": "partition.nodes.count", "index": 7},
3239
],
3340
}
3441

3542
NODE_MAP = {
3643
"tags": [
3744
{"name": "slurm_partition_name", "index": 0},
38-
{"name": "slurm_node_availability", "index": 1},
3945
{"name": "slurm_node_name", "index": 2},
40-
{"name": "slurm_node_memory", "index": 4},
4146
{"name": "slurm_cluster_name", "index": 5},
4247
],
48+
"info_tags": [
49+
{"name": "slurm_node_availability", "index": 1},
50+
{"name": "slurm_node_memory", "index": 4},
51+
],
4352
"extended_tags": [
4453
{"name": "slurm_node_state", "index": 9},
4554
{"name": "slurm_node_state_reason", "index": 10},

0 commit comments

Comments
 (0)