13
13
from .config_models import ConfigMixin
14
14
from .constants import (
15
15
GPU_PARAMS ,
16
+ GPU_TOTAL ,
16
17
NODE_MAP ,
17
18
PARTITION_MAP ,
18
19
SACCT_MAP ,
22
23
SDIAG_MAP ,
23
24
SINFO_ADDITIONAL_NODE_PARAMS ,
24
25
SINFO_NODE_PARAMS ,
26
+ SINFO_PARTITION_INFO_PARAMS ,
25
27
SINFO_PARTITION_PARAMS ,
26
28
SINFO_STATE_CODE ,
27
29
SQUEUE_MAP ,
@@ -78,6 +80,7 @@ def __init__(self, name, init_config, instances):
78
80
# CMD compilation
79
81
if self .collect_sinfo_stats :
80
82
self .sinfo_partition_cmd = self .get_slurm_command ('sinfo' , SINFO_PARTITION_PARAMS )
83
+ self .sinfo_partition_info_cmd = self .get_slurm_command ('sinfo' , SINFO_PARTITION_INFO_PARAMS )
81
84
self .sinfo_collection_level = self .instance .get ('sinfo_collection_level' , 1 )
82
85
if self .sinfo_collection_level > 1 :
83
86
self .sinfo_node_cmd = self .get_slurm_command ('sinfo' , SINFO_NODE_PARAMS )
@@ -86,7 +89,8 @@ def __init__(self, name, init_config, instances):
86
89
if self .gpu_stats :
87
90
self .sinfo_node_cmd [- 1 ] += GPU_PARAMS
88
91
if self .gpu_stats :
89
- self .sinfo_partition_cmd [- 1 ] += GPU_PARAMS
92
+ self .sinfo_partition_cmd [- 1 ] += GPU_TOTAL
93
+ self .sinfo_partition_info_cmd [- 1 ] += GPU_PARAMS
90
94
91
95
if self .collect_squeue_stats :
92
96
self .squeue_cmd = self .get_slurm_command ('squeue' , SQUEUE_PARAMS )
@@ -124,6 +128,7 @@ def check(self, _):
124
128
125
129
if self .collect_sinfo_stats :
126
130
commands .append (('sinfo' , self .sinfo_partition_cmd , self .process_sinfo_partition ))
131
+ commands .append (('sinfo' , self .sinfo_partition_info_cmd , self .process_sinfo_partition_info ))
127
132
if self .sinfo_collection_level > 1 :
128
133
commands .append (('snode' , self .sinfo_node_cmd , self .process_sinfo_node ))
129
134
@@ -159,7 +164,7 @@ def check(self, _):
159
164
self .log .debug ("No output from %s" , name )
160
165
161
166
def process_sinfo_partition (self , output ):
162
- # normal*|c1|1|up|1000| N/A|1/0 /0/1|allocated|1
167
+ # test-queue*| N/A|1/2 /0/3
163
168
lines = output .strip ().split ('\n ' )
164
169
165
170
if self .debug_sinfo_stats :
@@ -174,12 +179,37 @@ def process_sinfo_partition(self, output):
174
179
tags = self ._process_tags (partition_data , PARTITION_MAP ["tags" ], tags )
175
180
176
181
if self .gpu_stats :
177
- gpu_tags = self ._process_sinfo_gpu (partition_data [- 2 ], partition_data [- 1 ], "partition" , tags )
182
+ gpu_tag , _ = self ._process_sinfo_gpu (partition_data [- 1 ], None , "partition" , tags )
183
+ tags .extend (gpu_tag )
184
+
185
+ self ._process_sinfo_aiot_state (partition_data [2 ], "partition" , tags )
186
+
187
+ def process_sinfo_partition_info (self , output ):
188
+ # test-queue*|N/A|c[1-2]|up|1|972|allocated|10
189
+ lines = output .strip ().split ('\n ' )
190
+
191
+ if self .debug_sinfo_stats :
192
+ self .log .debug ("Processing sinfo partition line: %s" , lines )
193
+
194
+ for line in lines :
195
+ partition_data = line .split ('|' )
196
+
197
+ tags = []
198
+ tags .extend (self .tags )
199
+
200
+ tags = self ._process_tags (partition_data , PARTITION_MAP ["tags" ], tags )
201
+
202
+ if self .gpu_stats :
203
+ gpu_tags , gpu_info_tags = self ._process_sinfo_gpu (
204
+ partition_data [- 2 ], partition_data [- 1 ], "partition" , tags
205
+ )
178
206
tags .extend (gpu_tags )
179
207
180
- self ._process_metrics (partition_data , PARTITION_MAP , tags )
208
+ tags = self ._process_tags (partition_data , PARTITION_MAP ["info_tags" ], tags )
209
+ if self .gpu_stats :
210
+ tags .extend (gpu_info_tags )
181
211
182
- self ._process_sinfo_aiot_state (partition_data [ 6 ], "partition" , tags )
212
+ self ._process_metrics (partition_data , PARTITION_MAP , tags )
183
213
self .gauge ('partition.info' , 1 , tags )
184
214
185
215
self .gauge ('sinfo.partition.enabled' , 1 )
@@ -200,16 +230,21 @@ def process_sinfo_node(self, output):
200
230
201
231
tags = self ._process_tags (node_data , NODE_MAP ["tags" ], tags )
202
232
203
- if self .sinfo_collection_level > 2 :
204
- tags = self ._process_tags (node_data , NODE_MAP ["extended_tags" ], tags )
205
-
206
233
if self .gpu_stats :
207
- gpu_tags = self ._process_sinfo_gpu (node_data [- 2 ], node_data [- 1 ], "node" , tags )
234
+ gpu_tags , gpu_info_tags = self ._process_sinfo_gpu (node_data [- 2 ], node_data [- 1 ], "node" , tags )
208
235
tags .extend (gpu_tags )
209
236
210
- # Submit metrics
211
237
self ._process_metrics (node_data , NODE_MAP , tags )
238
+
212
239
self ._process_sinfo_aiot_state (node_data [3 ], 'node' , tags )
240
+
241
+ tags = self ._process_tags (node_data , NODE_MAP ["info_tags" ], tags )
242
+ if self .sinfo_collection_level > 2 :
243
+ tags = self ._process_tags (node_data , NODE_MAP ["extended_tags" ], tags )
244
+
245
+ # Submit metrics
246
+ if self .gpu_stats :
247
+ tags .extend (gpu_info_tags )
213
248
self .gauge ('node.info' , 1 , tags = tags )
214
249
215
250
self .gauge ('sinfo.node.enabled' , 1 )
@@ -358,12 +393,12 @@ def _update_sacct_params(self):
358
393
# Update the sacct command with the dynamic SACCT_PARAMS
359
394
self .sacct_cmd = self .get_slurm_command ('sacct' , sacct_params )
360
395
361
- def _process_sinfo_aiot_state (self , cpus_state , namespace , tags ):
396
+ def _process_sinfo_aiot_state (self , aiot_state , namespace , tags ):
362
397
# "0/2/0/2"
363
398
try :
364
- allocated , idle , other , total = cpus_state .split ('/' )
399
+ allocated , idle , other , total = aiot_state .split ('/' )
365
400
except ValueError as e :
366
- self .log .debug ("Invalid CPU state '%s'. Skipping. Error: %s" , cpus_state , e )
401
+ self .log .debug ("Invalid CPU state '%s'. Skipping. Error: %s" , aiot_state , e )
367
402
return
368
403
if namespace == "partition" :
369
404
self .gauge (f'{ namespace } .node.allocated' , allocated , tags )
@@ -383,21 +418,19 @@ def _process_sinfo_gpu(self, gres, gres_used, namespace, tags):
383
418
used_gpu_count = None
384
419
385
420
try :
386
- # gpu:tesla:4(IDX:0-3) -> ["gpu","tesla","4(IDX","0-3)"]
387
- gres_used_parts = gres_used .split (':' )
388
- # gpu:tesla:4 -> ["gpu","tesla","4"]
421
+ # Always parse total GPU info
389
422
gres_total_parts = gres .split (':' )
390
-
391
- # Ensure gres_used_parts has the correct format for GPU usage
392
- if len (gres_used_parts ) == 4 and gres_used_parts [0 ] == "gpu" :
393
- _ , gpu_type , used_gpu_count_part , used_gpu_used_idx_part = gres_used_parts
394
- used_gpu_count = int (used_gpu_count_part .split ('(' )[0 ])
395
- used_gpu_used_idx = used_gpu_used_idx_part .rstrip (')' )
396
-
397
- # Ensure gres_total_parts has the correct format for total GPUs
398
423
if len (gres_total_parts ) == 3 and gres_total_parts [0 ] == "gpu" :
399
- _ , _ , total_gpu_part = gres_total_parts
424
+ _ , gpu_type , total_gpu_part = gres_total_parts
400
425
total_gpu = int (total_gpu_part )
426
+
427
+ # Only parse used GPU info if gres_used is not None
428
+ if gres_used is not None :
429
+ gres_used_parts = gres_used .split (':' )
430
+ if len (gres_used_parts ) == 4 and gres_used_parts [0 ] == "gpu" :
431
+ _ , _ , used_gpu_count_part , used_gpu_used_idx = gres_used_parts
432
+ used_gpu_count = int (used_gpu_count_part .split ('(' )[0 ])
433
+ used_gpu_used_idx = used_gpu_used_idx .rstrip (')' )
401
434
except (ValueError , IndexError ) as e :
402
435
self .log .debug (
403
436
"Invalid GPU data: gres:'%s', gres_used:'%s'. Skipping GPU metric submission. Error: %s" ,
@@ -406,15 +439,15 @@ def _process_sinfo_gpu(self, gres, gres_used, namespace, tags):
406
439
e ,
407
440
)
408
441
409
- gpu_tags = [f"slurm_partition_gpu_type: { gpu_type } " , f"slurm_partition_gpu_used_idx: { used_gpu_used_idx } " ]
410
-
442
+ gpu_tags = [f"slurm_ { namespace } _gpu_type: { gpu_type } " ]
443
+ gpu_info_tags = [ f"slurm_ { namespace } _gpu_used_idx: { used_gpu_used_idx } " ]
411
444
_tags = tags + gpu_tags
412
445
if total_gpu is not None :
413
446
self .gauge (f'{ namespace } .gpu_total' , total_gpu , _tags )
414
- if used_gpu_count is not None :
447
+ if used_gpu_count is not None and gres_used is not None :
415
448
self .gauge (f'{ namespace } .gpu_used' , used_gpu_count , _tags )
416
449
417
- return gpu_tags
450
+ return gpu_tags , gpu_info_tags
418
451
419
452
def _process_tags (self , data , map , tags ):
420
453
for tag_info in map :
0 commit comments