Skip to content

Commit 45ae8f5

Browse files
authored
Add slurm_partition_name to job metrics (#20170)
* add slurm_partition_name to job metrics * changelog
1 parent a6cf7f3 commit 45ae8f5

File tree

5 files changed

+34
-15
lines changed

5 files changed

+34
-15
lines changed

slurm/changelog.d/20170.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add slurm_partition_name to job metrics

slurm/datadog_checks/slurm/check.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,8 @@ def process_sinfo_node(self, output):
215215
self.gauge('sinfo.node.enabled', 1)
216216

217217
def process_squeue(self, output):
218-
# JOBID | USER | NAME | STATE | NODELIST | CPUS | NODELIST(REASON) | MIN_MEMORY # noqa: E501
219-
# 31 | root | wrap | PENDING | | 1 | (Resources) | 500M # noqa: E501
218+
# JOBID | USER | NAME | STATE | NODELIST | CPUS | NODELIST(REASON) | MIN_MEMORY | Partition # noqa: E501
219+
# 31 | root | wrap | PENDING | | 1 | (Resources) | 500M | foo # noqa: E501
220220
lines = output.strip().split('\n')
221221

222222
if self.debug_squeue_stats:

slurm/datadog_checks/slurm/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
SINFO_NODE_PARAMS = ["-haNO", "PartitionName:|,Available:|,NodeList:|,CPUsState:|,Memory:|,Cluster:"]
99
SINFO_ADDITIONAL_NODE_PARAMS = "|,CPUsLoad:|,FreeMem:|,Disk:|,StateLong:|,Reason:|,features_act:|,Threads:"
1010
GPU_PARAMS = "|,Gres:|,GresUsed:"
11-
SQUEUE_PARAMS = ["-aho", "%A|%u|%j|%T|%N|%C|%R|%m"]
11+
SQUEUE_PARAMS = ["-aho", "%A|%u|%j|%T|%N|%C|%R|%m|%P"]
1212
SSHARE_PARAMS = ["-alnPU"]
1313
SACCT_PARAMS = [
1414
"-anpo",
@@ -75,13 +75,15 @@
7575
{"name": "slurm_job_cpus", "index": 5},
7676
{"name": "slurm_job_reason", "index": 6},
7777
{"name": "slurm_job_tres_per_node", "index": 7},
78+
{"name": "slurm_partition_name", "index": 8},
7879
],
7980
}
8081

8182
SACCT_MAP = {
8283
"tags": [
8384
{"name": "slurm_job_name", "index": 1},
8485
{"name": "slurm_job_partition", "index": 2},
86+
{"name": "slurm_partition_name", "index": 2},
8587
{"name": "slurm_job_account", "index": 3},
8688
{"name": "slurm_job_cpus", "index": 4},
8789
{"name": "slurm_job_tres_per_node", "index": 5},

slurm/tests/common.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -817,8 +817,8 @@ def mock_output(filename):
817817
'value': 1,
818818
'tags': [],
819819
},
820-
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY # noqa: E501
821-
# 42 |root |wrap |RUNNING |c1 | 1 |c1 |300M # noqa: E501
820+
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY | PARTITION # noqa: E501
821+
# 42 |root |wrap |RUNNING |c1 | 1 |c1 |300M | foo # noqa: E501
822822
{
823823
'name': 'slurm.squeue.job.info',
824824
'value': 1,
@@ -831,10 +831,11 @@ def mock_output(filename):
831831
'slurm_job_state:RUNNING',
832832
'slurm_job_tres_per_node:300M',
833833
'slurm_job_user:root',
834+
'slurm_partition_name:foo',
834835
],
835836
},
836-
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY # noqa: E501
837-
# 44 |root |wrap |RUNNING |c2 | 1 |c2 |400M # noqa: E501
837+
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY | PARTITION # noqa: E501
838+
# 44 |root |wrap |RUNNING |c2 | 1 |c2 |400M | foo # noqa: E501
838839
{
839840
'name': 'slurm.squeue.job.info',
840841
'value': 1,
@@ -847,10 +848,11 @@ def mock_output(filename):
847848
'slurm_job_state:RUNNING',
848849
'slurm_job_tres_per_node:400M',
849850
'slurm_job_user:root',
851+
'slurm_partition_name:foo',
850852
],
851853
},
852-
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY # noqa: E501
853-
# 45 |root |test.py |PENDING | | 1 |(Resources) |100M # noqa: E501
854+
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY | PARTITION # noqa: E501
855+
# 45 |root |test.py |PENDING | | 1 |(Resources) |100M | foo # noqa: E501
854856
{
855857
'name': 'slurm.squeue.job.info',
856858
'value': 1,
@@ -863,10 +865,11 @@ def mock_output(filename):
863865
'slurm_job_state:PENDING',
864866
'slurm_job_tres_per_node:100M',
865867
'slurm_job_user:root',
868+
'slurm_partition_name:foo',
866869
],
867870
},
868-
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY # noqa: E501
869-
# 46 |root |test.py |PENDING | | 1 |(Priority) |200M # noqa: E501
871+
# JOBID |USER |NAME |STATE |NODELIST |CPUS |NODELIST(REASON) |MIN_MEMORY | PARTITION # noqa: E501
872+
# 46 |root |test.py |PENDING | | 1 |(Priority) |200M | foo # noqa: E501
870873
{
871874
'name': 'slurm.squeue.job.info',
872875
'value': 1,
@@ -879,6 +882,7 @@ def mock_output(filename):
879882
'slurm_job_state:PENDING',
880883
'slurm_job_tres_per_node:200M',
881884
'slurm_job_user:root',
885+
'slurm_partition_name:foo',
882886
],
883887
},
884888
]
@@ -905,6 +909,7 @@ def mock_output(filename):
905909
'slurm_job_name:wrap',
906910
'slurm_job_node_list:c1',
907911
'slurm_job_partition:normal',
912+
'slurm_partition_name:normal',
908913
'slurm_job_state:COMPLETED',
909914
'slurm_job_tres_per_node:billing=1,cpu=1,mem=500M,node=1',
910915
],
@@ -921,6 +926,7 @@ def mock_output(filename):
921926
'slurm_job_name:wrap',
922927
'slurm_job_node_list:c1',
923928
'slurm_job_partition:normal',
929+
'slurm_partition_name:normal',
924930
'slurm_job_state:COMPLETED',
925931
'slurm_job_tres_per_node:billing=1,cpu=1,mem=500M,node=1',
926932
],
@@ -937,6 +943,7 @@ def mock_output(filename):
937943
'slurm_job_name:wrap',
938944
'slurm_job_node_list:c1',
939945
'slurm_job_partition:normal',
946+
'slurm_partition_name:normal',
940947
'slurm_job_state:COMPLETED',
941948
'slurm_job_tres_per_node:billing=1,cpu=1,mem=500M,node=1',
942949
],
@@ -953,6 +960,7 @@ def mock_output(filename):
953960
'slurm_job_name:wrap',
954961
'slurm_job_node_list:c1',
955962
'slurm_job_partition:normal',
963+
'slurm_partition_name:normal',
956964
'slurm_job_state:COMPLETED',
957965
'slurm_job_tres_per_node:billing=1,cpu=1,mem=500M,node=1',
958966
],
@@ -969,6 +977,7 @@ def mock_output(filename):
969977
'slurm_job_name:wrap',
970978
'slurm_job_node_list:c1',
971979
'slurm_job_partition:normal',
980+
'slurm_partition_name:normal',
972981
'slurm_job_state:COMPLETED',
973982
'slurm_job_tres_per_node:billing=1,cpu=1,mem=500M,node=1',
974983
],
@@ -985,6 +994,7 @@ def mock_output(filename):
985994
'slurm_job_name:wrap',
986995
'slurm_job_node_list:c1',
987996
'slurm_job_partition:normal',
997+
'slurm_partition_name:normal',
988998
'slurm_job_state:COMPLETED',
989999
'slurm_job_tres_per_node:billing=1,cpu=1,mem=500M,node=1',
9901000
],
@@ -1004,6 +1014,7 @@ def mock_output(filename):
10041014
'slurm_job_name:batch',
10051015
'slurm_job_node_list:c1',
10061016
'slurm_job_partition:null',
1017+
'slurm_partition_name:null',
10071018
'slurm_job_state:COMPLETED',
10081019
'slurm_job_tres_per_node:cpu=1,mem=500M,node=1',
10091020
],
@@ -1021,6 +1032,7 @@ def mock_output(filename):
10211032
'slurm_job_name:batch',
10221033
'slurm_job_node_list:c1',
10231034
'slurm_job_partition:null',
1035+
'slurm_partition_name:null',
10241036
'slurm_job_state:COMPLETED',
10251037
'slurm_job_tres_per_node:cpu=1,mem=500M,node=1',
10261038
],
@@ -1038,6 +1050,7 @@ def mock_output(filename):
10381050
'slurm_job_name:batch',
10391051
'slurm_job_node_list:c1',
10401052
'slurm_job_partition:null',
1053+
'slurm_partition_name:null',
10411054
'slurm_job_state:COMPLETED',
10421055
'slurm_job_tres_per_node:cpu=1,mem=500M,node=1',
10431056
],
@@ -1055,6 +1068,7 @@ def mock_output(filename):
10551068
'slurm_job_name:batch',
10561069
'slurm_job_node_list:c1',
10571070
'slurm_job_partition:null',
1071+
'slurm_partition_name:null',
10581072
'slurm_job_state:COMPLETED',
10591073
'slurm_job_tres_per_node:cpu=1,mem=500M,node=1',
10601074
],
@@ -1072,6 +1086,7 @@ def mock_output(filename):
10721086
'slurm_job_name:batch',
10731087
'slurm_job_node_list:c1',
10741088
'slurm_job_partition:null',
1089+
'slurm_partition_name:null',
10751090
'slurm_job_state:COMPLETED',
10761091
'slurm_job_tres_per_node:cpu=1,mem=500M,node=1',
10771092
],
@@ -1089,6 +1104,7 @@ def mock_output(filename):
10891104
'slurm_job_name:batch',
10901105
'slurm_job_node_list:c1',
10911106
'slurm_job_partition:null',
1107+
'slurm_partition_name:null',
10921108
'slurm_job_state:COMPLETED',
10931109
'slurm_job_tres_per_node:cpu=1,mem=500M,node=1',
10941110
],

slurm/tests/fixtures/squeue.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
45|root|test.py|PENDING||1|(Resources)|100M
2-
46|root|test.py|PENDING||1|(Priority)|200M
3-
42|root|wrap|RUNNING|c1|1|c1|300M
4-
44|root|wrap|RUNNING|c2|1|c2|400M
1+
45|root|test.py|PENDING||1|(Resources)|100M|foo
2+
46|root|test.py|PENDING||1|(Priority)|200M|foo
3+
42|root|wrap|RUNNING|c1|1|c1|300M|foo
4+
44|root|wrap|RUNNING|c2|1|c2|400M|foo

0 commit comments

Comments
 (0)