From 4be7653e04a9e17146deb779a8c029ba9a1b3edf Mon Sep 17 00:00:00 2001 From: abhinavDhulipala <46908860+abhinavDhulipala@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:22:11 -0800 Subject: [PATCH 1/4] continue on fallback parse err (#57) --- exporter/jobs.go | 4 +++- exporter/nodes.go | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/exporter/jobs.go b/exporter/jobs.go index a2c6b06..c9fe6c3 100644 --- a/exporter/jobs.go +++ b/exporter/jobs.go @@ -148,7 +148,9 @@ func parseCliFallback(squeue []byte, errorCounter prometheus.Counter) ([]JobMetr } mem, err := MemToFloat(metric.Mem) if err != nil { - return nil, err + slog.Error(fmt.Sprintf("squeue fallback parse error: failed on line %d `%s` with err `%q`", i, line, err)) + errorCounter.Inc() + continue } openapiJobMetric := JobMetric{ Account: metric.Account, diff --git a/exporter/nodes.go b/exporter/nodes.go index f2b66e2..8eab368 100644 --- a/exporter/nodes.go +++ b/exporter/nodes.go @@ -130,7 +130,8 @@ func (cmf *NodeCliFallbackFetcher) fetch() ([]NodeMetric, error) { } if err := json.Unmarshal(line, &metric); err != nil { cmf.errorCounter.Inc() - return nil, fmt.Errorf("sinfo failed to parse line %d: %s, got %q", i, line, err) + slog.Error(fmt.Sprintf("sinfo failed to parse line %d: %s, got %q", i, line, err)) + continue } // convert mem units from MB to Bytes metric.RealMemory *= 1e6 From db67c5357e1d40932bf24c8cb46e4834268a5520 Mon Sep 17 00:00:00 2001 From: abhinavDhulipala <46908860+abhinavDhulipala@users.noreply.github.com> Date: Fri, 2 Feb 2024 00:24:55 -0800 Subject: [PATCH 2/4] fwd release (#58) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index db6f086..c74ad15 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ You can also install the exporter directly with `go install github.com/rivosinc/ ```bash # example installation -$ go install github.com/rivosinc/prometheus-slurm-exporter@v1.0.1 +$ go install github.com/rivosinc/prometheus-slurm-exporter@v1.1.1 # or if you like living on the edge $ go install github.com/rivosinc/prometheus-slurm-exporter@latest # if not already added, ensure From e4ac97025d4f4e27e3c676c574ca83e028d4633f Mon Sep 17 00:00:00 2001 From: abhinavDhulipala Date: Sat, 20 Jan 2024 13:54:44 -0800 Subject: [PATCH 3/4] fixtures and partial response struct --- exporter/fixtures/sinfo_dataparser.json | 120 ++++++++++++++++++ .../fixtures/sinfo_dataparser.json.license | 3 + exporter/nodes.go | 32 +++++ 3 files changed, 155 insertions(+) create mode 100644 exporter/fixtures/sinfo_dataparser.json create mode 100644 exporter/fixtures/sinfo_dataparser.json.license diff --git a/exporter/fixtures/sinfo_dataparser.json b/exporter/fixtures/sinfo_dataparser.json new file mode 100644 index 0000000..fc63caa --- /dev/null +++ b/exporter/fixtures/sinfo_dataparser.json @@ -0,0 +1,120 @@ +{ + "meta": { + "plugins": { + "data_parser": "data_parser/v0.0.39", + "accounting_storage": "accounting_storage/slurmdbd" + }, + "command": ["sinfo", "--json"], + "Slurm": { + "version": { "major": 23, "micro": 4, "minor": 2 }, + "release": "23.02.4" + } + }, + "sinfo": [ + { + "port": 6818, + "node": { "state": ["ALLOCATED", "DRAIN"] }, + "nodes": { + "allocated": 10, + "idle": 0, + "other": 0, + "total": 10, + "hostnames": [], + "addresses": [], + "nodes": [ + "cs62", + "cs63", + "cs64", + "cs65", + "cs66", + "cs68", + "cs71", + "cs200", + "cs201", + "cs204" + ] + }, + "cpus": { + "allocated": 114, + "idle": 0, + "other": 526, + "total": 640, + "minimum": 64, + "maximum": 64, + "load": { "minimum": 2, "maximum": 1460 }, + "per_node": { "max": { "set": false, "infinite": true, "number": 0 } } + }, + "sockets": { "minimum": 2, "maximum": 2 }, + "cores": { "minimum": 16, "maximum": 16 }, + "threads": { "minimum": 2, "maximum": 2 }, + "disk": { "minimum": 0, "maximum": 0 }, + "memory": { + "minimum": 770000, + "maximum": 770000, + "free": { + "minimum": { "set": true, "infinite": false, "number": 428227 }, + "maximum": { "set": true, "infinite": false, "number": 678817 } + }, + "allocated": 20480 + }, + "weight": { "minimum": 1, "maximum": 1 }, + "features": { + "total": "xeon_6346,mhz_3600", + "active": "xeon_6346,mhz_3600" + }, + "gres": { "total": "", "used": "" }, + "cluster": "", + "comment": "", + "extra": "", + "reason": { + "description": "IT-4461", + "time": 1705613872, + "user": "root" + }, + "reservation": "", + "partition": { + "nodes": { + "allowed_allocation": "", + "configured": "cs[11,20-32,34,36,49-50,53-77,79-90,92-174,178-180,182-184,186-206,208-211,231-298,300-305]", + "total": 243 + }, + "accounts": { "allowed": "", "deny": "" }, + "groups": { "allowed": "" }, + "qos": { "allowed": "", "deny": "", "assigned": "" }, + "alternate": "hw-l", + "tres": { + "billing_weights": "", + "configured": "cpu=17952,mem=271100000M,node=243,billing=17952" + }, + "cluster": "", + "cpus": { "task_binding": 0, "total": 17952 }, + "defaults": { + "memory_per_cpu": -9223372036854771614, + "time": { "set": false, "infinite": false, "number": 0 }, + "job": "" + }, + "grace_time": 0, + "maximums": { + "cpus_per_node": { "set": false, "infinite": true, "number": 0 }, + "cpus_per_socket": { "set": false, "infinite": true, "number": 0 }, + "memory_per_cpu": 0, + "nodes": { "set": false, "infinite": true, "number": 0 }, + "shares": 1, + "time": { "set": true, "infinite": false, "number": 20160 }, + "over_time_limit": { "set": false, "infinite": false, "number": 0 } + }, + "minimums": { "nodes": 0 }, + "name": "hw", + "node_sets": "", + "priority": { "job_factor": 1, "tier": 1 }, + "timeouts": { + "resume": { "set": false, "infinite": false, "number": 0 }, + "suspend": { "set": false, "infinite": false, "number": 0 } + }, + "suspend_time": { "set": false, "infinite": false, "number": 0 } + } + } + ], + "warnings": [], + "errors": [] +} diff --git a/exporter/fixtures/sinfo_dataparser.json.license b/exporter/fixtures/sinfo_dataparser.json.license new file mode 100644 index 0000000..d8f6e93 --- /dev/null +++ b/exporter/fixtures/sinfo_dataparser.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2023 Rivos Inc. + +SPDX-License-Identifier: Apache-2.0 diff --git a/exporter/nodes.go b/exporter/nodes.go index 8eab368..8783e73 100644 --- a/exporter/nodes.go +++ b/exporter/nodes.go @@ -32,6 +32,38 @@ type NodeMetric struct { CpuLoad float64 `json:"cpu_load"` } +type sinfoDataParserResponse struct { + Meta struct { + Plugins map[string]string `json:"plugins"` + } `json:"meta"` + SlurmVersion struct { + Version struct { + Major int `json:"major"` + Micro int `json:"micro"` + Minor int `json:"minor"` + } `json:"version"` + Release string `json:"release"` + } `json:"Slurm"` + Sinfo []struct { + Node struct { + State []string `json:"state"` + } `json:"node"` + Nodes struct { + Allocated int `json:"allocated"` + Idle int `json:"idle"` + Other int `json:"other"` + Total int `json:"total"` + Nodes []string `json:"nodes"` + } `json:"nodes"` + Cpus struct { + Allocated int `json:"allocated"` + Idle int `json:"idle"` + Other int `json:"other"` + Total int `json:"total"` + } + } `json:"sinfo"` +} + type sinfoResponse struct { Meta struct { SlurmVersion struct { From 8ed95d17c0e89f56076be7bf01d925b118382099 Mon Sep 17 00:00:00 2001 From: abhinavDhulipala Date: Sun, 10 Mar 2024 22:47:13 -0700 Subject: [PATCH 4/4] -N option --- exporter/fixtures/sinfo_dataparser.json | 13 +---- exporter/nodes.go | 68 +++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 12 deletions(-) diff --git a/exporter/fixtures/sinfo_dataparser.json b/exporter/fixtures/sinfo_dataparser.json index fc63caa..02269ad 100644 --- a/exporter/fixtures/sinfo_dataparser.json +++ b/exporter/fixtures/sinfo_dataparser.json @@ -21,18 +21,7 @@ "total": 10, "hostnames": [], "addresses": [], - "nodes": [ - "cs62", - "cs63", - "cs64", - "cs65", - "cs66", - "cs68", - "cs71", - "cs200", - "cs201", - "cs204" - ] + "nodes": ["cs62"] }, "cpus": { "allocated": 114, diff --git a/exporter/nodes.go b/exporter/nodes.go index 8783e73..d7d5fb7 100644 --- a/exporter/nodes.go +++ b/exporter/nodes.go @@ -61,9 +61,77 @@ type sinfoDataParserResponse struct { Other int `json:"other"` Total int `json:"total"` } + Memory struct { + Minimum int `json:"minimum"` + Maximum int `json:"maximum"` + Allocated int `json:"allocated"` + Free struct { + Minimum struct { + Set bool `json:"set"` + Number int `json:"number"` + } `json:"minimum"` + Maximum struct { + Set bool `json:"set"` + Number int `json:"number"` + } `json:"maximum"` + } `json:"free"` + } + Partition struct { + Name string `json:"name"` + Alternate string `json:"alternate"` + } `json:"parittion"` } `json:"sinfo"` } +type DataParserJsonFetcher struct { + scraper SlurmByteScraper + errorCounter prometheus.Counter + cache *AtomicThrottledCache[NodeMetric] +} + +func (dpj *DataParserJsonFetcher) fetch() ([]NodeMetric, error) { + squeue := new(sinfoDataParserResponse) + cliJson, err := dpj.scraper.FetchRawBytes() + if err != nil { + return nil, err + } + if err := json.Unmarshal(cliJson, squeue); err != nil { + return nil, err + } + nodeMetrics := make([]NodeMetric, 0) + for _, entry := range squeue.Sinfo { + nodes := entry.Nodes + // validate single node parse + if nodes.Total != 1 { + return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") + } + if entry.Memory.Free.Maximum.Set && entry.Memory.Free.Minimum.Set { + return nil, fmt.Errorf("unable to scrape free mem metrics") + } + if entry.Memory.Free.Minimum.Number != entry.Memory.Free.Maximum.Number { + return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") + } + if entry.Memory.Minimum != entry.Memory.Maximum { + return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") + } + metric := NodeMetric{ + Hostname: nodes.Nodes[0], + Cpus: float64(entry.Cpus.Total), + RealMemory: float64(entry.Memory.Maximum), + FreeMemory: float64(entry.Memory.Free.Maximum.Number), + State: strings.Join(entry.Node.State, "&"), + } + if !slices.Contains(metric.Partitions, entry.Partition.Name) { + metric.Partitions = append(metric.Partitions, entry.Partition.Name) + } + if entry.Partition.Alternate != "" && !slices.Contains(metric.Partitions, entry.Partition.Alternate) { + metric.Partitions = append(metric.Partitions, entry.Partition.Alternate) + } + nodeMetrics = append(nodeMetrics, metric) + } + return nodeMetrics, nil +} + type sinfoResponse struct { Meta struct { SlurmVersion struct {