From a80f8a781a1940789be87fe3d6c4c6b6ac5adb06 Mon Sep 17 00:00:00 2001 From: abhinavDhulipala <46908860+abhinavDhulipala@users.noreply.github.com> Date: Tue, 26 Mar 2024 06:00:17 +0000 Subject: [PATCH] change default sinfo json args --- exporter/fixtures/sinfo_dataparser.json | 322 ++++++++++++++++-------- exporter/main_test.go | 2 +- exporter/nodes.go | 24 +- exporter/server.go | 2 +- 4 files changed, 239 insertions(+), 111 deletions(-) diff --git a/exporter/fixtures/sinfo_dataparser.json b/exporter/fixtures/sinfo_dataparser.json index 02269ad..39bb74a 100644 --- a/exporter/fixtures/sinfo_dataparser.json +++ b/exporter/fixtures/sinfo_dataparser.json @@ -1,109 +1,215 @@ { - "meta": { - "plugins": { - "data_parser": "data_parser/v0.0.39", - "accounting_storage": "accounting_storage/slurmdbd" - }, - "command": ["sinfo", "--json"], - "Slurm": { - "version": { "major": 23, "micro": 4, "minor": 2 }, - "release": "23.02.4" - } - }, - "sinfo": [ - { - "port": 6818, - "node": { "state": ["ALLOCATED", "DRAIN"] }, - "nodes": { - "allocated": 10, - "idle": 0, - "other": 0, - "total": 10, - "hostnames": [], - "addresses": [], - "nodes": ["cs62"] - }, - "cpus": { - "allocated": 114, - "idle": 0, - "other": 526, - "total": 640, - "minimum": 64, - "maximum": 64, - "load": { "minimum": 2, "maximum": 1460 }, - "per_node": { "max": { "set": false, "infinite": true, "number": 0 } } - }, - "sockets": { "minimum": 2, "maximum": 2 }, - "cores": { "minimum": 16, "maximum": 16 }, - "threads": { "minimum": 2, "maximum": 2 }, - "disk": { "minimum": 0, "maximum": 0 }, - "memory": { - "minimum": 770000, - "maximum": 770000, - "free": { - "minimum": { "set": true, "infinite": false, "number": 428227 }, - "maximum": { "set": true, "infinite": false, "number": 678817 } - }, - "allocated": 20480 - }, - "weight": { "minimum": 1, "maximum": 1 }, - "features": { - "total": "xeon_6346,mhz_3600", - "active": "xeon_6346,mhz_3600" - }, - "gres": { "total": "", "used": "" }, - "cluster": "", - "comment": "", - "extra": "", - "reason": { - "description": "IT-4461", - "time": 1705613872, - "user": "root" - }, - "reservation": "", - "partition": { - "nodes": { - "allowed_allocation": "", - "configured": "cs[11,20-32,34,36,49-50,53-77,79-90,92-174,178-180,182-184,186-206,208-211,231-298,300-305]", - "total": 243 - }, - "accounts": { "allowed": "", "deny": "" }, - "groups": { "allowed": "" }, - "qos": { "allowed": "", "deny": "", "assigned": "" }, - "alternate": "hw-l", - "tres": { - "billing_weights": "", - "configured": "cpu=17952,mem=271100000M,node=243,billing=17952" - }, - "cluster": "", - "cpus": { "task_binding": 0, "total": 17952 }, - "defaults": { - "memory_per_cpu": -9223372036854771614, - "time": { "set": false, "infinite": false, "number": 0 }, - "job": "" - }, - "grace_time": 0, - "maximums": { - "cpus_per_node": { "set": false, "infinite": true, "number": 0 }, - "cpus_per_socket": { "set": false, "infinite": true, "number": 0 }, - "memory_per_cpu": 0, - "nodes": { "set": false, "infinite": true, "number": 0 }, - "shares": 1, - "time": { "set": true, "infinite": false, "number": 20160 }, - "over_time_limit": { "set": false, "infinite": false, "number": 0 } - }, - "minimums": { "nodes": 0 }, - "name": "hw", - "node_sets": "", - "priority": { "job_factor": 1, "tier": 1 }, - "timeouts": { - "resume": { "set": false, "infinite": false, "number": 0 }, - "suspend": { "set": false, "infinite": false, "number": 0 } - }, - "suspend_time": { "set": false, "infinite": false, "number": 0 } - } - } - ], - "warnings": [], - "errors": [] -} + "meta": { + "plugins": { + "data_parser": "data_parser\/v0.0.39", + "accounting_storage": "accounting_storage\/none" + }, + "command": [ + "sinfo", + "-N", + "--json" + ], + "Slurm": { + "version": { + "major": 23, + "micro": 5, + "minor": 2 + }, + "release": "23.02.5" + } + }, + "sinfo": [ + { + "port": 6818, + "node": { + "state": [ + "ALLOCATED" + ] + }, + "nodes": { + "allocated": 1, + "idle": 0, + "other": 0, + "total": 1, + "hostnames": [ + ], + "addresses": [ + ], + "nodes": [ + "localhost" + ] + }, + "cpus": { + "allocated": 1, + "idle": 0, + "other": 0, + "total": 1, + "minimum": 1, + "maximum": 1, + "load": { + "minimum": 78, + "maximum": 78 + }, + "per_node": { + "max": { + "set": false, + "infinite": true, + "number": 0 + } + } + }, + "sockets": { + "minimum": 1, + "maximum": 1 + }, + "cores": { + "minimum": 1, + "maximum": 1 + }, + "threads": { + "minimum": 1, + "maximum": 1 + }, + "disk": { + "minimum": 0, + "maximum": 0 + }, + "memory": { + "minimum": 1, + "maximum": 1, + "free": { + "minimum": { + "set": true, + "infinite": false, + "number": 227 + }, + "maximum": { + "set": true, + "infinite": false, + "number": 227 + } + }, + "allocated": 1 + }, + "weight": { + "minimum": 1, + "maximum": 1 + }, + "features": { + "total": "", + "active": "" + }, + "gres": { + "total": "", + "used": "" + }, + "cluster": "", + "comment": "", + "extra": "", + "reason": { + "description": "", + "time": 0, + "user": "" + }, + "reservation": "", + "partition": { + "nodes": { + "allowed_allocation": "", + "configured": "localhost", + "total": 1 + }, + "accounts": { + "allowed": "", + "deny": "" + }, + "groups": { + "allowed": "" + }, + "qos": { + "allowed": "", + "deny": "", + "assigned": "" + }, + "alternate": "", + "tres": { + "billing_weights": "", + "configured": "cpu=1,mem=1M,node=1,billing=1" + }, + "cluster": "", + "cpus": { + "task_binding": 0, + "total": 1 + }, + "defaults": { + "memory_per_cpu": 0, + "time": { + "set": false, + "infinite": false, + "number": 0 + }, + "job": "" + }, + "grace_time": 0, + "maximums": { + "cpus_per_node": { + "set": false, + "infinite": true, + "number": 0 + }, + "cpus_per_socket": { + "set": false, + "infinite": true, + "number": 0 + }, + "memory_per_cpu": 0, + "nodes": { + "set": false, + "infinite": true, + "number": 0 + }, + "shares": 1, + "time": { + "set": false, + "infinite": true, + "number": 0 + }, + "over_time_limit": { + "set": false, + "infinite": false, + "number": 0 + } + }, + "minimums": { + "nodes": 0 + }, + "name": "debug", + "node_sets": "", + "priority": { + "job_factor": 1, + "tier": 1 + }, + "timeouts": { + "resume": { + "set": false, + "infinite": false, + "number": 0 + }, + "suspend": { + "set": false, + "infinite": false, + "number": 0 + } + }, + "suspend_time": { + "set": false, + "infinite": false, + "number": 0 + } + } + } + ], + "warnings": [ + ], + "errors": [ + ] + } diff --git a/exporter/main_test.go b/exporter/main_test.go index 8504060..6caeb4e 100644 --- a/exporter/main_test.go +++ b/exporter/main_test.go @@ -63,7 +63,7 @@ func TestNewConfig_Default(t *testing.T) { assert := assert.New(t) config, err := NewConfig(new(CliFlags)) assert.Nil(err) - assert.Equal([]string{"sinfo", "--json"}, config.cliOpts.sinfo) + assert.Equal([]string{"sinfo", "-N", "--json"}, config.cliOpts.sinfo) assert.Equal([]string{"squeue", "--json"}, config.cliOpts.squeue) assert.Equal([]string{"scontrol", "show", "lic", "--json"}, config.cliOpts.lic) assert.Equal(uint64(10), config.TraceConf.rate) diff --git a/exporter/nodes.go b/exporter/nodes.go index d7d5fb7..50f3704 100644 --- a/exporter/nodes.go +++ b/exporter/nodes.go @@ -79,13 +79,14 @@ type sinfoDataParserResponse struct { Partition struct { Name string `json:"name"` Alternate string `json:"alternate"` - } `json:"parittion"` + } `json:"partition"` } `json:"sinfo"` } type DataParserJsonFetcher struct { scraper SlurmByteScraper errorCounter prometheus.Counter + duration time.Duration cache *AtomicThrottledCache[NodeMetric] } @@ -93,9 +94,11 @@ func (dpj *DataParserJsonFetcher) fetch() ([]NodeMetric, error) { squeue := new(sinfoDataParserResponse) cliJson, err := dpj.scraper.FetchRawBytes() if err != nil { + dpj.errorCounter.Inc() return nil, err } if err := json.Unmarshal(cliJson, squeue); err != nil { + dpj.errorCounter.Inc() return nil, err } nodeMetrics := make([]NodeMetric, 0) @@ -103,15 +106,19 @@ func (dpj *DataParserJsonFetcher) fetch() ([]NodeMetric, error) { nodes := entry.Nodes // validate single node parse if nodes.Total != 1 { + dpj.errorCounter.Inc() return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") } if entry.Memory.Free.Maximum.Set && entry.Memory.Free.Minimum.Set { + dpj.errorCounter.Inc() return nil, fmt.Errorf("unable to scrape free mem metrics") } if entry.Memory.Free.Minimum.Number != entry.Memory.Free.Maximum.Number { + dpj.errorCounter.Inc() return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") } if entry.Memory.Minimum != entry.Memory.Maximum { + dpj.errorCounter.Inc() return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") } metric := NodeMetric{ @@ -132,6 +139,21 @@ func (dpj *DataParserJsonFetcher) fetch() ([]NodeMetric, error) { return nodeMetrics, nil } +func (dpj *DataParserJsonFetcher) Fetch() ([]NodeMetric, error) { + t := time.Now() + metrics, err := dpj.cache.FetchOrThrottle(dpj.fetch) + dpj.duration = time.Since(t) + return metrics, err +} + +func (dpj *DataParserJsonFetcher) ScrapeDuration() time.Duration { + return dpj.duration +} + +func (dpj *DataParserJsonFetcher) ScrapeError() prometheus.Counter { + return dpj.errorCounter +} + type sinfoResponse struct { Meta struct { SlurmVersion struct { diff --git a/exporter/server.go b/exporter/server.go index ec0bccd..f752884 100644 --- a/exporter/server.go +++ b/exporter/server.go @@ -69,7 +69,7 @@ func NewConfig(cliFlags *CliFlags) (*Config, error) { // defaults cliOpts := CliOpts{ squeue: []string{"squeue", "--json"}, - sinfo: []string{"sinfo", "--json"}, + sinfo: []string{"sinfo", "-N", "--json"}, lic: []string{"scontrol", "show", "lic", "--json"}, sdiag: []string{"sdiag", "--json"}, licEnabled: cliFlags.SlurmLicEnabled,