From 7c4f76d725f5380fcdc084b445e6a1b016ce9690 Mon Sep 17 00:00:00 2001 From: abhinavDhulipala <46908860+abhinavDhulipala@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:17:38 -0800 Subject: [PATCH] [exporter/diags] make json sdiag response struct 24.05 comptible (#114) * make json sdiag response struct 24.05 comptible * keep testing against slurm 23 --- Dockerfile | 6 +- exporter/diags.go | 89 +++++-- exporter/diags_test.go | 39 +++ exporter/fixtures/sdiag_2405.json | 291 ++++++++++++++++++++++ exporter/fixtures/sdiag_2405.json.license | 3 + exporter/utils.go | 33 ++- 6 files changed, 433 insertions(+), 28 deletions(-) create mode 100644 exporter/fixtures/sdiag_2405.json create mode 100644 exporter/fixtures/sdiag_2405.json.license diff --git a/Dockerfile b/Dockerfile index 607d8eb..275c47c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,9 +40,9 @@ RUN mkdir -p /etc/slurm && \ # install go deps RUN arch=`uname -m` && \ if [ $arch == "aarch64" ]; then arch="arm64"; elif [ "$arch" == "x86_64" ]; then arch="amd64" ;fi && \ - wget "https://go.dev/dl/go1.20.12.linux-${arch}.tar.gz" && \ - tar -C /usr/local -xzf "go1.20.12.linux-${arch}.tar.gz" && \ - rm "go1.20.12.linux-${arch}.tar.gz" && \ + wget "https://go.dev/dl/go1.23.1.linux-${arch}.tar.gz" && \ + tar -C /usr/local -xzf "go1.23.1.linux-${arch}.tar.gz" && \ + rm "go1.23.1.linux-${arch}.tar.gz" && \ mkdir /src # default wrapper deps for e2e tests diff --git a/exporter/diags.go b/exporter/diags.go index 93863e9..37bc6e9 100644 --- a/exporter/diags.go +++ b/exporter/diags.go @@ -8,24 +8,58 @@ import ( "encoding/json" "fmt" - "github.com/prometheus/client_golang/prometheus" "log/slog" + + "github.com/prometheus/client_golang/prometheus" ) +type IntFromOptionalStruct int + +func (ffoo *IntFromOptionalStruct) UnmarshalJSON(data []byte) error { + // in between certain versions of data_parser, certain integer fields + // can be given in actual int or in the form + // {"average_time": {"set": true, "number": 1234, "infinite": false}} + // create type to coerce to int + var nativeInt int + if err := json.Unmarshal(data, &nativeInt); err == nil { + *ffoo = IntFromOptionalStruct(nativeInt) + return nil + } + var numStruct struct { + Set bool `json:"set"` + Infinite bool `json:"infinite"` + Number int `json:"number"` + } + err := json.Unmarshal(data, &numStruct) + if err != nil { + return err + } + if !numStruct.Set { + *ffoo = IntFromOptionalStruct(-1) + return fmt.Errorf("avg num not set") + } + if numStruct.Infinite { + *ffoo = IntFromOptionalStruct(-1) + return fmt.Errorf("num set to infinite") + } + *ffoo = IntFromOptionalStruct(numStruct.Number) + return nil +} + type UserRpcInfo struct { - User string `json:"user"` - UserId int `json:"user_id"` - Count int `json:"count"` - AvgTime int `json:"average_time"` - TotalTime int `json:"total_time"` + User string `json:"user"` + UserId int `json:"user_id"` + Count int `json:"count"` + AvgTime IntFromOptionalStruct `json:"average_time"` + TotalTime int `json:"total_time"` } type MessageRpcInfo struct { - MessageType string `json:"message_type"` - TypeId int `json:"type_id"` - Count int `json:"count"` - AvgTime int `json:"average_time"` - TotalTime int `json:"total_time"` + MessageType string `json:"message_type"` + TypeId int `json:"type_id"` + Count int `json:"count"` + AvgTime IntFromOptionalStruct `json:"average_time"` + TotalTime int `json:"total_time"` } type DiagMetric struct { @@ -41,22 +75,29 @@ type DiagMetric struct { } type SdiagResponse struct { + // Response coercible between slurm 23 and 24 data versions Meta struct { - SlurmVersion struct { - Version struct { - Major int `json:"major"` - Micro int `json:"micro"` - Minor int `json:"minor"` - } `json:"version"` - Release string `json:"release"` - } `json:"Slurm"` - Plugins map[string]string + SlurmVersion SlurmVersion `json:"Slurm"` + Plugins map[string]string `json:"plugins"` + Plugin map[string]string `json:"plugin"` } `json:"meta"` Statistics DiagMetric Errors []string `json:"errors"` Warnings []string `json:"warnings"` } +func (sr *SdiagResponse) IsDataParserPlugin() bool { + if sr.Meta.Plugins != nil { + _, ok := sr.Meta.Plugins["data_parser"] + return ok + } + if sr.Meta.Plugin != nil { + _, ok := sr.Meta.Plugin["data_parser"] + return ok + } + return false +} + func parseDiagMetrics(sdiagResp []byte) (*SdiagResponse, error) { sdiag := new(SdiagResponse) err := json.Unmarshal(sdiagResp, sdiag) @@ -138,14 +179,14 @@ func (sc *DiagnosticsCollector) Collect(ch chan<- prometheus.Metric) { } ch <- prometheus.MustNewConstMetric(sc.diagScrapeDuration, prometheus.GaugeValue, float64(sc.fetcher.Duration().Abs().Milliseconds())) sdiagResponse, err := parseDiagMetrics(sdiag) - if _, ok := sdiagResponse.Meta.Plugins["data_parser"]; !ok { + if err != nil { sc.diagScrapeError.Inc() - slog.Error("only the data_parser plugin is supported") + slog.Error(fmt.Sprintf("diag parse error: %q", err)) return } - if err != nil { + if !sdiagResponse.IsDataParserPlugin() { sc.diagScrapeError.Inc() - slog.Error(fmt.Sprintf("diag parse error: %q", err)) + slog.Error("only the data_parser plugin is supported") return } emitNonZero := func(desc *prometheus.Desc, val float64, label string) { diff --git a/exporter/diags_test.go b/exporter/diags_test.go index f1478fb..7635dc5 100644 --- a/exporter/diags_test.go +++ b/exporter/diags_test.go @@ -40,6 +40,25 @@ func TestDiagCollect(t *testing.T) { assert.NotEmpty(metrics) } +func TestDiagCollect_2405(t *testing.T) { + assert := assert.New(t) + config, err := NewConfig(new(CliFlags)) + assert.NoError(err) + dc := NewDiagsCollector(config) + dc.fetcher = &MockScraper{fixture: "fixtures/sdiag_2405.json"} + metricChan := make(chan prometheus.Metric) + go func() { + dc.Collect(metricChan) + close(metricChan) + }() + metrics := make([]prometheus.Metric, 0) + for m, ok := <-metricChan; ok; m, ok = <-metricChan { + metrics = append(metrics, m) + t.Logf("Received metric %s", m.Desc().String()) + } + assert.NotEmpty(metrics) +} + func TestDiagDescribe(t *testing.T) { assert := assert.New(t) ch := make(chan *prometheus.Desc) @@ -57,3 +76,23 @@ func TestDiagDescribe(t *testing.T) { } assert.NotEmpty(descs) } + +func TestDataParserVersionDiscovery_Slurm23(t *testing.T) { + assert := assert.New(t) + fetcher := MockScraper{fixture: "fixtures/sdiag.json"} + sdiag, err := fetcher.FetchRawBytes() + assert.NoError(err) + resp, err := parseDiagMetrics(sdiag) + assert.NoError(err) + assert.True(resp.IsDataParserPlugin()) +} + +func TestDataParserVersionDiscovery_Slurm24(t *testing.T) { + assert := assert.New(t) + fetcher := MockScraper{fixture: "fixtures/sdiag_2405.json"} + sdiag, err := fetcher.FetchRawBytes() + assert.NoError(err) + resp, err := parseDiagMetrics(sdiag) + assert.NoError(err) + assert.Truef(resp.IsDataParserPlugin(), "parsed metadata struct %+v", resp.Meta) +} diff --git a/exporter/fixtures/sdiag_2405.json b/exporter/fixtures/sdiag_2405.json new file mode 100644 index 0000000..c4f821d --- /dev/null +++ b/exporter/fixtures/sdiag_2405.json @@ -0,0 +1,291 @@ +{ + "statistics": { + "parts_packed": 1, + "req_time": { + "set": true, + "infinite": false, + "number": 1739832148 + }, + "req_time_start": { + "set": true, + "infinite": false, + "number": 1739822537 + }, + "server_thread_count": 2, + "agent_queue_size": 0, + "agent_count": 0, + "agent_thread_count": 0, + "dbd_agent_queue_size": 0, + "gettimeofday_latency": 33, + "schedule_cycle_max": 1666, + "schedule_cycle_last": 110, + "schedule_cycle_sum": 10791, + "schedule_cycle_total": 162, + "schedule_cycle_mean": 66, + "schedule_cycle_mean_depth": 0, + "schedule_cycle_per_minute": 1, + "schedule_cycle_depth": 0, + "schedule_exit": { + "end_job_queue": 162, + "default_queue_depth": 0, + "max_job_start": 0, + "max_rpc_cnt": 0, + "max_sched_time": 0, + "licenses": 0 + }, + "schedule_queue_length": 0, + "jobs_submitted": 1, + "jobs_started": 1, + "jobs_completed": 1, + "jobs_canceled": 0, + "jobs_failed": 0, + "jobs_pending": 0, + "jobs_running": 0, + "job_states_ts": { + "set": true, + "infinite": false, + "number": 1739832137 + }, + "bf_backfilled_jobs": 0, + "bf_last_backfilled_jobs": 0, + "bf_backfilled_het_jobs": 0, + "bf_cycle_counter": 0, + "bf_cycle_mean": 0, + "bf_depth_mean": 0, + "bf_depth_mean_try": 0, + "bf_cycle_sum": 0, + "bf_cycle_last": 0, + "bf_cycle_max": 0, + "bf_exit": { + "end_job_queue": 0, + "bf_max_job_start": 0, + "bf_max_job_test": 0, + "bf_max_time": 0, + "bf_node_space_size": 0, + "state_changed": 0 + }, + "bf_last_depth": 0, + "bf_last_depth_try": 0, + "bf_depth_sum": 0, + "bf_depth_try_sum": 0, + "bf_queue_len": 0, + "bf_queue_len_mean": 0, + "bf_queue_len_sum": 0, + "bf_table_size": 0, + "bf_table_size_sum": 0, + "bf_table_size_mean": 0, + "bf_when_last_cycle": { + "set": true, + "infinite": false, + "number": 0 + }, + "bf_active": false, + "rpcs_by_message_type": [ + { + "type_id": 1002, + "message_type": "MESSAGE_NODE_REGISTRATION_STATUS", + "count": 6, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 4969, + "average_time": { + "set": true, + "infinite": false, + "number": 828 + } + }, + { + "type_id": 4001, + "message_type": "REQUEST_RESOURCE_ALLOCATION", + "count": 1, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 17873, + "average_time": { + "set": true, + "infinite": false, + "number": 17873 + } + }, + { + "type_id": 4019, + "message_type": "REQUEST_JOB_READY", + "count": 1, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 332, + "average_time": { + "set": true, + "infinite": false, + "number": 332 + } + }, + { + "type_id": 5001, + "message_type": "REQUEST_JOB_STEP_CREATE", + "count": 1, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 6331, + "average_time": { + "set": true, + "infinite": false, + "number": 6331 + } + }, + { + "type_id": 5017, + "message_type": "REQUEST_COMPLETE_JOB_ALLOCATION", + "count": 1, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 2113, + "average_time": { + "set": true, + "infinite": false, + "number": 2113 + } + }, + { + "type_id": 5016, + "message_type": "REQUEST_STEP_COMPLETE", + "count": 1, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 739, + "average_time": { + "set": true, + "infinite": false, + "number": 739 + } + }, + { + "type_id": 6012, + "message_type": "MESSAGE_EPILOG_COMPLETE", + "count": 1, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 635, + "average_time": { + "set": true, + "infinite": false, + "number": 635 + } + }, + { + "type_id": 2035, + "message_type": "REQUEST_STATS_INFO", + "count": 5, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 1951, + "average_time": { + "set": true, + "infinite": false, + "number": 390 + } + }, + { + "type_id": 2009, + "message_type": "REQUEST_PARTITION_INFO", + "count": 6, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 1391, + "average_time": { + "set": true, + "infinite": false, + "number": 231 + } + }, + { + "type_id": 2003, + "message_type": "REQUEST_JOB_INFO", + "count": 3, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 2975, + "average_time": { + "set": true, + "infinite": false, + "number": 991 + } + }, + { + "type_id": 2007, + "message_type": "REQUEST_NODE_INFO", + "count": 3, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 1051, + "average_time": { + "set": true, + "infinite": false, + "number": 350 + } + } + ], + "rpcs_by_user": [ + { + "user_id": 0, + "user": "root", + "count": 29, + "total_time": 40360, + "average_time": { + "set": true, + "infinite": false, + "number": 1391 + } + } + ], + "pending_rpcs": [], + "pending_rpcs_by_hostlist": [] + }, + "meta": { + "plugin": { + "type": "", + "name": "", + "data_parser": "data_parser/v0.0.41", + "accounting_storage": "" + }, + "client": { + "source": "/dev/pts/0", + "user": "root", + "group": "root" + }, + "command": ["sdiag"], + "slurm": { + "version": { + "major": "24", + "micro": "5", + "minor": "05" + }, + "release": "24.05.5", + "cluster": "default-cluster" + } + }, + "errors": [], + "warnings": [] +} diff --git a/exporter/fixtures/sdiag_2405.json.license b/exporter/fixtures/sdiag_2405.json.license new file mode 100644 index 0000000..d8f6e93 --- /dev/null +++ b/exporter/fixtures/sdiag_2405.json.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2023 Rivos Inc. + +SPDX-License-Identifier: Apache-2.0 diff --git a/exporter/utils.go b/exporter/utils.go index ba0b115..12379c6 100644 --- a/exporter/utils.go +++ b/exporter/utils.go @@ -6,6 +6,7 @@ package exporter import ( "bytes" + "encoding/json" "errors" "fmt" "os" @@ -16,14 +17,44 @@ import ( "sync" "time" - "github.com/prometheus/client_golang/prometheus" "log/slog" + + "github.com/prometheus/client_golang/prometheus" ) type SlurmPrimitiveMetric interface { NodeMetric | JobMetric | DiagMetric | LicenseMetric | AccountLimitMetric } +type CoercedInt int + +func (ci *CoercedInt) UnmarshalJSON(data []byte) error { + var nativeInt int + if err := json.Unmarshal(data, &nativeInt); err == nil { + *ci = CoercedInt(nativeInt) + return nil + } + var stringInt string + if err := json.Unmarshal(data, &stringInt); err != nil { + return err + } + convertedInt, err := strconv.ParseInt(stringInt, 10, 64) + if err != nil { + return err + } + *ci = CoercedInt(convertedInt) + return nil +} + +type SlurmVersion struct { + Version struct { + Major CoercedInt `json:"major"` + Micro CoercedInt `json:"micro"` + Minor CoercedInt `json:"minor"` + } `json:"version"` + Release string `json:"release"` +} + // interface for getting data from slurm // used for dep injection/ease of testing & for add slurmrestd support later type SlurmByteScraper interface {