diff --git a/README.md b/README.md index db6f086..c74ad15 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ You can also install the exporter directly with `go install github.com/rivosinc/ ```bash # example installation -$ go install github.com/rivosinc/prometheus-slurm-exporter@v1.0.1 +$ go install github.com/rivosinc/prometheus-slurm-exporter@v1.1.1 # or if you like living on the edge $ go install github.com/rivosinc/prometheus-slurm-exporter@latest # if not already added, ensure diff --git a/exporter/fixtures/sinfo_dataparser.json b/exporter/fixtures/sinfo_dataparser.json index fc63caa..02269ad 100644 --- a/exporter/fixtures/sinfo_dataparser.json +++ b/exporter/fixtures/sinfo_dataparser.json @@ -21,18 +21,7 @@ "total": 10, "hostnames": [], "addresses": [], - "nodes": [ - "cs62", - "cs63", - "cs64", - "cs65", - "cs66", - "cs68", - "cs71", - "cs200", - "cs201", - "cs204" - ] + "nodes": ["cs62"] }, "cpus": { "allocated": 114, diff --git a/exporter/jobs.go b/exporter/jobs.go index a2c6b06..c9fe6c3 100644 --- a/exporter/jobs.go +++ b/exporter/jobs.go @@ -148,7 +148,9 @@ func parseCliFallback(squeue []byte, errorCounter prometheus.Counter) ([]JobMetr } mem, err := MemToFloat(metric.Mem) if err != nil { - return nil, err + slog.Error(fmt.Sprintf("squeue fallback parse error: failed on line %d `%s` with err `%q`", i, line, err)) + errorCounter.Inc() + continue } openapiJobMetric := JobMetric{ Account: metric.Account, diff --git a/exporter/nodes.go b/exporter/nodes.go index fe4220c..d7d5fb7 100644 --- a/exporter/nodes.go +++ b/exporter/nodes.go @@ -55,9 +55,83 @@ type sinfoDataParserResponse struct { Total int `json:"total"` Nodes []string `json:"nodes"` } `json:"nodes"` + Cpus struct { + Allocated int `json:"allocated"` + Idle int `json:"idle"` + Other int `json:"other"` + Total int `json:"total"` + } + Memory struct { + Minimum int `json:"minimum"` + Maximum int `json:"maximum"` + Allocated int `json:"allocated"` + Free struct { + Minimum struct { + Set bool `json:"set"` + Number int `json:"number"` + } `json:"minimum"` + Maximum struct { + Set bool `json:"set"` + Number int `json:"number"` + } `json:"maximum"` + } `json:"free"` + } + Partition struct { + Name string `json:"name"` + Alternate string `json:"alternate"` + } `json:"parittion"` } `json:"sinfo"` } +type DataParserJsonFetcher struct { + scraper SlurmByteScraper + errorCounter prometheus.Counter + cache *AtomicThrottledCache[NodeMetric] +} + +func (dpj *DataParserJsonFetcher) fetch() ([]NodeMetric, error) { + squeue := new(sinfoDataParserResponse) + cliJson, err := dpj.scraper.FetchRawBytes() + if err != nil { + return nil, err + } + if err := json.Unmarshal(cliJson, squeue); err != nil { + return nil, err + } + nodeMetrics := make([]NodeMetric, 0) + for _, entry := range squeue.Sinfo { + nodes := entry.Nodes + // validate single node parse + if nodes.Total != 1 { + return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") + } + if entry.Memory.Free.Maximum.Set && entry.Memory.Free.Minimum.Set { + return nil, fmt.Errorf("unable to scrape free mem metrics") + } + if entry.Memory.Free.Minimum.Number != entry.Memory.Free.Maximum.Number { + return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") + } + if entry.Memory.Minimum != entry.Memory.Maximum { + return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`") + } + metric := NodeMetric{ + Hostname: nodes.Nodes[0], + Cpus: float64(entry.Cpus.Total), + RealMemory: float64(entry.Memory.Maximum), + FreeMemory: float64(entry.Memory.Free.Maximum.Number), + State: strings.Join(entry.Node.State, "&"), + } + if !slices.Contains(metric.Partitions, entry.Partition.Name) { + metric.Partitions = append(metric.Partitions, entry.Partition.Name) + } + if entry.Partition.Alternate != "" && !slices.Contains(metric.Partitions, entry.Partition.Alternate) { + metric.Partitions = append(metric.Partitions, entry.Partition.Alternate) + } + nodeMetrics = append(nodeMetrics, metric) + } + return nodeMetrics, nil +} + type sinfoResponse struct { Meta struct { SlurmVersion struct { @@ -156,7 +230,8 @@ func (cmf *NodeCliFallbackFetcher) fetch() ([]NodeMetric, error) { } if err := json.Unmarshal(line, &metric); err != nil { cmf.errorCounter.Inc() - return nil, fmt.Errorf("sinfo failed to parse line %d: %s, got %q", i, line, err) + slog.Error(fmt.Sprintf("sinfo failed to parse line %d: %s, got %q", i, line, err)) + continue } // convert mem units from MB to Bytes metric.RealMemory *= 1e6