Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[exporter] data parser json support #53

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 215 additions & 0 deletions exporter/fixtures/sinfo_dataparser.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
{
"meta": {
"plugins": {
"data_parser": "data_parser\/v0.0.39",
"accounting_storage": "accounting_storage\/none"
},
"command": [
"sinfo",
"-N",
"--json"
],
"Slurm": {
"version": {
"major": 23,
"micro": 5,
"minor": 2
},
"release": "23.02.5"
}
},
"sinfo": [
{
"port": 6818,
"node": {
"state": [
"ALLOCATED"
]
},
"nodes": {
"allocated": 1,
"idle": 0,
"other": 0,
"total": 1,
"hostnames": [
],
"addresses": [
],
"nodes": [
"localhost"
]
},
"cpus": {
"allocated": 1,
"idle": 0,
"other": 0,
"total": 1,
"minimum": 1,
"maximum": 1,
"load": {
"minimum": 78,
"maximum": 78
},
"per_node": {
"max": {
"set": false,
"infinite": true,
"number": 0
}
}
},
"sockets": {
"minimum": 1,
"maximum": 1
},
"cores": {
"minimum": 1,
"maximum": 1
},
"threads": {
"minimum": 1,
"maximum": 1
},
"disk": {
"minimum": 0,
"maximum": 0
},
"memory": {
"minimum": 1,
"maximum": 1,
"free": {
"minimum": {
"set": true,
"infinite": false,
"number": 227
},
"maximum": {
"set": true,
"infinite": false,
"number": 227
}
},
"allocated": 1
},
"weight": {
"minimum": 1,
"maximum": 1
},
"features": {
"total": "",
"active": ""
},
"gres": {
"total": "",
"used": ""
},
"cluster": "",
"comment": "",
"extra": "",
"reason": {
"description": "",
"time": 0,
"user": ""
},
"reservation": "",
"partition": {
"nodes": {
"allowed_allocation": "",
"configured": "localhost",
"total": 1
},
"accounts": {
"allowed": "",
"deny": ""
},
"groups": {
"allowed": ""
},
"qos": {
"allowed": "",
"deny": "",
"assigned": ""
},
"alternate": "",
"tres": {
"billing_weights": "",
"configured": "cpu=1,mem=1M,node=1,billing=1"
},
"cluster": "",
"cpus": {
"task_binding": 0,
"total": 1
},
"defaults": {
"memory_per_cpu": 0,
"time": {
"set": false,
"infinite": false,
"number": 0
},
"job": ""
},
"grace_time": 0,
"maximums": {
"cpus_per_node": {
"set": false,
"infinite": true,
"number": 0
},
"cpus_per_socket": {
"set": false,
"infinite": true,
"number": 0
},
"memory_per_cpu": 0,
"nodes": {
"set": false,
"infinite": true,
"number": 0
},
"shares": 1,
"time": {
"set": false,
"infinite": true,
"number": 0
},
"over_time_limit": {
"set": false,
"infinite": false,
"number": 0
}
},
"minimums": {
"nodes": 0
},
"name": "debug",
"node_sets": "",
"priority": {
"job_factor": 1,
"tier": 1
},
"timeouts": {
"resume": {
"set": false,
"infinite": false,
"number": 0
},
"suspend": {
"set": false,
"infinite": false,
"number": 0
}
},
"suspend_time": {
"set": false,
"infinite": false,
"number": 0
}
}
}
],
"warnings": [
],
"errors": [
]
}
3 changes: 3 additions & 0 deletions exporter/fixtures/sinfo_dataparser.json.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SPDX-FileCopyrightText: 2023 Rivos Inc.

SPDX-License-Identifier: Apache-2.0
2 changes: 1 addition & 1 deletion exporter/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func TestNewConfig_Default(t *testing.T) {
assert := assert.New(t)
config, err := NewConfig(new(CliFlags))
assert.Nil(err)
assert.Equal([]string{"sinfo", "--json"}, config.cliOpts.sinfo)
assert.Equal([]string{"sinfo", "-N", "--json"}, config.cliOpts.sinfo)
assert.Equal([]string{"squeue", "--json"}, config.cliOpts.squeue)
assert.Equal([]string{"scontrol", "show", "lic", "--json"}, config.cliOpts.lic)
assert.Equal(uint64(10), config.TraceConf.rate)
Expand Down
119 changes: 119 additions & 0 deletions exporter/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,125 @@ type NodeMetric struct {
Weight float64 `json:"weight"`
}

type sinfoDataParserResponse struct {
Meta struct {
Plugins map[string]string `json:"plugins"`
} `json:"meta"`
SlurmVersion struct {
Version struct {
Major int `json:"major"`
Micro int `json:"micro"`
Minor int `json:"minor"`
} `json:"version"`
Release string `json:"release"`
} `json:"Slurm"`
Sinfo []struct {
Node struct {
State []string `json:"state"`
} `json:"node"`
Nodes struct {
Allocated int `json:"allocated"`
Idle int `json:"idle"`
Other int `json:"other"`
Total int `json:"total"`
Nodes []string `json:"nodes"`
} `json:"nodes"`
Cpus struct {
Allocated int `json:"allocated"`
Idle int `json:"idle"`
Other int `json:"other"`
Total int `json:"total"`
}
Memory struct {
Minimum int `json:"minimum"`
Maximum int `json:"maximum"`
Allocated int `json:"allocated"`
Free struct {
Minimum struct {
Set bool `json:"set"`
Number int `json:"number"`
} `json:"minimum"`
Maximum struct {
Set bool `json:"set"`
Number int `json:"number"`
} `json:"maximum"`
} `json:"free"`
}
Partition struct {
Name string `json:"name"`
Alternate string `json:"alternate"`
} `json:"partition"`
} `json:"sinfo"`
}

type DataParserJsonFetcher struct {
scraper SlurmByteScraper
errorCounter prometheus.Counter
duration time.Duration
cache *AtomicThrottledCache[NodeMetric]
}

func (dpj *DataParserJsonFetcher) fetch() ([]NodeMetric, error) {
squeue := new(sinfoDataParserResponse)
cliJson, err := dpj.scraper.FetchRawBytes()
if err != nil {
dpj.errorCounter.Inc()
return nil, err
}
if err := json.Unmarshal(cliJson, squeue); err != nil {
dpj.errorCounter.Inc()
return nil, err
}
nodeMetrics := make([]NodeMetric, 0)
for _, entry := range squeue.Sinfo {
nodes := entry.Nodes
// validate single node parse
if nodes.Total != 1 {
dpj.errorCounter.Inc()
return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`")
}
freeMemSet := entry.Memory.Free.Maximum.Set && entry.Memory.Free.Minimum.Set
if freeMemSet && entry.Memory.Free.Minimum.Number != entry.Memory.Free.Maximum.Number {
dpj.errorCounter.Inc()
slog.Error("unable to scrape free mem set")
}
if entry.Memory.Minimum != entry.Memory.Maximum {
dpj.errorCounter.Inc()
return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`")
}
metric := NodeMetric{
Hostname: nodes.Nodes[0],
Cpus: float64(entry.Cpus.Total),
RealMemory: float64(entry.Memory.Maximum),
FreeMemory: float64(entry.Memory.Free.Maximum.Number),
State: strings.Join(entry.Node.State, "&"),
}
if !slices.Contains(metric.Partitions, entry.Partition.Name) {
metric.Partitions = append(metric.Partitions, entry.Partition.Name)
}
if entry.Partition.Alternate != "" && !slices.Contains(metric.Partitions, entry.Partition.Alternate) {
metric.Partitions = append(metric.Partitions, entry.Partition.Alternate)
}
nodeMetrics = append(nodeMetrics, metric)
}
return nodeMetrics, nil
}

func (dpj *DataParserJsonFetcher) FetchMetrics() ([]NodeMetric, error) {
t := time.Now()
metrics, err := dpj.cache.FetchOrThrottle(dpj.fetch)
dpj.duration = time.Since(t)
return metrics, err
}

func (dpj *DataParserJsonFetcher) ScrapeDuration() time.Duration {
return dpj.duration
}

func (dpj *DataParserJsonFetcher) ScrapeError() prometheus.Counter {
return dpj.errorCounter
}

type sinfoResponse struct {
Meta struct {
SlurmVersion struct {
Expand Down
13 changes: 13 additions & 0 deletions exporter/nodes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

var MockNodeInfoScraper = &MockScraper{fixture: "fixtures/sinfo_out.json"}
var MockNodeInfoDataParserScraper = &MockScraper{fixture: "fixtures/sinfo_dataparser.json"}
var MockNodeDataParserScraper = &MockScraper{fixture: "fixtures/sinfo_dataparser.json"}

func TestNewNodeCollector(t *testing.T) {
assert := assert.New(t)
Expand Down Expand Up @@ -92,6 +93,18 @@ func TestNodeSummaryMemoryMetrics(t *testing.T) {
assert.Equal(2e+06, metrics.RealMemory)
}

func TestDataParserNodeSummaryMemoryMetrics(t *testing.T) {
assert := assert.New(t)
fetcher := DataParserJsonFetcher{scraper: MockNodeDataParserScraper, errorCounter: prometheus.NewCounter(prometheus.CounterOpts{}), cache: NewAtomicThrottledCache[NodeMetric](1)}
nodeMetrics, err := fetcher.FetchMetrics()
fmt.Println(nodeMetrics)
assert.Nil(err)
metrics := fetchNodeTotalMemMetrics(nodeMetrics)
assert.Equal(114688., metrics.AllocMemory)
assert.Equal(1.823573e+06, metrics.FreeMemory)
assert.Equal(2e+06, metrics.RealMemory)
}

func TestNodeCollector(t *testing.T) {
assert := assert.New(t)
config, err := NewConfig(new(CliFlags))
Expand Down
2 changes: 1 addition & 1 deletion exporter/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func NewConfig(cliFlags *CliFlags) (*Config, error) {
// defaults
cliOpts := CliOpts{
squeue: []string{"squeue", "--json"},
sinfo: []string{"sinfo", "--json"},
sinfo: []string{"sinfo", "-N", "--json"},
lic: []string{"scontrol", "show", "lic", "--json"},
sdiag: []string{"sdiag", "--json"},
licEnabled: cliFlags.SlurmLicEnabled,
Expand Down
Loading
Loading