Skip to content

Commit

Permalink
Merge branch 'data-parser-support' of https://github.com/rivosinc/pro…
Browse files Browse the repository at this point in the history
…metheus-slurm-exporter into data-parser-support
  • Loading branch information
abhinavDhulipala committed Mar 26, 2024
2 parents 2a85465 + 8ed95d1 commit a12e27f
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 15 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ You can also install the exporter directly with `go install github.com/rivosinc/

```bash
# example installation
$ go install github.com/rivosinc/prometheus-slurm-exporter@v1.0.1
$ go install github.com/rivosinc/prometheus-slurm-exporter@v1.1.1
# or if you like living on the edge
$ go install github.com/rivosinc/prometheus-slurm-exporter@latest
# if not already added, ensure
Expand Down
13 changes: 1 addition & 12 deletions exporter/fixtures/sinfo_dataparser.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,7 @@
"total": 10,
"hostnames": [],
"addresses": [],
"nodes": [
"cs62",
"cs63",
"cs64",
"cs65",
"cs66",
"cs68",
"cs71",
"cs200",
"cs201",
"cs204"
]
"nodes": ["cs62"]
},
"cpus": {
"allocated": 114,
Expand Down
4 changes: 3 additions & 1 deletion exporter/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ func parseCliFallback(squeue []byte, errorCounter prometheus.Counter) ([]JobMetr
}
mem, err := MemToFloat(metric.Mem)
if err != nil {
return nil, err
slog.Error(fmt.Sprintf("squeue fallback parse error: failed on line %d `%s` with err `%q`", i, line, err))
errorCounter.Inc()
continue
}
openapiJobMetric := JobMetric{
Account: metric.Account,
Expand Down
77 changes: 76 additions & 1 deletion exporter/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,83 @@ type sinfoDataParserResponse struct {
Total int `json:"total"`
Nodes []string `json:"nodes"`
} `json:"nodes"`
Cpus struct {
Allocated int `json:"allocated"`
Idle int `json:"idle"`
Other int `json:"other"`
Total int `json:"total"`
}
Memory struct {
Minimum int `json:"minimum"`
Maximum int `json:"maximum"`
Allocated int `json:"allocated"`
Free struct {
Minimum struct {
Set bool `json:"set"`
Number int `json:"number"`
} `json:"minimum"`
Maximum struct {
Set bool `json:"set"`
Number int `json:"number"`
} `json:"maximum"`
} `json:"free"`
}
Partition struct {
Name string `json:"name"`
Alternate string `json:"alternate"`
} `json:"parittion"`
} `json:"sinfo"`
}

type DataParserJsonFetcher struct {
scraper SlurmByteScraper
errorCounter prometheus.Counter
cache *AtomicThrottledCache[NodeMetric]
}

func (dpj *DataParserJsonFetcher) fetch() ([]NodeMetric, error) {
squeue := new(sinfoDataParserResponse)
cliJson, err := dpj.scraper.FetchRawBytes()
if err != nil {
return nil, err
}
if err := json.Unmarshal(cliJson, squeue); err != nil {
return nil, err
}
nodeMetrics := make([]NodeMetric, 0)
for _, entry := range squeue.Sinfo {
nodes := entry.Nodes
// validate single node parse
if nodes.Total != 1 {
return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`")
}
if entry.Memory.Free.Maximum.Set && entry.Memory.Free.Minimum.Set {
return nil, fmt.Errorf("unable to scrape free mem metrics")
}
if entry.Memory.Free.Minimum.Number != entry.Memory.Free.Maximum.Number {
return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`")
}
if entry.Memory.Minimum != entry.Memory.Maximum {
return nil, fmt.Errorf("must contain only 1 node per entry, please use the -N option exp. `sinfo -N --json`")
}
metric := NodeMetric{
Hostname: nodes.Nodes[0],
Cpus: float64(entry.Cpus.Total),
RealMemory: float64(entry.Memory.Maximum),
FreeMemory: float64(entry.Memory.Free.Maximum.Number),
State: strings.Join(entry.Node.State, "&"),
}
if !slices.Contains(metric.Partitions, entry.Partition.Name) {
metric.Partitions = append(metric.Partitions, entry.Partition.Name)
}
if entry.Partition.Alternate != "" && !slices.Contains(metric.Partitions, entry.Partition.Alternate) {
metric.Partitions = append(metric.Partitions, entry.Partition.Alternate)
}
nodeMetrics = append(nodeMetrics, metric)
}
return nodeMetrics, nil
}

type sinfoResponse struct {
Meta struct {
SlurmVersion struct {
Expand Down Expand Up @@ -156,7 +230,8 @@ func (cmf *NodeCliFallbackFetcher) fetch() ([]NodeMetric, error) {
}
if err := json.Unmarshal(line, &metric); err != nil {
cmf.errorCounter.Inc()
return nil, fmt.Errorf("sinfo failed to parse line %d: %s, got %q", i, line, err)
slog.Error(fmt.Sprintf("sinfo failed to parse line %d: %s, got %q", i, line, err))
continue
}
// convert mem units from MB to Bytes
metric.RealMemory *= 1e6
Expand Down

0 comments on commit a12e27f

Please sign in to comment.