Skip to content

Commit

Permalink
[exporter/jobs] consolidate node not avail reason (#106)
Browse files Browse the repository at this point in the history
* consolidate node not avail reason

* test node avail and new no err
  • Loading branch information
abhinavDhulipala authored Nov 17, 2024
1 parent 2dd03c9 commit ac18f46
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
2 changes: 2 additions & 0 deletions exporter/fixtures/squeue_fallback.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{"a": "account1", "id": 26515966, "end_time": "2023-09-21T00:21:42", "state": "RUNNING", "p": "hw-h", "cpu": 1, "mem": "128G", "array_id": "N/A", "r": "cs10"}
{"a": "account1", "id": 50580016, "end_time": "2023-09-21T14:31:11", "state": "RUNNING", "p": "hw-l", "cpu": 1, "mem": "62.50G", "array_id": "N/A", "r": "cs10"}
{"a": "account1", "id": 51447051, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A", "r": "(Dependency)"}
{"a": "account1", "id": 51447052, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A", "r": "((ReqNodeNotAvail, UnavailableNodes:cs[100,101,102]))"}
{"a": "account1", "id": 51447053, "end_time": "N/A", "state": "PENDING", "p": "hw-h", "cpu": 1, "mem": "40000M", "array_id": "N/A", "r": "(Nodes required for job are DOWN, DRAINED or reserved for jobs in higher priority partitions)"}
{"a": "account1", "id": 18804, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": 24, "mem": "118G", "array_id": "N/A", "r": "(Priority)"}
# test counter inc with faulty inputs
{"a": "account1", "id": 18805, "end_time": "NONE", "state": "PENDING", "p": "magma", "cpu": xx, "mem": "118G", "array_id": "N/A"}
Expand Down
17 changes: 14 additions & 3 deletions exporter/jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ import (
"golang.org/x/exp/slog"
)

// the pending reason for a job denoting that a requested node is unavailable
const reqNodeNotAvailReason string = "ReqNodeNotAvail, UnavailableNodes"

type NodeResource struct {
Mem float64 `json:"memory"`
}
Expand Down Expand Up @@ -132,7 +135,7 @@ func (jcf *JobCliFallbackFetcher) fetch() ([]JobMetric, error) {
jcf.errCounter.Inc()
continue
}
re := regexp.MustCompile(`^\((?P<reason>(\w)+)\)$`)
re := regexp.MustCompile(`^\((?P<reason>(.+))\)$`)
if metric.JobState == "PENDING" {
if matches := re.FindStringSubmatch(metric.StateReason); matches != nil {
metric.StateReason = matches[re.SubexpIndex("reason")]
Expand Down Expand Up @@ -279,9 +282,17 @@ func parseStateReasonMetric(jobs []JobMetric) *StateReasonMetric {
}

for _, job := range jobs {
if job.JobState == "PENDING" {
metric.pendingStateCount[job.StateReason]++
if job.JobState != "PENDING" {
continue
}
reason := job.StateReason
if strings.Contains(reason, reqNodeNotAvailReason) {
// consolidate pending node not avail reason to be node agnostic. i.e
// from (ReqNodeNotAvail, UnavailableNodes:cs[100,...])
// to (ReqNodeNotAvail, UnavailableNodes)
reason = fmt.Sprintf("(%s)", reqNodeNotAvailReason)
}
metric.pendingStateCount[reason]++
}
return &metric
}
Expand Down
8 changes: 8 additions & 0 deletions exporter/jobs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package exporter

import (
"strings"
"testing"
"time"

Expand Down Expand Up @@ -74,6 +75,13 @@ func TestParseCliFallback(t *testing.T) {
metrics, err := cliFallbackFetcher.fetch()
assert.Nil(err)
assert.NotEmpty(metrics)
nodeAvailMetricsCount := 0
for _, metric := range metrics {
if strings.Contains(metric.StateReason, reqNodeNotAvailReason) {
nodeAvailMetricsCount++
}
}
assert.Equal(1, nodeAvailMetricsCount)
assert.Equal(2., CollectCounterValue(cliFallbackFetcher.errCounter))
}

Expand Down

0 comments on commit ac18f46

Please sign in to comment.