Skip to content

Commit

Permalink
Output age of an youngest snapshot, instead of oldest one
Browse files Browse the repository at this point in the history
  • Loading branch information
dsh2dsh committed Apr 26, 2024
1 parent a625213 commit 81e9a2e
Showing 1 changed file with 54 additions and 42 deletions.
96 changes: 54 additions & 42 deletions client/monitorcmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
"github.com/zrepl/zrepl/zfs"
)

const snapshotsOkMsg = "job %q: %s snapshot: %v"

var MonitorCmd = &cli.Subcommand{
Use: "monitor",
Short: "Icinga/Nagios health checks",
Expand All @@ -36,14 +38,14 @@ func newMonitorAliveCmd() *cli.Subcommand {
}

func newMonitorSnapshotsCmd() *cli.Subcommand {
runner := monitorSnapshots{}
runner := newMonitorSnapshots()
return &cli.Subcommand{
Use: "snapshots",
Short: "check snapshots age",
SetupSubcommands: func() []*cli.Subcommand {
return []*cli.Subcommand{
newLatestSnapshotsCmd(&runner),
newOldestSnapshotsCmd(&runner),
newLatestSnapshotsCmd(runner),
newOldestSnapshotsCmd(runner),
}
},
SetupCobra: func(c *cobra.Command) {
Expand All @@ -63,7 +65,11 @@ func newLatestSnapshotsCmd(runner *monitorSnapshots) *cli.Subcommand {
return &cli.Subcommand{
Use: "latest",
Short: "check latest snapshots are not too old, according to rules",
Run: runner.run,
Run: func(ctx context.Context, subcmd *cli.Subcommand, args []string,
) error {
runner.outputAndExit(runner.run(ctx, subcmd, args))
return nil
},
}
}

Expand All @@ -73,24 +79,42 @@ func newOldestSnapshotsCmd(runner *monitorSnapshots) *cli.Subcommand {
Short: "check oldest snapshots are not too old, according to rules",
Run: func(ctx context.Context, subcmd *cli.Subcommand, args []string,
) error {
runner.oldest = true
return runner.run(ctx, subcmd, args)
runner.outputAndExit(runner.withOldest(true).run(ctx, subcmd, args))
return nil
},
}
}

func newMonitorSnapshots() *monitorSnapshots {
m := &monitorSnapshots{}
return m.applyOptions()
}

type monitorSnapshots struct {
job string
oldest bool
prefix string
critical time.Duration
warning time.Duration

age time.Duration
resp *monitoringplugin.Response
age time.Duration
}

func (self *monitorSnapshots) applyOptions() *monitorSnapshots {
if self.resp == nil {
self.resp = monitoringplugin.NewResponse(snapshotsOkMsg)
}
return self
}

func (self *monitorSnapshots) withOldest(v bool) *monitorSnapshots {
self.oldest = v
return self
}

func (self *monitorSnapshots) run(
ctx context.Context, subcmd *cli.Subcommand, args []string,
ctx context.Context, subcmd *cli.Subcommand, _ []string,
) error {
jobConfig, err := subcmd.Config().Job(self.job)
if err != nil {
Expand All @@ -103,9 +127,7 @@ func (self *monitorSnapshots) run(
} else if rules, err = self.overrideRules(rules); err != nil {
return err
}
self.outputAndExit(self.checkSnapshots(ctx, datasets, rules))

return nil
return self.checkSnapshots(ctx, datasets, rules)
}

func (self *monitorSnapshots) overrideRules(
Expand Down Expand Up @@ -251,25 +273,27 @@ func (self *monitorSnapshots) checkDataset(

latest := self.groupSnapshots(snaps, rules)
for i, rule := range rules {
const tooOldFmt = "%s %q too old: %q > %q"
d := time.Since(latest[i].Creation).Truncate(time.Second)
const tooOldFmt = "%s %q too old: %q > %q"
switch {
case rule.Prefix == "" && latest[i].Creation.IsZero():
case latest[i].Creation.IsZero():
err = newMonitorCriticalf(
"%q has no snapshots with prefix %q", name, rule.Prefix)
case time.Since(latest[i].Creation) >= rule.Critical:
err = newMonitorCriticalf(tooOldFmt, self.snapshotType(),
latest[i].FullPath(name), d, rule.Critical)
case rule.Warning > 0 && time.Since(latest[i].Creation) >= rule.Warning:
err = newMonitorWarningf(tooOldFmt, self.snapshotType(),
latest[i].FullPath(name), d, rule.Warning)
case d > self.age:
self.resp.UpdateStatus(monitoringplugin.CRITICAL, fmt.Sprintf(
"%q has no snapshots with prefix %q", name, rule.Prefix))
return nil
case d >= rule.Critical:
self.resp.UpdateStatus(monitoringplugin.CRITICAL, fmt.Sprintf(
tooOldFmt,
self.snapshotType(), latest[i].FullPath(name), d, rule.Critical))
return nil
case rule.Warning > 0 && d >= rule.Warning:
self.resp.UpdateStatus(monitoringplugin.WARNING, fmt.Sprintf(
tooOldFmt,
self.snapshotType(), latest[i].FullPath(name), d, rule.Warning))
return nil
case self.age == 0 || d < self.age:
self.age = d
}
if err != nil {
return err
}
}
return nil
}
Expand Down Expand Up @@ -309,26 +333,14 @@ func (self *monitorSnapshots) cmpSnapshots(
}

func (self *monitorSnapshots) outputAndExit(err error) {
resp := monitoringplugin.NewResponse(fmt.Sprintf("job %q: %s snapshot: %v",
self.job, self.snapshotType(), self.age))

if err != nil {
status := fmt.Sprintf("job %q: %s", self.job, err)
var checkResult monitorCheckResult
if errors.As(err, &checkResult) {
switch {
case checkResult.critical:
resp.UpdateStatus(monitoringplugin.CRITICAL, status)
case checkResult.warning:
resp.UpdateStatus(monitoringplugin.WARNING, status)
default:
resp.UpdateStatus(monitoringplugin.UNKNOWN, status)
}
} else {
resp.UpdateStatus(monitoringplugin.UNKNOWN, status)
}
self.resp.UpdateStatusOnError(fmt.Errorf("job %q: %w", self.job, err),
monitoringplugin.UNKNOWN, "", true)
} else {
self.resp.WithDefaultOkMessage(fmt.Sprintf(snapshotsOkMsg,
self.job, self.snapshotType(), self.age))
}
resp.OutputAndExit()
self.resp.OutputAndExit()
}

func (self *monitorSnapshots) snapshotType() string {
Expand Down

0 comments on commit 81e9a2e

Please sign in to comment.