From dfaeab72567424eef1a70071af3ee1fec9f6accd Mon Sep 17 00:00:00 2001 From: Denis Shaposhnikov <993498+dsh2dsh@users.noreply.github.com> Date: Thu, 26 Sep 2024 21:36:37 +0200 Subject: [PATCH] Implement "zrepl monitor snapshots count", which checks snapshots count according to rules. ``` $ zrepl monitor snapshots count OK: monitor snapshots job "zdisk": all snapshots count: 2451 job "zroot-to-zdisk": all snapshots count: 60 ``` It expects configuration like: ``` yaml monitor: count: - prefix: "zrepl_frequently_" warning: 20 critical: 30 - prefix: "zrepl_hourly_" warning: 31 critical: 50 - prefix: "zrepl_daily_" warning: 91 critical: 92 - prefix: "zrepl_monthly_" warning: 13 critical: 14 - prefix: "" # everything else warning: 2 critical: 10 ``` --- README.md | 20 ++- client/monitor/cmd.go | 70 +++++++++-- client/monitor/snapshots.go | 240 +++++++++++++++++++++++++----------- config/config.go | 18 ++- 4 files changed, 259 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index 54d1df6e..64db48de 100644 --- a/README.md +++ b/README.md @@ -116,12 +116,28 @@ pkg install zrepl-dsh2dsh - [ "unzstd" ] ``` - * Added Icinga/Nagios checks for checking the daemon is alive, latest or - oldest snapshots are not too old. See + * Added Icinga/Nagios checks for checking the daemon is alive, snapshots count + is ok, latest or oldest snapshots are not too old. See [#765](https://github.com/zrepl/zrepl/pull/765). Configuration example: ``` yaml monitor: + count: + - prefix: "zrepl_frequently_" + warning: 20 + critical: 30 + - prefix: "zrepl_hourly_" + warning: 31 + critical: 50 + - prefix: "zrepl_daily_" + warning: 91 + critical: 92 + - prefix: "zrepl_monthly_" + warning: 13 + critical: 14 + - prefix: "" # everything else + warning: 2 + critical: 10 latest: - prefix: "zrepl_frequently_" critical: "48h" # 2d diff --git a/client/monitor/cmd.go b/client/monitor/cmd.go index 5ad1564d..c368c93d 100644 --- a/client/monitor/cmd.go +++ b/client/monitor/cmd.go @@ -58,10 +58,10 @@ var aliveCmd = &cli.Subcommand{ var snapshotsCmd = &cli.Subcommand{ Use: "snapshots", - Short: "check snapshots age", + Short: "check snapshots according to rules", SetupSubcommands: func() []*cli.Subcommand { - return []*cli.Subcommand{latestCmd, oldestCmd} + return []*cli.Subcommand{countsCmd, latestCmd, oldestCmd} }, SetupCobra: func(c *cobra.Command) { @@ -76,7 +76,27 @@ var snapshotsCmd = &cli.Subcommand{ Run: func(ctx context.Context, cmd *cli.Subcommand, args []string, ) error { - return withJobConfig(cmd, checkSnapshots) + return withJobConfig(cmd, checkSnapshots, + func(m *config.MonitorSnapshots) bool { + return m.Valid() + }) + }, +} + +var countsCmd = &cli.Subcommand{ + Use: "count", + Short: "check snapshots count according to rules", + + SetupCobra: func(c *cobra.Command) { + c.Args = cobra.ExactArgs(0) + }, + + Run: func(ctx context.Context, cmd *cli.Subcommand, args []string, + ) error { + return withJobConfig(cmd, checkCounts, + func(m *config.MonitorSnapshots) bool { + return len(m.Count) > 0 + }) }, } @@ -90,7 +110,10 @@ var latestCmd = &cli.Subcommand{ Run: func(ctx context.Context, cmd *cli.Subcommand, args []string, ) error { - return withJobConfig(cmd, checkLatest) + return withJobConfig(cmd, checkLatest, + func(m *config.MonitorSnapshots) bool { + return len(m.Latest) > 0 + }) }, } @@ -104,7 +127,10 @@ var oldestCmd = &cli.Subcommand{ Run: func(ctx context.Context, cmd *cli.Subcommand, args []string, ) error { - return withJobConfig(cmd, checkOldest) + return withJobConfig(cmd, checkOldest, + func(m *config.MonitorSnapshots) bool { + return len(m.Oldest) > 0 + }) }, } @@ -120,11 +146,12 @@ func withStatusClient(cmd *cli.Subcommand, fn func(c *status.Client) error, func withJobConfig(cmd *cli.Subcommand, fn func(j *config.JobEnum, resp *monitoringplugin.Response) error, + filterJob func(m *config.MonitorSnapshots) bool, ) (err error) { resp := monitoringplugin.NewResponse("monitor snapshots") var foundJob bool - for j := range jobs(cmd.Config(), snapJob) { + for j := range jobs(cmd.Config(), snapJob, filterJob) { foundJob = true if err = fn(j, resp); err != nil { err = fmt.Errorf("job %q: %w", j.Name(), err) @@ -143,11 +170,14 @@ func withJobConfig(cmd *cli.Subcommand, return nil } -func jobs(c *config.Config, jobName string) iter.Seq[*config.JobEnum] { +func jobs(c *config.Config, jobName string, + filterJob func(m *config.MonitorSnapshots) bool, +) iter.Seq[*config.JobEnum] { fn := func(yield func(j *config.JobEnum) bool) { for i := range c.Jobs { j := &c.Jobs[i] - ok := (jobName == "" && j.MonitorSnapshots().Valid()) || + m := j.MonitorSnapshots() + ok := (jobName == "" && filterJob(&m)) || (jobName != "" && j.Name() == jobName) if ok && !yield(j) { break @@ -161,10 +191,28 @@ func checkSnapshots(j *config.JobEnum, resp *monitoringplugin.Response) error { check := NewSnapCheck(resp). WithPrefix(snapPrefix). WithThresholds(snapWarn, snapCrit) - if err := check.UpdateStatus(j); err != nil { - return err + m := j.MonitorSnapshots() + + if len(m.Count) > 0 { + if err := check.WithCounts(true).UpdateStatus(j); err != nil { + return err + } + } + + if len(m.Latest) > 0 { + if err := check.Reset().WithCounts(false).UpdateStatus(j); err != nil { + return err + } } - return check.Reset().WithOldest(true).UpdateStatus(j) + + if len(m.Oldest) > 0 { + return check.Reset().WithOldest(true).UpdateStatus(j) + } + return nil +} + +func checkCounts(j *config.JobEnum, resp *monitoringplugin.Response) error { + return NewSnapCheck(resp).WithCounts(true).UpdateStatus(j) } func checkLatest(j *config.JobEnum, resp *monitoringplugin.Response) error { diff --git a/client/monitor/snapshots.go b/client/monitor/snapshots.go index 0c6471ee..20357528 100644 --- a/client/monitor/snapshots.go +++ b/client/monitor/snapshots.go @@ -19,7 +19,9 @@ func NewSnapCheck(resp *monitoringplugin.Response) *SnapCheck { } type SnapCheck struct { + counts bool oldest bool + job string prefix string warn time.Duration @@ -27,8 +29,10 @@ type SnapCheck struct { resp *monitoringplugin.Response - age time.Duration - snapName string + age time.Duration + snapCount uint + snapName string + failed bool datasets map[string][]zfs.FilesystemVersion orderedDatasets []string @@ -56,56 +60,44 @@ func (self *SnapCheck) WithResponse(resp *monitoringplugin.Response, return self } -func (self *SnapCheck) UpdateStatus(jobConfig *config.JobEnum) error { - if err := self.Run(context.Background(), jobConfig); err != nil { - return err - } else if self.resp.GetStatusCode() == monitoringplugin.OK { - self.resp.UpdateStatus(monitoringplugin.OK, self.statusf( - "%s %q: %v", - self.snapshotType(), self.snapName, self.age)) - } - return nil +func (self *SnapCheck) WithCounts(v bool) *SnapCheck { + self.counts = v + return self } -func (self *SnapCheck) Run(ctx context.Context, jobConfig *config.JobEnum, -) error { - self.job = jobConfig.Name() - datasets, rules, err := self.datasetRules(ctx, jobConfig) - if err != nil { - return err - } else if rules, err = self.overrideRules(rules); err != nil { +func (self *SnapCheck) UpdateStatus(jobConfig *config.JobEnum) error { + if err := self.Run(context.Background(), jobConfig); err != nil { return err } - for _, dataset := range datasets { - if err := self.checkDataset(ctx, dataset, rules); err != nil { - return err - } + switch { + case self.failed: + case self.counts: + self.updateStatus(monitoringplugin.OK, + "all snapshots count: %d", self.snapCount) + default: + self.updateStatus(monitoringplugin.OK, "%s %q: %v", + self.snapshotType(), self.snapName, self.age) } return nil } -func (self *SnapCheck) overrideRules(rules []config.MonitorSnapshot, -) ([]config.MonitorSnapshot, error) { - if self.prefix != "" { - rules = []config.MonitorSnapshot{ - { - Prefix: self.prefix, - Warning: self.warn, - Critical: self.crit, - }, - } +func (self *SnapCheck) Run(ctx context.Context, j *config.JobEnum) error { + self.job = j.Name() + datasets, err := self.jobDatasets(ctx, j) + if err != nil { + return err } - if len(rules) == 0 { - return nil, errors.New("no monitor rules or cli args defined") + if self.counts { + return self.checkCounts(ctx, j, datasets) } - return rules, nil + return self.checkCreation(ctx, j, datasets) } -func (self *SnapCheck) datasetRules( +func (self *SnapCheck) jobDatasets( ctx context.Context, jobConfig *config.JobEnum, -) (datasets []string, rules []config.MonitorSnapshot, err error) { +) (datasets []string, err error) { switch j := jobConfig.Ret.(type) { case *config.PushJob: datasets, err = self.datasetsFromFilter(ctx, j.Filesystems) @@ -121,18 +113,9 @@ func (self *SnapCheck) datasetRules( err = fmt.Errorf("unknown job type %T", j) } - if err != nil { - return + if err == nil { + self.datasets = make(map[string][]zfs.FilesystemVersion, len(datasets)) } - - cfg := jobConfig.MonitorSnapshots() - if self.oldest { - rules = cfg.Oldest - } else { - rules = cfg.Latest - } - - self.datasets = make(map[string][]zfs.FilesystemVersion, len(datasets)) return } @@ -206,18 +189,34 @@ func (self *SnapCheck) datasetsFromRootFs( return filtered, nil } -func (self *SnapCheck) checkDataset( - ctx context.Context, fsName string, rules []config.MonitorSnapshot, +func (self *SnapCheck) checkCounts(ctx context.Context, j *config.JobEnum, + datasets []string, +) error { + rules := j.MonitorSnapshots().Count + if len(rules) == 0 { + return errors.New("no monitor rules defined") + } + + for _, dataset := range datasets { + if err := self.checkSnapsCounts(ctx, dataset, rules); err != nil { + return err + } + } + return nil +} + +func (self *SnapCheck) checkSnapsCounts(ctx context.Context, fsName string, + rules []config.MonitorCount, ) error { snaps, err := self.snapshots(ctx, fsName) if err != nil { return err } - latest := self.byCreation(snaps, rules) + grouped := self.byCount(snaps, rules) for i := range rules { - if !self.applyRule(&rules[i], latest[i], fsName) { - return nil + if !self.applyCountRule(&rules[i], fsName, grouped[i]) { + break } } return nil @@ -243,8 +242,112 @@ func (self *SnapCheck) snapshots(ctx context.Context, fsName string, return snaps, err } +func (self *SnapCheck) byCount(snaps []zfs.FilesystemVersion, + rules []config.MonitorCount, +) []uint { + grouped := make([]uint, len(rules)) + for i := range snaps { + s := &snaps[i] + for j := range rules { + r := &rules[j] + if r.Prefix == "" || strings.HasPrefix(s.Name, r.Prefix) { + grouped[j]++ + break + } + } + } + return grouped +} + +func (self *SnapCheck) applyCountRule(rule *config.MonitorCount, fsName string, + cnt uint, +) bool { + if cnt == 0 && rule.Prefix == "" { + return true + } else if cnt == 0 { + self.resp.UpdateStatus(monitoringplugin.CRITICAL, fmt.Sprintf( + "%q has no snapshots with prefix %q", fsName, rule.Prefix)) + return false + } + + const msg = "%s: %q snapshots count: %d (%d)" + switch { + case cnt >= rule.Critical: + self.updateStatus(monitoringplugin.CRITICAL, msg, + fsName, rule.Prefix, cnt, rule.Critical) + return false + case rule.Warning > 0 && cnt >= rule.Warning: + self.updateStatus(monitoringplugin.WARNING, msg, + fsName, rule.Prefix, cnt, rule.Warning) + return false + default: + self.snapCount += cnt + } + return true +} + +func (self *SnapCheck) checkCreation(ctx context.Context, j *config.JobEnum, + datasets []string, +) error { + rules, err := self.overrideRules(self.rulesByCreation(j)) + if err != nil { + return err + } + + for _, dataset := range datasets { + if err := self.checkSnapsCreation(ctx, dataset, rules); err != nil { + return err + } + } + return nil +} + +func (self *SnapCheck) overrideRules(rules []config.MonitorCreation, +) ([]config.MonitorCreation, error) { + if self.prefix != "" { + rules = []config.MonitorCreation{ + { + Prefix: self.prefix, + Warning: self.warn, + Critical: self.crit, + }, + } + } + + if len(rules) == 0 { + return nil, errors.New("no monitor rules or cli args defined") + } + return rules, nil +} + +func (self *SnapCheck) rulesByCreation(j *config.JobEnum, +) []config.MonitorCreation { + cfg := j.MonitorSnapshots() + if self.oldest { + return cfg.Oldest + } + return cfg.Latest +} + +func (self *SnapCheck) checkSnapsCreation( + ctx context.Context, fsName string, rules []config.MonitorCreation, +) error { + snaps, err := self.snapshots(ctx, fsName) + if err != nil { + return err + } + + latest := self.byCreation(snaps, rules) + for i := range rules { + if !self.applyCreationRule(&rules[i], latest[i], fsName) { + return nil + } + } + return nil +} + func (self *SnapCheck) byCreation(snaps []zfs.FilesystemVersion, - rules []config.MonitorSnapshot, + rules []config.MonitorCreation, ) []*zfs.FilesystemVersion { grouped := make([]*zfs.FilesystemVersion, len(rules)) for i := range snaps { @@ -278,7 +381,7 @@ func (self *SnapCheck) snapshotType() string { return "latest" } -func (self *SnapCheck) applyRule(rule *config.MonitorSnapshot, +func (self *SnapCheck) applyCreationRule(rule *config.MonitorCreation, snap *zfs.FilesystemVersion, fsName string, ) bool { if snap == nil && rule.Prefix == "" { @@ -294,14 +397,12 @@ func (self *SnapCheck) applyRule(rule *config.MonitorSnapshot, switch { case d >= rule.Critical: - self.resp.UpdateStatus(monitoringplugin.CRITICAL, self.statusf( - tooOldFmt, - self.snapshotType(), snap.FullPath(fsName), d, rule.Critical)) + self.updateStatus(monitoringplugin.CRITICAL, tooOldFmt, + self.snapshotType(), snap.FullPath(fsName), d, rule.Critical) return false case rule.Warning > 0 && d >= rule.Warning: - self.resp.UpdateStatus(monitoringplugin.WARNING, self.statusf( - tooOldFmt, - self.snapshotType(), snap.FullPath(fsName), d, rule.Warning)) + self.updateStatus(monitoringplugin.WARNING, tooOldFmt, + self.snapshotType(), snap.FullPath(fsName), d, rule.Warning) return false case self.age == 0: fallthrough @@ -314,20 +415,17 @@ func (self *SnapCheck) applyRule(rule *config.MonitorSnapshot, return true } -func (self *SnapCheck) statusf(format string, a ...any) string { - return self.status(fmt.Sprintf(format, a...)) -} - -func (self *SnapCheck) status(s string) string { - prefix := fmt.Sprintf("job %q", self.job) - if s == "" { - return prefix - } - return prefix + ": " + s +func (self *SnapCheck) updateStatus(statusCode int, format string, a ...any) { + self.failed = self.failed || statusCode != monitoringplugin.OK + statusMessage := fmt.Sprintf("job %q: ", self.job) + + fmt.Sprintf(format, a...) + self.resp.UpdateStatus(statusCode, statusMessage) } func (self *SnapCheck) Reset() *SnapCheck { self.age = 0 + self.snapCount = 0 self.snapName = "" + self.failed = false return self } diff --git a/config/config.go b/config/config.go index a18f96ed..4bc2d801 100644 --- a/config/config.go +++ b/config/config.go @@ -115,18 +115,26 @@ type ConflictResolution struct { } type MonitorSnapshots struct { - Latest []MonitorSnapshot `yaml:"latest" validate:"dive"` - Oldest []MonitorSnapshot `yaml:"oldest" validate:"dive"` + Count []MonitorCount `yaml:"count" validate:"dive"` + Latest []MonitorCreation `yaml:"latest" validate:"dive"` + Oldest []MonitorCreation `yaml:"oldest" validate:"dive"` } -type MonitorSnapshot struct { +type MonitorCount struct { + Prefix string `yaml:"prefix"` + Warning uint `yaml:"warning"` + Critical uint `yaml:"critical" validate:"required"` +} + +type MonitorCreation struct { Prefix string `yaml:"prefix"` Warning time.Duration `yaml:"warning"` Critical time.Duration `yaml:"critical" validate:"required"` } -func (self MonitorSnapshots) Valid() bool { - return len(self.Latest) > 0 || len(self.Oldest) > 0 +func (self *MonitorSnapshots) Valid() bool { + return len(self.Count) > 0 || len(self.Latest) > 0 || + len(self.Oldest) > 0 } type PassiveJob struct {