Skip to content

Commit

Permalink
Implement "zrepl monitor snapshots count",
Browse files Browse the repository at this point in the history
which checks snapshots count according to rules.

```
$ zrepl monitor snapshots count
OK: monitor snapshots
job "zdisk": all snapshots count: 2451
job "zroot-to-zdisk": all snapshots count: 60
```

It expects configuration like:

``` yaml
monitor:
  count:
    - prefix: "zrepl_frequently_"
      warning: 20
      critical: 30
    - prefix: "zrepl_hourly_"
      warning: 31
      critical: 50
    - prefix: "zrepl_daily_"
      warning: 91
      critical: 92
    - prefix: "zrepl_monthly_"
      warning: 13
      critical: 14
    - prefix: ""	    # everything else
      warning: 2
      critical: 10
```
  • Loading branch information
dsh2dsh committed Sep 26, 2024
1 parent 4c0ae81 commit dfaeab7
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 89 deletions.
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,28 @@ pkg install zrepl-dsh2dsh
- [ "unzstd" ]
```

* Added Icinga/Nagios checks for checking the daemon is alive, latest or
oldest snapshots are not too old. See
* Added Icinga/Nagios checks for checking the daemon is alive, snapshots count
is ok, latest or oldest snapshots are not too old. See
[#765](https://github.com/zrepl/zrepl/pull/765). Configuration example:

``` yaml
monitor:
count:
- prefix: "zrepl_frequently_"
warning: 20
critical: 30
- prefix: "zrepl_hourly_"
warning: 31
critical: 50
- prefix: "zrepl_daily_"
warning: 91
critical: 92
- prefix: "zrepl_monthly_"
warning: 13
critical: 14
- prefix: "" # everything else
warning: 2
critical: 10
latest:
- prefix: "zrepl_frequently_"
critical: "48h" # 2d
Expand Down
70 changes: 59 additions & 11 deletions client/monitor/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ var aliveCmd = &cli.Subcommand{

var snapshotsCmd = &cli.Subcommand{
Use: "snapshots",
Short: "check snapshots age",
Short: "check snapshots according to rules",

SetupSubcommands: func() []*cli.Subcommand {
return []*cli.Subcommand{latestCmd, oldestCmd}
return []*cli.Subcommand{countsCmd, latestCmd, oldestCmd}
},

SetupCobra: func(c *cobra.Command) {
Expand All @@ -76,7 +76,27 @@ var snapshotsCmd = &cli.Subcommand{

Run: func(ctx context.Context, cmd *cli.Subcommand, args []string,
) error {
return withJobConfig(cmd, checkSnapshots)
return withJobConfig(cmd, checkSnapshots,
func(m *config.MonitorSnapshots) bool {
return m.Valid()
})
},
}

var countsCmd = &cli.Subcommand{
Use: "count",
Short: "check snapshots count according to rules",

SetupCobra: func(c *cobra.Command) {
c.Args = cobra.ExactArgs(0)
},

Run: func(ctx context.Context, cmd *cli.Subcommand, args []string,
) error {
return withJobConfig(cmd, checkCounts,
func(m *config.MonitorSnapshots) bool {
return len(m.Count) > 0
})
},
}

Expand All @@ -90,7 +110,10 @@ var latestCmd = &cli.Subcommand{

Run: func(ctx context.Context, cmd *cli.Subcommand, args []string,
) error {
return withJobConfig(cmd, checkLatest)
return withJobConfig(cmd, checkLatest,
func(m *config.MonitorSnapshots) bool {
return len(m.Latest) > 0
})
},
}

Expand All @@ -104,7 +127,10 @@ var oldestCmd = &cli.Subcommand{

Run: func(ctx context.Context, cmd *cli.Subcommand, args []string,
) error {
return withJobConfig(cmd, checkOldest)
return withJobConfig(cmd, checkOldest,
func(m *config.MonitorSnapshots) bool {
return len(m.Oldest) > 0
})
},
}

Expand All @@ -120,11 +146,12 @@ func withStatusClient(cmd *cli.Subcommand, fn func(c *status.Client) error,

func withJobConfig(cmd *cli.Subcommand,
fn func(j *config.JobEnum, resp *monitoringplugin.Response) error,
filterJob func(m *config.MonitorSnapshots) bool,
) (err error) {
resp := monitoringplugin.NewResponse("monitor snapshots")

var foundJob bool
for j := range jobs(cmd.Config(), snapJob) {
for j := range jobs(cmd.Config(), snapJob, filterJob) {
foundJob = true
if err = fn(j, resp); err != nil {
err = fmt.Errorf("job %q: %w", j.Name(), err)
Expand All @@ -143,11 +170,14 @@ func withJobConfig(cmd *cli.Subcommand,
return nil
}

func jobs(c *config.Config, jobName string) iter.Seq[*config.JobEnum] {
func jobs(c *config.Config, jobName string,
filterJob func(m *config.MonitorSnapshots) bool,
) iter.Seq[*config.JobEnum] {
fn := func(yield func(j *config.JobEnum) bool) {
for i := range c.Jobs {
j := &c.Jobs[i]
ok := (jobName == "" && j.MonitorSnapshots().Valid()) ||
m := j.MonitorSnapshots()
ok := (jobName == "" && filterJob(&m)) ||
(jobName != "" && j.Name() == jobName)
if ok && !yield(j) {
break
Expand All @@ -161,10 +191,28 @@ func checkSnapshots(j *config.JobEnum, resp *monitoringplugin.Response) error {
check := NewSnapCheck(resp).
WithPrefix(snapPrefix).
WithThresholds(snapWarn, snapCrit)
if err := check.UpdateStatus(j); err != nil {
return err
m := j.MonitorSnapshots()

if len(m.Count) > 0 {
if err := check.WithCounts(true).UpdateStatus(j); err != nil {
return err
}
}

if len(m.Latest) > 0 {
if err := check.Reset().WithCounts(false).UpdateStatus(j); err != nil {
return err
}
}
return check.Reset().WithOldest(true).UpdateStatus(j)

if len(m.Oldest) > 0 {
return check.Reset().WithOldest(true).UpdateStatus(j)
}
return nil
}

func checkCounts(j *config.JobEnum, resp *monitoringplugin.Response) error {
return NewSnapCheck(resp).WithCounts(true).UpdateStatus(j)
}

func checkLatest(j *config.JobEnum, resp *monitoringplugin.Response) error {
Expand Down
Loading

0 comments on commit dfaeab7

Please sign in to comment.