Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ZFS replication metrics #243

Merged
merged 12 commits into from
Apr 27, 2024
Merged
21 changes: 21 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Usage
[--collector.cluster | --no-collector.cluster]
[--collector.resources | --no-collector.resources]
[--collector.config | --no-collector.config]
[--collector.replication | --no-collector.replication]
[--config.file CONFIG_FILE]
[--web.listen-address WEB_LISTEN_ADDRESS]
[--server.keyfile SERVER_KEYFILE]
Expand Down Expand Up @@ -90,6 +91,8 @@ Usage

--collector.config, --no-collector.config
Exposes PVE onboot status
--collector.replication, --no-collector.replication
Exposes PVE replication info


Use `[::]` in the `--web.listen-address` flag in order to bind to both IPv6 and
Expand Down Expand Up @@ -191,6 +194,24 @@ Here's an example of the metrics exported.
# HELP pve_version_info Proxmox VE version info
# TYPE pve_version_info gauge
pve_version_info{release="7.1",repoid="6fe299a0",version="7.1-5"} 1.0
# HELP pve_replication_duration_seconds Proxmox vm replication duration
# TYPE pve_replication_duration_seconds gauge
pve_replication_duration_seconds{id="1-0"} 7.73584
# HELP pve_replication_last_sync_timestamp_seconds Proxmox vm replication last_sync
# TYPE pve_replication_last_sync_timestamp_seconds gauge
pve_replication_last_sync_timestamp_seconds{id="1-0"} 1.713382503e+09
# HELP pve_replication_last_try_timestamp_seconds Proxmox vm replication last_try
# TYPE pve_replication_last_try_timestamp_seconds gauge
pve_replication_last_try_timestamp_seconds{id="1-0"} 1.713382503e+09
# HELP pve_replication_next_sync_timestamp_seconds Proxmox vm replication next_sync
# TYPE pve_replication_next_sync_timestamp_seconds gauge
pve_replication_next_sync_timestamp_seconds{id="1-0"} 1.7134689e+09
# HELP pve_replication_failed_syncs Proxmox vm replication fail_count
# TYPE pve_replication_failed_syncs gauge
pve_replication_failed_syncs{id="1-0"} 0.0
# HELP pve_replication_info Proxmox vm replication info
# TYPE pve_replication_info gauge
pve_replication_info{guest="qemu/1",id="1-0",source="node/proxmox1",target="node/proxmox2",type="local"} 1.0

Authentication
--------------
Expand Down
7 changes: 6 additions & 1 deletion src/pve_exporter/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ def main():
action=BooleanOptionalAction, default=True,
help='Exposes PVE onboot status')

nodeflags.add_argument('--collector.replication', dest='collector_replication',
action=BooleanOptionalAction, default=True,
help='Exposes PVE replication info')

parser.add_argument('--config.file', type=pathlib.Path,
dest="config_file", default='/etc/prometheus/pve.yml',
help='Path to config file (/etc/prometheus/pve.yml)')
Expand All @@ -69,7 +73,8 @@ def main():
node=params.collector_node,
cluster=params.collector_cluster,
resources=params.collector_resources,
config=params.collector_config
config=params.collector_config,
replication=params.collector_replication
)

# Load configuration.
Expand Down
8 changes: 7 additions & 1 deletion src/pve_exporter/collector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
VersionCollector,
ClusterInfoCollector
)
from pve_exporter.collector.node import NodeConfigCollector
from pve_exporter.collector.node import (
NodeConfigCollector,
NodeReplicationCollector
)

CollectorsOptions = collections.namedtuple('CollectorsOptions', [
'status',
Expand All @@ -23,6 +26,7 @@
'cluster',
'resources',
'config',
'replication'
])


Expand All @@ -44,5 +48,7 @@ def collect_pve(config, host, cluster, node, options: CollectorsOptions):
registry.register(VersionCollector(pve))
if node and options.config:
registry.register(NodeConfigCollector(pve))
if node and options.replication:
registry.register(NodeReplicationCollector(pve))

return generate_latest(registry)
69 changes: 69 additions & 0 deletions src/pve_exporter/collector/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# pylint: disable=too-few-public-methods

import logging
import itertools

from prometheus_client.core import GaugeMetricFamily

Expand Down Expand Up @@ -57,3 +58,71 @@ def collect(self): # pylint: disable=missing-docstring
metrics[key].add_metric(label_values, metric_value)

return metrics.values()

class NodeReplicationCollector:
"""
Collects Proxmox VE Replication information directly from status, i.e. replication duration,
last_sync, last_try, next_sync, fail_count.
For manual test: "pvesh get /nodes/<node>/replication/<id>/status"
"""

def __init__(self, pve):
self._pve = pve

def collect(self): # pylint: disable=missing-docstring

info_metrics = {
'info': GaugeMetricFamily(
'pve_replication_info',
'Proxmox vm replication info',
labels=['id', 'type', 'source', 'target', 'guest'])
}

metrics = {
'duration': GaugeMetricFamily(
'pve_replication_duration_seconds',
'Proxmox vm replication duration',
labels=['id']),
'last_sync': GaugeMetricFamily(
'pve_replication_last_sync_timestamp_seconds',
'Proxmox vm replication last_sync',
labels=['id']),
'last_try': GaugeMetricFamily(
'pve_replication_last_try_timestamp_seconds',
'Proxmox vm replication last_try',
labels=['id']),
'next_sync': GaugeMetricFamily(
'pve_replication_next_sync_timestamp_seconds',
'Proxmox vm replication next_sync',
labels=['id']),
'fail_count': GaugeMetricFamily(
'pve_replication_failed_syncs',
'Proxmox vm replication fail_count',
labels=['id']),
}

node = None
for entry in self._pve.cluster.status.get():
if entry['type'] == 'node' and entry['local']:
node = entry['name']
break

for jobdata in self._pve.nodes(node).replication.get():
# Add info metric
label_values = [
str(jobdata['id']),
str(jobdata['type']),
f"node/{jobdata['source']}",
f"node/{jobdata['target']}",
f"{jobdata['vmtype']}/{jobdata['guest']}",
]
info_metrics['info'].add_metric(label_values, 1)

# Add metrics
label_values = [str(jobdata['id'])]
status = self._pve.nodes(node).replication(jobdata['id']).status.get()
for key, metric_value in status.items():
if key in metrics:
metrics[key].add_metric(label_values, metric_value)

return itertools.chain(metrics.values(), info_metrics.values())