diff --git a/README.rst b/README.rst index 6111421..ff909c1 100644 --- a/README.rst +++ b/README.rst @@ -52,6 +52,7 @@ Usage [--collector.cluster | --no-collector.cluster] [--collector.resources | --no-collector.resources] [--collector.config | --no-collector.config] + [--collector.replication | --no-collector.replication] [--config.file CONFIG_FILE] [--web.listen-address WEB_LISTEN_ADDRESS] [--server.keyfile SERVER_KEYFILE] @@ -90,6 +91,8 @@ Usage --collector.config, --no-collector.config Exposes PVE onboot status + --collector.replication, --no-collector.replication + Exposes PVE replication info Use `[::]` in the `--web.listen-address` flag in order to bind to both IPv6 and @@ -191,6 +194,24 @@ Here's an example of the metrics exported. # HELP pve_version_info Proxmox VE version info # TYPE pve_version_info gauge pve_version_info{release="7.1",repoid="6fe299a0",version="7.1-5"} 1.0 + # HELP pve_replication_duration_seconds Proxmox vm replication duration + # TYPE pve_replication_duration_seconds gauge + pve_replication_duration_seconds{id="1-0"} 7.73584 + # HELP pve_replication_last_sync_timestamp_seconds Proxmox vm replication last_sync + # TYPE pve_replication_last_sync_timestamp_seconds gauge + pve_replication_last_sync_timestamp_seconds{id="1-0"} 1.713382503e+09 + # HELP pve_replication_last_try_timestamp_seconds Proxmox vm replication last_try + # TYPE pve_replication_last_try_timestamp_seconds gauge + pve_replication_last_try_timestamp_seconds{id="1-0"} 1.713382503e+09 + # HELP pve_replication_next_sync_timestamp_seconds Proxmox vm replication next_sync + # TYPE pve_replication_next_sync_timestamp_seconds gauge + pve_replication_next_sync_timestamp_seconds{id="1-0"} 1.7134689e+09 + # HELP pve_replication_failed_syncs Proxmox vm replication fail_count + # TYPE pve_replication_failed_syncs gauge + pve_replication_failed_syncs{id="1-0"} 0.0 + # HELP pve_replication_info Proxmox vm replication info + # TYPE pve_replication_info gauge + pve_replication_info{guest="qemu/1",id="1-0",source="node/proxmox1",target="node/proxmox2",type="local"} 1.0 Authentication -------------- diff --git a/src/pve_exporter/cli.py b/src/pve_exporter/cli.py index 5cf9ca8..8d7f60a 100755 --- a/src/pve_exporter/cli.py +++ b/src/pve_exporter/cli.py @@ -46,6 +46,10 @@ def main(): action=BooleanOptionalAction, default=True, help='Exposes PVE onboot status') + nodeflags.add_argument('--collector.replication', dest='collector_replication', + action=BooleanOptionalAction, default=True, + help='Exposes PVE replication info') + parser.add_argument('--config.file', type=pathlib.Path, dest="config_file", default='/etc/prometheus/pve.yml', help='Path to config file (/etc/prometheus/pve.yml)') @@ -69,7 +73,8 @@ def main(): node=params.collector_node, cluster=params.collector_cluster, resources=params.collector_resources, - config=params.collector_config + config=params.collector_config, + replication=params.collector_replication ) # Load configuration. diff --git a/src/pve_exporter/collector/__init__.py b/src/pve_exporter/collector/__init__.py index 16eec94..cf73b63 100644 --- a/src/pve_exporter/collector/__init__.py +++ b/src/pve_exporter/collector/__init__.py @@ -14,7 +14,10 @@ VersionCollector, ClusterInfoCollector ) -from pve_exporter.collector.node import NodeConfigCollector +from pve_exporter.collector.node import ( + NodeConfigCollector, + NodeReplicationCollector +) CollectorsOptions = collections.namedtuple('CollectorsOptions', [ 'status', @@ -23,6 +26,7 @@ 'cluster', 'resources', 'config', + 'replication' ]) @@ -44,5 +48,7 @@ def collect_pve(config, host, cluster, node, options: CollectorsOptions): registry.register(VersionCollector(pve)) if node and options.config: registry.register(NodeConfigCollector(pve)) + if node and options.replication: + registry.register(NodeReplicationCollector(pve)) return generate_latest(registry) diff --git a/src/pve_exporter/collector/node.py b/src/pve_exporter/collector/node.py index d0a9566..0f2c817 100644 --- a/src/pve_exporter/collector/node.py +++ b/src/pve_exporter/collector/node.py @@ -4,6 +4,7 @@ # pylint: disable=too-few-public-methods import logging +import itertools from prometheus_client.core import GaugeMetricFamily @@ -57,3 +58,71 @@ def collect(self): # pylint: disable=missing-docstring metrics[key].add_metric(label_values, metric_value) return metrics.values() + +class NodeReplicationCollector: + """ + Collects Proxmox VE Replication information directly from status, i.e. replication duration, + last_sync, last_try, next_sync, fail_count. + For manual test: "pvesh get /nodes//replication//status" + """ + + def __init__(self, pve): + self._pve = pve + + def collect(self): # pylint: disable=missing-docstring + + info_metrics = { + 'info': GaugeMetricFamily( + 'pve_replication_info', + 'Proxmox vm replication info', + labels=['id', 'type', 'source', 'target', 'guest']) + } + + metrics = { + 'duration': GaugeMetricFamily( + 'pve_replication_duration_seconds', + 'Proxmox vm replication duration', + labels=['id']), + 'last_sync': GaugeMetricFamily( + 'pve_replication_last_sync_timestamp_seconds', + 'Proxmox vm replication last_sync', + labels=['id']), + 'last_try': GaugeMetricFamily( + 'pve_replication_last_try_timestamp_seconds', + 'Proxmox vm replication last_try', + labels=['id']), + 'next_sync': GaugeMetricFamily( + 'pve_replication_next_sync_timestamp_seconds', + 'Proxmox vm replication next_sync', + labels=['id']), + 'fail_count': GaugeMetricFamily( + 'pve_replication_failed_syncs', + 'Proxmox vm replication fail_count', + labels=['id']), + } + + node = None + for entry in self._pve.cluster.status.get(): + if entry['type'] == 'node' and entry['local']: + node = entry['name'] + break + + for jobdata in self._pve.nodes(node).replication.get(): + # Add info metric + label_values = [ + str(jobdata['id']), + str(jobdata['type']), + f"node/{jobdata['source']}", + f"node/{jobdata['target']}", + f"{jobdata['vmtype']}/{jobdata['guest']}", + ] + info_metrics['info'].add_metric(label_values, 1) + + # Add metrics + label_values = [str(jobdata['id'])] + status = self._pve.nodes(node).replication(jobdata['id']).status.get() + for key, metric_value in status.items(): + if key in metrics: + metrics[key].add_metric(label_values, metric_value) + + return itertools.chain(metrics.values(), info_metrics.values())