Skip to content

Commit 198a720

Browse files
authored
Send Teleport service check as a metric (#17441)
* update description 'cluster' -> 'instance' * send count metric instead of service check * raise on exception * format * fix integration tests * remove obsolete field version in docker-compose.yml * fix msg arg to self.count and add teleport_status tag to health.up metric * fix integration tests typo * fix e2e tests * log error message on exception * lint * add teleport.health.up to metadata.csv * add assertion for unreachable health state metric tag * apply suggestion
1 parent 9bfdc06 commit 198a720

File tree

6 files changed

+18
-11
lines changed

6 files changed

+18
-11
lines changed

teleport/datadog_checks/teleport/check.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,18 @@ def __init__(self, name, init_config, instances):
2020

2121
def check(self, _):
2222
try:
23-
response = self.http.get("{}/healthz".format(self.diag_addr))
23+
health_endpoint = f"{self.diag_addr}/healthz"
24+
response = self.http.get(health_endpoint)
2425
response.raise_for_status()
25-
self.service_check("health.up", self.OK)
26+
self.count("health.up", 1, tags=["teleport_status:ok"])
2627
except Exception as e:
27-
self.service_check("health.up", self.CRITICAL, message=str(e))
28+
self.log.error(
29+
"Cannot connect to Teleport HTTP diagnostic health endpoint '%s': %s.\nPlease make sure to enable Teleport's diagnostic HTTP endpoints.", # noqa: E501
30+
health_endpoint,
31+
str(e),
32+
) # noqa: E501
33+
self.count("health.up", 0, tags=["teleport_status:unreachable"])
34+
raise
2835

2936
super().check(_)
3037

teleport/metadata.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,4 @@ teleport.db.messages_from_client.count,count,,,,Time to fetch TLS configuration
128128
teleport.db.messages_from_server.count,gauge,,,,Number of reverse SSH tunnels connected to the Teleport Proxy Service by Teleport instances,0,teleport,,
129129
teleport.db.method_call_count.count,gauge,,,,Number of outbound connections to leaf clusters,0,teleport,,
130130
teleport.db.method_call_latency_seconds.bucket,count,,,,Number of times a user exceeded their max concurrent ssh connections,0,teleport,,
131+
teleport.health.up,count,,,,Status of the Teleport instance,0,teleport,,

teleport/tests/docker/docker-compose.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
version: '3'
21
services:
32
teleport:
43
image: public.ecr.aws/gravitational/teleport:14.3

teleport/tests/test_common.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@
1111
pytestmark = [pytest.mark.unit]
1212

1313

14-
def test_connect_exception(dd_run_check):
15-
with pytest.raises(Exception):
14+
def test_connect_exception(dd_run_check, aggregator, caplog):
15+
with pytest.raises(Exception, match="Failed to resolve 'invalid-hostname'"):
1616
check = TeleportCheck("teleport", {}, [BAD_HOSTNAME_INSTANCE])
1717
dd_run_check(check)
1818

19+
aggregator.assert_metric("teleport.health.up", value=0, count=1, tags=["teleport_status:unreachable"])
20+
assert "Cannot connect to Teleport HTTP diagnostic health endpoint" in caplog.text
21+
1922

2023
def test_common_teleport_metrics(dd_run_check, aggregator, instance, mock_http_response, metrics_path):
2124
mock_http_response(file_path=metrics_path)

teleport/tests/test_e2e.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44

55
import pytest
66

7-
from datadog_checks.teleport import TeleportCheck
8-
97
from .common import COMMON_METRICS, INSTANCE
108

119
pytestmark = pytest.mark.e2e
@@ -18,5 +16,5 @@
1816

1917
def test_teleport_e2e(dd_agent_check):
2018
aggregator = dd_agent_check(CONFIG)
21-
aggregator.assert_service_check('teleport.health.up', status=TeleportCheck.OK, count=1)
19+
aggregator.assert_metric("teleport.health.up", value=1, count=1, tags=["teleport_status:ok"])
2220
aggregator.assert_metric(f"teleport.{COMMON_METRICS[0]}")

teleport/tests/test_integration.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
def test_connect_ok(aggregator, instance, dd_run_check):
1515
check = TeleportCheck("teleport", {}, [instance])
1616
dd_run_check(check)
17-
aggregator.assert_service_check("teleport.health.up", status=TeleportCheck.OK, count=1)
18-
aggregator.assert_service_check("teleport.health.up", status=TeleportCheck.CRITICAL, count=0)
17+
aggregator.assert_metric("teleport.health.up", value=1, count=1, tags=["teleport_status:ok"])
1918

2019

2120
def test_check_collects_teleport_common_metrics(aggregator, instance, dd_run_check):

0 commit comments

Comments
 (0)