Skip to content

Commit 7222db4

Browse files
authored
Add state and phys_state metrics to InfiniBand Integration (#20070)
* add state and phys_state parsing * add test and tweak check * changelog * lint * fix unit
1 parent bfe0e22 commit 7222db4

File tree

7 files changed

+56
-1
lines changed

7 files changed

+56
-1
lines changed

infiniband/changelog.d/20070.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add state and phys_state metrics

infiniband/datadog_checks/infiniband/check.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from datadog_checks.base import AgentCheck # noqa: F401
99

10-
from .metrics import IB_COUNTERS, RDMA_COUNTERS
10+
from .metrics import IB_COUNTERS, RDMA_COUNTERS, STATUS_COUNTERS
1111

1212

1313
class InfinibandCheck(AgentCheck):
@@ -27,6 +27,7 @@ def __init__(self, name, init_config, instances):
2727
# Allow for specific counters to be excluded if configured
2828
self.exclude_counters = set(self.instance.get('exclude_counters', []))
2929
self.exclude_hw_counters = set(self.instance.get('exclude_hw_counters', []))
30+
self.exclude_status_counters = set(self.instance.get('exclude_status_counters', []))
3031

3132
# Allow for specific devices to be excluded if configured
3233
self.exclude_devices = set(self.instance.get('exclude_devices', []))
@@ -67,6 +68,7 @@ def _collect_counters(self, device, port):
6768

6869
self._collect_counter_metrics(port_path, tags)
6970
self._collect_hw_counter_metrics(port_path, tags)
71+
self._collect_status_metrics(port_path, tags)
7072

7173
def _collect_counter_metrics(self, port_path, tags):
7274
counters_path = os.path.join(port_path, "counters")
@@ -94,6 +96,31 @@ def _collect_hw_counter_metrics(self, port_path, tags):
9496
) and filename not in self.exclude_hw_counters:
9597
self._submit_counter_metric(file, f"rdma.{filename}", tags)
9698

99+
def _collect_status_metrics(self, port_path, tags):
100+
for status_file in STATUS_COUNTERS:
101+
if status_file in self.exclude_status_counters:
102+
self.log.debug("Skipping status counter %s as it is in the exclude list", status_file)
103+
continue
104+
file_path = os.path.join(port_path, status_file)
105+
if os.path.exists(file_path):
106+
with open(file_path, "r") as f:
107+
content = f.read().strip()
108+
# "4: ACTIVE" - split to get value and state
109+
parts = content.split(":", 1)
110+
value = int(parts[0].strip())
111+
metric_tags = list(tags)
112+
113+
# Add state as a tag if it exists
114+
if len(parts) > 1:
115+
state = parts[1].strip()
116+
metric_tags.append(f"port_{status_file}:{state}")
117+
118+
if self.collection_type in {'gauge', 'both'}:
119+
self.gauge(f"port_{status_file}", value, metric_tags)
120+
121+
if self.collection_type in {'monotonic_count', 'both'}:
122+
self.monotonic_count(f"port_{status_file}.count", value, metric_tags)
123+
97124
def _submit_counter_metric(self, file_path, metric_name, tags):
98125
with open(file_path, "r") as f:
99126
value = int(f.read().strip())

infiniband/datadog_checks/infiniband/metrics.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,5 @@
9999
"rdma_read_resp_bytes",
100100
"rdma_read_bytes",
101101
}
102+
103+
STATUS_COUNTERS = {"state", "phys_state"} # "4: ACTIVE" # "5: LinkUp"

infiniband/metadata.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ infiniband.multicast_rcv_packets,gauge,,packet,,"Number of multicast packets,inc
1313
infiniband.multicast_rcv_packets.count,count,,packet,,Number of new multicast packets received since the last metric submission (legacy),0,infiniband,,,
1414
infiniband.multicast_xmit_packets,gauge,,packet,,Number of multicast packets transmitted on all VLs from the port (legacy),0,infiniband,,,
1515
infiniband.multicast_xmit_packets.count,count,,packet,,Number of new multicast packets transmitted since the last metric submission (legacy),0,infiniband,,,
16+
infiniband.phys_state,gauge,,,,Physical link state,0,infiniband,,,
1617
infiniband.port_multicast_rcv_packets,gauge,,packet,,Number of multicast packets received,0,infiniband,,,
1718
infiniband.port_multicast_rcv_packets.count,count,,packet,,Number of new multicast packets received since the last metric submission,0,infiniband,,,
1819
infiniband.port_multicast_xmit_packets,gauge,,packet,,Number of multicast packets transmitted on all VLs from the port,0,infiniband,,,
@@ -175,6 +176,7 @@ infiniband.rdma.tx_vport_unicast_bytes,gauge,,byte,,Number of unicast bytes tran
175176
infiniband.rdma.tx_vport_unicast_bytes.count,count,,byte,,Number of new unicast bytes transmitted on virtual port since the last metric submission,0,infiniband,,,
176177
infiniband.rdma.tx_vport_unicast_packets,gauge,,packet,,Number of unicast packets transmitted on virtual port,0,infiniband,,,
177178
infiniband.rdma.tx_vport_unicast_packets.count,count,,packet,,Number of new unicast packets transmitted on virtual port since the last metric submission,0,infiniband,,,
179+
infiniband.state,gauge,,,,Port state,0,infiniband,,,
178180
infiniband.symbol_error,gauge,,error,,Number of minor link errors detected on one or more physical lanes,0,infiniband,,,
179181
infiniband.symbol_error.count,count,,error,,Number of new minor link errors detected since the last metric submission,0,infiniband,,,
180182
infiniband.unicast_rcv_packets,gauge,,packet,,"Number of unicast packets,including unicast packets containing errors (legacy)",0,infiniband,,,

infiniband/tests/common.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,10 @@
1414
'rx_bytes': '3000',
1515
}
1616

17+
MOCK_STATUS_DATA = {
18+
'state': '4: ACTIVE',
19+
'phys_state': '5: LinkUp',
20+
}
21+
1722
MOCK_DEVICE = 'mlx5_0'
1823
MOCK_PORT = '1'

infiniband/tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
MOCK_IB_COUNTER_DATA,
1212
MOCK_PORT,
1313
MOCK_RDMA_COUNTER_DATA,
14+
MOCK_STATUS_DATA,
1415
)
1516

1617

@@ -56,6 +57,8 @@ def _get_file_content(filename):
5657
return MOCK_IB_COUNTER_DATA[counter]
5758
elif counter in MOCK_RDMA_COUNTER_DATA:
5859
return MOCK_RDMA_COUNTER_DATA[counter]
60+
elif counter in MOCK_STATUS_DATA:
61+
return MOCK_STATUS_DATA[counter]
5962
return '0'
6063

6164

infiniband/tests/test_unit.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
MOCK_IB_COUNTER_DATA,
1212
MOCK_PORT,
1313
MOCK_RDMA_COUNTER_DATA,
14+
MOCK_STATUS_DATA,
1415
)
1516

1617

@@ -44,6 +45,20 @@ def test_check(aggregator, instance, mock_fs):
4445
_assert_metrics(aggregator, MOCK_IB_COUNTER_DATA, 'infiniband', tags)
4546
_assert_metrics(aggregator, MOCK_RDMA_COUNTER_DATA, 'infiniband.rdma', tags)
4647

48+
for status_name, status_value in MOCK_STATUS_DATA.items():
49+
value, state_name = status_value.split(':', 1)
50+
value = int(value.strip())
51+
state_name = state_name.strip()
52+
53+
expected_tags = tags + [f'port_{status_name}:{state_name}']
54+
aggregator.assert_metric(
55+
f'infiniband.port_{status_name}',
56+
metric_type=aggregator.GAUGE,
57+
value=value,
58+
tags=expected_tags,
59+
count=1,
60+
)
61+
4762

4863
@pytest.mark.parametrize(
4964
"collection_type,m_type,count",

0 commit comments

Comments
 (0)