Skip to content

Commit fb7123d

Browse files
authored
collect some counters as gauges (#19459)
* collect some counters as gauges * changelog * lint and comment * lint again
1 parent 22c0eeb commit fb7123d

File tree

4 files changed

+51
-14
lines changed

4 files changed

+51
-14
lines changed

dcgm/changelog.d/19459.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add the gauge/total version of some monotonic counter metrics

dcgm/datadog_checks/dcgm/metrics.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,26 +10,47 @@
1010
'DCGM_FI_DEV_FB_USED': 'frame_buffer.used',
1111
'DCGM_FI_DEV_GPU_TEMP': 'temperature',
1212
'DCGM_FI_DEV_GPU_UTIL': 'gpu_utilization',
13-
'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL': 'nvlink_bandwidth',
14-
'DCGM_FI_DEV_PCIE_REPLAY_COUNTER': 'pcie_replay', # becomes pcie_replay.count
13+
'DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL': {
14+
'name': 'nvlink_bandwidth',
15+
'type': 'counter_gauge',
16+
}, # becomes nvlink_bandwidth.total and nvlink_bandwidth.count
17+
'DCGM_FI_DEV_PCIE_REPLAY_COUNTER': {
18+
'name': 'pcie_replay',
19+
'type': 'counter_gauge',
20+
}, # becomes pcie_replay.total and pcie_replay.count
1521
'DCGM_FI_DEV_POWER_USAGE': 'power_usage',
1622
'DCGM_FI_DEV_SM_CLOCK': 'sm_clock',
17-
'DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION': 'total_energy_consumption',
23+
'DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION': {
24+
'name': 'total_energy_consumption',
25+
'type': 'counter_gauge',
26+
}, # becomes total_energy_consumption.total and total_energy_consumption.count
1827
'DCGM_FI_DEV_VGPU_LICENSE_STATUS': 'vgpu_license_status',
1928
'DCGM_FI_DEV_XID_ERRORS': 'xid_errors',
2029
# Metrics related to memory get grouped together because there are more of them available.
2130
'DCGM_FI_DEV_MEM_CLOCK': 'mem.clock',
2231
'DCGM_FI_DEV_MEM_COPY_UTIL': 'mem.copy_utilization',
2332
'DCGM_FI_DEV_MEMORY_TEMP': 'mem.temperature',
2433
# NVML Specific Missing Metrics (5)
25-
'DCGM_FI_DEV_COUNT': 'device', # becomes device.count
34+
'DCGM_FI_DEV_COUNT': {
35+
'name': 'device',
36+
'type': 'counter_gauge',
37+
}, # becomes device.total and device.count
2638
'DCGM_FI_DEV_FAN_SPEED': 'fan_speed',
27-
'DCGM_FI_PROF_PCIE_RX_BYTES': 'pcie_rx_throughput',
28-
'DCGM_FI_PROF_PCIE_TX_BYTES': 'pcie_tx_throughput',
39+
'DCGM_FI_PROF_PCIE_RX_BYTES': {
40+
'name': 'pcie_rx_throughput',
41+
'type': 'counter_gauge',
42+
},
43+
'DCGM_FI_PROF_PCIE_TX_BYTES': {
44+
'name': 'pcie_tx_throughput',
45+
'type': 'counter_gauge',
46+
}, # becomes pcie_tx_throughput.total and pcie_tx_throughput.count
2947
# Others from default-counters.csv
3048
'DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS': 'correctable_remapped_rows',
3149
'DCGM_FI_DEV_ROW_REMAP_FAILURE': 'row_remap_failure',
32-
'DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS': 'uncorrectable_remapped_rows',
50+
'DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS': {
51+
'name': 'uncorrectable_remapped_rows',
52+
'type': 'counter_gauge',
53+
}, # becomes uncorrectable_remapped_rows.total and uncorrectable_remapped_rows.count
3354
# More recommended metrics
3455
'DCGM_FI_DEV_CLOCK_THROTTLE_REASONS': 'clock_throttle_reasons',
3556
'DCGM_FI_DEV_FB_RESERVED': 'frame_buffer.reserved',

dcgm/metadata.csv

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation
22
dcgm.clock_throttle_reasons,gauge,,,,Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*),0,dcgm,,
33
dcgm.correctable_remapped_rows.count,count,,row,,Number of remapped rows for correctable errors.,0,dcgm,,
44
dcgm.dec_utilization,gauge,,percent,,Decoder utilization (in %).,0,dcgm,,
5-
dcgm.device.count,count,,device,,Number of Devices on the node.,0,dcgm,,
5+
dcgm.device.count,count,,device,,Change in number Devices on the node.,0,dcgm,,
6+
dcgm.device.total,gauge,,,,Number of Devices on the node.,0,dcgm,,
67
dcgm.dram.active,gauge,,fraction,,Ratio of cycles the device memory interface is active sending or receiving data (in %).,0,dcgm,,
78
dcgm.enc_utilization,gauge,,percent,,Encoder utilization (in %).,0,dcgm,,
89
dcgm.fan_speed,gauge,,percent,,Fan speed for the device in percent 0-100.,0,dcgm,,
@@ -16,10 +17,14 @@ dcgm.gr_engine_active,gauge,,fraction,,Ratio of time the graphics engine is acti
1617
dcgm.mem.clock,gauge,,megahertz,,Memory clock frequency (in MHz).,0,dcgm,,
1718
dcgm.mem.copy_utilization,gauge,,percent,,Memory utilization (in %).,0,dcgm,,
1819
dcgm.mem.temperature,gauge,,degree celsius,,Memory temperature (in C).,0,dcgm,,
19-
dcgm.nvlink_bandwidth.count,count,,,,Total number of NVLink bandwidth counters for all lanes,0,dcgm,,
20-
dcgm.pcie_replay.count,count,,,,Total number of PCIe retries.,0,dcgm,,
21-
dcgm.pcie_rx_throughput.count,count,,,,PCIe Rx utilization information.,0,dcgm,,
22-
dcgm.pcie_tx_throughput.count,count,,,,PCIe Tx utilization information.,0,dcgm,,
20+
dcgm.nvlink_bandwidth.count,count,,,,Change in number of NVLink bandwidth counters for all lanes,0,dcgm,,
21+
dcgm.nvlink_bandwidth.total,gauge,,,,Total number of NVLink bandwidth counters for all lanes,0,dcgm,,
22+
dcgm.pcie_replay.count,count,,,,Change in number of PCIe retries.,0,dcgm,,
23+
dcgm.pcie_replay.total,gauge,,,,Total number of PCIe retries.,0,dcgm,,
24+
dcgm.pcie_rx_throughput.count,count,,,,Change in PCIe Rx utilization information.,0,dcgm,,
25+
dcgm.pcie_rx_throughput.total,gauge,,,,PCIe Rx utilization information.,0,dcgm,,
26+
dcgm.pcie_tx_throughput.count,count,,,,Change PCIe Tx utilization information.,0,dcgm,,
27+
dcgm.pcie_tx_throughput.total,gauge,,,,PCIe Tx utilization information,0,dcgm,,
2328
dcgm.pipe.fp16_active,gauge,,fraction,,Ratio of cycles the fp16 pipes are active (in %).,0,dcgm,,
2429
dcgm.pipe.fp32_active,gauge,,fraction,,Ratio of cycles the fp32 pipes are active (in %).,0,dcgm,,
2530
dcgm.pipe.fp64_active,gauge,,fraction,,Ratio of cycles the fp64 pipes are active (in %).,0,dcgm,,
@@ -33,7 +38,9 @@ dcgm.sm_active,gauge,,fraction,,The ratio of cycles an SM has at least 1 warp as
3338
dcgm.sm_clock,gauge,,megahertz,,SM clock frequency (in MHz).,0,dcgm,,
3439
dcgm.sm_occupancy,gauge,,fraction,,The ratio of number of warps resident on an SM (in %).,0,dcgm,,
3540
dcgm.temperature,gauge,,degree celsius,,GPU temperature (in C).,0,dcgm,,
36-
dcgm.total_energy_consumption.count,count,,millijoule,,Total energy consumption since boot (in mJ).,0,dcgm,,
37-
dcgm.uncorrectable_remapped_rows.count,count,,row,,Number of remapped rows for uncorrectable errors.,0,dcgm,,
41+
dcgm.total_energy_consumption.count,count,,millijoule,,Change in energy consumption (in mJ).,0,dcgm,,
42+
dcgm.total_energy_consumption.total,gauge,,,,Total energy consumption since boot (in mJ),0,dcgm,,
43+
dcgm.uncorrectable_remapped_rows.count,count,,row,,Change in number of remapped rows for uncorrectable errors.,0,dcgm,,
44+
dcgm.uncorrectable_remapped_rows.total,gauge,,,,Total number of remapped rows for uncorrectable errors.,0,dcgm,,
3845
dcgm.vgpu_license_status,gauge,,,,vGPU License status,0,dcgm,,
3946
dcgm.xid_errors,gauge,,,,Value of the last XID error encountered.,0,dcgm,,

dcgm/tests/common.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
'correctable_remapped_rows.count',
2020
'dec_utilization',
2121
'device.count',
22+
'device.total',
2223
'dram.active',
2324
'enc_utilization',
2425
'fan_speed',
@@ -33,9 +34,13 @@
3334
'mem.copy_utilization',
3435
'mem.temperature',
3536
'nvlink_bandwidth.count',
37+
'nvlink_bandwidth.total',
3638
'pcie_replay.count',
39+
'pcie_replay.total',
3740
'pcie_rx_throughput.count',
41+
'pcie_rx_throughput.total',
3842
'pcie_tx_throughput.count',
43+
'pcie_tx_throughput.total',
3944
'pipe.fp16_active',
4045
'pipe.fp32_active',
4146
'pipe.fp64_active',
@@ -50,9 +55,12 @@
5055
'sm_occupancy',
5156
'temperature',
5257
'total_energy_consumption.count',
58+
'total_energy_consumption.total',
5359
'uncorrectable_remapped_rows.count',
60+
'uncorrectable_remapped_rows.total',
5461
'vgpu_license_status',
5562
'xid_errors',
5663
]
64+
5765
EXPECTED_METRICS = [f'dcgm.{m}' for m in EXPECTED_METRICS]
5866
assert sorted(EXPECTED_METRICS) == EXPECTED_METRICS, 'Please keep this list in alphabetic order!'

0 commit comments

Comments
 (0)