@@ -2,7 +2,8 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation
2
2
dcgm.clock_throttle_reasons,gauge,,,,Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*),0,dcgm,,
3
3
dcgm.correctable_remapped_rows.count,count,,row,,Number of remapped rows for correctable errors.,0,dcgm,,
4
4
dcgm.dec_utilization,gauge,,percent,,Decoder utilization (in %).,0,dcgm,,
5
- dcgm.device.count,count,,device,,Number of Devices on the node.,0,dcgm,,
5
+ dcgm.device.count,count,,device,,Change in number Devices on the node.,0,dcgm,,
6
+ dcgm.device.total,gauge,,,,Number of Devices on the node.,0,dcgm,,
6
7
dcgm.dram.active,gauge,,fraction,,Ratio of cycles the device memory interface is active sending or receiving data (in %).,0,dcgm,,
7
8
dcgm.enc_utilization,gauge,,percent,,Encoder utilization (in %).,0,dcgm,,
8
9
dcgm.fan_speed,gauge,,percent,,Fan speed for the device in percent 0-100.,0,dcgm,,
@@ -16,10 +17,14 @@ dcgm.gr_engine_active,gauge,,fraction,,Ratio of time the graphics engine is acti
16
17
dcgm.mem.clock,gauge,,megahertz,,Memory clock frequency (in MHz).,0,dcgm,,
17
18
dcgm.mem.copy_utilization,gauge,,percent,,Memory utilization (in %).,0,dcgm,,
18
19
dcgm.mem.temperature,gauge,,degree celsius,,Memory temperature (in C).,0,dcgm,,
19
- dcgm.nvlink_bandwidth.count,count,,,,Total number of NVLink bandwidth counters for all lanes,0,dcgm,,
20
- dcgm.pcie_replay.count,count,,,,Total number of PCIe retries.,0,dcgm,,
21
- dcgm.pcie_rx_throughput.count,count,,,,PCIe Rx utilization information.,0,dcgm,,
22
- dcgm.pcie_tx_throughput.count,count,,,,PCIe Tx utilization information.,0,dcgm,,
20
+ dcgm.nvlink_bandwidth.count,count,,,,Change in number of NVLink bandwidth counters for all lanes,0,dcgm,,
21
+ dcgm.nvlink_bandwidth.total,gauge,,,,Total number of NVLink bandwidth counters for all lanes,0,dcgm,,
22
+ dcgm.pcie_replay.count,count,,,,Change in number of PCIe retries.,0,dcgm,,
23
+ dcgm.pcie_replay.total,gauge,,,,Total number of PCIe retries.,0,dcgm,,
24
+ dcgm.pcie_rx_throughput.count,count,,,,Change in PCIe Rx utilization information.,0,dcgm,,
25
+ dcgm.pcie_rx_throughput.total,gauge,,,,PCIe Rx utilization information.,0,dcgm,,
26
+ dcgm.pcie_tx_throughput.count,count,,,,Change PCIe Tx utilization information.,0,dcgm,,
27
+ dcgm.pcie_tx_throughput.total,gauge,,,,PCIe Tx utilization information,0,dcgm,,
23
28
dcgm.pipe.fp16_active,gauge,,fraction,,Ratio of cycles the fp16 pipes are active (in %).,0,dcgm,,
24
29
dcgm.pipe.fp32_active,gauge,,fraction,,Ratio of cycles the fp32 pipes are active (in %).,0,dcgm,,
25
30
dcgm.pipe.fp64_active,gauge,,fraction,,Ratio of cycles the fp64 pipes are active (in %).,0,dcgm,,
@@ -33,7 +38,9 @@ dcgm.sm_active,gauge,,fraction,,The ratio of cycles an SM has at least 1 warp as
33
38
dcgm.sm_clock,gauge,,megahertz,,SM clock frequency (in MHz).,0,dcgm,,
34
39
dcgm.sm_occupancy,gauge,,fraction,,The ratio of number of warps resident on an SM (in %).,0,dcgm,,
35
40
dcgm.temperature,gauge,,degree celsius,,GPU temperature (in C).,0,dcgm,,
36
- dcgm.total_energy_consumption.count,count,,millijoule,,Total energy consumption since boot (in mJ).,0,dcgm,,
37
- dcgm.uncorrectable_remapped_rows.count,count,,row,,Number of remapped rows for uncorrectable errors.,0,dcgm,,
41
+ dcgm.total_energy_consumption.count,count,,millijoule,,Change in energy consumption (in mJ).,0,dcgm,,
42
+ dcgm.total_energy_consumption.total,gauge,,,,Total energy consumption since boot (in mJ),0,dcgm,,
43
+ dcgm.uncorrectable_remapped_rows.count,count,,row,,Change in number of remapped rows for uncorrectable errors.,0,dcgm,,
44
+ dcgm.uncorrectable_remapped_rows.total,gauge,,,,Total number of remapped rows for uncorrectable errors.,0,dcgm,,
38
45
dcgm.vgpu_license_status,gauge,,,,vGPU License status,0,dcgm,,
39
46
dcgm.xid_errors,gauge,,,,Value of the last XID error encountered.,0,dcgm,,
0 commit comments