Skip to content

Commit 0214a39

Browse files
authored
InfiniBand recommended monitors (#20159)
* ib recommended monitors * manifest * Update manifest.json * path change * path change * Update manifest.json
1 parent 27ab362 commit 0214a39

File tree

3 files changed

+72
-0
lines changed

3 files changed

+72
-0
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"version": 2,
3+
"created_at": "2025-04-27",
4+
"last_updated_at": "2025-04-27",
5+
"title": "InfiniBand Port Physical State Alert",
6+
"description": "Alerts when an InfiniBand port reports a disabled or link error recovery physical state for 5 minutes or longer",
7+
"definition": {
8+
"id": 170517673,
9+
"name": "[InfiniBand] Port Physical State Alert",
10+
"type": "query alert",
11+
"query": "min(last_5m):avg:infiniband.port_phys_state{port_phys_state:disabled OR port_phys_state:link_error_recovery} by {port,device,host,port_state} >= 1",
12+
"message": "{{#is_alert}}\nPort:{{port.name}} of device:{{device.name}} on host:{{host.name}} is reporting a {{port_state.name}} physical state.\n{{/is_alert}}",
13+
"tags": [
14+
"integration:infiniband"
15+
],
16+
"options": {
17+
"thresholds": {
18+
"critical": 1
19+
},
20+
"notify_audit": false,
21+
"on_missing_data": "resolve",
22+
"include_tags": true,
23+
"new_group_delay": 60,
24+
"silenced": {}
25+
},
26+
"priority": null,
27+
"restriction_policy": {
28+
"bindings": []
29+
}
30+
},
31+
"tags": [
32+
"integration:infiniband"
33+
]
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"version": 2,
3+
"created_at": "2025-04-27",
4+
"last_updated_at": "2025-04-27",
5+
"title": "InfiniBand Port State Alert",
6+
"description": "Alerts when an InfiniBand port reports a down state for 5 minutes or longer",
7+
"definition": {
8+
"id": 170517681,
9+
"name": "[InfiniBand] Port State Alert",
10+
"type": "query alert",
11+
"query": "min(last_5m):count:infiniband.port_state{port_state:down} by {port,device,host,port_state} >= 1",
12+
"message": "{{#is_alert}}\nPort:{{port.name}} of device:{{device.name}} on host:{{host.name}} is reporting a {{port_state.name}} state.\n{{/is_alert}}",
13+
"tags": [
14+
"integration:infiniband"
15+
],
16+
"options": {
17+
"thresholds": {
18+
"critical": 1
19+
},
20+
"notify_audit": false,
21+
"on_missing_data": "resolve",
22+
"include_tags": true,
23+
"new_group_delay": 60,
24+
"silenced": {}
25+
},
26+
"priority": null,
27+
"restriction_policy": {
28+
"bindings": []
29+
}
30+
},
31+
"tags": [
32+
"integration:infiniband"
33+
]
34+
}

infiniband/manifest.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
"metadata_path": "metadata.csv"
3737
}
3838
},
39+
"monitors": {
40+
"port_state": "assets/monitors/port_state.json",
41+
"physical_state": "assets/monitors/physical_state.json"
42+
},
3943
"dashboards": {
4044
"InfiniBand Overview": "assets/dashboards/infiniband_overview.json"
4145
}

0 commit comments

Comments
 (0)