Skip to content

Commit 7bc4cf2

Browse files
authored
Collect Cisco ACI faults as logs. (#19836)
* Collect Cisco ACI faults as logs. * Add faults unit test.
1 parent ffb0a71 commit 7bc4cf2

17 files changed

+450
-2
lines changed

cisco_aci/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,16 @@ To configure this check for an Agent running on a host:
5858
## Set to `true` to enable Network Device Monitoring metadata (for devices and interfaces) to be sent.
5959
#
6060
# send_ndm_metadata: false
61+
62+
# send_faultinst_faults - boolean - optional - default: false
63+
# Set to `true` to enable collection of Cisco ACI faultInst faults as logs.
64+
#
65+
# send_faultinst_faults: false
66+
67+
# send_faultdelegate_faults - boolean - optional - default: false
68+
# Set to `true` to enable collection of Cisco ACI faultDelegate faults as logs.
69+
#
70+
# send_faultdelegate_faults: false
6171
```
6272

6373
*NOTE*: Be sure to specify any tenants for the integration to collect metrics on applications, EPG, etc.

cisco_aci/assets/configuration/spec.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,20 @@ files:
106106
type: boolean
107107
example: False
108108
display_default: False
109+
- name: send_faultinst_faults
110+
description: |
111+
Set to `true` to enable collection of Cisco ACI faultInst faults as logs.
112+
value:
113+
type: boolean
114+
example: False
115+
display_default: False
116+
- name: send_faultdelegate_faults
117+
description: |
118+
Set to `true` to enable collection of Cisco ACI faultDelegate faults as logs.
119+
value:
120+
type: boolean
121+
example: False
122+
display_default: False
109123
- template: instances/http
110124
overrides:
111125
username.display_priority: 9

cisco_aci/changelog.d/19836.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Collect Cisco ACI faults as logs.

cisco_aci/datadog_checks/cisco_aci/api.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,20 @@ def get_apic_capacity_metrics(self, capacity_metric, query=None):
337337
response = self.make_request(path)
338338
return self._parse_response(response)
339339

340+
def get_faultinst_faults(self, afterTimestamp):
341+
path = "/api/node/class/faultInst.json"
342+
if afterTimestamp is not None:
343+
path += "?query-target-filter=and(gt(faultInst.lastTransition,\"{}\"))".format(afterTimestamp)
344+
response = self.make_request(path)
345+
return self._parse_response(response)
346+
347+
def get_faultdelegate_faults(self, afterTimestamp):
348+
path = "/api/node/class/faultDelegate.json"
349+
if afterTimestamp is not None:
350+
path += "?query-target-filter=and(gt(faultDelegate.lastTransition,\"{}\"))".format(afterTimestamp)
351+
response = self.make_request(path)
352+
return self._parse_response(response)
353+
340354
def _parse_response(self, response):
341355
try:
342356
return response.get('imdata')

cisco_aci/datadog_checks/cisco_aci/cisco.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from datadog_checks.cisco_aci.api import Api
1111
from datadog_checks.cisco_aci.capacity import Capacity
1212
from datadog_checks.cisco_aci.fabric import Fabric
13+
from datadog_checks.cisco_aci.faults import Faults
1314
from datadog_checks.cisco_aci.tags import CiscoTags
1415
from datadog_checks.cisco_aci.tenant import Tenant
1516

@@ -137,6 +138,20 @@ def check(self, _):
137138
api.close()
138139
raise
139140

141+
try:
142+
faults = Faults(self, api, self.instance, self.instance.get('namespace', 'default'))
143+
faults.collect()
144+
except Exception as e:
145+
self.log.error('faults collection failed: %s', e)
146+
self.service_check(
147+
SERVICE_CHECK_NAME,
148+
AgentCheck.CRITICAL,
149+
message="aci faults operations failed, returning a status of {}".format(e),
150+
tags=service_check_tags,
151+
)
152+
api.close()
153+
raise
154+
140155
self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags)
141156

142157
self.set_external_tags(self.get_external_host_tags())

cisco_aci/datadog_checks/cisco_aci/config_models/defaults.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@ def instance_request_size():
7272
return 16
7373

7474

75+
def instance_send_faultdelegate_faults():
76+
return False
77+
78+
79+
def instance_send_faultinst_faults():
80+
return False
81+
82+
7583
def instance_send_ndm_metadata():
7684
return False
7785

cisco_aci/datadog_checks/cisco_aci/config_models/instance.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ class InstanceConfig(BaseModel):
9090
pwd: Optional[str] = None
9191
read_timeout: Optional[float] = None
9292
request_size: Optional[float] = None
93+
send_faultdelegate_faults: Optional[bool] = None
94+
send_faultinst_faults: Optional[bool] = None
9395
send_ndm_metadata: Optional[bool] = None
9496
service: Optional[str] = None
9597
skip_proxy: Optional[bool] = None

cisco_aci/datadog_checks/cisco_aci/data/conf.yaml.example

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,16 @@ instances:
135135
#
136136
# send_ndm_metadata: false
137137

138+
## @param send_faultinst_faults - boolean - optional - default: false
139+
## Set to `true` to enable collection of Cisco ACI faultInst faults as logs.
140+
#
141+
# send_faultinst_faults: false
142+
143+
## @param send_faultdelegate_faults - boolean - optional - default: false
144+
## Set to `true` to enable collection of Cisco ACI faultDelegate faults as logs.
145+
#
146+
# send_faultdelegate_faults: false
147+
138148
## @param proxy - mapping - optional
139149
## This overrides the `proxy` setting in `init_config`.
140150
##
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# (C) Datadog, Inc. 2025-present
2+
# All rights reserved
3+
# Licensed under a 3-clause BSD style license (see LICENSE)
4+
import datetime
5+
6+
from datadog_checks.base.utils.serialization import from_json, to_json
7+
from datadog_checks.base.utils.time import get_timestamp
8+
9+
10+
class Faults:
11+
"""
12+
Collect faults from the APIC
13+
"""
14+
15+
# Custom facets need to be namespaced because facets must have unique paths and a path cannot
16+
# be shared between different facet groups. Most Cisco ACI fault fields are used as facets
17+
# so all fields will be moved under the namespace here rather than creating a remapper in the
18+
# pipeline for each field.
19+
ATTR_NAMESPACE = "cisco_aci"
20+
21+
FAULTINST_KEY = "faultInst"
22+
FAULTDELEGATE_KEY = "faultDelegate"
23+
24+
def __init__(self, check, api, instance, namespace):
25+
self.check = check
26+
self.api = api
27+
self.instance = instance
28+
self.namespace = namespace
29+
30+
self.log = check.log
31+
self.read_persistent_cache = check.read_persistent_cache
32+
self.send_log = check.send_log
33+
self.write_persistent_cache = check.write_persistent_cache
34+
35+
# Config for submitting faultInst faults as logs
36+
self.send_faultinst_faults = self.instance.get('send_faultinst_faults', False)
37+
# Config for submitting faultDelegate faults as logs
38+
self.send_faultdelegate_faults = self.instance.get('send_faultdelegate_faults', False)
39+
40+
def faultinst_faults_enabled(self):
41+
return self.send_faultinst_faults
42+
43+
def faultdelegate_faults_enabled(self):
44+
return self.send_faultdelegate_faults
45+
46+
def collect(self):
47+
if self.faultinst_faults_enabled():
48+
data = self.read_persistent_cache("max_timestamp_{}".format(Faults.FAULTINST_KEY))
49+
max_timestamp = from_json(data) if data else None
50+
faults = self.api.get_faultinst_faults(max_timestamp)
51+
self.submit_faults(Faults.FAULTINST_KEY, faults)
52+
if self.faultdelegate_faults_enabled():
53+
data = self.read_persistent_cache("max_timestamp_{}".format(Faults.FAULTDELEGATE_KEY))
54+
max_timestamp = from_json(data) if data else None
55+
faults = self.api.get_faultdelegate_faults(max_timestamp)
56+
self.submit_faults(Faults.FAULTDELEGATE_KEY, faults)
57+
58+
def submit_faults(self, faultCategory, faults):
59+
if len(faults) == 0:
60+
return
61+
62+
max_timestamp = 0.0
63+
num_skipped = 0
64+
for fault in faults:
65+
payload = {}
66+
if faultCategory in fault:
67+
payload[Faults.ATTR_NAMESPACE] = fault[faultCategory]["attributes"]
68+
payload[Faults.ATTR_NAMESPACE]["faultCategory"] = faultCategory
69+
else:
70+
num_skipped += 1
71+
self.log.debug("skipping fault that does not contain %s: %s", faultCategory, fault)
72+
continue
73+
74+
if "lastTransition" in payload[Faults.ATTR_NAMESPACE]:
75+
max_timestamp = max(
76+
max_timestamp,
77+
get_timestamp(datetime.datetime.fromisoformat(payload[Faults.ATTR_NAMESPACE]["lastTransition"])),
78+
)
79+
self.send_log(payload)
80+
81+
self.write_persistent_cache("max_timestamp_{}".format(faultCategory), to_json(max_timestamp))
82+
83+
if num_skipped > 0:
84+
self.log.warning("skipped %d faults that did not contain %s", num_skipped, faultCategory)

cisco_aci/manifest.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"Supported OS::Linux",
1616
"Supported OS::macOS",
1717
"Supported OS::Windows",
18+
"Category::Log Collection",
1819
"Category::Network",
1920
"Offering::Integration"
2021
]
@@ -52,6 +53,9 @@
5253
"CPU usage is high for Cisco ACI device": "assets/monitors/cpu_high.json",
5354
"Health score of device is critical": "assets/monitors/critical_health_score.json",
5455
"Interface for a Cisco ACI device is down": "assets/monitors/interface_down.json"
56+
},
57+
"logs": {
58+
"source": "cisco-aci"
5559
}
5660
}
57-
}
61+
}

cisco_aci/tests/common.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
CAPACITY_FIXTURES_DIR = os.path.join(FIXTURES_DIR, 'capacity')
1919
FABRIC_FIXTURES_DIR = os.path.join(FIXTURES_DIR, 'fabric')
2020
TENANT_FIXTURES_DIR = os.path.join(FIXTURES_DIR, 'tenant')
21-
ALL_FIXTURE_DIR = [FIXTURES_DIR, CAPACITY_FIXTURES_DIR, FABRIC_FIXTURES_DIR, TENANT_FIXTURES_DIR]
21+
FAULTS_FIXTURES_DIR = os.path.join(FIXTURES_DIR, 'faults')
22+
ALL_FIXTURE_DIR = [FIXTURES_DIR, CAPACITY_FIXTURES_DIR, FABRIC_FIXTURES_DIR, TENANT_FIXTURES_DIR, FAULTS_FIXTURES_DIR]
2223

2324
USERNAME = 'datadog'
2425
PASSWORD = 'datadog'
@@ -33,6 +34,8 @@
3334
'tenant': ['DataDog'],
3435
"tags": ["project:cisco_aci"],
3536
"send_ndm_metadata": True,
37+
"send_faultinst_faults": True,
38+
"send_faultdelegate_faults": True,
3639
}
3740

3841
# list of fixture names
@@ -183,6 +186,14 @@
183186
# 9ec9c2e1bcd513274516713bc3f68724 - Api.get_eth_list_and_stats
184187
'_api_node_class_topology_pod_1_node_101_l1PhysIf_json_rsp_subtree_children_rsp_subtree_include_stats_rsp_subtree_class_ethpmPhysIf_eqptEgrTotal5min_eqptIngrTotal5min_eqptEgrDropPkts5min_eqptEgrBytes5min_eqptIngrBytes5min',
185188
# 9bd6720132f1eef5ae8ec7d6438d9c6b - Api.get_eth_list_and_stats
189+
'_api_node_class_faultInst_json',
190+
# 431f5593e6349e09f0097d3f23dea75c - Api.get_faultinst_faults
191+
'_api_node_class_faultDelegate_json',
192+
# 5d56882ce9d312af184fedfe77bc08df - Api.get_faultdelegate_faults
193+
'_api_node_class_faultInst_json_query_target_filter_and_gt_faultInst_lastTransition__1741816605_365___',
194+
# 5058ab43747e6423c036a89104ec05dc - Api.get_faultinst_faults
195+
'_api_node_class_faultDelegate_json_query_target_filter_and_gt_faultDelegate_lastTransition__1741791186_041___',
196+
# 8bf03ccb4d494c0f8d05f7e8d9115dfd - Api.get_faultdelegate_faults
186197
]
187198

188199
# The map will contain the md5 hash to the fixture
@@ -243,3 +254,7 @@ class FakeTenantSessionWrapper(FakeSessionWrapper):
243254

244255
class FakeFabricSessionWrapper(FakeSessionWrapper):
245256
fixture_dirs = [FABRIC_FIXTURES_DIR]
257+
258+
259+
class FakeFaultsSessionWrapper(FakeSessionWrapper):
260+
fixture_dirs = [FAULTS_FIXTURES_DIR]

cisco_aci/tests/fixtures/faults.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# (C) Datadog, Inc. 2025-present
2+
# All rights reserved
3+
# Licensed under a 3-clause BSD style license (see LICENSE)
4+
5+
EXPECTED_FAULT_LOGS = [
6+
{
7+
"cisco_aci": {
8+
"ack": "no",
9+
"alert": "no",
10+
"cause": "threshold-crossed",
11+
"changeSet": "crcLast:0",
12+
"childAction": "",
13+
"code": "F381328",
14+
"created": "2025-03-12T05:00:05.650+00:00",
15+
"delegated": "no",
16+
"descr": "TCA: CRC Align Errors current value(eqptIngrErrPkts5min:crcLast) value 0% fell below threshold 1%", # noqa: E501
17+
"dn": "topology/pod-1/node-102/sys/phys-[eth1/1]/fault-F381328",
18+
"domain": "infra",
19+
"highestSeverity": "warning",
20+
"lastTransition": "2025-03-12T21:56:45.365+00:00",
21+
"lc": "retaining",
22+
"occur": "892",
23+
"origSeverity": "warning",
24+
"prevSeverity": "warning",
25+
"rule": "tca-eqpt-ingr-err-pkts5min-crc-last",
26+
"severity": "cleared",
27+
"status": "",
28+
"subject": "counter",
29+
"title": "",
30+
"type": "operational",
31+
"faultCategory": "faultInst",
32+
},
33+
"ddtags": "project:cisco_aci",
34+
},
35+
{
36+
"cisco_aci": {
37+
"ack": "no",
38+
"affected": "resPolCont/rtdOutCont/rtdOutDef-[uni/tn-Legado/out-PANDORA_to_NELS.l3out]/node-101/stpathatt-[N5KFRE-VPC]/nwissues", # noqa: E501
39+
"cause": "configuration-failed",
40+
"changeSet": "configQual:invalid-path, configSt:failed-to-apply, debugMessage:invalid-path: Interface does not exist;, temporaryError:no", # noqa: E501
41+
"childAction": "",
42+
"code": "F0467",
43+
"created": "2025-03-12T14:50:49.955+00:00",
44+
"descr": "Fault delegate: Configuration failed for uni/tn-Legado/out-PANDORA_to_NELS.l3out node 101 N5KFRE-VPC due to Invalid Path Configuration, debug message: invalid-path: Interface does not exist;", # noqa: E501
45+
"dn": "uni/tn-Legado/out-PANDORA_to_NELS.l3out/fd-[resPolCont/rtdOutCont/rtdOutDef-[uni/tn-Legado/out-PANDORA_to_NELS.l3out]/node-101/stpathatt-[N5KFRE-VPC]/nwissues]-fault-F0467", # noqa: E501
46+
"domain": "tenant",
47+
"highestSeverity": "minor",
48+
"lastTransition": "2025-03-12T14:53:06.041+00:00",
49+
"lc": "raised",
50+
"occur": "1",
51+
"origSeverity": "minor",
52+
"prevSeverity": "minor",
53+
"rule": "fv-nw-issues-config-failed",
54+
"severity": "minor",
55+
"status": "",
56+
"subject": "management",
57+
"type": "config",
58+
"faultCategory": "faultDelegate",
59+
},
60+
"ddtags": "project:cisco_aci",
61+
},
62+
{
63+
"cisco_aci": {
64+
"ack": "no",
65+
"alert": "no",
66+
"cause": "threshold-crossed",
67+
"changeSet": "errorRate:11",
68+
"childAction": "",
69+
"code": "F96976",
70+
"created": "2025-03-12T05:01:08.717+00:00",
71+
"delegated": "no",
72+
"descr": "TCA: Egress Error Drop Packets rate(eqptEgrDropPkts5min:errorRate) value 11 raised above threshold 10", # noqa: E501
73+
"dn": "topology/pod-1/node-102/sys/phys-[eth1/1]/fault-F96976",
74+
"domain": "infra",
75+
"highestSeverity": "warning",
76+
"lastTransition": "2025-03-12T21:56:45.365+00:00",
77+
"lc": "raised",
78+
"occur": "53",
79+
"origSeverity": "warning",
80+
"prevSeverity": "cleared",
81+
"rule": "tca-eqpt-egr-drop-pkts5min-error-rate",
82+
"severity": "warning",
83+
"status": "",
84+
"subject": "counter",
85+
"title": "",
86+
"type": "operational",
87+
"faultCategory": "faultInst",
88+
},
89+
"ddtags": "project:cisco_aci",
90+
},
91+
{
92+
"cisco_aci": {
93+
"ack": "no",
94+
"affected": "uni/epp/fv-[uni/tn-Tenant888/ap-app_demo_ap/epg-dev_epg]/node-1101/polDelSt",
95+
"cause": "configuration-failed",
96+
"changeSet": "deploymentState:not-registered-for-atg",
97+
"childAction": "",
98+
"code": "F1298",
99+
"created": "2025-03-12T09:39:32.189+00:00",
100+
"descr": "Fault delegate: For tenant Tenant888, application profile app_demo_ap, deployment of application EPG dev_epg failed on node 1101. Reason Node Cannot Deploy EPG", # noqa: E501
101+
"dn": "uni/tn-Tenant888/ap-app_demo_ap/epg-dev_epg/fd-[uni/epp/fv-[uni/tn-Tenant888/ap-app_demo_ap/epg-dev_epg]/node-1101/polDelSt]-fault-F1298", # noqa: E501
102+
"domain": "tenant",
103+
"highestSeverity": "minor",
104+
"lastTransition": "2025-03-12T14:53:06.041+00:00",
105+
"lc": "raised",
106+
"occur": "1",
107+
"origSeverity": "minor",
108+
"prevSeverity": "minor",
109+
"rule": "fv-pol-delivery-status-configuration-failed",
110+
"severity": "minor",
111+
"status": "",
112+
"subject": "epg",
113+
"type": "config",
114+
"faultCategory": "faultDelegate",
115+
},
116+
"ddtags": "project:cisco_aci",
117+
},
118+
]

0 commit comments

Comments
 (0)