Skip to content

[NDM] Add NDM metadata support for Cisco ACI #17735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
eece332
Add support for sending device metadata
zoedt Jun 4, 2024
2dc6b77
Add unit test for device metadata, update fixture
zoedt Jun 4, 2024
0fbd388
Add license header, changelogs
zoedt Jun 4, 2024
308381e
Lint
zoedt Jun 4, 2024
aaaf282
First pass at submitting interface metadata, cleanup for test fixtures
zoedt Jun 5, 2024
d7e3d76
Fix for py2.7 support
zoedt Jun 6, 2024
df7ccfa
Try to fix imports
zoedt Jun 6, 2024
2874add
Merge branch 'master' of github.com:DataDog/integrations-core into zo…
zoedt Jun 7, 2024
79954aa
Deal with pydantic stuff py2.7
zoedt Jun 7, 2024
3c84f1c
Merge branch 'master' of github.com:DataDog/integrations-core into zo…
zoedt Jun 7, 2024
8dd93c7
Merge branch 'master' of github.com:DataDog/integrations-core into zo…
zoedt Jun 10, 2024
7d7c5a4
Allow namespace for Cisco ACI devices, static var for vendor
zoedt Jun 11, 2024
ac9fe08
Update device metadata to use the correct fieldname, add pydantic mod…
zoedt Jun 11, 2024
7848a1c
Sync the conf.yaml example
zoedt Jun 11, 2024
3457360
Add device type and integration to device metadata, fix ID field name
zoedt Jun 12, 2024
3f3cd66
Update interface statuses
zoedt Jun 12, 2024
2b4ee06
Deal with device status (use fabricSt)
zoedt Jun 12, 2024
c6c20f9
Update get_eth_list to get operStatus, update all tests and fixtures
zoedt Jun 13, 2024
6137d1f
Amend docs for namespace
zoedt Jun 18, 2024
502e682
Batch events sent to EvP
zoedt Jun 20, 2024
f51307d
Add interface status metric
zoedt Jun 20, 2024
ff002e5
Only add to list for >py3.0
zoedt Jun 20, 2024
eb0ec6b
Update default value for vendor, yield for batch events, use device t…
zoedt Jun 24, 2024
63e2f08
Add source field to device metadata tags
zoedt Jun 24, 2024
6d8c377
Add enums for interface status
zoedt Jun 24, 2024
04a4d2e
Use correct track type for NDM metadata
zoedt Jul 1, 2024
a93dfa7
Amend device id tag, collect timestamp ms -> s
zoedt Jul 2, 2024
86c4fea
Add interface integration field
zoedt Jul 3, 2024
ddcd8d7
More generic method to send EvP event
zoedt Jul 3, 2024
aeacd2e
Add docstring for the EvP method
zoedt Jul 5, 2024
0c58f52
Update interface tagging, remove system_ip tag
zoedt Jul 9, 2024
2a8f97e
Fix linting for submit event platform event
zoedt Jul 9, 2024
c9f81a4
Use interface ID tags
zoedt Jul 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions cisco_aci/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ files:
value:
type: boolean
example: False
- name: namespace
description: |
Namespace for differentiating between devices that share the same IP.
If not specified, the namespace will be 'default'.
value:
type: string
example: default
- template: instances/http
overrides:
username.display_priority: 9
Expand Down
1 change: 1 addition & 0 deletions cisco_aci/changelog.d/17735.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[NDM] Add NDM metadata support for Cisco ACI
2 changes: 1 addition & 1 deletion cisco_aci/datadog_checks/cisco_aci/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
# Licensed under a 3-clause BSD style license (see LICENSE)

from .__about__ import __version__
from .cisco import CiscoACICheck
from datadog_checks.cisco_aci.cisco import CiscoACICheck

__all__ = ['__version__', 'CiscoACICheck']
4 changes: 2 additions & 2 deletions cisco_aci/datadog_checks/cisco_aci/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,8 @@ def get_spine_proc_metrics(self, pod, node):
return self._parse_response(response)

def get_eth_list(self, pod, node):
query = 'query-target=subtree&target-subtree-class=l1PhysIf'
path = '/api/mo/topology/pod-{}/node-{}/sys.json?{}'.format(pod, node, query)
query = 'rsp-subtree=children&rsp-subtree-class=ethpmPhysIf'
path = '/api/node/class/topology/pod-{}/node-{}/l1PhysIf.json?{}'.format(pod, node, query)
response = self.make_request(path)
return self._parse_response(response)

Expand Down
17 changes: 8 additions & 9 deletions cisco_aci/datadog_checks/cisco_aci/cisco.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
from datadog_checks.base import AgentCheck, ConfigurationError
from datadog_checks.base.config import _is_affirmative
from datadog_checks.base.utils.containers import hash_mutable

from . import aci_metrics
from .api import Api
from .capacity import Capacity
from .fabric import Fabric
from .tags import CiscoTags
from .tenant import Tenant
from datadog_checks.cisco_aci.aci_metrics import make_tenant_metrics
from datadog_checks.cisco_aci.api import Api
from datadog_checks.cisco_aci.capacity import Capacity
from datadog_checks.cisco_aci.fabric import Fabric
from datadog_checks.cisco_aci.tags import CiscoTags
from datadog_checks.cisco_aci.tenant import Tenant

SOURCE_TYPE = 'cisco_aci'

Expand All @@ -25,7 +24,7 @@ class CiscoACICheck(AgentCheck):

def __init__(self, name, init_config, instances):
super(CiscoACICheck, self).__init__(name, init_config, instances)
self.tenant_metrics = aci_metrics.make_tenant_metrics()
self.tenant_metrics = make_tenant_metrics()
self.last_events_ts = {}
self.external_host_tags = {}
self._api_cache = {}
Expand Down Expand Up @@ -109,7 +108,7 @@ def check(self, _):
raise

try:
fabric = Fabric(self, api, self.instance)
fabric = Fabric(self, api, self.instance, self.instance.get('namespace', 'default'))
fabric.collect()
except Exception as e:
self.log.error('fabric collection failed: %s', e)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ def instance_min_collection_interval():
return 15


def instance_namespace():
return 'default'


def instance_persist_connections():
return False

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class InstanceConfig(BaseModel):
log_requests: Optional[bool] = None
metric_patterns: Optional[MetricPatterns] = None
min_collection_interval: Optional[float] = None
namespace: Optional[str] = None
ntlm_domain: Optional[str] = None
password: Optional[str] = None
persist_connections: Optional[bool] = None
Expand Down
6 changes: 6 additions & 0 deletions cisco_aci/datadog_checks/cisco_aci/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ instances:
#
# appcenter: false

## @param namespace - string - optional - default: default
## Namespace for differentiating between devices that share the same IP.
## If not specified, the namespace will be 'default'.
#
# namespace: default

## @param proxy - mapping - optional
## This overrides the `proxy` setting in `init_config`.
##
Expand Down
112 changes: 107 additions & 5 deletions cisco_aci/datadog_checks/cisco_aci/fabric.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,38 @@
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

from six import iteritems
from six import PY3, iteritems

from datadog_checks.base.utils.serialization import json

if PY3:
import time

from datadog_checks.cisco_aci.models import DeviceMetadata, InterfaceMetadata, NetworkDevicesMetadata, Node, PhysIf

else:
DeviceMetadata = None
Eth = None
InterfaceMetadata = None
Node = None

from . import aci_metrics, exceptions, helpers

VENDOR_CISCO = 'cisco'
PAYLOAD_METADATA_BATCH_SIZE = 100


class Fabric:
"""
Collect fabric metrics from the APIC
"""

def __init__(self, check, api, instance):
def __init__(self, check, api, instance, namespace):
self.check = check
self.api = api
self.instance = instance
self.check_tags = check.check_tags
self.namespace = namespace

# grab some functions from the check
self.gauge = check.gauge
Expand All @@ -25,13 +42,19 @@ def __init__(self, check, api, instance):
self.submit_metrics = check.submit_metrics
self.tagger = self.check.tagger
self.external_host_tags = self.check.external_host_tags
self.ndm_metadata = check.ndm_metadata

def collect(self):
fabric_pods = self.api.get_fabric_pods()
fabric_nodes = self.api.get_fabric_nodes()
self.log.info("%s pods and %s nodes computed", len(fabric_nodes), len(fabric_pods))
pods = self.submit_pod_health(fabric_pods)
self.submit_nodes_health(fabric_nodes, pods)
devices, interfaces = self.submit_nodes_health_and_metadata(fabric_nodes, pods)
if PY3:
collect_timestamp = time.time() * 1000
batches = self.batch_payloads(devices, interfaces, collect_timestamp)
for batch in batches:
self.ndm_metadata(json.dumps(batch.model_dump(exclude_none=True)))

def submit_pod_health(self, pods):
pods_dict = {}
Expand All @@ -53,7 +76,9 @@ def submit_pod_health(self, pods):

return pods_dict

def submit_nodes_health(self, nodes, pods):
def submit_nodes_health_and_metadata(self, nodes, pods):
device_metadata = []
interface_metadata = []
for n in nodes:
hostname = helpers.get_fabric_hostname(n)

Expand All @@ -70,17 +95,22 @@ def submit_nodes_health(self, nodes, pods):
continue
self.log.info("processing node %s on pod %s", node_id, pod_id)
try:
if PY3:
device_metadata.append(self.submit_node_metadata(node_attrs, tags))
self.submit_process_metric(n, tags + self.check_tags + user_tags, hostname=hostname)
except (exceptions.APIConnectionException, exceptions.APIParsingException):
pass
if node_attrs.get('role') != "controller":
try:
stats = self.api.get_node_stats(pod_id, node_id)
self.submit_fabric_metric(stats, tags, 'fabricNode', hostname=hostname)
self.process_eth(node_attrs)
eth_metadata = self.process_eth(node_attrs)
if PY3:
interface_metadata.extend(eth_metadata)
except (exceptions.APIConnectionException, exceptions.APIParsingException):
pass
self.log.info("finished processing node %s", node_id)
return device_metadata, interface_metadata

def process_eth(self, node):
self.log.info("processing ethernet ports for %s", node.get('id'))
Expand All @@ -90,16 +120,20 @@ def process_eth(self, node):
eth_list = self.api.get_eth_list(pod_id, node['id'])
except (exceptions.APIConnectionException, exceptions.APIParsingException):
pass
interfaces = []
for e in eth_list:
eth_attrs = helpers.get_attributes(e)
eth_id = eth_attrs['id']
tags = self.tagger.get_fabric_tags(e, 'l1PhysIf')
if PY3:
interfaces.append(self.create_interface_metadata(e, node['address'], tags, hostname))
try:
stats = self.api.get_eth_stats(pod_id, node['id'], eth_id)
self.submit_fabric_metric(stats, tags, 'l1PhysIf', hostname=hostname)
except (exceptions.APIConnectionException, exceptions.APIParsingException):
pass
self.log.info("finished processing ethernet ports for %s", node['id'])
return interfaces

def submit_fabric_metric(self, stats, tags, obj_type, hostname=None):
for s in stats:
Expand Down Expand Up @@ -209,3 +243,71 @@ def get_fabric_type(self, obj_type):
return 'pod'
if obj_type == 'l1PhysIf':
return 'port'

def batch_payloads(self, devices, interfaces, collect_ts):
for device in devices:
yield NetworkDevicesMetadata(namespace=self.namespace, devices=[device], collect_timestamp=collect_ts)

payloads = []
for interface in interfaces:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 I can't remember if our backend really supports receiving multiple interfaces in the same batch when they come from different devices. Do you know how Meraki does it? We might want to check other examples just to be safe

Copy link
Contributor Author

@zoedt zoedt Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here's reference to the meraki NDM ingestion - the crawler is running on a per org basis, so multiple devices / interfaces may overlap at some point i'd assume? 🤔

if len(payloads) == PAYLOAD_METADATA_BATCH_SIZE:
yield NetworkDevicesMetadata(
namespace=self.namespace, interfaces=payloads, collect_timestamp=collect_ts
)
payloads = []
payloads.append(interface)
if payloads:
yield NetworkDevicesMetadata(namespace=self.namespace, interfaces=payloads, collect_timestamp=collect_ts)

def submit_node_metadata(self, node_attrs, tags):
node = Node(attributes=node_attrs)
id_tags = ['namespace:{}'.format(self.namespace), 'system_ip:{}'.format(node.attributes.address)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
id_tags = ['namespace:{}'.format(self.namespace), 'system_ip:{}'.format(node.attributes.address)]
id_tags = ['device_namespace:{}'.format(self.namespace), 'device_ip:{}'.format(node.attributes.address)]

Do we add the device_namespace and device_ip tags to all Cisco ACI metrics?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have a separate ticket for that work ! i'm doing that in the bg, but if you prefer it all in one PR i can amend that!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good for me to add it separately 👍 i think here we just want to ensure that the id tags can be used to filter metrics down to a specific device (so device_namespace:aaa, device_ip:bbb should work once we add common tags)

device_tags = [
'device_vendor:{}'.format(VENDOR_CISCO),
'device_namespace:{}'.format(self.namespace),
'device_hostname:{}'.format(node.attributes.dn),
'hostname:{}'.format(node.attributes.dn),
'system_ip:{}'.format(node.attributes.address),
'device_ip:{}'.format(node.attributes.address),
'id:{}:{}'.format(self.namespace, node.attributes.address),
"source:cisco-aci",
]
device = DeviceMetadata(
id='{}:{}'.format(self.namespace, node.attributes.address),
id_tags=id_tags,
tags=device_tags + tags,
name=node.attributes.dn,
ip_address=node.attributes.address,
model=node.attributes.model,
fabric_st=node.attributes.fabric_st,
vendor=VENDOR_CISCO,
version=node.attributes.version,
serial_number=node.attributes.serial,
device_type=node.attributes.device_type,
)
return device.model_dump(exclude_none=True)

def create_interface_metadata(self, phys_if, address, tags, hostname):
eth = PhysIf(**phys_if.get('l1PhysIf', {}))
interface = InterfaceMetadata(
device_id='{}:{}'.format(self.namespace, address),
id_tags=tags,
index=eth.attributes.id,
name=eth.attributes.name,
description=eth.attributes.desc,
mac_address=eth.attributes.router_mac,
admin_status=eth.attributes.admin_st,
)
if eth.ethpm_phys_if:
interface.oper_status = eth.ethpm_phys_if.attributes.oper_st
if interface.status:
new_tags = tags.copy()
new_tags.extend(
[
"device_ip:{}".format(address),
"device_namespace:{}".format(self.namespace),
"interface.status:{}".format(interface.status),
]
)
self.gauge('cisco_aci.fabric.node.interface.status', 1, tags=tags, hostname=hostname)
return interface.model_dump(exclude_none=True)
Loading
Loading