Skip to content

Commit 4ae50af

Browse files
[Karpenter Integration]: add missed metrics based on karpenter v1.4 documentations (#20110)
* add missed karpenter v1.4 metrics * add changelog * fix wrong pr bumber in changlog * fix linting rules * fix tests * fix * add csv metadata and add tests * sort the csv metadata file * Update karpenter/changelog.d/20110.added Co-authored-by: Kyle Neale <kyle.a.neale@gmail.com> --------- Co-authored-by: Kyle Neale <kyle.a.neale@gmail.com>
1 parent b01f93a commit 4ae50af

File tree

5 files changed

+280
-1
lines changed

5 files changed

+280
-1
lines changed

karpenter/changelog.d/20110.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add Karpenter v1.4 metrics

karpenter/datadog_checks/karpenter/metrics.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,20 @@
99
# https://karpenter.sh/docs/reference/metrics/
1010

1111
METRIC_MAP = {
12+
'aws_sdk_go_request': 'aws.sdk_go.request',
13+
'aws_sdk_go_request_retry': 'aws.sdk_go.request.retry',
14+
'aws_sdk_go_request_duration_seconds': 'aws.sdk_go.request.duration_seconds',
15+
'aws_sdk_go_request_attempt': 'aws.sdk_go.request_attempt',
16+
'aws_sdk_go_request_attempt_duration_seconds': 'aws.sdk_go.request_attempt.duration_seconds',
1217
'certwatcher_read_certificate': 'certwatcher.read.certificate',
1318
'certwatcher_read_certificate_errors': 'certwatcher.read.certificate.errors',
1419
'controller_runtime_active_workers': 'controller.runtime.active_workers',
1520
'controller_runtime_max_concurrent_reconciles': 'controller.runtime.max.concurrent_reconciles',
1621
'controller_runtime_reconcile': 'controller.runtime.reconcile',
1722
'controller_runtime_reconcile_errors': 'controller.runtime.reconcile_errors',
23+
'controller_runtime_reconcile_panics': 'controller.runtime.reconcile_panics',
1824
'controller_runtime_reconcile_time_seconds': 'controller.runtime.reconcile.time_seconds',
25+
'controller_runtime_terminal_reconcile_errors': 'controller.runtime.terminal.reconcile.errors',
1926
'go_gc_duration_seconds': 'go.gc.duration_seconds',
2027
'go_goroutines': 'go_goroutines',
2128
'go_info': 'go_info',
@@ -69,6 +76,7 @@
6976
'karpenter_disruption_queue_depth': 'disruption.queue_depth',
7077
'karpenter_disruption_pods_disrupted': 'disruption.pods.disrupted',
7178
'karpenter_disruption_nodes_disrupted': 'disruption.nodes.disrupted',
79+
'karpenter_ignored_pod_count': 'ignored_pod_count',
7280
'karpenter_interruption_actions_performed': 'interruption.actions_performed',
7381
'karpenter_interruption_deleted_messages': 'interruption.deleted_messages',
7482
'karpenter_interruption_message_latency_time_seconds': 'interruption.message.latency.time_seconds',
@@ -84,9 +92,11 @@
8492
'karpenter_nodeclaims_disrupted': 'nodeclaims_disrupted',
8593
'karpenter_nodeclaims_drifted': 'nodeclaims_drifted',
8694
'karpenter_nodeclaims_initialized': 'nodeclaims_initialized',
95+
'karpenter_nodeclaims_instance_termination_duration_seconds': 'nodeclaims_instance_termination.duration_seconds',
8796
'karpenter_nodeclaims_launched': 'nodeclaims_launched',
8897
'karpenter_nodeclaims_registered': 'nodeclaims_registered',
8998
'karpenter_nodeclaims_terminated': 'nodeclaims_terminated',
99+
'karpenter_nodeclaims_termination_duration_seconds': 'nodeclaims_termination.duration_seconds',
90100
'karpenter_nodepool_limit': 'nodepool_limit',
91101
'karpenter_nodepool_usage': 'nodepool_usage',
92102
'karpenter_nodes_allocatable': 'nodes.allocatable',
@@ -109,8 +119,11 @@
109119
'karpenter_provisioner_usage': 'provisioner.usage',
110120
'karpenter_provisioner_usage_pct': 'provisioner.usage.pct',
111121
'karpenter_cluster_state_synced': 'cluster_state.synced',
122+
'karpenter_cluster_state_unsynced_time_seconds': 'cluster_state.unsynced.time_seconds',
112123
'karpenter_cluster_state_node_count': 'cluster_state.node_count',
124+
'karpenter_cluster_utilization_percent': 'cluster.utilization.percent',
113125
'leader_election_master_status': 'leader_election.master_status',
126+
'leader_election_slowpath': 'leader_election.slowpath',
114127
'process_cpu_seconds': 'process.cpu_seconds',
115128
'process_max_fds': 'process.max_fds',
116129
'process_open_fds': 'process.open_fds',
@@ -140,6 +153,23 @@
140153
'karpenter_interruption_message_queue_duration_seconds': 'interruption.message.latency.time_seconds',
141154
'karpenter_nodepools_usage': 'nodepool_usage',
142155
'karpenter_nodepools_limit': 'nodepool_limit',
156+
'operator_ec2nodeclass_status_condition_transitions': 'operator.ec2nodeclass.status_condition.transitions',
157+
'operator_ec2nodeclass_status_condition_current_status_seconds': 'operator.ec2nodeclass.status_condition.current_status.seconds',
158+
'operator_ec2nodeclass_status_condition_count': 'operator.ec2nodeclass.status_condition_count',
159+
'operator_node_event_count': 'operator.node.event_count',
160+
'operator_node_status_condition_transitions': 'operator.node.status_condition.transitions',
161+
'operator_node_status_condition_transition_seconds': 'operator.node.status_condition.transitions.seconds',
162+
'operator_node_status_condition_current_status_seconds': 'operator.node.status_condition.current_status.seconds',
163+
'operator_node_status_condition_count': 'operator.node.status_condition_count',
164+
'operator_node_termination_duration_seconds': 'operator.node.termination.duration_seconds',
165+
'operator_nodeclaim_status_condition_transitions': 'operator.nodeclaim.status_condition.transitions',
166+
'operator_nodeclaim_status_condition_transition_seconds': 'operator.nodeclaim.status_condition.transitions.seconds',
167+
'operator_nodeclaim_status_condition_current_status_seconds': 'operator.nodeclaim.status_condition.current_status.seconds',
168+
'operator_nodeclaim_status_condition_count': 'operator.nodeclaim.status_condition_count',
169+
'operator_nodeclaim_termination_duration_seconds': 'operator.nodeclaim.termination.duration_seconds',
170+
'operator_nodepool_status_condition_transitions': 'operator.nodepool.status_condition.transitions',
171+
'operator_nodepool_status_condition_current_status_seconds': 'operator.nodepool.status_condition.current_status.seconds',
172+
'operator_nodepool_status_condition_count': 'operator.nodepool.status_condition_count',
143173
}
144174

145175
RENAME_LABELS_MAP = {

karpenter/metadata.csv

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
11
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags
2+
karpenter.aws.sdk_go.request.count,count,,,,The total number of AWS SDK Go requests,0,karpenter,,,
3+
karpenter.aws.sdk_go.request.duration_seconds.bucket,count,,,,Latency of AWS SDK Go requests histogram buckets,0,karpenter,,,
4+
karpenter.aws.sdk_go.request.duration_seconds.count,count,,,,Count of AWS SDK Go request durations,0,karpenter,,,
5+
karpenter.aws.sdk_go.request.duration_seconds.sum,count,,second,,Sum of AWS SDK Go request durations,0,karpenter,,,
6+
karpenter.aws.sdk_go.request_attempt.count,count,,,,The total number of AWS SDK Go request attempts,0,karpenter,,,
7+
karpenter.aws.sdk_go.request_attempt.duration_seconds.bucket,count,,second,,Latency of AWS SDK Go request attempts histogram buckets,0,karpenter,,,
8+
karpenter.aws.sdk_go.request_attempt.duration_seconds.count,count,,request,,Count of AWS SDK Go request attempt durations,0,karpenter,,,
9+
karpenter.aws.sdk_go.request_attempt.duration_seconds.sum,count,,second,,Sum of AWS SDK Go request attempt durations,0,karpenter,,,
210
karpenter.build_info,gauge,,,,A metric with a constant '1' value labeled by version from which Karpenter was built.,0,karpenter,,,
311
karpenter.certwatcher.read.certificate.count,count,,read,,The count of certificate reads,0,karpenter,,,
412
karpenter.certwatcher.read.certificate.errors.count,count,,error,,The count of certificate read errors,0,karpenter,,,
@@ -16,8 +24,10 @@ karpenter.cloudprovider.instance.type.cpu_cores,gauge,,core,,VCPUs cores for a g
1624
karpenter.cloudprovider.instance.type.memory_bytes,gauge,,byte,,"Memory, in bytes, for a given instance type",0,karpenter,,,
1725
karpenter.cloudprovider.instance.type.offering_available,gauge,,,,"Instance type offering availability, based on instance type, capacity type, and zone",0,karpenter,,,
1826
karpenter.cloudprovider.instance.type.price_estimate,gauge,,,,Estimated hourly price used when making informed decisions on node cost calculation. This is updated once on startup and then every 12 hours,0,karpenter,,,
27+
karpenter.cluster.utilization.percent,gauge,,percent,,Utilization of allocatable resources by pod requests,0,karpenter,,,
1928
karpenter.cluster_state.node_count,gauge,,node,,Current count of nodes in cluster state.,0,karpenter,,,
2029
karpenter.cluster_state.synced,gauge,,,,Returns 1 if cluster state is synced and 0 otherwise. Synced checks that nodeclaims and nodes that are stored in the APIServer have the same representation as Karpenter's cluster state,0,karpenter,,,
30+
karpenter.cluster_state.unsynced.time_seconds,gauge,,second,,The time for which cluster state is not synced,0,karpenter,,,
2131
karpenter.consistency.errors,gauge,,error,,Number of consistency checks that have failed,0,karpenter,,,
2232
karpenter.controller.runtime.active_workers,gauge,,worker,,Number of currently used workers per controller,0,karpenter,,,
2333
karpenter.controller.runtime.max.concurrent_reconciles,gauge,,,,Maximum number of concurrent reconciles per controller,0,karpenter,,,
@@ -26,6 +36,8 @@ karpenter.controller.runtime.reconcile.time_seconds.bucket,count,,,,The count of
2636
karpenter.controller.runtime.reconcile.time_seconds.count,count,,,,The count of observations in the reconciliation per controller histogram,0,karpenter,,,
2737
karpenter.controller.runtime.reconcile.time_seconds.sum,count,,second,,The sum of time per reconciliation per controller,0,karpenter,,,
2838
karpenter.controller.runtime.reconcile_errors.count,count,,error,,The count of reconciliation errors per controller,0,karpenter,,,
39+
karpenter.controller.runtime.reconcile_panics.count,count,,,,Total number of reconciliation panics per controller,0,karpenter,,,
40+
karpenter.controller.runtime.terminal.reconcile.errors.count,count,,,,Total number of terminal reconciliation errors per controller,0,karpenter,,,
2941
karpenter.deprovisioning.actions_performed.count,count,,execution,,The count of deprovisioning actions performed. Labeled by deprovisioner,0,karpenter,,,
3042
karpenter.deprovisioning.consolidation_timeouts,gauge,,timeout,,Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type,0,karpenter,,,
3143
karpenter.deprovisioning.eligible_machines,gauge,,,,Number of machines eligible for deprovisioning by Karpenter. Labeled by deprovisioner,0,karpenter,,,
@@ -97,9 +109,15 @@ karpenter.nodeclaims_created,gauge,,,,Number of nodeclaims created in total by K
97109
karpenter.nodeclaims_disrupted,gauge,,,,Number of nodeclaims disrupted in total by Karpenter. Labeled by disruption type of the nodeclaim and the owning nodepool,0,karpenter,,,
98110
karpenter.nodeclaims_drifted,gauge,,,,Number of nodeclaims drifted reasons in total by Karpenter. Labeled by drift type of the nodeclaim and the owning nodepool,0,karpenter,,,
99111
karpenter.nodeclaims_initialized,gauge,,,,Number of nodeclaims initialized in total by Karpenter. Labeled by the owning nodepool,0,karpenter,,,
112+
karpenter.nodeclaims_instance_termination.duration_seconds.bucket,count,,,,Histogram buckets for CloudProvider Instance termination duration,0,karpenter,,,
113+
karpenter.nodeclaims_instance_termination.duration_seconds.count,count,,,,Count of CloudProvider Instance termination observations,0,karpenter,,,
114+
karpenter.nodeclaims_instance_termination.duration_seconds.sum,count,,second,,Sum of CloudProvider Instance termination durations,0,karpenter,,,
100115
karpenter.nodeclaims_launched,gauge,,,,Number of nodeclaims launched in total by Karpenter. Labeled by the owning nodepool,0,karpenter,,,
101116
karpenter.nodeclaims_registered,gauge,,,,Number of nodeclaims registered in total by Karpenter. Labeled by the owning nodepool,0,karpenter,,,
102117
karpenter.nodeclaims_terminated,gauge,,,,Number of nodeclaims terminated in total by Karpenter. Labeled by reason the nodeclaim was terminated and the owning nodepool,0,karpenter,,,
118+
karpenter.nodeclaims_termination.duration_seconds.bucket,count,,,,Histogram buckets for NodeClaim termination duration,0,karpenter,,,
119+
karpenter.nodeclaims_termination.duration_seconds.count,count,,,,Count of NodeClaim termination duration observations,0,karpenter,,,
120+
karpenter.nodeclaims_termination.duration_seconds.sum,count,,second,,Sum of NodeClaim termination durations,0,karpenter,,,
103121
karpenter.nodepool_limit,gauge,,,,The nodepool limits are the limits specified on the provisioner that restrict the quantity of resources provisioned. Labeled by nodepool name and resource type,0,karpenter,,,
104122
karpenter.nodepool_usage,gauge,,,,The nodepool usage is the amount of resources that have been provisioned by a particular nodepool. Labeled by nodepool name and resource type,0,karpenter,,,
105123
karpenter.nodes.allocatable,gauge,,,,The amount of resources allocatable by nodes,0,karpenter,,,
@@ -115,6 +133,22 @@ karpenter.nodes.total.daemon_limits,gauge,,,,Total resources specified by Daemon
115133
karpenter.nodes.total.daemon_requests,gauge,,,,Total resources requested by DaemonSet pods,0,karpenter,,,
116134
karpenter.nodes.total.pod_limits,gauge,,,,Total pod resources specified by non-DaemonSet pod limits,0,karpenter,,,
117135
karpenter.nodes.total.pod_requests,gauge,,,,Total pod resources requested by non-DaemonSet pods bound,0,karpenter,,,
136+
karpenter.operator.ec2nodeclass.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for ec2nodeclass,0,karpenter,,,
137+
karpenter.operator.ec2nodeclass.status_condition.transitions.count,count,,,,Count of status condition transitions for ec2nodeclass,0,karpenter,,,
138+
karpenter.operator.ec2nodeclass.status_condition_count,gauge,,,,Number of conditions for ec2nodeclass,0,karpenter,,,
139+
karpenter.operator.node.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for node,0,karpenter,,,
140+
karpenter.operator.node.status_condition.transitions.count,count,,,,Count of status condition transitions for node,0,karpenter,,,
141+
karpenter.operator.node.status_condition.transitions.seconds.bucket,count,,second,,Histogram of condition state durations for node,0,karpenter,,,
142+
karpenter.operator.node.status_condition_count,gauge,,,,Number of conditions for node,0,karpenter,,,
143+
karpenter.operator.node.termination.duration_seconds.bucket,count,,second,,Histogram buckets for node termination durations,0,karpenter,,,
144+
karpenter.operator.nodeclaim.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for nodeclaim,0,karpenter,,,
145+
karpenter.operator.nodeclaim.status_condition.transitions.count,count,,,,Count of status condition transitions for nodeclaim,0,karpenter,,,
146+
karpenter.operator.nodeclaim.status_condition.transitions.seconds.bucket,count,,second,,Histogram of condition state durations for nodeclaim,0,karpenter,,,
147+
karpenter.operator.nodeclaim.status_condition_count,gauge,,,,Number of conditions for nodeclaim,0,karpenter,,,
148+
karpenter.operator.nodeclaim.termination.duration_seconds.bucket,count,,,,Histogram buckets for nodeclaim termination durations,0,karpenter,,,
149+
karpenter.operator.nodepool.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for nodepool,0,karpenter,,,
150+
karpenter.operator.nodepool.status_condition.transitions.count,count,,,,Count of status condition transitions for nodepool,0,karpenter,,,
151+
karpenter.operator.nodepool.status_condition_count,gauge,,,,Number of conditions for nodepool,0,karpenter,,,
118152
karpenter.pods.startup.time_seconds.count,count,,,,The count of the observations in the pod startup summary,0,karpenter,,,
119153
karpenter.pods.startup.time_seconds.quantile,gauge,,,,The time taken between pod creation and the pod being in a running state by `quantile`,0,karpenter,,,
120154
karpenter.pods.startup.time_seconds.sum,count,,second,,The sum of the time from pod creation and the pod being in a running state,0,karpenter,,,

karpenter/tests/common.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,40 @@ def get_fixture_path(filename):
179179
'karpenter.interruption.message.latency.time_seconds.bucket',
180180
'karpenter.nodepool_usage',
181181
'karpenter.nodepool_limit',
182+
'karpenter.aws.sdk_go.request.count',
183+
'karpenter.aws.sdk_go.request.duration_seconds.bucket',
184+
'karpenter.aws.sdk_go.request.duration_seconds.count',
185+
'karpenter.aws.sdk_go.request.duration_seconds.sum',
186+
'karpenter.aws.sdk_go.request_attempt.count',
187+
'karpenter.aws.sdk_go.request_attempt.duration_seconds.bucket',
188+
'karpenter.aws.sdk_go.request_attempt.duration_seconds.count',
189+
'karpenter.aws.sdk_go.request_attempt.duration_seconds.sum',
190+
'karpenter.cluster.utilization.percent',
191+
'karpenter.cluster_state.unsynced.time_seconds',
192+
'karpenter.controller.runtime.reconcile_panics.count',
193+
'karpenter.controller.runtime.terminal.reconcile.errors.count',
194+
'karpenter.nodeclaims_instance_termination.duration_seconds.bucket',
195+
'karpenter.nodeclaims_instance_termination.duration_seconds.count',
196+
'karpenter.nodeclaims_instance_termination.duration_seconds.sum',
197+
'karpenter.nodeclaims_termination.duration_seconds.bucket',
198+
'karpenter.nodeclaims_termination.duration_seconds.count',
199+
'karpenter.nodeclaims_termination.duration_seconds.sum',
200+
'karpenter.operator.ec2nodeclass.status_condition.current_status.seconds',
201+
'karpenter.operator.ec2nodeclass.status_condition.transitions.count',
202+
'karpenter.operator.ec2nodeclass.status_condition_count',
203+
'karpenter.operator.node.status_condition.current_status.seconds',
204+
'karpenter.operator.node.status_condition.transitions.count',
205+
'karpenter.operator.node.status_condition.transitions.seconds.bucket',
206+
'karpenter.operator.node.status_condition_count',
207+
'karpenter.operator.node.termination.duration_seconds.bucket',
208+
'karpenter.operator.nodeclaim.status_condition.current_status.seconds',
209+
'karpenter.operator.nodeclaim.status_condition.transitions.count',
210+
'karpenter.operator.nodeclaim.status_condition.transitions.seconds.bucket',
211+
'karpenter.operator.nodeclaim.status_condition_count',
212+
'karpenter.operator.nodeclaim.termination.duration_seconds.bucket',
213+
'karpenter.operator.nodepool.status_condition.current_status.seconds',
214+
'karpenter.operator.nodepool.status_condition.transitions.count',
215+
'karpenter.operator.nodepool.status_condition_count',
182216
]
183217
RENAMED_LABELS = [
184218
'go_version:go1.20.6',

0 commit comments

Comments
 (0)