Skip to content

Commit 0ee23b6

Browse files
authored
Add collect of metrics for buffer cache usage (#17960)
The pg_buffercache extension provides a pg_buffercache view that give a detailed report the usage of shared buffer: which relation is using it, whether it's dirty and how many backends are pinning this buffer. This patch adds the capacity to collect data from pg_buffercache to provide visibility on shared buffer usage.
1 parent 3f7689e commit 0ee23b6

File tree

12 files changed

+109
-2
lines changed

12 files changed

+109
-2
lines changed

postgres/assets/configuration/spec.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,13 @@ files:
256256
example:
257257
- application_name
258258
default: []
259+
- name: collect_buffercache_metrics
260+
description: |
261+
If set to true, collects metrics regarding buffer cache usage from pg_buffercache.
262+
pg_buffercache extension must be installed.
263+
value:
264+
type: boolean
265+
example: false
259266
- name: collect_database_size_metrics
260267
description: Collect database size metrics.
261268
value:

postgres/changelog.d/17960.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add collect of metrics for buffer cache usage

postgres/datadog_checks/postgres/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def __init__(self, instance, init_config):
8181
self.ssl_key = instance.get('ssl_key', None)
8282
self.ssl_password = instance.get('ssl_password', None)
8383
self.table_count_limit = instance.get('table_count_limit', TABLE_COUNT_LIMIT)
84+
self.collect_buffercache_metrics = is_affirmative(instance.get('collect_buffercache_metrics', False))
8485
self.collect_function_metrics = is_affirmative(instance.get('collect_function_metrics', False))
8586
# Default value for `count_metrics` is True for backward compatibility
8687
self.collect_count_metrics = is_affirmative(instance.get('collect_count_metrics', True))

postgres/datadog_checks/postgres/config_models/defaults.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ def instance_collect_bloat_metrics():
2828
return False
2929

3030

31+
def instance_collect_buffercache_metrics():
32+
return False
33+
34+
3135
def instance_collect_checksum_metrics():
3236
return False
3337

postgres/datadog_checks/postgres/config_models/instance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ class InstanceConfig(BaseModel):
200200
azure: Optional[Azure] = None
201201
collect_activity_metrics: Optional[bool] = None
202202
collect_bloat_metrics: Optional[bool] = None
203+
collect_buffercache_metrics: Optional[bool] = None
203204
collect_checksum_metrics: Optional[bool] = None
204205
collect_count_metrics: Optional[bool] = None
205206
collect_database_size_metrics: Optional[bool] = None

postgres/datadog_checks/postgres/data/conf.yaml.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,12 @@ instances:
197197
# activity_metrics_excluded_aggregations:
198198
# - application_name
199199

200+
## @param collect_buffercache_metrics - boolean - optional - default: false
201+
## If set to true, collects metrics regarding buffer cache usage from pg_buffercache.
202+
## pg_buffercache extension must be installed.
203+
#
204+
# collect_buffercache_metrics: false
205+
200206
## @param collect_database_size_metrics - boolean - optional - default: true
201207
## Collect database size metrics.
202208
#

postgres/datadog_checks/postgres/postgres.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
ANALYZE_PROGRESS_METRICS,
4242
AWS_RDS_HOSTNAME_SUFFIX,
4343
AZURE_DEPLOYMENT_TYPE_TO_RESOURCE_TYPE,
44+
BUFFERCACHE_METRICS,
4445
CLUSTER_VACUUM_PROGRESS_METRICS,
4546
CONNECTION_METRICS,
4647
COUNT_METRICS,
@@ -301,6 +302,8 @@ def dynamic_queries(self):
301302
if self._config.collect_wal_metrics is not False:
302303
# collect wal metrics for pg >= 10 only if the user has not explicitly disabled it
303304
queries.append(WAL_FILE_METRICS)
305+
if self._config.collect_buffercache_metrics:
306+
queries.append(BUFFERCACHE_METRICS)
304307
queries.append(QUERY_PG_REPLICATION_SLOTS)
305308
queries.append(VACUUM_PROGRESS_METRICS)
306309
queries.append(STAT_SUBSCRIPTION_METRICS)

postgres/datadog_checks/postgres/util.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,52 @@ def get_list_chunks(lst, n):
695695
"max(age(backend_xmin))",
696696
]
697697

698+
# pg_buffercache is implemented with a function scan. Thus, the planner doesn't
699+
# have much reliable estimation on the number of rows returned by pg_buffercache.
700+
# The function's pgproc.prorows is used and 1000 is used as a default value.
701+
# On top of that, the function is volatile, preventing possible inlining and
702+
# optimisation.
703+
# It is very likely that we have way more buffers than relations: 16GB of shared_buffers
704+
# will have 2097152 buffers returned by pg_buffercache while pg_class will mostly be
705+
# around thousands of rows. Therefore, we write the query as a CTE aggregating on reldatabase
706+
# and relfilenode. Given that the function is volatile, this will force the CTE to be
707+
# materialized and we should have less or the same cardinality as output as pg_class's
708+
# rows.
709+
# This is more efficient than the cte-less version which will rely on a merge join and thus
710+
# sort the output of pg_buffercache.
711+
BUFFERCACHE_METRICS = {
712+
'name': 'buffercache_metrics',
713+
'query': """
714+
WITH buffer_by_relfilenode AS (
715+
SELECT reldatabase, relfilenode,
716+
NULLIF(COUNT(CASE WHEN relfilenode IS NOT NULL THEN 1 END), 0) as used,
717+
COUNT(CASE WHEN relfilenode IS NULL THEN 1 END) as unused,
718+
SUM(usagecount) as sum_usagecount,
719+
NULLIF(SUM(isdirty::int), 0) as sum_dirty,
720+
NULLIF(SUM(pinning_backends), 0) as sum_pinning
721+
FROM pg_buffercache
722+
GROUP BY reldatabase, relfilenode
723+
)
724+
SELECT COALESCE(d.datname, 'shared'), n.nspname, c.relname,
725+
used, unused, sum_usagecount, sum_dirty, sum_pinning
726+
FROM buffer_by_relfilenode b
727+
LEFT JOIN pg_database d ON b.reldatabase = d.oid
728+
LEFT JOIN pg_class c ON b.relfilenode = pg_relation_filenode(c.oid)
729+
LEFT JOIN pg_namespace n ON n.oid = c.relnamespace;
730+
""",
731+
'columns': [
732+
{'name': 'db', 'type': 'tag'},
733+
{'name': 'schema', 'type': 'tag_not_null'},
734+
{'name': 'relation', 'type': 'tag_not_null'},
735+
{'name': 'used_buffers', 'type': 'gauge'},
736+
{'name': 'unused_buffers', 'type': 'gauge'},
737+
{'name': 'usage_count', 'type': 'gauge'},
738+
{'name': 'dirty_buffers', 'type': 'gauge'},
739+
{'name': 'pinning_backends', 'type': 'gauge'},
740+
],
741+
'metric_prefix': 'postgresql.buffercache',
742+
}
743+
698744
# The metrics we retrieve from pg_stat_activity when the postgres version >= 9.6
699745
ACTIVITY_METRICS_9_6 = [
700746
"SUM(CASE WHEN xact_start IS NOT NULL THEN 1 ELSE 0 END)",

postgres/metadata.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ postgresql.bgwriter.maxwritten_clean,count,,,,The number of times the background
2525
postgresql.bgwriter.sync_time,count,,millisecond,,The total amount of checkpoint processing time spent synchronizing files to disk.,0,postgres,bgw sync time,
2626
postgresql.bgwriter.write_time,count,,millisecond,,The total amount of checkpoint processing time spent writing files to disk.,0,postgres,bgw wrt time,
2727
postgresql.buffer_hit,gauge,,hit,second,"The number of times disk blocks were found in the buffer cache, preventing the need to read from the database. This metric is tagged with db.",1,postgres,buff hit,
28+
postgresql.buffercache.dirty_buffers,gauge,,buffer,,"Number of dirty shared buffers. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache dirty buffers,
29+
postgresql.buffercache.pinning_backends,gauge,,,,"Number of backends pinning shared buffers. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache pinning backends,
30+
postgresql.buffercache.unused_buffers,gauge,,buffer,,"Number of unused shared buffers. pg_buffercache extension needs to be installed.",0,postgres,buffercache unused buffers,
31+
postgresql.buffercache.usage_count,gauge,,,,"Sum of shared buffers' usage_count. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache usage count,
32+
postgresql.buffercache.used_buffers,gauge,,buffer,,"Number of shared buffers. pg_buffercache extension needs to be installed. This metric is tagged by db, schema and relation.",0,postgres,buffercache buffers,
2833
postgresql.checksums.checksum_failures,count,,,,"The number of checksum failures in this database. This metric is tagged with db.",0,postgres,checksums,
2934
postgresql.checksums.enabled,count,,,,"Whether database checksums are enabled. Value is always 1 and tagged with enabled:true or enabled:false. This metric is tagged with db.",0,postgres,checksums.enabled,
3035
postgresql.cluster_vacuum.heap_blks_scanned,gauge,,block,,"Number of heap blocks scanned. This counter only advances when the phase is seq scanning heap. Only available with PostgreSQL 12 and newer. This metric is tagged with db, table, command, phase, index.",0,postgres,postgres cluster blk_scanned,

postgres/tests/common.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,13 @@
119119

120120
def _iterate_metric_name(query):
121121
if 'columns' in query:
122+
metric_prefix = ''
123+
if 'metric_prefix' in query:
124+
metric_prefix = f'{query["metric_prefix"]}.'
122125
for column in query['columns']:
123126
if column['type'].startswith('tag'):
124127
continue
125-
yield column['name']
128+
yield f'{metric_prefix}{column["name"]}'
126129
else:
127130
for metric in query['metrics'].values():
128131
yield metric[0]

postgres/tests/compose/resources/02_setup.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ EOSQL
1212
fi
1313

1414
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" "datadog_test" <<-'EOSQL'
15+
CREATE EXTENSION pg_buffercache SCHEMA public;
1516
CREATE EXTENSION pg_stat_statements SCHEMA public;
1617
GRANT SELECT ON pg_stat_statements TO datadog;
1718

postgres/tests/test_pg_integration.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from datadog_checks.base.stubs import datadog_agent
1515
from datadog_checks.postgres import PostgreSql
1616
from datadog_checks.postgres.__about__ import __version__
17-
from datadog_checks.postgres.util import DatabaseHealthCheckError, PartialFormatter, fmt
17+
from datadog_checks.postgres.util import BUFFERCACHE_METRICS, DatabaseHealthCheckError, PartialFormatter, fmt
1818

1919
from .common import (
2020
COMMON_METRICS,
@@ -25,6 +25,7 @@
2525
POSTGRES_VERSION,
2626
USER_ADMIN,
2727
_get_expected_tags,
28+
_iterate_metric_name,
2829
assert_metric_at_least,
2930
check_activity_metrics,
3031
check_bgw_metrics,
@@ -339,6 +340,34 @@ def test_connections_metrics(aggregator, integration_check, pg_instance):
339340
aggregator.assert_metric('postgresql.connections', count=1, tags=expected_tags)
340341

341342

343+
@requires_over_10
344+
def test_buffercache_metrics(aggregator, integration_check, pg_instance):
345+
pg_instance['collect_buffercache_metrics'] = True
346+
check = integration_check(pg_instance)
347+
348+
with _get_superconn(pg_instance) as conn:
349+
with conn.cursor() as cur:
350+
# Generate some usage on persons relation
351+
cur.execute('select * FROM persons;')
352+
353+
check.check(pg_instance)
354+
base_tags = _get_expected_tags(check, pg_instance)
355+
356+
# Check specific persons relation
357+
persons_tags = base_tags + ['relation:persons', 'db:datadog_test', 'schema:public']
358+
metrics_not_emitted_if_zero = ['postgresql.buffercache.pinning_backends', 'postgresql.buffercache.dirty_buffers']
359+
for metric in _iterate_metric_name(BUFFERCACHE_METRICS):
360+
if metric in metrics_not_emitted_if_zero:
361+
aggregator.assert_metric(metric, count=0, tags=persons_tags)
362+
else:
363+
aggregator.assert_metric(metric, count=1, tags=persons_tags)
364+
365+
# Check metric reported for unused buffers
366+
unused_buffers_tags = base_tags + ['db:shared']
367+
unused_metric = 'postgresql.buffercache.unused_buffers'
368+
aggregator.assert_metric(unused_metric, count=1, tags=unused_buffers_tags)
369+
370+
342371
def test_locks_metrics_no_relations(aggregator, integration_check, pg_instance):
343372
"""
344373
Since 4.0.0, to prevent tag explosion, lock metrics are not collected anymore unless relations are specified

0 commit comments

Comments
 (0)