fix: allow EXPLAIN on multi-statement SQL beginning with SET (#20106)

jasonmp85 · web-flow · commit 9b43811bf841 · 2025-04-25T17:19:18.000Z
* feat: allow EXPLAIN on multi-statement SQL beginning with SET

SQL strings containing multiple statements cause problems with the ex-
isting explain logic. This change adds a regex to strip an arbitrary
number of `SET` commands from the head of all incoming SQL, such that
the eventual `EXPLAIN` will be performed on just the trailing SQL.

This doesn't fully address all multi-statement SQL strings: in part-
icular clients might send e.g. both a SELECT and an UPDATE in the same
string. As before, this would be passed as-is to the EXPLAIN machinery.

Refs: DBMON-2626

* Add changelog entry for new EXPLAIN behavior

* Simplified regex

* Address code review feedback

* This is more of a fix than feature

* Add integration tests, fix logic

* Refactor

* Formatting fix

* Handle leading comment
diff --git a/postgres/changelog.d/20106.fixed b/postgres/changelog.d/20106.fixed
@@ -0,0 +1 @@
+Allow EXPLAIN on multi-statement SQL where one or more SET commands appear before another supported statement type
diff --git a/postgres/datadog_checks/postgres/statement_samples.py b/postgres/datadog_checks/postgres/statement_samples.py
@@ -32,7 +32,7 @@
 from datadog_checks.base.utils.tracking import tracked_method
 from datadog_checks.postgres.explain_parameterized_queries import ExplainParameterizedQueries
 
-from .util import DatabaseConfigurationError, DBExplainError, warning_with_tags
+from .util import DatabaseConfigurationError, DBExplainError, trim_leading_set_stmts, warning_with_tags
 from .version_utils import V9_6, V10
 
 # according to https://unicodebook.readthedocs.io/unicode_encodings.html, the max supported size of a UTF-8 encoded
@@ -749,15 +749,22 @@ def _run_and_track_explain(self, dbname, statement, obfuscated_statement, query_
     @tracked_method(agent_check_getter=agent_check_getter)
     def _run_explain_safe(self, dbname, statement, obfuscated_statement, query_signature):
         # type: (str, str, str, str) -> Tuple[Optional[Dict], Optional[DBExplainError], Optional[str]]
+
+        orig_statement = statement
+
+        # remove leading SET statements from our SQL
+        if obfuscated_statement[:3].lower() == "set":
+            statement = trim_leading_set_stmts(statement)
+            obfuscated_statement = trim_leading_set_stmts(obfuscated_statement)
+
         if not self._can_explain_statement(obfuscated_statement):
             return None, DBExplainError.no_plans_possible, None
 
         track_activity_query_size = self._get_track_activity_query_size()
 
-        if (
-            self._get_truncation_state(track_activity_query_size, statement, query_signature)
-            == StatementTruncationState.truncated
-        ):
+        # truncation check is on the original query, not the trimmed version
+        stmt_trunc = self._get_truncation_state(track_activity_query_size, orig_statement, query_signature)
+        if stmt_trunc == StatementTruncationState.truncated:
             return (
                 None,
                 DBExplainError.query_truncated,
diff --git a/postgres/datadog_checks/postgres/util.py b/postgres/datadog_checks/postgres/util.py
@@ -1,6 +1,7 @@
 # (C) Datadog, Inc. 2019-present
 # All rights reserved
 # Licensed under Simplified BSD License (see LICENSE)
+import re
 import string
 from enum import Enum
 from typing import Any, List, Tuple  # noqa: F401
@@ -130,6 +131,43 @@ def get_list_chunks(lst, n):
         yield lst[i : i + n]
 
 
+SET_TRIM_PATTERN = re.compile(
+    r"""
+    ^(?:
+        # match one leading comment
+        (?:
+            \s*
+            /\*
+            .*?
+            \*/
+        )?
+        # match leading SET commands
+        \s*SET\b
+        (?:
+            [^';]*? | # keywords, integer literals, etc.
+            (?:'[^']*?')* # single-quoted strings
+        )+
+        ;
+    )+
+    \s*(.+?)$ # actual non-SET cmds
+    """,
+    flags=(re.I | re.X),
+)
+
+
+# Expects one or more SQL statements in a string. If the string
+# begins with any SET statements, they are removed and the rest
+# of the string is returned. Otherwise, the string is returned
+# as it was received.
+def trim_leading_set_stmts(sql):
+    match = SET_TRIM_PATTERN.match(sql)
+
+    if match:
+        return match.group(1)
+    else:
+        return sql
+
+
 fmt = PartialFormatter()
 
 AWS_RDS_HOSTNAME_SUFFIX = ".rds.amazonaws.com"
diff --git a/postgres/tests/test_statements.py b/postgres/tests/test_statements.py
@@ -583,6 +583,44 @@ def test_get_db_explain_setup_state(integration_check, dbm_instance, dbname, exp
 failed_explain_test_repeat_count = 5
 
 
+@pytest.mark.parametrize(
+    "query",
+    [
+        "SELECT * FROM pg_class",
+        "SET LOCAL datestyle TO postgres; SELECT * FROM pg_class",
+    ],
+)
+def test_successful_explain(
+    integration_check,
+    dbm_instance,
+    aggregator,
+    query,
+):
+    dbname = "datadog_test"
+    # Don't need metrics for this one
+    dbm_instance['query_metrics']['enabled'] = False
+    dbm_instance['query_samples']['explain_parameterized_queries'] = False
+    check = integration_check(dbm_instance)
+    check._connect()
+
+    # run check so all internal state is correctly initialized
+    run_one_check(check)
+
+    # clear out contents of aggregator so we measure only the metrics generated during this specific part of the test
+    aggregator.reset()
+
+    db_explain_error, err = check.statement_samples._get_db_explain_setup_state(dbname)
+    assert db_explain_error is None
+    assert err is None
+
+    plan, *rest = check.statement_samples._run_and_track_explain(dbname, query, query, query)
+    assert plan is not None
+
+    plan = plan['Plan']
+    assert plan['Node Type'] == 'Seq Scan'
+    assert plan['Relation Name'] == 'pg_class'
+
+
 @pytest.mark.parametrize(
     "query,expected_error_tag,explain_function_override,expected_fail_count,skip_on_versions",
     [
diff --git a/postgres/tests/test_unit.py b/postgres/tests/test_unit.py
@@ -201,3 +201,28 @@ def test_database_identifier(pg_instance, template, expected, tags):
     pg_instance['tags'] = tags
     check = PostgreSql('postgres', {}, [pg_instance])
     assert check.database_identifier == expected
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "query,expected_trimmed_query",
+    [
+        ("SELECT * FROM pg_settings WHERE name = $1", "SELECT * FROM pg_settings WHERE name = $1"),
+        ("SELECT * FROM pg_settings; DELETE FROM pg_settings;", "SELECT * FROM pg_settings; DELETE FROM pg_settings;"),
+        ("SET search_path TO 'my_schema', public; SELECT * FROM pg_settings", "SELECT * FROM pg_settings"),
+        ("SET TIME ZONE 'Europe/Rome'; SELECT * FROM pg_settings", "SELECT * FROM pg_settings"),
+        (
+            "SET LOCAL request_id = 1234; SET LOCAL hostname TO 'Bob''s Laptop'; SELECT * FROM pg_settings",
+            "SELECT * FROM pg_settings",
+        ),
+        ("SET LONG;" * 1024 + "SELECT *;", "SELECT *;"),
+        ("SET " + "'quotable'" * 1024 + "; SELECT *;", "SELECT *;"),
+        ("SET 'l" + "o" * 1024 + "ng'; SELECT *;", "SELECT *;"),
+        (" /** pl/pgsql **/ SET 'comment'; SELECT *;", "SELECT *;"),
+        ("this isn't SQL", "this isn't SQL"),
+        ("", ""),
+    ],
+)
+def test_trim_set_stmts(query, expected_trimmed_query):
+    trimmed_query = util.trim_leading_set_stmts(query)
+    assert trimmed_query == expected_trimmed_query

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Allow EXPLAIN on multi-statement SQL where one or more SET commands appear before another supported statement type`