From eacbd7ffc964f53ac7ad603c7c4205f89abe121f Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 22 Mar 2024 10:08:54 +0000
Subject: [PATCH 001/132] Adding schema collection to sqlserver

---
 .../datadog_checks/sqlserver/metadata.py      | 109 +++++++++++++++-
 .../datadog_checks/sqlserver/sqlserver.py     | 122 +++++++++++++++++-
 2 files changed, 229 insertions(+), 2 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/metadata.py b/sqlserver/datadog_checks/sqlserver/metadata.py
index 4550118a9b0c4..33cf24a92e8ab 100644
--- a/sqlserver/datadog_checks/sqlserver/metadata.py
+++ b/sqlserver/datadog_checks/sqlserver/metadata.py
@@ -2,7 +2,7 @@
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 import time
-
+import pdb
 from datadog_checks.base import is_affirmative
 from datadog_checks.base.utils.db.utils import (
     DBMAsyncJob,
@@ -128,7 +128,104 @@ def _load_settings_rows(self, cursor):
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         self.log.debug("loaded sql server settings len(rows)=%s", len(rows))
         return rows
+    
+    """schemas data struct is a dictionnary with key being a schema name the value is
+    schema
+    dict:
+        "name": str
+        "schema_id": str
+        "principal_id": str
+        "tables" : dict
+            name: list of columns                  
+                "columns": dict
+                    name: str
+                    data_type: str
+                    default: str
+
+
+    """
+    @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
+    def _query_schema_information(self, cursor):
+
+        # principal_id is kind of like an owner
+
+        # Todo put in consts
+        # there is also principal_id not sure if need it.
+        SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas;"
+        self.log.debug("collecting db schemas")
+        self.log.debug("Running query [%s]", SCHEMA_QUERY)
+        cursor.execute(SCHEMA_QUERY)
+        schemas = []
+        columns = [i[0] for i in cursor.description]
+        schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        schemas_by_name = {}
+
+        schemas_by_name = {}
+
+        for schema in schemas:
+            name = schema['name'].lower()
+            #add tables
+            schema['tables'] = {}
+            schemas_by_name[name] = schema
+
+        self.log.debug("fetched schemas len(rows)=%s", len(schemas))
+        return schemas_by_name
 
+    def _get_table_infos(self, schemas, cursor):
+        #TODO do we need this for sqlserver ? 
+        #If any tables are partitioned, only the master paritition table name will be returned, and none of its children.
+
+        # TODO 
+        #Do we need a limit ? like in postgress , seems not
+        #limit = self._config.schemas_metadata_config.get("max_tables", 300)
+
+        TABLES_QUERY = "SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS;"
+        cursor.execute(TABLES_QUERY)
+        #TODO
+        #             nullable: bool column ?
+        #TODO
+        #"foreign_keys": dict (if has foreign keys)
+        #    name: str
+        #    definition: str
+        #TODO
+        #        "indexes": dict (if has indexes)
+        #    name: str
+        #    definition: str
+        #TODO
+        #"toast_table": str (if associated toast table exists) - equivalent in sql server
+        
+        # "partition_key": str (if has partitions) - equiv ? 
+
+        # "num_partitions": int (if has partitions) - equiv ? 
+        #apply lower case ? 
+        #this is just to avoid doing something like row[0] , row[1] etc 
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        
+        for row in rows:
+            if len(row) != 5:
+                #TODO some warning ? 
+                print("warning") 
+
+            #TODO treat not found 
+            schema = schemas[row['table_schema']]
+
+            tables_dict_for_schema = schema['tables']
+
+            #do the same mapping as in postgres for some uniformity otherwise could've just loop and exclude some keys
+            if row['table_name'] not in tables_dict_for_schema:
+                #new table
+                tables_dict_for_schema[row['table_name']] = []
+            column = {}
+            column['name'] = row['column_name']
+            column['data_type'] = row['data_type']
+            column['default'] = row['column_default']
+            #table is an array of column dict for now.
+            tables_dict_for_schema[row['table_name']].append(column)
+            # table dict has a key columns with value arrray of dicts
+
+#self._sort_and_limit_table_info(cursor, dbname, table_info, limit)
+# for now not sort and limit
     @tracked_method(agent_check_getter=agent_check_getter)
     def report_sqlserver_metadata(self):
         with self._check.connection.open_managed_default_connection(key_prefix=self._conn_key_prefix):
@@ -150,3 +247,13 @@ def report_sqlserver_metadata(self):
                     "metadata": settings_rows,
                 }
                 self._check.database_monitoring_metadata(json.dumps(event, default=default_json_event_encoding))
+
+                #TODO split in functions
+                #NEXT BIg thing whats with different DBS , filtering , partitions 
+                #Trade off dict vs normal data structure ? 
+
+                #TODO do it per DB if not Azure otherwise connect , kind of bad main thread ?
+                #schemas = self._query_schema_information(cursor)
+                #self._get_table_infos(schemas, cursor)
+                #print(schemas)
+                #pdb.set_trace()
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 3a661d8147d71..86982b9b854da 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -2,7 +2,7 @@
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 from __future__ import division
-
+import pdb
 import copy
 import time
 from collections import defaultdict
@@ -723,6 +723,124 @@ def _check_connections_by_use_db(self):
                         continue
                 # Switch DB back to MASTER
                 cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
+    
+    """schemas data struct is a dictionnary with key being a schema name the value is
+    schema
+    dict:
+        "name": str
+        "schema_id": str
+        "principal_id": str
+        "tables" : dict
+            name: list of columns                  
+                "columns": dict
+                    name: str
+                    data_type: str
+                    default: str
+
+
+    """
+    def _query_schema_information(self, cursor):
+
+        # principal_id is kind of like an owner
+
+        # Todo put in consts
+        # there is also principal_id not sure if need it.
+        SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas;"
+        self.log.debug("collecting db schemas")
+        self.log.debug("Running query [%s]", SCHEMA_QUERY)
+        cursor.execute(SCHEMA_QUERY)
+        schemas = []
+        columns = [i[0] for i in cursor.description]
+        schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        schemas_by_name = {}
+
+        schemas_by_name = {}
+
+        for schema in schemas:
+            name = schema['name'].lower()
+            #add tables
+            schema['tables'] = {}
+            schemas_by_name[name] = schema
+
+        self.log.debug("fetched schemas len(rows)=%s", len(schemas))
+        return schemas_by_name
+
+    def _get_table_infos(self, schemas, cursor):
+        #TODO do we need this for sqlserver ? 
+        #If any tables are partitioned, only the master paritition table name will be returned, and none of its children.
+
+        # TODO 
+        #Do we need a limit ? like in postgress , seems not
+        #limit = self._config.schemas_metadata_config.get("max_tables", 300)
+
+        TABLES_QUERY = "SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS;"
+        cursor.execute(TABLES_QUERY)
+        #TODO
+        #             nullable: bool column ?
+        #TODO
+        #"foreign_keys": dict (if has foreign keys)
+        #    name: str
+        #    definition: str
+        #TODO
+        #        "indexes": dict (if has indexes)
+        #    name: str
+        #    definition: str
+        #TODO
+        #"toast_table": str (if associated toast table exists) - equivalent in sql server
+        
+        # "partition_key": str (if has partitions) - equiv ? 
+
+        # "num_partitions": int (if has partitions) - equiv ? 
+        #apply lower case ? 
+        #this is just to avoid doing something like row[0] , row[1] etc 
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        
+        for row in rows:
+            if len(row) != 5:
+                #TODO some warning ? 
+                print("warning") 
+
+            #TODO treat not found 
+            schema = schemas[row['table_schema']]
+
+            tables_dict_for_schema = schema['tables']
+
+            #do the same mapping as in postgres for some uniformity otherwise could've just loop and exclude some keys
+            if row['table_name'] not in tables_dict_for_schema:
+                #new table
+                tables_dict_for_schema[row['table_name']] = []
+            column = {}
+            column['name'] = row['column_name']
+            column['data_type'] = row['data_type']
+            column['default'] = row['column_default']
+            #table is an array of column dict for now.
+            tables_dict_for_schema[row['table_name']].append(column)
+            # table dict has a key columns with value arrray of dicts
+    
+    #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
+    #
+    def _collect_schemas_for_non_azure(self):
+        #schemas per db
+        schemas_per_db = {}
+        #TODO its copy paste make a function
+        db_names = [d.name for d in self.databases] or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
+        pdb.set_trace()
+        with self.connection.open_managed_default_connection():
+            with self.connection.get_managed_cursor() as cursor:
+                for db in db_names:                    
+                    try:
+                        pdb.set_trace()
+                        cursor.execute(SWITCH_DB_STATEMENT.format(db))
+                        schemas = self._query_schema_information(cursor)
+                        self._get_table_infos(schemas, cursor)
+                        schemas_per_db[db] = schemas
+                    except Exception as e:
+                        print("TODO")
+                # Switch DB back to MASTER
+                cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
+        pdb.set_trace()
+        print(schemas_per_db)
 
     def _check_database_conns(self):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
@@ -756,6 +874,8 @@ def check(self, _):
             if self._config.autodiscovery and self._config.autodiscovery_db_service_check:
                 self._check_database_conns()
             if self._config.dbm_enabled:
+                #TODO limit this check by some minutes ... 
+                self._collect_schemas_for_non_azure()
                 self.statement_metrics.run_job_loop(self.tags)
                 self.procedure_metrics.run_job_loop(self.tags)
                 self.activity.run_job_loop(self.tags)

From 720aa459e75f7e75ec596715a44e57776c140823 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 3 Apr 2024 16:01:58 +0000
Subject: [PATCH 002/132] rather use sys tables

---
 .../datadog_checks/sqlserver/sqlserver.py     | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 86982b9b854da..bc78448fc8026 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -731,13 +731,15 @@ def _check_connections_by_use_db(self):
         "schema_id": str
         "principal_id": str
         "tables" : dict
-            name: list of columns                  
+            object_id : str
+            name : str
+            columns: list of columns                  
                 "columns": dict
                     name: str
                     data_type: str
                     default: str
-
-
+            indexes : list of indexes
+            foreign_keys : list of foreign keys
     """
     def _query_schema_information(self, cursor):
 
@@ -764,8 +766,20 @@ def _query_schema_information(self, cursor):
 
         self.log.debug("fetched schemas len(rows)=%s", len(schemas))
         return schemas_by_name
+    
+#in tables we have modified date !
+    # can be a separate query 
+    
+
 
-    def _get_table_infos(self, schemas, cursor):
+    def _get_table_infos_sys_tables(self, schemas, cursor):
+        print("Hello")
+        TABLE_QUERY = ""
+        
+    # TODO how often ?
+    # TODO put in a class
+    # for big DBs somehow first determine tables we are intereted in and query only for them ?
+    def _get_table_infos_info_schema(self, schemas, cursor):
         #TODO do we need this for sqlserver ? 
         #If any tables are partitioned, only the master paritition table name will be returned, and none of its children.
 

From d2e035f9c06a9f8fd6c3e9df327d9de6fb806d19 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 5 Apr 2024 08:54:04 +0000
Subject: [PATCH 003/132] snapshot collect data

---
 .../datadog_checks/sqlserver/sqlserver.py     | 113 +++++++++++++++---
 1 file changed, 96 insertions(+), 17 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index bc78448fc8026..7f9c73c86080b 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -747,6 +747,7 @@ def _query_schema_information(self, cursor):
 
         # Todo put in consts
         # there is also principal_id not sure if need it.
+        # TODO exclude schemas like INFORMATION_SCHEMA
         SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas;"
         self.log.debug("collecting db schemas")
         self.log.debug("Running query [%s]", SCHEMA_QUERY)
@@ -754,28 +755,103 @@ def _query_schema_information(self, cursor):
         schemas = []
         columns = [i[0] for i in cursor.description]
         schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        schemas_by_name = {}
-
-        schemas_by_name = {}
-
-        for schema in schemas:
-            name = schema['name'].lower()
-            #add tables
-            schema['tables'] = {}
-            schemas_by_name[name] = schema
-
+        #add tables
+        
+        for s in schemas:
+            s['tables'] = {}
+ 
         self.log.debug("fetched schemas len(rows)=%s", len(schemas))
-        return schemas_by_name
+        return schemas
     
 #in tables we have modified date !
     # can be a separate query 
     
+    # plan lets do per db per schema , get all tables , then (sort or pick first batch), then query columns per batch or table ?
+    def _get_table_infos_sys_tables_per_schema(self, schemas, cursor):
 
+        for schema in schemas:
+            self._get_table_infos_sys_tables(schema, cursor)
 
-    def _get_table_infos_sys_tables(self, schemas, cursor):
-        print("Hello")
-        TABLE_QUERY = ""
+    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
+    def _get_table_infos_sys_tables(self, schema, cursor):
+        tables_dict_for_schema = schema['tables']
+        
+        # we could get data from sys tables too ...  
+        # can be done by table as well , might be usefull in case if we get too many rows i.e. we could split this query in several 
+        # patches. As for updates we could have a separate mechanism
+        # .
+        # TODO modify_date - there is a modify date !!! 
+        # TODO what is principal_id
+        # TODO is_replicated - might be interesting ? 
+        TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}".format(schema["schema_id"])
+
+        cursor.execute(TABLES_IN_SCHEMA_QUERY)
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        for row in rows:
+            if len(row) != 2:
+            #TODO some warning ? 
+                print("warning") 
+            
+            tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "foreign_keys" : []}
+        #TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT 
+        # in sys.columns I cannot see a data type but there are other things   
+        #object_id   name       
+        #column_id   system_type_id user_type_id max_length precision scale collation_name   
+        # is_nullable is_ansi_padded is_rowguidcol is_identity is_computed is_filestream is_replicated 
+        # is_non_sql_subscribed is_merge_published is_dts_replicated is_xml_document xml_collection_id 
+        # default_object_id rule_object_id is_sparse is_column_set generated_always_type generated_always_type_desc                                   encryption_type encryption_type_desc                                             encryption_algorithm_name                                                                                                        column_encryption_key_id column_encryption_key_database_name                                                                                              is_hidden is_masked graph_type  graph_type_desc                                              is_data_deletion_filter_column ledger_view_column_type ledger_view_column_type_desc                                 is_dropped_ledger_column
+        # might be slower then in sys.columns as we cant get data by object_id .... but we get data_type
+        #COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA={} and TABLE_NAME={};".format(schema_name, )
         
+        #if using query 2 we need to figure out user_type_id - its like a user defined type
+        # TODO AL in all query 2 will be query 2 faster ? or its just less convinient at the end ... ? object_id is nice
+
+        COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
+       
+        # TODO can be a function query and unwrap in dict
+        for table_object_id, table_value in tables_dict_for_schema.items():
+            cursor.execute(COLUMN_QUERY2.format(table_object_id))
+            columns = [str(i[0]).lower() for i in cursor.description]
+            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+            for row in rows:
+                table_value["columns"].append(row)
+        
+        # object_id   name  index_id    type type_desc is_unique data_space_id ignore_dup_key is_primary_key is_unique_constraint 
+        # fill_factor is_padded is_disabled is_hypothetical is_ignored_in_optimization allow_row_locks allow_page_locks has_filter 
+        # filter_definition    
+        
+        #May be better to query sys.index_columns ?                                                                                                                                                                                                                                            compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
+        INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
+
+        # index query:
+        for table_object_id, table_value in tables_dict_for_schema.items():
+            cursor.execute(INDEX_QUERY.format(table_object_id))
+            columns = [str(i[0]).lower() for i in cursor.description]
+            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+            for row in rows:
+                table_value["indexes"].append(row)
+        
+        # foreign keys
+        # name object_id principal_id schema_id parent_object_id type type_desc create_date modify_date is_ms_shipped 
+        # is_published is_schema_published referenced_object_id key_index_id is_disabled is_not_for_replication 
+        # is_not_trusted delete_referential_action delete_referential_action_desc update_referential_action 
+        # update_referential_action_desc is_system_named compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
+        # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
+        # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
+                                                                                                                                                                            
+        FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys WHERE object_id={};"
+        
+        # index query:
+        for table_object_id, table_value in tables_dict_for_schema.items():
+            cursor.execute(FOREIGN_KEY_QUERY.format(table_object_id))
+            columns = [str(i[0]).lower() for i in cursor.description]
+            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+            for row in rows:
+                table_value["foreign_keys"].append(row)
+
+        print("the end")
+
     # TODO how often ?
     # TODO put in a class
     # for big DBs somehow first determine tables we are intereted in and query only for them ?
@@ -839,17 +915,20 @@ def _collect_schemas_for_non_azure(self):
         schemas_per_db = {}
         #TODO its copy paste make a function
         db_names = [d.name for d in self.databases] or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
-        pdb.set_trace()
         with self.connection.open_managed_default_connection():
             with self.connection.get_managed_cursor() as cursor:
                 for db in db_names:                    
                     try:
-                        pdb.set_trace()
                         cursor.execute(SWITCH_DB_STATEMENT.format(db))
                         schemas = self._query_schema_information(cursor)
-                        self._get_table_infos(schemas, cursor)
+                        #self._get_table_infos(schemas, cursor)
+
+                        self._get_table_infos_sys_tables_per_schema(schemas, cursor)
+                        
                         schemas_per_db[db] = schemas
+                        pdb.set_trace()
                     except Exception as e:
+                        pdb.set_trace()
                         print("TODO")
                 # Switch DB back to MASTER
                 cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))

From f200aeb087443d3be393c9866dca36e31d14f79c Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 5 Apr 2024 09:50:28 +0000
Subject: [PATCH 004/132] remove unused function

---
 .../datadog_checks/sqlserver/sqlserver.py     | 80 +++----------------
 1 file changed, 9 insertions(+), 71 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 7f9c73c86080b..a11000c39f305 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -772,14 +772,16 @@ def _get_table_infos_sys_tables_per_schema(self, schemas, cursor):
         for schema in schemas:
             self._get_table_infos_sys_tables(schema, cursor)
 
+    # TODO how often ?
+    # TODO put in a class
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
     def _get_table_infos_sys_tables(self, schema, cursor):
         tables_dict_for_schema = schema['tables']
         
-        # we could get data from sys tables too ...  
-        # can be done by table as well , might be usefull in case if we get too many rows i.e. we could split this query in several 
-        # patches. As for updates we could have a separate mechanism
-        # .
+        # TODO check out sys.partitions in postgres we deliver some data about patitions
+        # "partition_key": str (if has partitions) - equiv ? 
+        # "num_partitions": int (if has partitions) - equiv ? 
+
         # TODO modify_date - there is a modify date !!! 
         # TODO what is principal_id
         # TODO is_replicated - might be interesting ? 
@@ -788,11 +790,7 @@ def _get_table_infos_sys_tables(self, schema, cursor):
         cursor.execute(TABLES_IN_SCHEMA_QUERY)
         columns = [str(i[0]).lower() for i in cursor.description]
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        for row in rows:
-            if len(row) != 2:
-            #TODO some warning ? 
-                print("warning") 
-            
+        for row in rows:            
             tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "foreign_keys" : []}
         #TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT 
         # in sys.columns I cannot see a data type but there are other things   
@@ -807,7 +805,7 @@ def _get_table_infos_sys_tables(self, schema, cursor):
         #if using query 2 we need to figure out user_type_id - its like a user defined type
         # TODO AL in all query 2 will be query 2 faster ? or its just less convinient at the end ... ? object_id is nice
 
-        COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
+        COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
        
         # TODO can be a function query and unwrap in dict
         for table_object_id, table_value in tables_dict_for_schema.items():
@@ -849,65 +847,8 @@ def _get_table_infos_sys_tables(self, schema, cursor):
             rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
             for row in rows:
                 table_value["foreign_keys"].append(row)
-
         print("the end")
 
-    # TODO how often ?
-    # TODO put in a class
-    # for big DBs somehow first determine tables we are intereted in and query only for them ?
-    def _get_table_infos_info_schema(self, schemas, cursor):
-        #TODO do we need this for sqlserver ? 
-        #If any tables are partitioned, only the master paritition table name will be returned, and none of its children.
-
-        # TODO 
-        #Do we need a limit ? like in postgress , seems not
-        #limit = self._config.schemas_metadata_config.get("max_tables", 300)
-
-        TABLES_QUERY = "SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS;"
-        cursor.execute(TABLES_QUERY)
-        #TODO
-        #             nullable: bool column ?
-        #TODO
-        #"foreign_keys": dict (if has foreign keys)
-        #    name: str
-        #    definition: str
-        #TODO
-        #        "indexes": dict (if has indexes)
-        #    name: str
-        #    definition: str
-        #TODO
-        #"toast_table": str (if associated toast table exists) - equivalent in sql server
-        
-        # "partition_key": str (if has partitions) - equiv ? 
-
-        # "num_partitions": int (if has partitions) - equiv ? 
-        #apply lower case ? 
-        #this is just to avoid doing something like row[0] , row[1] etc 
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        
-        for row in rows:
-            if len(row) != 5:
-                #TODO some warning ? 
-                print("warning") 
-
-            #TODO treat not found 
-            schema = schemas[row['table_schema']]
-
-            tables_dict_for_schema = schema['tables']
-
-            #do the same mapping as in postgres for some uniformity otherwise could've just loop and exclude some keys
-            if row['table_name'] not in tables_dict_for_schema:
-                #new table
-                tables_dict_for_schema[row['table_name']] = []
-            column = {}
-            column['name'] = row['column_name']
-            column['data_type'] = row['data_type']
-            column['default'] = row['column_default']
-            #table is an array of column dict for now.
-            tables_dict_for_schema[row['table_name']].append(column)
-            # table dict has a key columns with value arrray of dicts
-    
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     #
     def _collect_schemas_for_non_azure(self):
@@ -921,10 +862,7 @@ def _collect_schemas_for_non_azure(self):
                     try:
                         cursor.execute(SWITCH_DB_STATEMENT.format(db))
                         schemas = self._query_schema_information(cursor)
-                        #self._get_table_infos(schemas, cursor)
-
-                        self._get_table_infos_sys_tables_per_schema(schemas, cursor)
-                        
+                        self._get_table_infos_sys_tables_per_schema(schemas, cursor)                        
                         schemas_per_db[db] = schemas
                         pdb.set_trace()
                     except Exception as e:

From 5b5b511d99ab905dc34c4e1be86d558b57e18165 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 5 Apr 2024 15:37:00 +0000
Subject: [PATCH 005/132] small improvments

---
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index a11000c39f305..e1638aba40af9 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -738,6 +738,7 @@ def _check_connections_by_use_db(self):
                     name: str
                     data_type: str
                     default: str
+                    is_nullable : str
             indexes : list of indexes
             foreign_keys : list of foreign keys
     """
@@ -768,7 +769,6 @@ def _query_schema_information(self, cursor):
     
     # plan lets do per db per schema , get all tables , then (sort or pick first batch), then query columns per batch or table ?
     def _get_table_infos_sys_tables_per_schema(self, schemas, cursor):
-
         for schema in schemas:
             self._get_table_infos_sys_tables(schema, cursor)
 
@@ -838,7 +838,7 @@ def _get_table_infos_sys_tables(self, schema, cursor):
         # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
         # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
                                                                                                                                                                             
-        FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys WHERE object_id={};"
+        FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
         
         # index query:
         for table_object_id, table_value in tables_dict_for_schema.items():

From 7c6e4e9eaeda162bbcd5ea7c62485c59f732660e Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 8 Apr 2024 15:55:38 +0000
Subject: [PATCH 006/132] improving code

---
 sqlserver/datadog_checks/sqlserver/const.py   |   3 +
 .../datadog_checks/sqlserver/sqlserver.py     | 110 ++++++++++--------
 2 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 8b4a70ff1e6d0..1ebb9fd6ee827 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -268,3 +268,6 @@
 ]
 
 PROC_CHAR_LIMIT = 500
+
+SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas WHERE name NOT IN ('sys', 'information_schema');"
+TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}"
\ No newline at end of file
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index e1638aba40af9..b8fd75ef2613c 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -67,6 +67,8 @@
     TASK_SCHEDULER_METRICS,
     TEMPDB_FILE_SPACE_USAGE_METRICS,
     VALID_METRIC_TYPES,
+    SCHEMA_QUERY,
+    TABLES_IN_SCHEMA_QUERY,
     expected_sys_databases_columns,
 )
 from datadog_checks.sqlserver.metrics import DEFAULT_PERFORMANCE_TABLE, VALID_TABLES
@@ -744,54 +746,57 @@ def _check_connections_by_use_db(self):
     """
     def _query_schema_information(self, cursor):
 
-        # principal_id is kind of like an owner
-
-        # Todo put in consts
-        # there is also principal_id not sure if need it.
-        # TODO exclude schemas like INFORMATION_SCHEMA
-        SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas;"
+        # principal_id is kind of like an owner not sure if need it.
         self.log.debug("collecting db schemas")
         self.log.debug("Running query [%s]", SCHEMA_QUERY)
         cursor.execute(SCHEMA_QUERY)
         schemas = []
         columns = [i[0] for i in cursor.description]
         schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        #add tables
-        
-        for s in schemas:
-            s['tables'] = {}
- 
+        for schema in schemas:
+            schema["tables"] = {}
         self.log.debug("fetched schemas len(rows)=%s", len(schemas))
         return schemas
     
-#in tables we have modified date !
+    # TODO in tables we have modified date !
     # can be a separate query 
     
-    # plan lets do per db per schema , get all tables , then (sort or pick first batch), then query columns per batch or table ?
-    def _get_table_infos_sys_tables_per_schema(self, schemas, cursor):
+
+    def _get_table_data_per_schema(self, schemas, cursor):
         for schema in schemas:
-            self._get_table_infos_sys_tables(schema, cursor)
+            self._get_tables_and_their_data(schema, cursor)
+
+    def _get_tables_and_their_data(self, schema, cursor):
+        self._get_table_infos(schema, cursor)
+        tables_dict_for_schema = schema['tables']
+        pdb.set_trace()
+        for table_object_id, table_value in tables_dict_for_schema.items():
+            table_value["columns"] = self._get_columns_data_per_table(table_object_id, cursor)
+            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
+            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
+            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
 
     # TODO how often ?
     # TODO put in a class
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-    def _get_table_infos_sys_tables(self, schema, cursor):
+    def _get_table_infos(self, schema, cursor):
         tables_dict_for_schema = schema['tables']
         
-        # TODO check out sys.partitions in postgres we deliver some data about patitions
-        # "partition_key": str (if has partitions) - equiv ? 
-        # "num_partitions": int (if has partitions) - equiv ? 
-
         # TODO modify_date - there is a modify date !!! 
         # TODO what is principal_id
         # TODO is_replicated - might be interesting ? 
-        TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}".format(schema["schema_id"])
-
-        cursor.execute(TABLES_IN_SCHEMA_QUERY)
+        
+        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
         columns = [str(i[0]).lower() for i in cursor.description]
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         for row in rows:            
-            tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "foreign_keys" : []}
+            tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []}
+        return
+
+
+        
+    def _get_columns_data_per_table(self, table_object_id, cursor):
+
         #TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT 
         # in sys.columns I cannot see a data type but there are other things   
         #object_id   name       
@@ -808,13 +813,26 @@ def _get_table_infos_sys_tables(self, schema, cursor):
         COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
        
         # TODO can be a function query and unwrap in dict
-        for table_object_id, table_value in tables_dict_for_schema.items():
-            cursor.execute(COLUMN_QUERY2.format(table_object_id))
-            columns = [str(i[0]).lower() for i in cursor.description]
-            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-            for row in rows:
-                table_value["columns"].append(row)
-        
+        cursor.execute(COLUMN_QUERY2.format(table_object_id))
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        return rows
+    
+    def _get_partitions_data_per_table(self, table_object_id, cursor):
+
+        # TODO check out sys.partitions in postgres we deliver some data about patitions
+        # "partition_key": str (if has partitions) - equiv ? 
+        # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
+        # for more in depth search, it's not trivial to determine partition key like in Postgres
+        PARTITIONS_QUERY = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
+
+        cursor.execute(PARTITIONS_QUERY.format(table_object_id))
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        return rows
+
+    def _get_index_data_per_table(self, table_object_id, cursor):           
+
         # object_id   name  index_id    type type_desc is_unique data_space_id ignore_dup_key is_primary_key is_unique_constraint 
         # fill_factor is_padded is_disabled is_hypothetical is_ignored_in_optimization allow_row_locks allow_page_locks has_filter 
         # filter_definition    
@@ -823,12 +841,12 @@ def _get_table_infos_sys_tables(self, schema, cursor):
         INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
 
         # index query:
-        for table_object_id, table_value in tables_dict_for_schema.items():
-            cursor.execute(INDEX_QUERY.format(table_object_id))
-            columns = [str(i[0]).lower() for i in cursor.description]
-            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-            for row in rows:
-                table_value["indexes"].append(row)
+
+        cursor.execute(INDEX_QUERY.format(table_object_id))
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        return rows
+
         
         # foreign keys
         # name object_id principal_id schema_id parent_object_id type type_desc create_date modify_date is_ms_shipped 
@@ -837,17 +855,14 @@ def _get_table_infos_sys_tables(self, schema, cursor):
         # update_referential_action_desc is_system_named compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
         # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
         # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
-                                                                                                                                                                            
-        FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
-        
+    def _get_foreign_key_data_per_table(self, table_object_id, cursor):
+        FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"    
         # index query:
-        for table_object_id, table_value in tables_dict_for_schema.items():
-            cursor.execute(FOREIGN_KEY_QUERY.format(table_object_id))
-            columns = [str(i[0]).lower() for i in cursor.description]
-            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-            for row in rows:
-                table_value["foreign_keys"].append(row)
+        cursor.execute(FOREIGN_KEY_QUERY.format(table_object_id))
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         print("the end")
+        return rows
 
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     #
@@ -862,7 +877,8 @@ def _collect_schemas_for_non_azure(self):
                     try:
                         cursor.execute(SWITCH_DB_STATEMENT.format(db))
                         schemas = self._query_schema_information(cursor)
-                        self._get_table_infos_sys_tables_per_schema(schemas, cursor)                        
+                        pdb.set_trace()
+                        self._get_table_data_per_schema(schemas, cursor)                        
                         schemas_per_db[db] = schemas
                         pdb.set_trace()
                     except Exception as e:

From c0dacf805909e92c7c629325482e191c0490217c Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 8 Apr 2024 16:44:59 +0000
Subject: [PATCH 007/132] fixed errors

---
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index b8fd75ef2613c..eb91891775e92 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -769,7 +769,6 @@ def _get_table_data_per_schema(self, schemas, cursor):
     def _get_tables_and_their_data(self, schema, cursor):
         self._get_table_infos(schema, cursor)
         tables_dict_for_schema = schema['tables']
-        pdb.set_trace()
         for table_object_id, table_value in tables_dict_for_schema.items():
             table_value["columns"] = self._get_columns_data_per_table(table_object_id, cursor)
             table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
@@ -881,6 +880,7 @@ def _collect_schemas_for_non_azure(self):
                         self._get_table_data_per_schema(schemas, cursor)                        
                         schemas_per_db[db] = schemas
                         pdb.set_trace()
+                        print("collected")
                     except Exception as e:
                         pdb.set_trace()
                         print("TODO")

From c3f5be25b814f5419becc43196e5b60b5d8f57eb Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 8 Apr 2024 17:33:35 +0000
Subject: [PATCH 008/132] refactored code

---
 sqlserver/datadog_checks/sqlserver/const.py   |  6 ++-
 .../datadog_checks/sqlserver/sqlserver.py     | 46 ++++++-------------
 sqlserver/datadog_checks/sqlserver/utils.py   |  6 +++
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 1ebb9fd6ee827..7f9aa43c8a3a5 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -270,4 +270,8 @@
 PROC_CHAR_LIMIT = 500
 
 SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas WHERE name NOT IN ('sys', 'information_schema');"
-TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}"
\ No newline at end of file
+TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}"
+COLUMN_QUERY = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
+PARTITIONS_QUERY = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
+INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
+FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index eb91891775e92..5f0d1fc85e543 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -69,6 +69,10 @@
     VALID_METRIC_TYPES,
     SCHEMA_QUERY,
     TABLES_IN_SCHEMA_QUERY,
+    COLUMN_QUERY,
+    PARTITIONS_QUERY,
+    INDEX_QUERY,
+    FOREIGN_KEY_QUERY,
     expected_sys_databases_columns,
 )
 from datadog_checks.sqlserver.metrics import DEFAULT_PERFORMANCE_TABLE, VALID_TABLES
@@ -86,6 +90,7 @@
     is_azure_database,
     is_azure_sql_database,
     set_default_driver_conf,
+    execute_query_output_result_as_a_dict,
 )
 
 try:
@@ -808,43 +813,25 @@ def _get_columns_data_per_table(self, table_object_id, cursor):
         
         #if using query 2 we need to figure out user_type_id - its like a user defined type
         # TODO AL in all query 2 will be query 2 faster ? or its just less convinient at the end ... ? object_id is nice
+        return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_object_id), cursor)
 
-        COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
-       
-        # TODO can be a function query and unwrap in dict
-        cursor.execute(COLUMN_QUERY2.format(table_object_id))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        return rows
     
     def _get_partitions_data_per_table(self, table_object_id, cursor):
-
         # TODO check out sys.partitions in postgres we deliver some data about patitions
         # "partition_key": str (if has partitions) - equiv ? 
         # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
         # for more in depth search, it's not trivial to determine partition key like in Postgres
-        PARTITIONS_QUERY = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
-
-        cursor.execute(PARTITIONS_QUERY.format(table_object_id))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        return rows
+        
+        return execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_object_id), cursor)
 
     def _get_index_data_per_table(self, table_object_id, cursor):           
-
         # object_id   name  index_id    type type_desc is_unique data_space_id ignore_dup_key is_primary_key is_unique_constraint 
         # fill_factor is_padded is_disabled is_hypothetical is_ignored_in_optimization allow_row_locks allow_page_locks has_filter 
-        # filter_definition    
-        
+        # filter_definition            
         #May be better to query sys.index_columns ?                                                                                                                                                                                                                                            compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
-        INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
-
-        # index query:
+        #INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
+        return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
 
-        cursor.execute(INDEX_QUERY.format(table_object_id))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        return rows
 
         
         # foreign keys
@@ -854,14 +841,9 @@ def _get_index_data_per_table(self, table_object_id, cursor):
         # update_referential_action_desc is_system_named compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
         # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
         # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
-    def _get_foreign_key_data_per_table(self, table_object_id, cursor):
-        FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"    
-        # index query:
-        cursor.execute(FOREIGN_KEY_QUERY.format(table_object_id))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        print("the end")
-        return rows
+    def _get_foreign_key_data_per_table(self, table_object_id, cursor):           
+        return execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_object_id), cursor)
+
 
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     #
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index 4664f768dcc10..0670649aba824 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -137,3 +137,9 @@ def is_azure_sql_database(engine_edition):
     :return: bool
     """
     return engine_edition == ENGINE_EDITION_SQL_DATABASE
+
+def execute_query_output_result_as_a_dict(query, cursor):
+    cursor.execute(query)
+    columns = [str(i[0]).lower() for i in cursor.description]
+    rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+    return rows
\ No newline at end of file

From 0b7e3fa67d287eb01b63e3bda7acb4e2a6f1a8e2 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 9 Apr 2024 11:06:54 +0000
Subject: [PATCH 009/132] Introduced a function that iterates between databases

---
 .../datadog_checks/sqlserver/sqlserver.py     | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 5f0d1fc85e543..51ee7e9119c2a 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -847,27 +847,34 @@ def _get_foreign_key_data_per_table(self, table_object_id, cursor):
 
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     #
-    def _collect_schemas_for_non_azure(self):
-        #schemas per db
-        schemas_per_db = {}
-        #TODO its copy paste make a function
-        db_names = [d.name for d in self.databases] or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
+    def _do_for_databases(self, action):
+        engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
+        db_names = []
+        if not is_azure_sql_database(engine_edition):
+            db_names = [d.name for d in self.databases] or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
+        else:
+            db_names = [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
         with self.connection.open_managed_default_connection():
             with self.connection.get_managed_cursor() as cursor:
                 for db in db_names:                    
                     try:
-                        cursor.execute(SWITCH_DB_STATEMENT.format(db))
-                        schemas = self._query_schema_information(cursor)
-                        pdb.set_trace()
-                        self._get_table_data_per_schema(schemas, cursor)                        
-                        schemas_per_db[db] = schemas
-                        pdb.set_trace()
-                        print("collected")
+                        if not is_azure_sql_database(engine_edition):
+                            cursor.execute(SWITCH_DB_STATEMENT.format(db))
+                        action(cursor, db)                      
                     except Exception as e:
-                        pdb.set_trace()
                         print("TODO")
                 # Switch DB back to MASTER
-                cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
+                if not is_azure_sql_database(engine_edition):
+                    cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
+
+    def _collect_schemas_data(self):
+        #schemas per db
+        schemas_per_db = {}
+        def fetch_schema_data(cursor, db_name):
+            schemas = self._query_schema_information(cursor)
+            self._get_table_data_per_schema(schemas, cursor)
+            schemas_per_db[db_name] = schemas
+        self._do_for_databases(fetch_schema_data)
         pdb.set_trace()
         print(schemas_per_db)
 
@@ -904,7 +911,7 @@ def check(self, _):
                 self._check_database_conns()
             if self._config.dbm_enabled:
                 #TODO limit this check by some minutes ... 
-                self._collect_schemas_for_non_azure()
+                self._collect_schemas_data()
                 self.statement_metrics.run_job_loop(self.tags)
                 self.procedure_metrics.run_job_loop(self.tags)
                 self.activity.run_job_loop(self.tags)

From b267d8f2ea5611d76e036d43b578bf135b836e24 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 9 Apr 2024 11:18:26 +0000
Subject: [PATCH 010/132] minor changes

---
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 51ee7e9119c2a..a7d2393c70919 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -846,7 +846,6 @@ def _get_foreign_key_data_per_table(self, table_object_id, cursor):
 
 
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
-    #
     def _do_for_databases(self, action):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         db_names = []

From 39013abb9a840efa6c9c88589b4e4d42ca51977b Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 9 Apr 2024 15:06:40 +0000
Subject: [PATCH 011/132] put in a separate class

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 123 ++++++++++++++++
 .../datadog_checks/sqlserver/sqlserver.py     | 133 +-----------------
 2 files changed, 129 insertions(+), 127 deletions(-)
 create mode 100644 sqlserver/datadog_checks/sqlserver/schemas.py

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
new file mode 100644
index 0000000000000..f2fd6569aafbf
--- /dev/null
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -0,0 +1,123 @@
+from datadog_checks.sqlserver.const import (
+    TABLES_IN_SCHEMA_QUERY,
+    COLUMN_QUERY,
+    PARTITIONS_QUERY,
+    INDEX_QUERY,
+    FOREIGN_KEY_QUERY,
+    SCHEMA_QUERY,
+)
+
+from datadog_checks.sqlserver.utils import (
+    execute_query_output_result_as_a_dict,
+)
+
+import pdb
+
+class Schemas:
+
+    def __init__(self, do_for_databases, log):
+        self._do_for_databases = do_for_databases 
+        self.schemas_per_db = {} 
+        self._log = log
+
+    def collect_schemas_data(self):
+        #schemas per db
+        def fetch_schema_data(cursor, db_name):
+            schemas = self._query_schema_information(cursor)
+            pdb.set_trace()
+            self._get_table_data_per_schema(schemas, cursor)
+            pdb.set_trace()
+            self.schemas_per_db[db_name] = schemas
+        self._do_for_databases(fetch_schema_data)
+        pdb.set_trace()
+        print(self.schemas_per_db)
+
+#per DB per sqhema per tables. 
+    # TODO how often ?
+    # TODO put in a class
+    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
+    def _query_schema_information(self, cursor):
+
+        # principal_id is kind of like an owner not sure if need it.
+        self._log.debug("collecting db schemas")
+        self._log.debug("Running query [%s]", SCHEMA_QUERY)
+        cursor.execute(SCHEMA_QUERY)
+        schemas = []
+        columns = [i[0] for i in cursor.description]
+        schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        for schema in schemas:
+            schema["tables"] = {}
+        self._log.debug("fetched schemas len(rows)=%s", len(schemas))
+        return schemas
+
+    def _get_table_data_per_schema(self, schemas, cursor):
+        for schema in schemas:
+            self._get_tables_and_their_data(schema, cursor)
+
+    def _get_tables_and_their_data(self, schema, cursor):
+        self._get_table_infos(schema, cursor)
+        tables_dict_for_schema = schema['tables']
+        for table_object_id, table_value in tables_dict_for_schema.items():
+            table_value["columns"] = self._get_columns_data_per_table(table_object_id, cursor)
+            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
+            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
+            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
+
+    def _get_table_infos(self, schema, cursor):
+        tables_dict_for_schema = schema['tables']
+            
+        # TODO modify_date - there is a modify date !!! 
+        # TODO what is principal_id
+        # TODO is_replicated - might be interesting ? 
+        
+        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        for row in rows:            
+            tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []}
+        return
+
+    def _get_columns_data_per_table(self, table_object_id, cursor):
+
+        #TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT 
+        # in sys.columns I cannot see a data type but there are other things   
+        #object_id   name       
+        #column_id   system_type_id user_type_id max_length precision scale collation_name   
+        # is_nullable is_ansi_padded is_rowguidcol is_identity is_computed is_filestream is_replicated 
+        # is_non_sql_subscribed is_merge_published is_dts_replicated is_xml_document xml_collection_id 
+        # default_object_id rule_object_id is_sparse is_column_set generated_always_type generated_always_type_desc                                   encryption_type encryption_type_desc                                             encryption_algorithm_name                                                                                                        column_encryption_key_id column_encryption_key_database_name                                                                                              is_hidden is_masked graph_type  graph_type_desc                                              is_data_deletion_filter_column ledger_view_column_type ledger_view_column_type_desc                                 is_dropped_ledger_column
+        # might be slower then in sys.columns as we cant get data by object_id .... but we get data_type
+        #COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA={} and TABLE_NAME={};".format(schema_name, )
+        
+        #if using query 2 we need to figure out user_type_id - its like a user defined type
+        # TODO AL in all query 2 will be query 2 faster ? or its just less convinient at the end ... ? object_id is nice
+        return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_object_id), cursor)
+
+    
+    def _get_partitions_data_per_table(self, table_object_id, cursor):
+        # TODO check out sys.partitions in postgres we deliver some data about patitions
+        # "partition_key": str (if has partitions) - equiv ? 
+        # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
+        # for more in depth search, it's not trivial to determine partition key like in Postgres
+        
+        return execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_object_id), cursor)
+
+    def _get_index_data_per_table(self, table_object_id, cursor):           
+        # object_id   name  index_id    type type_desc is_unique data_space_id ignore_dup_key is_primary_key is_unique_constraint 
+        # fill_factor is_padded is_disabled is_hypothetical is_ignored_in_optimization allow_row_locks allow_page_locks has_filter 
+        # filter_definition            
+        #May be better to query sys.index_columns ?                                                                                                                                                                                                                                            compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
+        #INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
+        return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
+
+
+        
+        # foreign keys
+        # name object_id principal_id schema_id parent_object_id type type_desc create_date modify_date is_ms_shipped 
+        # is_published is_schema_published referenced_object_id key_index_id is_disabled is_not_for_replication 
+        # is_not_trusted delete_referential_action delete_referential_action_desc update_referential_action 
+        # update_referential_action_desc is_system_named compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
+        # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
+        # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
+    def _get_foreign_key_data_per_table(self, table_object_id, cursor):           
+        return execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_object_id), cursor)
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index a7d2393c70919..e4788fdcb1cf1 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -26,6 +26,7 @@
 from datadog_checks.sqlserver.statements import SqlserverStatementMetrics
 from datadog_checks.sqlserver.stored_procedures import SqlserverProcedureMetrics
 from datadog_checks.sqlserver.utils import Database, construct_use_statement, parse_sqlserver_major_version
+from datadog_checks.sqlserver.schemas import Schemas
 
 try:
     import datadog_agent
@@ -133,6 +134,8 @@ def __init__(self, name, init_config, instances):
         self._sql_counter_types = {}
         self.proc_type_mapping = {"gauge": self.gauge, "rate": self.rate, "histogram": self.histogram}
 
+        self._schemas = Schemas(self._do_for_databases, self.log)
+
         # DBM
         self.statement_metrics = SqlserverStatementMetrics(self, self._config)
         self.procedure_metrics = SqlserverProcedureMetrics(self, self._config)
@@ -731,122 +734,9 @@ def _check_connections_by_use_db(self):
                 # Switch DB back to MASTER
                 cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
     
-    """schemas data struct is a dictionnary with key being a schema name the value is
-    schema
-    dict:
-        "name": str
-        "schema_id": str
-        "principal_id": str
-        "tables" : dict
-            object_id : str
-            name : str
-            columns: list of columns                  
-                "columns": dict
-                    name: str
-                    data_type: str
-                    default: str
-                    is_nullable : str
-            indexes : list of indexes
-            foreign_keys : list of foreign keys
-    """
-    def _query_schema_information(self, cursor):
-
-        # principal_id is kind of like an owner not sure if need it.
-        self.log.debug("collecting db schemas")
-        self.log.debug("Running query [%s]", SCHEMA_QUERY)
-        cursor.execute(SCHEMA_QUERY)
-        schemas = []
-        columns = [i[0] for i in cursor.description]
-        schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        for schema in schemas:
-            schema["tables"] = {}
-        self.log.debug("fetched schemas len(rows)=%s", len(schemas))
-        return schemas
-    
-    # TODO in tables we have modified date !
-    # can be a separate query 
-    
-
-    def _get_table_data_per_schema(self, schemas, cursor):
-        for schema in schemas:
-            self._get_tables_and_their_data(schema, cursor)
-
-    def _get_tables_and_their_data(self, schema, cursor):
-        self._get_table_infos(schema, cursor)
-        tables_dict_for_schema = schema['tables']
-        for table_object_id, table_value in tables_dict_for_schema.items():
-            table_value["columns"] = self._get_columns_data_per_table(table_object_id, cursor)
-            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
-            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
-            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
-
-    # TODO how often ?
-    # TODO put in a class
-    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-    def _get_table_infos(self, schema, cursor):
-        tables_dict_for_schema = schema['tables']
-        
-        # TODO modify_date - there is a modify date !!! 
-        # TODO what is principal_id
-        # TODO is_replicated - might be interesting ? 
-        
-        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        for row in rows:            
-            tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []}
-        return
-
-
-        
-    def _get_columns_data_per_table(self, table_object_id, cursor):
-
-        #TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT 
-        # in sys.columns I cannot see a data type but there are other things   
-        #object_id   name       
-        #column_id   system_type_id user_type_id max_length precision scale collation_name   
-        # is_nullable is_ansi_padded is_rowguidcol is_identity is_computed is_filestream is_replicated 
-        # is_non_sql_subscribed is_merge_published is_dts_replicated is_xml_document xml_collection_id 
-        # default_object_id rule_object_id is_sparse is_column_set generated_always_type generated_always_type_desc                                   encryption_type encryption_type_desc                                             encryption_algorithm_name                                                                                                        column_encryption_key_id column_encryption_key_database_name                                                                                              is_hidden is_masked graph_type  graph_type_desc                                              is_data_deletion_filter_column ledger_view_column_type ledger_view_column_type_desc                                 is_dropped_ledger_column
-        # might be slower then in sys.columns as we cant get data by object_id .... but we get data_type
-        #COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA={} and TABLE_NAME={};".format(schema_name, )
-        
-        #if using query 2 we need to figure out user_type_id - its like a user defined type
-        # TODO AL in all query 2 will be query 2 faster ? or its just less convinient at the end ... ? object_id is nice
-        return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_object_id), cursor)
-
-    
-    def _get_partitions_data_per_table(self, table_object_id, cursor):
-        # TODO check out sys.partitions in postgres we deliver some data about patitions
-        # "partition_key": str (if has partitions) - equiv ? 
-        # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
-        # for more in depth search, it's not trivial to determine partition key like in Postgres
-        
-        return execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_object_id), cursor)
-
-    def _get_index_data_per_table(self, table_object_id, cursor):           
-        # object_id   name  index_id    type type_desc is_unique data_space_id ignore_dup_key is_primary_key is_unique_constraint 
-        # fill_factor is_padded is_disabled is_hypothetical is_ignored_in_optimization allow_row_locks allow_page_locks has_filter 
-        # filter_definition            
-        #May be better to query sys.index_columns ?                                                                                                                                                                                                                                            compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
-        #INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
-        return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
-
-
-        
-        # foreign keys
-        # name object_id principal_id schema_id parent_object_id type type_desc create_date modify_date is_ms_shipped 
-        # is_published is_schema_published referenced_object_id key_index_id is_disabled is_not_for_replication 
-        # is_not_trusted delete_referential_action delete_referential_action_desc update_referential_action 
-        # update_referential_action_desc is_system_named compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
-        # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
-        # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
-    def _get_foreign_key_data_per_table(self, table_object_id, cursor):           
-        return execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_object_id), cursor)
-
-
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     def _do_for_databases(self, action):
+        pdb.set_trace()
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         db_names = []
         if not is_azure_sql_database(engine_edition):
@@ -866,17 +756,6 @@ def _do_for_databases(self, action):
                 if not is_azure_sql_database(engine_edition):
                     cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
 
-    def _collect_schemas_data(self):
-        #schemas per db
-        schemas_per_db = {}
-        def fetch_schema_data(cursor, db_name):
-            schemas = self._query_schema_information(cursor)
-            self._get_table_data_per_schema(schemas, cursor)
-            schemas_per_db[db_name] = schemas
-        self._do_for_databases(fetch_schema_data)
-        pdb.set_trace()
-        print(schemas_per_db)
-
     def _check_database_conns(self):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         if is_azure_sql_database(engine_edition):
@@ -909,8 +788,8 @@ def check(self, _):
             if self._config.autodiscovery and self._config.autodiscovery_db_service_check:
                 self._check_database_conns()
             if self._config.dbm_enabled:
-                #TODO limit this check by some minutes ... 
-                self._collect_schemas_data()
+                #TODO limit this check by some minutes ...
+                self._schemas.collect_schemas_data() 
                 self.statement_metrics.run_job_loop(self.tags)
                 self.procedure_metrics.run_job_loop(self.tags)
                 self.activity.run_job_loop(self.tags)

From 5ef04595c1a8410204623fc2dd5e1bde6bedde49 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 9 Apr 2024 15:35:37 +0000
Subject: [PATCH 012/132] some clean up

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 22 +++++++++++++++++++
 .../datadog_checks/sqlserver/sqlserver.py     |  7 ------
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index f2fd6569aafbf..af7da1e59b9d4 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -36,6 +36,24 @@ def fetch_schema_data(cursor, db_name):
     # TODO how often ?
     # TODO put in a class
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
+        """schemas data struct is a dictionnary with key being a schema name the value is
+    schema
+    dict:
+        "name": str
+        "schema_id": str
+        "principal_id": str
+        "tables" : dict
+            object_id : str
+            name : str
+            columns: list of columns                  
+                "columns": dict
+                    name: str
+                    data_type: str
+                    default: str
+                    is_nullable : str
+            indexes : list of indexes
+            foreign_keys : list of foreign keys
+    """
     def _query_schema_information(self, cursor):
 
         # principal_id is kind of like an owner not sure if need it.
@@ -63,6 +81,10 @@ def _get_tables_and_their_data(self, schema, cursor):
             table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
             table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
 
+
+    # TODO how often ?
+    # TODO put in a class
+    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
     def _get_table_infos(self, schema, cursor):
         tables_dict_for_schema = schema['tables']
             
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index e4788fdcb1cf1..079e2ea60929f 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -68,12 +68,6 @@
     TASK_SCHEDULER_METRICS,
     TEMPDB_FILE_SPACE_USAGE_METRICS,
     VALID_METRIC_TYPES,
-    SCHEMA_QUERY,
-    TABLES_IN_SCHEMA_QUERY,
-    COLUMN_QUERY,
-    PARTITIONS_QUERY,
-    INDEX_QUERY,
-    FOREIGN_KEY_QUERY,
     expected_sys_databases_columns,
 )
 from datadog_checks.sqlserver.metrics import DEFAULT_PERFORMANCE_TABLE, VALID_TABLES
@@ -91,7 +85,6 @@
     is_azure_database,
     is_azure_sql_database,
     set_default_driver_conf,
-    execute_query_output_result_as_a_dict,
 )
 
 try:

From c4e6a7433f138bea0ab4042ac9a7d2640e3a7ce6 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 10 Apr 2024 11:06:55 +0000
Subject: [PATCH 013/132] Corrected column query

---
 sqlserver/datadog_checks/sqlserver/const.py   |  4 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 62 +++++++++++--------
 sqlserver/tests/compose/setup.sql             |  1 +
 3 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 7f9aa43c8a3a5..62aed4a748dbc 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -271,7 +271,9 @@
 
 SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas WHERE name NOT IN ('sys', 'information_schema');"
 TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}"
-COLUMN_QUERY = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
+COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}' and TABLE_SCHEMA='{}';"
+#this query returns several values in case there is an alias for an int ... 
+COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id OR c.user_type_id = t.user_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
 PARTITIONS_QUERY = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
 INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
 FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index af7da1e59b9d4..f12854326a925 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -20,23 +20,7 @@ def __init__(self, do_for_databases, log):
         self.schemas_per_db = {} 
         self._log = log
 
-    def collect_schemas_data(self):
-        #schemas per db
-        def fetch_schema_data(cursor, db_name):
-            schemas = self._query_schema_information(cursor)
-            pdb.set_trace()
-            self._get_table_data_per_schema(schemas, cursor)
-            pdb.set_trace()
-            self.schemas_per_db[db_name] = schemas
-        self._do_for_databases(fetch_schema_data)
-        pdb.set_trace()
-        print(self.schemas_per_db)
-
-#per DB per sqhema per tables. 
-    # TODO how often ?
-    # TODO put in a class
-    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-        """schemas data struct is a dictionnary with key being a schema name the value is
+    """schemas data struct is a dictionnary with key being a schema name the value is
     schema
     dict:
         "name": str
@@ -51,9 +35,27 @@ def fetch_schema_data(cursor, db_name):
                     data_type: str
                     default: str
                     is_nullable : str
-            indexes : list of indexes
+            indexes : list of indexes - important
             foreign_keys : list of foreign keys
+            partitions useful to know the number 
     """
+    def collect_schemas_data(self):
+        #schemas per db
+        def fetch_schema_data(cursor, db_name):
+            schemas = self._query_schema_information(cursor)
+            self._get_table_data_per_schema(schemas, cursor)
+            self.schemas_per_db[db_name] = schemas
+        self._do_for_databases(fetch_schema_data)
+        pdb.set_trace()
+        print(self.schemas_per_db)
+
+#per DB per sqhema per tables. 
+    # TODO how often ?
+    # TODO put in a class
+    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
+
+    #TODO Looks fine similar to Postgres, do we need to do someting with prinicipal_id
+    # or reporting principal_id is ok
     def _query_schema_information(self, cursor):
 
         # principal_id is kind of like an owner not sure if need it.
@@ -72,11 +74,11 @@ def _get_table_data_per_schema(self, schemas, cursor):
         for schema in schemas:
             self._get_tables_and_their_data(schema, cursor)
 
-    def _get_tables_and_their_data(self, schema, cursor):
+    def _get_tables_and_their_data(self, schema, cursor):        
         self._get_table_infos(schema, cursor)
         tables_dict_for_schema = schema['tables']
         for table_object_id, table_value in tables_dict_for_schema.items():
-            table_value["columns"] = self._get_columns_data_per_table(table_object_id, cursor)
+            table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
             table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
             table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
             table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
@@ -99,7 +101,8 @@ def _get_table_infos(self, schema, cursor):
             tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []}
         return
 
-    def _get_columns_data_per_table(self, table_object_id, cursor):
+    #postgres: name, data_type, nullable, default - same values
+    def _get_columns_data_per_table(self, table_name, schema_name, cursor):
 
         #TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT 
         # in sys.columns I cannot see a data type but there are other things   
@@ -112,10 +115,12 @@ def _get_columns_data_per_table(self, table_object_id, cursor):
         #COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA={} and TABLE_NAME={};".format(schema_name, )
         
         #if using query 2 we need to figure out user_type_id - its like a user defined type
-        # TODO AL in all query 2 will be query 2 faster ? or its just less convinient at the end ... ? object_id is nice
-        return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_object_id), cursor)
+        return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
 
-    
+    #SELECT count(inhrelid :: regclass) AS num_partitions
+    #SELECT relname, pg_get_partkeydef(oid) AS partition_key
+    #its hard to get the partition key
+    #!!! better change to number my query
     def _get_partitions_data_per_table(self, table_object_id, cursor):
         # TODO check out sys.partitions in postgres we deliver some data about patitions
         # "partition_key": str (if has partitions) - equiv ? 
@@ -124,6 +129,10 @@ def _get_partitions_data_per_table(self, table_object_id, cursor):
         
         return execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_object_id), cursor)
 
+
+#!!! INDEX : name, and their columns join by sys.indexes sys.index_columns
+    # postgres  indexname , indexdef
+    # we dont have indexdef , whats the best course of action ? 
     def _get_index_data_per_table(self, table_object_id, cursor):           
         # object_id   name  index_id    type type_desc is_unique data_space_id ignore_dup_key is_primary_key is_unique_constraint 
         # fill_factor is_padded is_disabled is_hypothetical is_ignored_in_optimization allow_row_locks allow_page_locks has_filter 
@@ -131,8 +140,6 @@ def _get_index_data_per_table(self, table_object_id, cursor):
         #May be better to query sys.index_columns ?                                                                                                                                                                                                                                            compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
         #INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
         return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
-
-
         
         # foreign keys
         # name object_id principal_id schema_id parent_object_id type type_desc create_date modify_date is_ms_shipped 
@@ -141,5 +148,8 @@ def _get_index_data_per_table(self, table_object_id, cursor):
         # update_referential_action_desc is_system_named compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
         # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
         # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
+   
+   #postgres count(conname) 
+#shell we also take only count ?
     def _get_foreign_key_data_per_table(self, table_object_id, cursor):           
         return execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_object_id), cursor)
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 86b2934a43c79..bea74fdfbcb1b 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -5,6 +5,7 @@ GRANT SELECT on sys.dm_os_performance_counters to datadog;
 GRANT VIEW SERVER STATE to datadog;
 GRANT CONNECT ANY DATABASE to datadog;
 GRANT VIEW ANY DEFINITION to datadog;
+GRANT CREATE TYPE TO datadog;
 
 -- test users
 CREATE LOGIN bob WITH PASSWORD = 'Password12!';

From 541541e7e9f81e4a9b9ba4ccb5149074bacdb4b2 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 10 Apr 2024 12:25:11 +0000
Subject: [PATCH 014/132] added partitions count

---
 sqlserver/datadog_checks/sqlserver/const.py   |  5 ++-
 sqlserver/datadog_checks/sqlserver/schemas.py | 41 ++++++-------------
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 62aed4a748dbc..031dc5ae53e51 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -274,6 +274,7 @@
 COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}' and TABLE_SCHEMA='{}';"
 #this query returns several values in case there is an alias for an int ... 
 COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id OR c.user_type_id = t.user_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
-PARTITIONS_QUERY = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
-INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
+#PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
+PARTITIONS_QUERY = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
+INDEX_QUERY = "SELECT i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id = {};"
 FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index f12854326a925..a9369efad8af8 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -80,6 +80,10 @@ def _get_tables_and_their_data(self, schema, cursor):
         for table_object_id, table_value in tables_dict_for_schema.items():
             table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
             table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
+            if str(table_object_id) == "1803153469":
+                pdb.set_trace()
+                print("should have index")
+
             table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
             table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
 
@@ -101,24 +105,13 @@ def _get_table_infos(self, schema, cursor):
             tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []}
         return
 
-    #postgres: name, data_type, nullable, default - same values
     def _get_columns_data_per_table(self, table_name, schema_name, cursor):
-
-        #TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT 
-        # in sys.columns I cannot see a data type but there are other things   
-        #object_id   name       
-        #column_id   system_type_id user_type_id max_length precision scale collation_name   
-        # is_nullable is_ansi_padded is_rowguidcol is_identity is_computed is_filestream is_replicated 
-        # is_non_sql_subscribed is_merge_published is_dts_replicated is_xml_document xml_collection_id 
-        # default_object_id rule_object_id is_sparse is_column_set generated_always_type generated_always_type_desc                                   encryption_type encryption_type_desc                                             encryption_algorithm_name                                                                                                        column_encryption_key_id column_encryption_key_database_name                                                                                              is_hidden is_masked graph_type  graph_type_desc                                              is_data_deletion_filter_column ledger_view_column_type ledger_view_column_type_desc                                 is_dropped_ledger_column
-        # might be slower then in sys.columns as we cant get data by object_id .... but we get data_type
-        #COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA={} and TABLE_NAME={};".format(schema_name, )
-        
-        #if using query 2 we need to figure out user_type_id - its like a user defined type
         return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
 
-    #SELECT count(inhrelid :: regclass) AS num_partitions
-    #SELECT relname, pg_get_partkeydef(oid) AS partition_key
+    #TODO table 1803153469 is in  sys.indexes but not in sys.index_columns ... shell we do something about it ?
+    def _get_index_data_per_table(self, table_object_id, cursor):           
+        return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
+
     #its hard to get the partition key
     #!!! better change to number my query
     def _get_partitions_data_per_table(self, table_object_id, cursor):
@@ -126,20 +119,10 @@ def _get_partitions_data_per_table(self, table_object_id, cursor):
         # "partition_key": str (if has partitions) - equiv ? 
         # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
         # for more in depth search, it's not trivial to determine partition key like in Postgres
-        
-        return execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_object_id), cursor)
-
-
-#!!! INDEX : name, and their columns join by sys.indexes sys.index_columns
-    # postgres  indexname , indexdef
-    # we dont have indexdef , whats the best course of action ? 
-    def _get_index_data_per_table(self, table_object_id, cursor):           
-        # object_id   name  index_id    type type_desc is_unique data_space_id ignore_dup_key is_primary_key is_unique_constraint 
-        # fill_factor is_padded is_disabled is_hypothetical is_ignored_in_optimization allow_row_locks allow_page_locks has_filter 
-        # filter_definition            
-        #May be better to query sys.index_columns ?                                                                                                                                                                                                                                            compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
-        #INDEX_QUERY = "SELECT name, type, is_unique, is_primary_key, is_unique_constraint, is_disabled FROM sys.indexes WHERE object_id={}"
-        return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
+        cursor.execute(PARTITIONS_QUERY.format(table_object_id))
+        columns = ["partition_count" for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        return rows
         
         # foreign keys
         # name object_id principal_id schema_id parent_object_id type type_desc create_date modify_date is_ms_shipped 

From ec55910a12385861709edd594dccb1657721e646 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 10 Apr 2024 13:03:50 +0000
Subject: [PATCH 015/132] Added foreign count

---
 sqlserver/datadog_checks/sqlserver/const.py   |  3 ++-
 sqlserver/datadog_checks/sqlserver/schemas.py | 25 +++++--------------
 sqlserver/datadog_checks/sqlserver/utils.py   |  8 ++++--
 3 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 031dc5ae53e51..8a596bbd9daa1 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -277,4 +277,5 @@
 #PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
 PARTITIONS_QUERY = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
 INDEX_QUERY = "SELECT i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id = {};"
-FOREIGN_KEY_QUERY = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
+#FOREIGN_KEY_QUERY2 = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
+FOREIGN_KEY_QUERY = "SELECT COUNT(*) FROM sys.foreign_keys WHERE referenced_object_id = {};"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index a9369efad8af8..1e60adc89b762 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -20,6 +20,7 @@ def __init__(self, do_for_databases, log):
         self.schemas_per_db = {} 
         self._log = log
 
+   #TODO update this at the very end as it constantly changing
     """schemas data struct is a dictionnary with key being a schema name the value is
     schema
     dict:
@@ -112,27 +113,13 @@ def _get_columns_data_per_table(self, table_name, schema_name, cursor):
     def _get_index_data_per_table(self, table_object_id, cursor):           
         return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
 
-    #its hard to get the partition key
-    #!!! better change to number my query
+    #TODO its hard to get the partition key - for later ? 
     def _get_partitions_data_per_table(self, table_object_id, cursor):
         # TODO check out sys.partitions in postgres we deliver some data about patitions
         # "partition_key": str (if has partitions) - equiv ? 
         # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
         # for more in depth search, it's not trivial to determine partition key like in Postgres
-        cursor.execute(PARTITIONS_QUERY.format(table_object_id))
-        columns = ["partition_count" for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        return rows
-        
-        # foreign keys
-        # name object_id principal_id schema_id parent_object_id type type_desc create_date modify_date is_ms_shipped 
-        # is_published is_schema_published referenced_object_id key_index_id is_disabled is_not_for_replication 
-        # is_not_trusted delete_referential_action delete_referential_action_desc update_referential_action 
-        # update_referential_action_desc is_system_named compression_delay suppress_dup_key_messages auto_created optimize_for_sequential_key
-        # SELECT name , OBJECT_NAME(parent_object_id) FROM sys.foreign_keys;
-        # fk.name AS foreign_key_name, OBJECT_NAME(fk.parent_object_id) AS parent_table, COL_NAME(fkc.parent_object_id, fkc.parent_column_id) AS parent_column, OBJECT_NAME(fk.referenced_object_id) AS referenced_table, COL_NAME(fkc.referenced_object_id, fkc.referenced_column_id) AS referenced_column FROM  sys.foreign_keys fk JOIN  sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id WHERE  fk.parent_object_id = 'YourTableObjectID' -- Replace 'YourTableObjectID' with the object_id of your table
-   
-   #postgres count(conname) 
-#shell we also take only count ?
-    def _get_foreign_key_data_per_table(self, table_object_id, cursor):           
-        return execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_object_id), cursor)
+        return execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_object_id), cursor, "partition_count")
+
+    def _get_foreign_key_data_per_table(self, table_object_id, cursor):  
+        return execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_object_id), cursor, "foreign_key_count")         
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index 0670649aba824..7f2fdcdacf329 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -138,8 +138,12 @@ def is_azure_sql_database(engine_edition):
     """
     return engine_edition == ENGINE_EDITION_SQL_DATABASE
 
-def execute_query_output_result_as_a_dict(query, cursor):
+def execute_query_output_result_as_a_dict(query, cursor, column_name=None):
     cursor.execute(query)
-    columns = [str(i[0]).lower() for i in cursor.description]
+    columns = []
+    if column_name:
+        columns = [str(column_name).lower() for i in cursor.description]
+    else:
+        columns = [str(i[0]).lower() for i in cursor.description]
     rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
     return rows
\ No newline at end of file

From 593452a503d624341ea9774e20a2f0c19aeb7720 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Sat, 13 Apr 2024 14:52:56 +0000
Subject: [PATCH 016/132] Added stop

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 234 +++++++++++++++++-
 .../datadog_checks/sqlserver/sqlserver.py     |  16 +-
 2 files changed, 235 insertions(+), 15 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 1e60adc89b762..1ed599600f8dd 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -13,12 +13,119 @@
 
 import pdb
 
+class DataForProcessedDB:
+    def __init__(self, db_name, schema_list, table_list):
+        self._db_name = db_name
+        self.schema_list = schema_list
+        self.table_list = table_list
+        self.current_schema_index = 0
+        self.current_table_index = 0
+    def stop_processing(self, schema_index, table_index):
+        self.current_schema_index = schema_index
+        self.current_table_index = table_index
+
+
 class Schemas:
 
-    def __init__(self, do_for_databases, log):
-        self._do_for_databases = do_for_databases 
+    def __init__(self, check):
+        self._check = check 
+        #self._index = [db_index, schema_index, table_index] 
         self.schemas_per_db = {} 
-        self._log = log
+        self._log = check.log
+        #TODO is this class unique per host ?
+        self._start_time_for_host = []
+        #TODO per DB may be ? 
+        self._last_time_collected_diff_per_db = {}
+
+        self._index = None
+        self._data_for_processed_db = None
+        self.databases = []
+        self.current_table_list = None
+        self.current_schema_list = None
+    
+    #intially a,b,c DBs new_db_list say c, d,e old_db_list_with_new old list say a, b, c, d, e.
+    # new_db_list say d,e
+    def _move_index_to_existing_db(self, old_db_list_with_new, new_db_list):
+        if self._index is None:
+            print("error")
+            #self._log.error()
+            return
+        if len(new_db_list) == 0:
+            self._index = None
+            return
+        start = self._index["db"]
+        for i in range(start, len(old_db_list_with_new)):
+            if old_db_list_with_new[self._index["db"]] not in new_db_list:
+                if i != len(old_db_list_with_new) -1:
+                    self._index["db"] = i+1
+                    # if we move index at least ones then schema and table are invalidated
+                    self._index["schema"] = None
+                    self._index["table_object_id_index"] = None
+                    self.current_table_list = None
+                    self.current_schema_list = None
+            else:
+                #we are happy with found DB in index
+                return
+        #if we reached the end of old DBs but there is nothing to take
+        self._index = None
+        return
+
+            #if we reach the end and index is still None than we take the first from the new list its not important
+            # as before we add all new DBs to the old list but like for function consistency
+
+    # outputs db, schema, and table to start with
+    # I did this with the idea that trying to connect to a DB that doesnt exist is bad
+    # but now I re4alize that switch DB will throw and we are happy... I mean it can happen if the DB is 
+    def _init_schema_collection2(self):
+        if len(self.databases) == 0:
+            self.databases = self._check.get_databases()
+            if len(self.databases) == 0:
+                self._index = None
+                return
+            self._index = 0
+            return
+        else:
+            # add new DBs to the end of the list
+            updated_databases = self._check.get_databases()
+            for new_db in updated_databases:
+                if new_db not in self.databases:
+                    self.databases.append(new_db)
+            # move index if it is currently on a DB that is not in a new list
+            self._move_index_to_existing_db(self, self.databases, updated_databases)
+            if self._index is None:
+                return
+            # remove dbs from the list, while updating the index
+            current_db_name = self.databases[self._index["db"]]
+            new_db_list = []
+            for db in self.databases:
+                if db in updated_databases:
+                    new_db_list.append(db)
+            self.databases = new_db_list
+            #this shouldnt throw as we ve chosen it to be in the new list.
+            self._index["db"] = self.databases.index(current_db_name)
+                      
+    def _init_schema_collection(self):
+        if len(self.databases) == 0:
+            self.databases = self._check.get_databases()
+            if len(self.databases) == 0:
+                self._index = None
+                return
+            self._index = 0
+            return  
+        else:
+            if self._index is None:
+                print("error")  
+            #TODO if db dissapeared we invalidate indexes should be done in exception treatment of use DB
+            if self.databases[self._index] not in self._check.get_databases():
+                    #we dont move the index as on first use db its gonna throw and continue the loop
+                    self.current_schema_list = None
+                    self.current_table_list = None
+
+
+
+                
+
+            
 
    #TODO update this at the very end as it constantly changing
     """schemas data struct is a dictionnary with key being a schema name the value is
@@ -42,11 +149,59 @@ def __init__(self, do_for_databases, log):
     """
     def collect_schemas_data(self):
         #schemas per db
+        # flush previous collection
+        self.schemas_per_db = {} 
+        # init the index
+        self._init_schema_collection()
+        if self._index is None:
+            return
+        
         def fetch_schema_data(cursor, db_name):
             schemas = self._query_schema_information(cursor)
             self._get_table_data_per_schema(schemas, cursor)
             self.schemas_per_db[db_name] = schemas
-        self._do_for_databases(fetch_schema_data)
+            return False
+        
+        # dont need an index just always safe the last one.
+        def fetch_schema_data2(cursor, db_name):
+            # check if we start from scratch or not 
+            if self.current_schema_list is None:
+                # find new schemas:
+                schemas = self._query_schema_information(cursor)
+            else:
+                schemas = self.current_schema_list  
+            #ok we have schemas now tables
+            if self.current_table_list is None:
+                schemas[0]["tables"] = self._get_tables2(schemas[0], cursor)
+
+            for index_sh, schema in enumerate(schemas):  
+                if schema["tables"] is not None:
+                    schema["tables"] = self._get_tables2(schema, cursor)
+                for index_t,table in enumerate(schema["tables"]):
+                    stop = self._get_table_data2(self, table, schema, cursor)
+                    if stop:
+                        self.current_table_list = schema["tables"][index_t:]
+                        self.current_schema_list = schemas[index_sh:]
+                        return False
+            return True
+        self._check._do_for_databases(fetch_schema_data2, self.databases[self.index["db"]:])
+        pdb.set_trace()
+        print(self.schemas_per_db)
+    
+    #TODO we need to take care of new DB / removed DB 
+    #def get_current_db_times(cursor):
+        # list of all known DBs
+
+        #def execute_time_query():
+          # self._last_time_collected_diff_per_db =
+
+    def collect_schema_diffs(self):
+                #schemas per db
+        def fetch_schema_diff_data(cursor, db_name):
+            schemas = self._query_schema_information(cursor)
+            self._get_table_diff_per_schema(schemas, cursor)
+            #self.schemas_per_db[db_name] = schemas[]
+        self._do_for_databases(fetch_schema_diff_data)
         pdb.set_trace()
         print(self.schemas_per_db)
 
@@ -67,16 +222,69 @@ def _query_schema_information(self, cursor):
         columns = [i[0] for i in cursor.description]
         schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
         for schema in schemas:
-            schema["tables"] = {}
+            schema["tables"] = []
         self._log.debug("fetched schemas len(rows)=%s", len(schemas))
         return schemas
 
     def _get_table_data_per_schema(self, schemas, cursor):
         for schema in schemas:
-            self._get_tables_and_their_data(schema, cursor)
+            self._get_tables(schema, cursor)
+            self._get_table_data(schema, cursor)
+    
+    #TODO will nedd a separate query for changed indexes
+    def _get_table_diff_per_schema(self, schemas, cursor):
+        for schema in schemas:
+            self._get_changed_tables(schema, cursor)
+        for schema in schemas:
+            self._get_table_data(schema, cursor)
+
+    # def payload consume , push in data amount 
+    def _get_table_data2(self, table, schema, cursor):
+        #while processing tables we would like to stop after X amount of data in payload.
+        table["columns"] = self._get_columns_data_per_table(table["name"], schema["name"], cursor)
+        table["partitions"] = self._get_partitions_data_per_table(table["object_id"], cursor)
+        if str(table["object_id"]) == "1803153469":
+            pdb.set_trace()
+            print("should have index")
 
+        table["indexes"] = self._get_index_data_per_table(table["object_id"], cursor)
+        table["foreign_keys"] = self._get_foreign_key_data_per_table(table["object_id"], cursor)
+        return False
+
+
+    # def payload consume , push in data amount 
+    def _get_table_data(self, schema, cursor):
+        #while processing tables we would like to stop after X amount of data in payload.
+        tables_dict_for_schema = schema['tables']
+        for table_object_id, table_value in tables_dict_for_schema.items():
+            table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
+            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
+            if str(table_object_id) == "1803153469":
+                pdb.set_trace()
+                print("should have index")
+
+            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
+            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
+        return False
+    
+    
+    def _get_data_for_table(self, schema, table, cursor):
+        #while processing tables we would like to stop after X amount of data in payload.
+        tables_dict_for_schema = schema['tables']
+        for table_object_id, table_value in tables_dict_for_schema.items():
+            table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
+            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
+            if str(table_object_id) == "1803153469":
+                pdb.set_trace()
+                print("should have index")
+
+            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
+            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
+        return False
+    
+    #TODO in SQLServer partitioned child tables should have the same object_id might be worth checking with a test.
     def _get_tables_and_their_data(self, schema, cursor):        
-        self._get_table_infos(schema, cursor)
+        self._get_tables(schema, cursor)
         tables_dict_for_schema = schema['tables']
         for table_object_id, table_value in tables_dict_for_schema.items():
             table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
@@ -88,11 +296,21 @@ def _get_tables_and_their_data(self, schema, cursor):
             table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
             table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
 
+    # TODO how often ?
+    # TODO put in a class
+    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
+    def _get_tables2(self, schema, cursor):
+
+        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()] #TODO may be more optimal to patch columns with index etc 
+        # rows = [dict(zip(columns + ["columns", "indexes", "partitions", "foreign_keys"], row + [[], [], [], []])) for row in cursor.fetchall()] #TODO may be this works
+        return [ {"object_id" : row["object_id"], "name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []} for row in rows ]                  
 
     # TODO how often ?
     # TODO put in a class
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-    def _get_table_infos(self, schema, cursor):
+    def _get_tables(self, schema, cursor):
         tables_dict_for_schema = schema['tables']
             
         # TODO modify_date - there is a modify date !!! 
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 079e2ea60929f..df7d969d57e7f 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -127,7 +127,7 @@ def __init__(self, name, init_config, instances):
         self._sql_counter_types = {}
         self.proc_type_mapping = {"gauge": self.gauge, "rate": self.rate, "histogram": self.histogram}
 
-        self._schemas = Schemas(self._do_for_databases, self.log)
+        self._schemas = Schemas(self)
 
         # DBM
         self.statement_metrics = SqlserverStatementMetrics(self, self._config)
@@ -728,21 +728,23 @@ def _check_connections_by_use_db(self):
                 cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
     
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
-    def _do_for_databases(self, action):
-        pdb.set_trace()
-        engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
-        db_names = []
+    def get_databases(self):
         if not is_azure_sql_database(engine_edition):
+            engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
             db_names = [d.name for d in self.databases] or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
         else:
             db_names = [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
+    def do_for_databases(self, action, databases):
+        engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         with self.connection.open_managed_default_connection():
             with self.connection.get_managed_cursor() as cursor:
-                for db in db_names:                    
+                for db in databases:                    
                     try:
                         if not is_azure_sql_database(engine_edition):
                             cursor.execute(SWITCH_DB_STATEMENT.format(db))
-                        action(cursor, db)                      
+                        stop = action(cursor, db)        
+                        if stop:
+                            break;                  
                     except Exception as e:
                         print("TODO")
                 # Switch DB back to MASTER

From a7f20c12e2c26aa311ec82319ea38a392fc97208 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Sat, 13 Apr 2024 16:34:46 +0000
Subject: [PATCH 017/132] fixed errors

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 129 ++++--------------
 .../datadog_checks/sqlserver/sqlserver.py     |   4 +-
 2 files changed, 30 insertions(+), 103 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 1ed599600f8dd..3a6e93039bf98 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -13,16 +13,7 @@
 
 import pdb
 
-class DataForProcessedDB:
-    def __init__(self, db_name, schema_list, table_list):
-        self._db_name = db_name
-        self.schema_list = schema_list
-        self.table_list = table_list
-        self.current_schema_index = 0
-        self.current_table_index = 0
-    def stop_processing(self, schema_index, table_index):
-        self.current_schema_index = schema_index
-        self.current_table_index = table_index
+
 
 
 class Schemas:
@@ -42,67 +33,7 @@ def __init__(self, check):
         self.databases = []
         self.current_table_list = None
         self.current_schema_list = None
-    
-    #intially a,b,c DBs new_db_list say c, d,e old_db_list_with_new old list say a, b, c, d, e.
-    # new_db_list say d,e
-    def _move_index_to_existing_db(self, old_db_list_with_new, new_db_list):
-        if self._index is None:
-            print("error")
-            #self._log.error()
-            return
-        if len(new_db_list) == 0:
-            self._index = None
-            return
-        start = self._index["db"]
-        for i in range(start, len(old_db_list_with_new)):
-            if old_db_list_with_new[self._index["db"]] not in new_db_list:
-                if i != len(old_db_list_with_new) -1:
-                    self._index["db"] = i+1
-                    # if we move index at least ones then schema and table are invalidated
-                    self._index["schema"] = None
-                    self._index["table_object_id_index"] = None
-                    self.current_table_list = None
-                    self.current_schema_list = None
-            else:
-                #we are happy with found DB in index
-                return
-        #if we reached the end of old DBs but there is nothing to take
-        self._index = None
-        return
-
-            #if we reach the end and index is still None than we take the first from the new list its not important
-            # as before we add all new DBs to the old list but like for function consistency
 
-    # outputs db, schema, and table to start with
-    # I did this with the idea that trying to connect to a DB that doesnt exist is bad
-    # but now I re4alize that switch DB will throw and we are happy... I mean it can happen if the DB is 
-    def _init_schema_collection2(self):
-        if len(self.databases) == 0:
-            self.databases = self._check.get_databases()
-            if len(self.databases) == 0:
-                self._index = None
-                return
-            self._index = 0
-            return
-        else:
-            # add new DBs to the end of the list
-            updated_databases = self._check.get_databases()
-            for new_db in updated_databases:
-                if new_db not in self.databases:
-                    self.databases.append(new_db)
-            # move index if it is currently on a DB that is not in a new list
-            self._move_index_to_existing_db(self, self.databases, updated_databases)
-            if self._index is None:
-                return
-            # remove dbs from the list, while updating the index
-            current_db_name = self.databases[self._index["db"]]
-            new_db_list = []
-            for db in self.databases:
-                if db in updated_databases:
-                    new_db_list.append(db)
-            self.databases = new_db_list
-            #this shouldnt throw as we ve chosen it to be in the new list.
-            self._index["db"] = self.databases.index(current_db_name)
                       
     def _init_schema_collection(self):
         if len(self.databases) == 0:
@@ -121,12 +52,6 @@ def _init_schema_collection(self):
                     self.current_schema_list = None
                     self.current_table_list = None
 
-
-
-                
-
-            
-
    #TODO update this at the very end as it constantly changing
     """schemas data struct is a dictionnary with key being a schema name the value is
     schema
@@ -150,18 +75,13 @@ def _init_schema_collection(self):
     def collect_schemas_data(self):
         #schemas per db
         # flush previous collection
+        pdb.set_trace()
         self.schemas_per_db = {} 
         # init the index
         self._init_schema_collection()
         if self._index is None:
             return
         
-        def fetch_schema_data(cursor, db_name):
-            schemas = self._query_schema_information(cursor)
-            self._get_table_data_per_schema(schemas, cursor)
-            self.schemas_per_db[db_name] = schemas
-            return False
-        
         # dont need an index just always safe the last one.
         def fetch_schema_data2(cursor, db_name):
             # check if we start from scratch or not 
@@ -178,33 +98,19 @@ def fetch_schema_data2(cursor, db_name):
                 if schema["tables"] is not None:
                     schema["tables"] = self._get_tables2(schema, cursor)
                 for index_t,table in enumerate(schema["tables"]):
-                    stop = self._get_table_data2(self, table, schema, cursor)
+                    pdb.set_trace()
+                    stop = self._get_table_data2(table, schema, cursor)
                     if stop:
                         self.current_table_list = schema["tables"][index_t:]
                         self.current_schema_list = schemas[index_sh:]
+                        self.schemas_per_db[db_name] = schemas
                         return False
+            self.schemas_per_db[db_name] = schemas
             return True
-        self._check._do_for_databases(fetch_schema_data2, self.databases[self.index["db"]:])
-        pdb.set_trace()
-        print(self.schemas_per_db)
-    
-    #TODO we need to take care of new DB / removed DB 
-    #def get_current_db_times(cursor):
-        # list of all known DBs
-
-        #def execute_time_query():
-          # self._last_time_collected_diff_per_db =
-
-    def collect_schema_diffs(self):
-                #schemas per db
-        def fetch_schema_diff_data(cursor, db_name):
-            schemas = self._query_schema_information(cursor)
-            self._get_table_diff_per_schema(schemas, cursor)
-            #self.schemas_per_db[db_name] = schemas[]
-        self._do_for_databases(fetch_schema_diff_data)
+        self._check.do_for_databases(fetch_schema_data2, self.databases[self._index:])
         pdb.set_trace()
         print(self.schemas_per_db)
-
+        
 #per DB per sqhema per tables. 
     # TODO how often ?
     # TODO put in a class
@@ -225,6 +131,25 @@ def _query_schema_information(self, cursor):
             schema["tables"] = []
         self._log.debug("fetched schemas len(rows)=%s", len(schemas))
         return schemas
+        
+    #TODO we need to take care of new DB / removed DB 
+    #def get_current_db_times(cursor):
+        # list of all known DBs
+
+        #def execute_time_query():
+          # self._last_time_collected_diff_per_db =
+
+    def collect_schema_diffs(self):
+                #schemas per db
+        def fetch_schema_diff_data(cursor, db_name):
+            schemas = self._query_schema_information(cursor)
+            self._get_table_diff_per_schema(schemas, cursor)
+            #self.schemas_per_db[db_name] = schemas[]
+        self._do_for_databases(fetch_schema_diff_data)
+        pdb.set_trace()
+        print(self.schemas_per_db)
+
+
 
     def _get_table_data_per_schema(self, schemas, cursor):
         for schema in schemas:
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index df7d969d57e7f..b147a521c3297 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -729,11 +729,13 @@ def _check_connections_by_use_db(self):
     
     #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     def get_databases(self):
+        engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         if not is_azure_sql_database(engine_edition):
-            engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
             db_names = [d.name for d in self.databases] or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
         else:
             db_names = [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
+        return db_names
+
     def do_for_databases(self, action, databases):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         with self.connection.open_managed_default_connection():

From d7e6ec976f0c631c2a00b29ef5facb4886821126 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Sat, 13 Apr 2024 16:38:21 +0000
Subject: [PATCH 018/132] fixed errors

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 45 +------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 3a6e93039bf98..980b40bc89c6a 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -83,7 +83,7 @@ def collect_schemas_data(self):
             return
         
         # dont need an index just always safe the last one.
-        def fetch_schema_data2(cursor, db_name):
+        def fetch_schema_data(cursor, db_name):
             # check if we start from scratch or not 
             if self.current_schema_list is None:
                 # find new schemas:
@@ -107,7 +107,7 @@ def fetch_schema_data2(cursor, db_name):
                         return False
             self.schemas_per_db[db_name] = schemas
             return True
-        self._check.do_for_databases(fetch_schema_data2, self.databases[self._index:])
+        self._check.do_for_databases(fetch_schema_data, self.databases[self._index:])
         pdb.set_trace()
         print(self.schemas_per_db)
         
@@ -175,51 +175,10 @@ def _get_table_data2(self, table, schema, cursor):
         table["indexes"] = self._get_index_data_per_table(table["object_id"], cursor)
         table["foreign_keys"] = self._get_foreign_key_data_per_table(table["object_id"], cursor)
         return False
-
-
-    # def payload consume , push in data amount 
-    def _get_table_data(self, schema, cursor):
-        #while processing tables we would like to stop after X amount of data in payload.
-        tables_dict_for_schema = schema['tables']
-        for table_object_id, table_value in tables_dict_for_schema.items():
-            table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
-            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
-            if str(table_object_id) == "1803153469":
-                pdb.set_trace()
-                print("should have index")
-
-            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
-            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
-        return False
     
     
-    def _get_data_for_table(self, schema, table, cursor):
-        #while processing tables we would like to stop after X amount of data in payload.
-        tables_dict_for_schema = schema['tables']
-        for table_object_id, table_value in tables_dict_for_schema.items():
-            table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
-            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
-            if str(table_object_id) == "1803153469":
-                pdb.set_trace()
-                print("should have index")
-
-            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
-            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
-        return False
-    
     #TODO in SQLServer partitioned child tables should have the same object_id might be worth checking with a test.
-    def _get_tables_and_their_data(self, schema, cursor):        
-        self._get_tables(schema, cursor)
-        tables_dict_for_schema = schema['tables']
-        for table_object_id, table_value in tables_dict_for_schema.items():
-            table_value["columns"] = self._get_columns_data_per_table(table_value["name"], schema["name"], cursor)
-            table_value["partitions"] = self._get_partitions_data_per_table(table_object_id, cursor)
-            if str(table_object_id) == "1803153469":
-                pdb.set_trace()
-                print("should have index")
 
-            table_value["indexes"] = self._get_index_data_per_table(table_object_id, cursor)
-            table_value["foreign_keys"] = self._get_foreign_key_data_per_table(table_object_id, cursor)
 
     # TODO how often ?
     # TODO put in a class

From 7c0b59b2e57ca763d4ee73af7475ef7c9a01ed35 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Sat, 13 Apr 2024 16:56:43 +0000
Subject: [PATCH 019/132] removed old code

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 46 ++++---------------
 1 file changed, 8 insertions(+), 38 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 980b40bc89c6a..ed67b431705a7 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -13,9 +13,6 @@
 
 import pdb
 
-
-
-
 class Schemas:
 
     def __init__(self, check):
@@ -92,14 +89,14 @@ def fetch_schema_data(cursor, db_name):
                 schemas = self.current_schema_list  
             #ok we have schemas now tables
             if self.current_table_list is None:
-                schemas[0]["tables"] = self._get_tables2(schemas[0], cursor)
+                schemas[0]["tables"] = self._get_tables(schemas[0], cursor)
 
             for index_sh, schema in enumerate(schemas):  
                 if schema["tables"] is not None:
-                    schema["tables"] = self._get_tables2(schema, cursor)
+                    schema["tables"] = self._get_tables(schema, cursor)
                 for index_t,table in enumerate(schema["tables"]):
                     pdb.set_trace()
-                    stop = self._get_table_data2(table, schema, cursor)
+                    stop = self._get_table_data(table, schema, cursor)
                     if stop:
                         self.current_table_list = schema["tables"][index_t:]
                         self.current_schema_list = schemas[index_sh:]
@@ -139,39 +136,19 @@ def _query_schema_information(self, cursor):
         #def execute_time_query():
           # self._last_time_collected_diff_per_db =
 
-    def collect_schema_diffs(self):
-                #schemas per db
-        def fetch_schema_diff_data(cursor, db_name):
-            schemas = self._query_schema_information(cursor)
-            self._get_table_diff_per_schema(schemas, cursor)
-            #self.schemas_per_db[db_name] = schemas[]
-        self._do_for_databases(fetch_schema_diff_data)
-        pdb.set_trace()
-        print(self.schemas_per_db)
-
-
 
-    def _get_table_data_per_schema(self, schemas, cursor):
-        for schema in schemas:
-            self._get_tables(schema, cursor)
-            self._get_table_data(schema, cursor)
     
     #TODO will nedd a separate query for changed indexes
-    def _get_table_diff_per_schema(self, schemas, cursor):
-        for schema in schemas:
-            self._get_changed_tables(schema, cursor)
-        for schema in schemas:
-            self._get_table_data(schema, cursor)
+
 
     # def payload consume , push in data amount 
-    def _get_table_data2(self, table, schema, cursor):
+    def _get_table_data(self, table, schema, cursor):
         #while processing tables we would like to stop after X amount of data in payload.
         table["columns"] = self._get_columns_data_per_table(table["name"], schema["name"], cursor)
         table["partitions"] = self._get_partitions_data_per_table(table["object_id"], cursor)
         if str(table["object_id"]) == "1803153469":
             pdb.set_trace()
             print("should have index")
-
         table["indexes"] = self._get_index_data_per_table(table["object_id"], cursor)
         table["foreign_keys"] = self._get_foreign_key_data_per_table(table["object_id"], cursor)
         return False
@@ -183,8 +160,7 @@ def _get_table_data2(self, table, schema, cursor):
     # TODO how often ?
     # TODO put in a class
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-    def _get_tables2(self, schema, cursor):
-
+    def _get_tables(self, schema, cursor):
         cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
         columns = [str(i[0]).lower() for i in cursor.description]
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()] #TODO may be more optimal to patch columns with index etc 
@@ -194,19 +170,13 @@ def _get_tables2(self, schema, cursor):
     # TODO how often ?
     # TODO put in a class
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-    def _get_tables(self, schema, cursor):
-        tables_dict_for_schema = schema['tables']
+
             
         # TODO modify_date - there is a modify date !!! 
         # TODO what is principal_id
         # TODO is_replicated - might be interesting ? 
         
-        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        for row in rows:            
-            tables_dict_for_schema[row['object_id']] = {"name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []}
-        return
+
 
     def _get_columns_data_per_table(self, table_name, schema_name, cursor):
         return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)

From 300ddbb74694286b68528f276a5307ac25c752a8 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 15 Apr 2024 13:06:57 +0000
Subject: [PATCH 020/132] Fixed some bugs in chunk schema collection

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 87 ++++++++++---------
 sqlserver/tests/test_connection.py            | 26 +-----
 sqlserver/tests/test_metrics.py               |  1 +
 3 files changed, 51 insertions(+), 63 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index ed67b431705a7..d525a5eccfe31 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -17,37 +17,32 @@ class Schemas:
 
     def __init__(self, check):
         self._check = check 
-        #self._index = [db_index, schema_index, table_index] 
-        self.schemas_per_db = {} 
         self._log = check.log
-        #TODO is this class unique per host ?
-        self._start_time_for_host = []
-        #TODO per DB may be ? 
-        self._last_time_collected_diff_per_db = {}
-
-        self._index = None
-        self._data_for_processed_db = None
-        self.databases = []
-        self.current_table_list = None
-        self.current_schema_list = None
-
-                      
+        self.schemas_per_db = {} 
+
+        # These are fields related to the work to do while doing the initial intake
+        # for diffs there should eb a self._done_db_list which will be used to see if new dbs have appeared/disappeared.
+        self._databases_to_query = []
+        self._current_table_list = None
+        self._current_schema_list = None
+        self._number_of_collected_tables = 0 #TODO later switch to columns
+
+    def reset_data_collection(self):
+        self._current_table_list = None  
+        self._current_schema_list = None
+        self._number_of_collected_tables = 0
+       
     def _init_schema_collection(self):
-        if len(self.databases) == 0:
-            self.databases = self._check.get_databases()
-            if len(self.databases) == 0:
-                self._index = None
-                return
-            self._index = 0
+        currently_known_databases = self._check.get_databases()
+        if len(self._databases_to_query) == 0:
+            self._databases_to_query = self._check.get_databases()
             return  
         else:
-            if self._index is None:
-                print("error")  
-            #TODO if db dissapeared we invalidate indexes should be done in exception treatment of use DB
-            if self.databases[self._index] not in self._check.get_databases():
-                    #we dont move the index as on first use db its gonna throw and continue the loop
-                    self.current_schema_list = None
-                    self.current_table_list = None
+            if self._databases_to_query[0] not in currently_known_databases:
+                #TODO if db dissapeared we invalidate indexes should be done in exception treatment of use DB ?
+                #if DB is not there the first use db will throw and we continue until we find an existing db or exaust the list
+                # the idea is always finish the existing DB list and then run "diff" logic which will create a new list of "tasks"
+                self.reset_data_collection()
 
    #TODO update this at the very end as it constantly changing
     """schemas data struct is a dictionnary with key being a schema name the value is
@@ -56,7 +51,7 @@ def _init_schema_collection(self):
         "name": str
         "schema_id": str
         "principal_id": str
-        "tables" : dict
+        "tables" : []
             object_id : str
             name : str
             columns: list of columns                  
@@ -76,35 +71,49 @@ def collect_schemas_data(self):
         self.schemas_per_db = {} 
         # init the index
         self._init_schema_collection()
-        if self._index is None:
+        if len(self._databases_to_query) == 0:
             return
         
         # dont need an index just always safe the last one.
         def fetch_schema_data(cursor, db_name):
             # check if we start from scratch or not 
-            if self.current_schema_list is None:
+            pdb.set_trace()
+            if self._current_schema_list is None:
                 # find new schemas:
                 schemas = self._query_schema_information(cursor)
             else:
-                schemas = self.current_schema_list  
-            #ok we have schemas now tables
-            if self.current_table_list is None:
+                schemas = self._current_schema_list  
+
+            if self._current_table_list is None:
                 schemas[0]["tables"] = self._get_tables(schemas[0], cursor)
+            else:
+                schemas[0]["tables"] = self._current_table_list
 
             for index_sh, schema in enumerate(schemas):  
-                if schema["tables"] is not None:
+                if schema["tables"] is None or len(schema["tables"]) == 0:
                     schema["tables"] = self._get_tables(schema, cursor)
                 for index_t,table in enumerate(schema["tables"]):
-                    pdb.set_trace()
+                    
+                    #TODO later can stop after a certain amount of columns
+                    # thus stop
+                    self._number_of_collected_tables+=1
                     stop = self._get_table_data(table, schema, cursor)
-                    if stop:
-                        self.current_table_list = schema["tables"][index_t:]
-                        self.current_schema_list = schemas[index_sh:]
+                    pdb.set_trace()
+                    if stop or self._number_of_collected_tables == 2:
+                        self._number_of_collected_tables = 0
+                        self._current_table_list = schema["tables"][index_t+1:]
+                        self._current_schema_list = schemas[index_sh:]
+                        # TODO this will send not only schemas with tables filled but schemas that are yet empty, not that bad but can be fixed
                         self.schemas_per_db[db_name] = schemas
+                        self._databases_to_query = self._databases_to_query[self._databases_to_query.index(db_name):]
+                        pdb.set_trace()
                         return False
             self.schemas_per_db[db_name] = schemas
+            # if we reached this point means we went through all the list thus we can reset :
+            self.reset_data_collection()
+            self._databases_to_query = []
             return True
-        self._check.do_for_databases(fetch_schema_data, self.databases[self._index:])
+        self._check.do_for_databases(fetch_schema_data, self._databases_to_query)
         pdb.set_trace()
         print(self.schemas_per_db)
         
diff --git a/sqlserver/tests/test_connection.py b/sqlserver/tests/test_connection.py
index 1f7613144351f..4d9e053e0aff5 100644
--- a/sqlserver/tests/test_connection.py
+++ b/sqlserver/tests/test_connection.py
@@ -357,31 +357,9 @@ def test_connection_failure(aggregator, dd_run_check, instance_docker):
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
 
     dd_run_check(check)
-    aggregator.assert_service_check(
-        'sqlserver.can_connect',
-        status=check.OK,
-    )
-    aggregator.reset()
-
-    try:
-        # Break the connection
-        check.connection = Connection(
-            check.resolved_hostname, {}, {'host': '', 'username': '', 'password': ''}, check.handle_service_check
-        )
-        dd_run_check(check)
-    except Exception:
-        aggregator.assert_service_check(
-            'sqlserver.can_connect',
-            status=check.CRITICAL,
-        )
-        aggregator.reset()
-
-    check.initialize_connection()
     dd_run_check(check)
-    aggregator.assert_service_check(
-        'sqlserver.can_connect',
-        status=check.OK,
-    )
+    dd_run_check(check)
+    assert True
 
 
 @pytest.mark.unit
diff --git a/sqlserver/tests/test_metrics.py b/sqlserver/tests/test_metrics.py
index 9cd60b1aa92bf..6f2b88011214c 100644
--- a/sqlserver/tests/test_metrics.py
+++ b/sqlserver/tests/test_metrics.py
@@ -212,6 +212,7 @@ def test_check_index_usage_metrics(
 ):
     instance_docker_metrics['database'] = 'datadog_test-1'
     instance_docker_metrics['include_index_usage_metrics'] = True
+
     instance_docker_metrics['ignore_missing_database'] = True
 
     # Cause an index seek

From a63c9246bded69855944b14f5ecb77d84eaf59c2 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 16 Apr 2024 08:08:03 +0000
Subject: [PATCH 021/132] removed some comments

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 38 +++----------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index d525a5eccfe31..424b834536a40 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -117,9 +117,8 @@ def fetch_schema_data(cursor, db_name):
         pdb.set_trace()
         print(self.schemas_per_db)
         
-#per DB per sqhema per tables. 
     # TODO how often ?
-    # TODO put in a class
+
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
 
     #TODO Looks fine similar to Postgres, do we need to do someting with prinicipal_id
@@ -138,21 +137,10 @@ def _query_schema_information(self, cursor):
         self._log.debug("fetched schemas len(rows)=%s", len(schemas))
         return schemas
         
-    #TODO we need to take care of new DB / removed DB 
-    #def get_current_db_times(cursor):
-        # list of all known DBs
-
-        #def execute_time_query():
-          # self._last_time_collected_diff_per_db =
-
-
-    
-    #TODO will nedd a separate query for changed indexes
+    #TODO collect diffs : we need to take care of new DB / removed DB . schemas new removed
+    # will nedd a separate query for changed indexes
 
-
-    # def payload consume , push in data amount 
     def _get_table_data(self, table, schema, cursor):
-        #while processing tables we would like to stop after X amount of data in payload.
         table["columns"] = self._get_columns_data_per_table(table["name"], schema["name"], cursor)
         table["partitions"] = self._get_partitions_data_per_table(table["object_id"], cursor)
         if str(table["object_id"]) == "1803153469":
@@ -160,14 +148,11 @@ def _get_table_data(self, table, schema, cursor):
             print("should have index")
         table["indexes"] = self._get_index_data_per_table(table["object_id"], cursor)
         table["foreign_keys"] = self._get_foreign_key_data_per_table(table["object_id"], cursor)
-        return False
-    
-    
+        #TODO probably here decide based on the columns amount
+        return True
+        
     #TODO in SQLServer partitioned child tables should have the same object_id might be worth checking with a test.
 
-
-    # TODO how often ?
-    # TODO put in a class
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
     def _get_tables(self, schema, cursor):
         cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
@@ -176,17 +161,6 @@ def _get_tables(self, schema, cursor):
         # rows = [dict(zip(columns + ["columns", "indexes", "partitions", "foreign_keys"], row + [[], [], [], []])) for row in cursor.fetchall()] #TODO may be this works
         return [ {"object_id" : row["object_id"], "name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []} for row in rows ]                  
 
-    # TODO how often ?
-    # TODO put in a class
-    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-
-            
-        # TODO modify_date - there is a modify date !!! 
-        # TODO what is principal_id
-        # TODO is_replicated - might be interesting ? 
-        
-
-
     def _get_columns_data_per_table(self, table_name, schema_name, cursor):
         return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
 

From 41609ddd27ad62404ddcb09043ae9f335796cec3 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 16 Apr 2024 10:47:23 +0000
Subject: [PATCH 022/132] some diffs

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 63 +++++++++++++++++--
 sqlserver/datadog_checks/sqlserver/utils.py   |  7 ++-
 sqlserver/tests/test_connection.py            | 33 ++++++++--
 3 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 424b834536a40..44462c429bd2b 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -8,13 +8,18 @@
 )
 
 from datadog_checks.sqlserver.utils import (
-    execute_query_output_result_as_a_dict,
+    execute_query_output_result_as_a_dict, get_list_chunks
 )
 
 import pdb
 
-class Schemas:
+import time
+import json
+from datadog_checks.base.utils.db.utils import DBMAsyncJob, default_json_event_encoding
 
+class Schemas:
+    
+    MAX_COLUMN_COUNT  = 100_000
     def __init__(self, check):
         self._check = check 
         self._log = check.log
@@ -64,7 +69,7 @@ def _init_schema_collection(self):
             foreign_keys : list of foreign keys
             partitions useful to know the number 
     """
-    def collect_schemas_data(self):
+    def collect_schemas_data2(self):
         #schemas per db
         # flush previous collection
         pdb.set_trace()
@@ -97,9 +102,9 @@ def fetch_schema_data(cursor, db_name):
                     #TODO later can stop after a certain amount of columns
                     # thus stop
                     self._number_of_collected_tables+=1
-                    stop = self._get_table_data(table, schema, cursor)
+                    column_amount = self._get_table_data(table, schema, cursor)
                     pdb.set_trace()
-                    if stop or self._number_of_collected_tables == 2:
+                    if column_amount > 100_000 or self._number_of_collected_tables == 2:
                         self._number_of_collected_tables = 0
                         self._current_table_list = schema["tables"][index_t+1:]
                         self._current_schema_list = schemas[index_sh:]
@@ -116,6 +121,52 @@ def fetch_schema_data(cursor, db_name):
         self._check.do_for_databases(fetch_schema_data, self._databases_to_query)
         pdb.set_trace()
         print(self.schemas_per_db)
+    
+    #sends all the data in one go but split in chunks (like Seth's solution)
+    def collect_schemas_data(self):
+        pdb.set_trace()
+        base_event = {
+                "host": self._check.resolved_hostname,
+                #"agent_version": datadog_agent.get_version(),
+                "dbms": "sqlserver", #TODO ?
+                "kind": "", # TODO ? 
+                #"collection_interval": self.schemas_collection_interval,
+                #"dbms_version": self._payload_pg_version(),
+                #"tags": self._tags_no_db,
+                #"cloud_metadata": self._config.cloud_metadata,
+            }
+
+        def fetch_schema_data(cursor, db_name):
+            schemas = self._query_schema_information(cursor)
+            pdb.set_trace()
+            coulmn_count  = 0
+            for schema in schemas:
+                tables = self._get_tables(schema, cursor)
+                # tables_chunk = list(get_list_chunks(tables, chunk_size)) - may be will need to switch to chunks and change queries ... ask Justin
+                start_table_index = 0
+                for index_t, table in tables:
+                    coulmn_count += self._get_table_data(table, schema, cursor)
+                    if coulmn_count > self.MAX_COLUMN_COUNT or index_t == len(tables) -1:   # we flush if the last table or columns threshold
+                        #flush data ... 
+                        self._flush_schema(base_event, db_name, schema, tables[start_table_index:index_t+1])
+                        start_table_index = index_t+1 if index_t+1 < len(tables) else 0 # 0 if we ve finished the tables anyway
+                        coulmn_count = 0
+                        # reset column coutnt
+                    #if last  
+                pdb.set_trace()                           
+                self._flush_schema(base_event, db_name, schema, tables[start_table_index:])
+            return True
+        self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
+
+    def _flush_schema(self, base_event, database, schema, tables):
+        event = {
+            **base_event,
+            "metadata": [{**database, "schemas": [{**schema, "tables": tables}]}],
+            "timestamp": time.time() * 1000,
+        }
+        json_event = json.dumps(event, default=default_json_event_encoding)
+        self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
+        self._check.database_monitoring_metadata(json_event)
         
     # TODO how often ?
 
@@ -149,7 +200,7 @@ def _get_table_data(self, table, schema, cursor):
         table["indexes"] = self._get_index_data_per_table(table["object_id"], cursor)
         table["foreign_keys"] = self._get_foreign_key_data_per_table(table["object_id"], cursor)
         #TODO probably here decide based on the columns amount
-        return True
+        return len(table["columns"])
         
     #TODO in SQLServer partitioned child tables should have the same object_id might be worth checking with a test.
 
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index 7f2fdcdacf329..9f94ab620aac4 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -146,4 +146,9 @@ def execute_query_output_result_as_a_dict(query, cursor, column_name=None):
     else:
         columns = [str(i[0]).lower() for i in cursor.description]
     rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-    return rows
\ No newline at end of file
+    return rows
+
+def get_list_chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
\ No newline at end of file
diff --git a/sqlserver/tests/test_connection.py b/sqlserver/tests/test_connection.py
index 4d9e053e0aff5..6bf5428ae9296 100644
--- a/sqlserver/tests/test_connection.py
+++ b/sqlserver/tests/test_connection.py
@@ -291,7 +291,7 @@ def test_config_with_and_without_port(instance_minimal_defaults, host, port, exp
 
 @pytest.mark.integration
 @pytest.mark.usefixtures('dd_environment')
-@pytest.mark.skipif(running_on_windows_ci() and SQLSERVER_MAJOR_VERSION == 2019, reason='Test flakes on this set up')
+@pytest.mark.skipif(True)
 def test_query_timeout(instance_docker):
     instance_docker['command_timeout'] = 1
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
@@ -314,6 +314,7 @@ def test_query_timeout(instance_docker):
 
 @pytest.mark.integration
 @pytest.mark.usefixtures('dd_environment')
+@pytest.mark.skipif(True)
 def test_connection_cleanup(instance_docker):
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
     check.initialize_connection()
@@ -346,7 +347,7 @@ def test_connection_cleanup(instance_docker):
     assert "oops" in str(e)
     assert len(check.connection._conns) == 0, "connection should have been closed"
 
-
+import pdb
 @pytest.mark.integration
 def test_connection_failure(aggregator, dd_run_check, instance_docker):
     instance_docker['dbm'] = True
@@ -355,11 +356,33 @@ def test_connection_failure(aggregator, dd_run_check, instance_docker):
     instance_docker['query_activity'] = {'enabled': True, 'run_sync': True, 'collection_interval': 0.1}
     instance_docker['collect_settings'] = {'enabled': True, 'run_sync': True, 'collection_interval': 0.1}
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
-
-    dd_run_check(check)
+    pdb.set_trace()
     dd_run_check(check)
+    aggregator.assert_service_check(
+        'sqlserver.can_connect',
+        status=check.OK,
+    )
+    aggregator.reset()
+
+    try:
+        # Break the connection
+        check.connection = Connection(
+            check.resolved_hostname, {}, {'host': '', 'username': '', 'password': ''}, check.handle_service_check
+        )
+        dd_run_check(check)
+    except Exception:
+        aggregator.assert_service_check(
+            'sqlserver.can_connect',
+            status=check.CRITICAL,
+        )
+        aggregator.reset()
+
+    check.initialize_connection()
     dd_run_check(check)
-    assert True
+    aggregator.assert_service_check(
+        'sqlserver.can_connect',
+        status=check.OK,
+    )
 
 
 @pytest.mark.unit

From bf5cef97504b24c7259a9d1d9c72ba80706f55d7 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 16 Apr 2024 12:15:29 +0000
Subject: [PATCH 023/132] working version send data in chunks

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 12 ++++++------
 sqlserver/tests/odbc/odbcinst.ini             |  2 +-
 sqlserver/tests/test_connection.py            |  7 +++----
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 44462c429bd2b..2bd3562aef4f9 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -138,30 +138,30 @@ def collect_schemas_data(self):
 
         def fetch_schema_data(cursor, db_name):
             schemas = self._query_schema_information(cursor)
-            pdb.set_trace()
+
             coulmn_count  = 0
             for schema in schemas:
                 tables = self._get_tables(schema, cursor)
                 # tables_chunk = list(get_list_chunks(tables, chunk_size)) - may be will need to switch to chunks and change queries ... ask Justin
                 start_table_index = 0
-                for index_t, table in tables:
+                for index_t, table in enumerate(tables):
+                    pdb.set_trace()
                     coulmn_count += self._get_table_data(table, schema, cursor)
                     if coulmn_count > self.MAX_COLUMN_COUNT or index_t == len(tables) -1:   # we flush if the last table or columns threshold
                         #flush data ... 
+                        pdb.set_trace() 
                         self._flush_schema(base_event, db_name, schema, tables[start_table_index:index_t+1])
                         start_table_index = index_t+1 if index_t+1 < len(tables) else 0 # 0 if we ve finished the tables anyway
                         coulmn_count = 0
                         # reset column coutnt
-                    #if last  
-                pdb.set_trace()                           
-                self._flush_schema(base_event, db_name, schema, tables[start_table_index:])
+                    #if last      
             return True
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
 
     def _flush_schema(self, base_event, database, schema, tables):
         event = {
             **base_event,
-            "metadata": [{**database, "schemas": [{**schema, "tables": tables}]}],
+            "metadata": [{"db_name":database, "schemas": [{**schema, "tables": tables}]}],
             "timestamp": time.time() * 1000,
         }
         json_event = json.dumps(event, default=default_json_event_encoding)
diff --git a/sqlserver/tests/odbc/odbcinst.ini b/sqlserver/tests/odbc/odbcinst.ini
index 75ffdd4b4d72d..58163f2833d9e 100644
--- a/sqlserver/tests/odbc/odbcinst.ini
+++ b/sqlserver/tests/odbc/odbcinst.ini
@@ -6,5 +6,5 @@ Driver=/usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
 
 [ODBC Driver 18 for SQL Server]
 Description=Microsoft ODBC Driver 18 for SQL Server
-Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.3.1
+Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.2.1
 UsageCount=1
diff --git a/sqlserver/tests/test_connection.py b/sqlserver/tests/test_connection.py
index 6bf5428ae9296..1f7613144351f 100644
--- a/sqlserver/tests/test_connection.py
+++ b/sqlserver/tests/test_connection.py
@@ -291,7 +291,7 @@ def test_config_with_and_without_port(instance_minimal_defaults, host, port, exp
 
 @pytest.mark.integration
 @pytest.mark.usefixtures('dd_environment')
-@pytest.mark.skipif(True)
+@pytest.mark.skipif(running_on_windows_ci() and SQLSERVER_MAJOR_VERSION == 2019, reason='Test flakes on this set up')
 def test_query_timeout(instance_docker):
     instance_docker['command_timeout'] = 1
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
@@ -314,7 +314,6 @@ def test_query_timeout(instance_docker):
 
 @pytest.mark.integration
 @pytest.mark.usefixtures('dd_environment')
-@pytest.mark.skipif(True)
 def test_connection_cleanup(instance_docker):
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
     check.initialize_connection()
@@ -347,7 +346,7 @@ def test_connection_cleanup(instance_docker):
     assert "oops" in str(e)
     assert len(check.connection._conns) == 0, "connection should have been closed"
 
-import pdb
+
 @pytest.mark.integration
 def test_connection_failure(aggregator, dd_run_check, instance_docker):
     instance_docker['dbm'] = True
@@ -356,7 +355,7 @@ def test_connection_failure(aggregator, dd_run_check, instance_docker):
     instance_docker['query_activity'] = {'enabled': True, 'run_sync': True, 'collection_interval': 0.1}
     instance_docker['collect_settings'] = {'enabled': True, 'run_sync': True, 'collection_interval': 0.1}
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
-    pdb.set_trace()
+
     dd_run_check(check)
     aggregator.assert_service_check(
         'sqlserver.can_connect',

From ba643edc973713f925987cc498daa26e51aa5ccf Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 16 Apr 2024 20:08:39 +0000
Subject: [PATCH 024/132] introduced collection per tables

---
 sqlserver/datadog_checks/sqlserver/const.py   |  17 ++-
 sqlserver/datadog_checks/sqlserver/schemas.py | 104 +++++++++++++++++-
 2 files changed, 112 insertions(+), 9 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 8a596bbd9daa1..40edc139e675c 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -271,11 +271,20 @@
 
 SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas WHERE name NOT IN ('sys', 'information_schema');"
 TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}"
-COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}' and TABLE_SCHEMA='{}';"
+COLUMN_QUERY3 = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}' and TABLE_SCHEMA='{}';"
 #this query returns several values in case there is an alias for an int ... 
 COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id OR c.user_type_id = t.user_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
+
+#WHERE  attrelid IN ({table_ids})
+COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
+
+
 #PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
-PARTITIONS_QUERY = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
-INDEX_QUERY = "SELECT i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id = {};"
+PARTITIONS_QUERY2 = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
+PARTITIONS_QUERY = "SELECT object_id, COUNT(*) AS partition_count FROM sys.partitions WHERE object_id IN ({}) GROUP BY object_id;"
+FOREIGN_KEY_QUERY = "SELECT referenced_object_id, COUNT(*) AS foreign_key_count FROM sys.foreign_keys WHERE referenced_object_id IN ({}) GROUP BY referenced_object_id;"
+INDEX_QUERY2 = "SELECT i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id = {};"
+# May be this query is wrong like what if index is build on 2 columns will this work ? to test ? 
+INDEX_QUERY = "SELECT i.object_id AS object_id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id IN ({});"
 #FOREIGN_KEY_QUERY2 = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
-FOREIGN_KEY_QUERY = "SELECT COUNT(*) FROM sys.foreign_keys WHERE referenced_object_id = {};"
+FOREIGN_KEY_QUERY2 = "SELECT COUNT(*) FROM sys.foreign_keys WHERE referenced_object_id = {};"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 2bd3562aef4f9..4d9e2a60a0639 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -74,6 +74,7 @@ def collect_schemas_data2(self):
         # flush previous collection
         pdb.set_trace()
         self.schemas_per_db = {} 
+        self.dbs_metadata = []
         # init the index
         self._init_schema_collection()
         if len(self._databases_to_query) == 0:
@@ -142,25 +143,41 @@ def fetch_schema_data(cursor, db_name):
             coulmn_count  = 0
             for schema in schemas:
                 tables = self._get_tables(schema, cursor)
+
                 # tables_chunk = list(get_list_chunks(tables, chunk_size)) - may be will need to switch to chunks and change queries ... ask Justin
                 start_table_index = 0
-                for index_t, table in enumerate(tables):
+                if len(tables) > 0:
                     pdb.set_trace()
+                    numer_columns,my_tables = self._get_tables_data(tables, schema, cursor)
+                    pdb.set_trace()
+                    print(my_tables, numer_columns)
+
+                for index_t, table in enumerate(tables):
+                    
+                    before = time.time() * 1000
                     coulmn_count += self._get_table_data(table, schema, cursor)
+                    after = time.time() * 1000
+                    total = after - before
+                    pdb.set_trace()
+                    print(total)
+
                     if coulmn_count > self.MAX_COLUMN_COUNT or index_t == len(tables) -1:   # we flush if the last table or columns threshold
-                        #flush data ... 
-                        pdb.set_trace() 
+                        #flush data ...                         
+                        print(total)
                         self._flush_schema(base_event, db_name, schema, tables[start_table_index:index_t+1])
                         start_table_index = index_t+1 if index_t+1 < len(tables) else 0 # 0 if we ve finished the tables anyway
                         coulmn_count = 0
                         # reset column coutnt
-                    #if last      
+                    #if last  
+
             return True
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
 
+    # TODO this can be a separate class, we could stack in data on each loop iteration and it decides when to flush 
+
     def _flush_schema(self, base_event, database, schema, tables):
         event = {
-            **base_event,
+            **base_event, 
             "metadata": [{"db_name":database, "schemas": [{**schema, "tables": tables}]}],
             "timestamp": time.time() * 1000,
         }
@@ -190,6 +207,83 @@ def _query_schema_information(self, cursor):
         
     #TODO collect diffs : we need to take care of new DB / removed DB . schemas new removed
     # will nedd a separate query for changed indexes
+    def _get_tables_data(self, table_list, schema, cursor):
+        if len(table_list) == 0:
+            return
+        name_to_id = {}
+        id_to_all = {}
+        table_names = ",".join(["'{}'".format(t.get("name")) for t in table_list])
+        table_ids = ",".join(["{}".format(t.get("object_id")) for t in table_list])
+        for t in table_list:
+            name_to_id[t["name"]] = t["object_id"] 
+            id_to_all[t["object_id"]] = t
+        total_columns_number  = self._populate_with_columns_data(table_names, name_to_id, id_to_all, schema, cursor)
+        self._populate_with_partitions_data(table_ids, id_to_all, cursor)
+        self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor)
+        pdb.set_trace()
+        self._populate_with_index_data(table_ids, id_to_all, cursor)
+        # unwrap id_to_all
+        return total_columns_number, list(id_to_all.values())
+
+    def _populate_with_columns_data(self, table_names, name_to_id, id_to_all, schema, cursor):
+        # get columns if we dont have a dict here unlike postgres
+        cursor.execute(COLUMN_QUERY.format(table_names, schema["name"]))
+        data = cursor.fetchall()
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in data]       
+        for row in rows:
+            table_id = name_to_id.get(str(row.get("table_name")))
+            if table_id is not None:
+                # exclude "table_name" from the row dict
+                row.pop("table_name", None)
+                id_to_all.get(table_id)["columns"] = id_to_all.get(table_id).get("columns",[]) + [row]
+        return len(data)
+    
+    def _populate_with_partitions_data(self, table_ids, id_to_all, cursor):
+        cursor.execute(PARTITIONS_QUERY.format(table_ids))
+        columns = [str(i[0]).lower() for i in cursor.description] 
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        for row in rows:
+            id  = row.pop("object_id", None)
+            if id is not None:
+                #TODO what happens if not found ? 
+                id_to_all.get(id)["partitions"] = row
+            else:
+                print("todo error")
+            row.pop("object_id", None)
+        print("end")
+
+    def _populate_with_index_data(self, table_ids, id_to_all, cursor):
+        cursor.execute(INDEX_QUERY.format(table_ids))
+        pdb.set_trace()
+        columns = [str(i[0]).lower() for i in cursor.description] 
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        for row in rows:
+            id  = row.pop("object_id", None)
+            if id is not None:
+                id_to_all.get(id)["indexes"] = row
+            else:
+                print("todo error")
+            row.pop("object_id", None)
+            pdb.set_trace()
+        pdb.set_trace()
+        print("end")
+
+    def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
+            pdb.set_trace()
+            cursor.execute(FOREIGN_KEY_QUERY.format(table_ids))
+            columns = [str(i[0]).lower() for i in cursor.description] 
+            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+            for row in rows:
+                id  = row.pop("object_id", None)
+                if id is not None:
+                    id_to_all.get(id)["foreign_keys"] = row
+                else:
+                    print("todo error")  
+            pdb.set_trace()
+            print("end")
+        #return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
+    
 
     def _get_table_data(self, table, schema, cursor):
         table["columns"] = self._get_columns_data_per_table(table["name"], schema["name"], cursor)

From 0d6d27986ad9a7ee45d25d98bc3e110f23d03a86 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 17 Apr 2024 12:41:10 +0000
Subject: [PATCH 025/132] Introduced a class for data submit

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 195 +++++++-----------
 1 file changed, 70 insertions(+), 125 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 4d9e2a60a0639..22b916ff2774c 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -15,15 +15,65 @@
 
 import time
 import json
-from datadog_checks.base.utils.db.utils import DBMAsyncJob, default_json_event_encoding
+import copy
 
-class Schemas:
-    
+from datadog_checks.base.utils.db.utils import default_json_event_encoding
+
+class SubmitData: 
     MAX_COLUMN_COUNT  = 100_000
+
+    def __init__(self, submit_data_function, base_event, logger):
+        self._submit = submit_data_function
+        self._columns_count  = 0
+        self.db_to_schemas = {} # dbname : { id : schema }
+        self._base_event = base_event
+        self._log = logger
+
+    def store(self, db_name, schema, tables, columns_count):
+        self._columns_count += columns_count
+        schemas = self.db_to_schemas.setdefault(db_name, {})
+        if schema["schema_id"] in schemas:
+            known_tables = schemas[schema["schema_id"]].setdefault("tables",[])
+            known_tables = known_tables + tables
+        else:
+            schemas[schema["schema_id"]] = copy.deepcopy(schema) # TODO a deep copy ? kind of costs not much to be safe
+            schemas[schema["schema_id"]]["tables"] = tables
+        if self._columns_count > self.MAX_COLUMN_COUNT:
+            self._submit()
+
+    def submit(self):
+        pdb.set_trace()
+        if not bool(self.db_to_schemas):
+            return
+        self._columns_count  = 0
+        event = {**self._base_event,
+                 "metadata" : [],
+                 "timestamp": time.time() * 1000
+                 }
+        for db, schemas_by_id in self.db_to_schemas.items():
+            event["metadata"] =  event["metadata"] + [{"db_name":db, "schemas": list(schemas_by_id.values()) }]
+        json_event = json.dumps(event, default=default_json_event_encoding)
+        self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
+        self._submit(json_event)
+        self.db_to_schemas = {}
+
+#TODO Introduce total max for data
+class Schemas:
     def __init__(self, check):
         self._check = check 
         self._log = check.log
         self.schemas_per_db = {} 
+        base_event = {
+                "host": self._check.resolved_hostname,
+                #"agent_version": datadog_agent.get_version(),
+                "dbms": "sqlserver", #TODO ?
+                "kind": "", # TODO ? 
+                #"collection_interval": self.schemas_collection_interval,
+                #"dbms_version": self._payload_pg_version(),
+                #"tags": self._tags_no_db,
+                #"cloud_metadata": self._config.cloud_metadata,
+            }
+        self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
 
         # These are fields related to the work to do while doing the initial intake
         # for diffs there should eb a self._done_db_list which will be used to see if new dbs have appeared/disappeared.
@@ -69,63 +119,10 @@ def _init_schema_collection(self):
             foreign_keys : list of foreign keys
             partitions useful to know the number 
     """
-    def collect_schemas_data2(self):
-        #schemas per db
-        # flush previous collection
-        pdb.set_trace()
-        self.schemas_per_db = {} 
-        self.dbs_metadata = []
-        # init the index
-        self._init_schema_collection()
-        if len(self._databases_to_query) == 0:
-            return
-        
-        # dont need an index just always safe the last one.
-        def fetch_schema_data(cursor, db_name):
-            # check if we start from scratch or not 
-            pdb.set_trace()
-            if self._current_schema_list is None:
-                # find new schemas:
-                schemas = self._query_schema_information(cursor)
-            else:
-                schemas = self._current_schema_list  
-
-            if self._current_table_list is None:
-                schemas[0]["tables"] = self._get_tables(schemas[0], cursor)
-            else:
-                schemas[0]["tables"] = self._current_table_list
-
-            for index_sh, schema in enumerate(schemas):  
-                if schema["tables"] is None or len(schema["tables"]) == 0:
-                    schema["tables"] = self._get_tables(schema, cursor)
-                for index_t,table in enumerate(schema["tables"]):
-                    
-                    #TODO later can stop after a certain amount of columns
-                    # thus stop
-                    self._number_of_collected_tables+=1
-                    column_amount = self._get_table_data(table, schema, cursor)
-                    pdb.set_trace()
-                    if column_amount > 100_000 or self._number_of_collected_tables == 2:
-                        self._number_of_collected_tables = 0
-                        self._current_table_list = schema["tables"][index_t+1:]
-                        self._current_schema_list = schemas[index_sh:]
-                        # TODO this will send not only schemas with tables filled but schemas that are yet empty, not that bad but can be fixed
-                        self.schemas_per_db[db_name] = schemas
-                        self._databases_to_query = self._databases_to_query[self._databases_to_query.index(db_name):]
-                        pdb.set_trace()
-                        return False
-            self.schemas_per_db[db_name] = schemas
-            # if we reached this point means we went through all the list thus we can reset :
-            self.reset_data_collection()
-            self._databases_to_query = []
-            return True
-        self._check.do_for_databases(fetch_schema_data, self._databases_to_query)
-        pdb.set_trace()
-        print(self.schemas_per_db)
     
     #sends all the data in one go but split in chunks (like Seth's solution)
     def collect_schemas_data(self):
-        pdb.set_trace()
+        
         base_event = {
                 "host": self._check.resolved_hostname,
                 #"agent_version": datadog_agent.get_version(),
@@ -139,51 +136,22 @@ def collect_schemas_data(self):
 
         def fetch_schema_data(cursor, db_name):
             schemas = self._query_schema_information(cursor)
-
-            coulmn_count  = 0
+            chunk_size = 50
             for schema in schemas:
-                tables = self._get_tables(schema, cursor)
-
-                # tables_chunk = list(get_list_chunks(tables, chunk_size)) - may be will need to switch to chunks and change queries ... ask Justin
-                start_table_index = 0
-                if len(tables) > 0:
-                    pdb.set_trace()
-                    numer_columns,my_tables = self._get_tables_data(tables, schema, cursor)
-                    pdb.set_trace()
-                    print(my_tables, numer_columns)
-
-                for index_t, table in enumerate(tables):
-                    
-                    before = time.time() * 1000
-                    coulmn_count += self._get_table_data(table, schema, cursor)
-                    after = time.time() * 1000
-                    total = after - before
-                    pdb.set_trace()
-                    print(total)
-
-                    if coulmn_count > self.MAX_COLUMN_COUNT or index_t == len(tables) -1:   # we flush if the last table or columns threshold
-                        #flush data ...                         
-                        print(total)
-                        self._flush_schema(base_event, db_name, schema, tables[start_table_index:index_t+1])
-                        start_table_index = index_t+1 if index_t+1 < len(tables) else 0 # 0 if we ve finished the tables anyway
-                        coulmn_count = 0
-                        # reset column coutnt
-                    #if last  
-
+                tables = self._get_tables(schema, cursor)            
+                tables_chunk = list(get_list_chunks(tables, chunk_size))
+                for tables_chunk in tables_chunk:
+                    columns_count, tables = self._get_tables_data(tables_chunk, schema, cursor)
+                    self._dataSubmitter.store(db_name, schema, tables, columns_count)  
+                #self._dataSubmitter.submit() # we force submit when we reach the end of schema, it's like in Seths solution
+                if len(tables) == 0:
+                    self._dataSubmitter.store(db_name, schema, [], 0)
+                # to ask him if this is needed or we can submit only on 100 000 column
+            # tells if we want to move to the next DB or stop
             return True
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
-
-    # TODO this can be a separate class, we could stack in data on each loop iteration and it decides when to flush 
-
-    def _flush_schema(self, base_event, database, schema, tables):
-        event = {
-            **base_event, 
-            "metadata": [{"db_name":database, "schemas": [{**schema, "tables": tables}]}],
-            "timestamp": time.time() * 1000,
-        }
-        json_event = json.dumps(event, default=default_json_event_encoding)
-        self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
-        self._check.database_monitoring_metadata(json_event)
+        # submit the last chunk of data if any
+        self._dataSubmitter.submit()
         
     # TODO how often ?
 
@@ -220,7 +188,6 @@ def _get_tables_data(self, table_list, schema, cursor):
         total_columns_number  = self._populate_with_columns_data(table_names, name_to_id, id_to_all, schema, cursor)
         self._populate_with_partitions_data(table_ids, id_to_all, cursor)
         self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor)
-        pdb.set_trace()
         self._populate_with_index_data(table_ids, id_to_all, cursor)
         # unwrap id_to_all
         return total_columns_number, list(id_to_all.values())
@@ -255,7 +222,6 @@ def _populate_with_partitions_data(self, table_ids, id_to_all, cursor):
 
     def _populate_with_index_data(self, table_ids, id_to_all, cursor):
         cursor.execute(INDEX_QUERY.format(table_ids))
-        pdb.set_trace()
         columns = [str(i[0]).lower() for i in cursor.description] 
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         for row in rows:
@@ -265,12 +231,9 @@ def _populate_with_index_data(self, table_ids, id_to_all, cursor):
             else:
                 print("todo error")
             row.pop("object_id", None)
-            pdb.set_trace()
-        pdb.set_trace()
         print("end")
 
     def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
-            pdb.set_trace()
             cursor.execute(FOREIGN_KEY_QUERY.format(table_ids))
             columns = [str(i[0]).lower() for i in cursor.description] 
             rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
@@ -280,21 +243,9 @@ def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
                     id_to_all.get(id)["foreign_keys"] = row
                 else:
                     print("todo error")  
-            pdb.set_trace()
             print("end")
         #return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
     
-
-    def _get_table_data(self, table, schema, cursor):
-        table["columns"] = self._get_columns_data_per_table(table["name"], schema["name"], cursor)
-        table["partitions"] = self._get_partitions_data_per_table(table["object_id"], cursor)
-        if str(table["object_id"]) == "1803153469":
-            pdb.set_trace()
-            print("should have index")
-        table["indexes"] = self._get_index_data_per_table(table["object_id"], cursor)
-        table["foreign_keys"] = self._get_foreign_key_data_per_table(table["object_id"], cursor)
-        #TODO probably here decide based on the columns amount
-        return len(table["columns"])
         
     #TODO in SQLServer partitioned child tables should have the same object_id might be worth checking with a test.
 
@@ -306,20 +257,14 @@ def _get_tables(self, schema, cursor):
         # rows = [dict(zip(columns + ["columns", "indexes", "partitions", "foreign_keys"], row + [[], [], [], []])) for row in cursor.fetchall()] #TODO may be this works
         return [ {"object_id" : row["object_id"], "name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []} for row in rows ]                  
 
-    def _get_columns_data_per_table(self, table_name, schema_name, cursor):
-        return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
 
     #TODO table 1803153469 is in  sys.indexes but not in sys.index_columns ... shell we do something about it ?
-    def _get_index_data_per_table(self, table_object_id, cursor):           
-        return execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_object_id), cursor)
+
 
     #TODO its hard to get the partition key - for later ? 
-    def _get_partitions_data_per_table(self, table_object_id, cursor):
+
         # TODO check out sys.partitions in postgres we deliver some data about patitions
         # "partition_key": str (if has partitions) - equiv ? 
         # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
         # for more in depth search, it's not trivial to determine partition key like in Postgres
-        return execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_object_id), cursor, "partition_count")
-
-    def _get_foreign_key_data_per_table(self, table_object_id, cursor):  
-        return execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_object_id), cursor, "foreign_key_count")         
+       

From f204f7a4a2a4c950f2132376d21b1ae667c63106 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 18 Apr 2024 15:10:25 +0000
Subject: [PATCH 026/132] pretending to be postgres for testing

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 22b916ff2774c..d9b88bf24bae9 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -1,3 +1,8 @@
+try:
+    import datadog_agent
+except ImportError:
+    from ..stubs import datadog_agent
+
 from datadog_checks.sqlserver.const import (
     TABLES_IN_SCHEMA_QUERY,
     COLUMN_QUERY,
@@ -63,16 +68,29 @@ def __init__(self, check):
         self._check = check 
         self._log = check.log
         self.schemas_per_db = {} 
+        """
         base_event = {
                 "host": self._check.resolved_hostname,
-                #"agent_version": datadog_agent.get_version(),
+                "agent_version": datadog_agent.get_version(),
                 "dbms": "sqlserver", #TODO ?
-                "kind": "", # TODO ? 
+                "kind": "", # TODO 
                 #"collection_interval": self.schemas_collection_interval,
                 #"dbms_version": self._payload_pg_version(),
                 #"tags": self._tags_no_db,
                 #"cloud_metadata": self._config.cloud_metadata,
             }
+        """
+        base_event = {
+            "host": self._check.resolved_hostname,
+            "agent_version": datadog_agent.get_version(),
+            "dbms": "postgres", #TODO fake it until you make it - trying to pass this data as postgres for now
+            "kind": "pg_databases", # TODO pg_databases - will result in KindPgDatabases and so processor would thing its postgres 
+            "collection_interval": 100, #dummy
+            "dbms_version": 1, #dummy
+            #"tags": self._tags_no_db,
+            #"cloud_metadata": self._config.cloud_metadata,
+        }
+
         self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
 
         # These are fields related to the work to do while doing the initial intake
@@ -192,6 +210,7 @@ def _get_tables_data(self, table_list, schema, cursor):
         # unwrap id_to_all
         return total_columns_number, list(id_to_all.values())
 
+    # TODO refactor the next 3 to have a base function when everythng is settled.
     def _populate_with_columns_data(self, table_names, name_to_id, id_to_all, schema, cursor):
         # get columns if we dont have a dict here unlike postgres
         cursor.execute(COLUMN_QUERY.format(table_names, schema["name"]))

From 0f08c7f64808c0b2189cb03447516ba53b546d53 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 19 Apr 2024 19:03:55 +0000
Subject: [PATCH 027/132] Adopted to Postgres

---
 postgres/datadog_checks/postgres/metadata.py  |   3 +-
 sqlserver/datadog_checks/sqlserver/const.py   |  10 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 108 ++++++++++++------
 3 files changed, 85 insertions(+), 36 deletions(-)

diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py
index 37dd85495f137..ae2da66fbc622 100644
--- a/postgres/datadog_checks/postgres/metadata.py
+++ b/postgres/datadog_checks/postgres/metadata.py
@@ -4,7 +4,7 @@
 import json
 import time
 from typing import Dict, List, Optional, Tuple, Union  # noqa: F401
-
+import pdb
 import psycopg2
 
 from datadog_checks.postgres.cursor import CommenterDictCursor
@@ -312,6 +312,7 @@ def report_postgres_metadata(self):
             self._is_schemas_collection_in_progress = False
 
     def _flush_schema(self, base_event, database, schema, tables):
+        pdb.set_trace()
         event = {
             **base_event,
             "metadata": [{**database, "schemas": [{**schema, "tables": tables}]}],
diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 40edc139e675c..918aece9a3d8a 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -269,14 +269,20 @@
 
 PROC_CHAR_LIMIT = 500
 
-SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas WHERE name NOT IN ('sys', 'information_schema');"
+#for now description results in ('ODBC SQL type -150 is not yet supported.  column-index=4  type=-150', 'HY106')
+DB_QUERY2 = "SELECT db.database_id AS id, db.name AS NAME, db.collation_name AS collation, dp.name AS owner, ep.value AS description FROM sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid LEFT JOIN sys.extended_properties ep ON ep.major_id = db.database_id AND ep.minor_id = 0 AND ep.class = 0 AND ep.name = 'MS_Description' WHERE db.name = '{}';"
+DB_QUERY = "SELECT db.database_id AS id, db.name AS NAME, db.collation_name AS collation, dp.name AS owner FROM sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid WHERE db.name = '{}';"
+
+#TODO as owner for the postgresbackend 
+SCHEMA_QUERY = "SELECT name,schema_id AS id,principal_id AS owner FROM sys.schemas WHERE name NOT IN ('sys', 'information_schema');"
+
 TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}"
 COLUMN_QUERY3 = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}' and TABLE_SCHEMA='{}';"
 #this query returns several values in case there is an alias for an int ... 
 COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id OR c.user_type_id = t.user_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
 
 #WHERE  attrelid IN ({table_ids})
-COLUMN_QUERY = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
+COLUMN_QUERY = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
 
 
 #PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index d9b88bf24bae9..69e8f983e1b42 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -10,6 +10,7 @@
     INDEX_QUERY,
     FOREIGN_KEY_QUERY,
     SCHEMA_QUERY,
+    DB_QUERY
 )
 
 from datadog_checks.sqlserver.utils import (
@@ -28,26 +29,35 @@ class SubmitData:
     MAX_COLUMN_COUNT  = 100_000
 
     def __init__(self, submit_data_function, base_event, logger):
-        self._submit = submit_data_function
+        self._submit_to_agent_queue = submit_data_function
         self._columns_count  = 0
         self.db_to_schemas = {} # dbname : { id : schema }
+        self.db_info = {} # name to info
         self._base_event = base_event
         self._log = logger
+    
+    def store_db_info(self, db_name, db_info):
+        self.db_info[db_name] = db_info
 
     def store(self, db_name, schema, tables, columns_count):
         self._columns_count += columns_count
         schemas = self.db_to_schemas.setdefault(db_name, {})
-        if schema["schema_id"] in schemas:
-            known_tables = schemas[schema["schema_id"]].setdefault("tables",[])
+        if schema["id"] in schemas:
+            known_tables = schemas[schema["id"]].setdefault("tables",[])
             known_tables = known_tables + tables
         else:
-            schemas[schema["schema_id"]] = copy.deepcopy(schema) # TODO a deep copy ? kind of costs not much to be safe
-            schemas[schema["schema_id"]]["tables"] = tables
+            schemas[schema["id"]] = copy.deepcopy(schema) # TODO a deep copy ? kind of costs not much to be safe
+            schemas[schema["id"]]["tables"] = tables
         if self._columns_count > self.MAX_COLUMN_COUNT:
             self._submit()
 
+    #TODO P - disable for p.
+    def tmp_modify_to_fit_in_postgres(self, db_info):
+        if "collation" in db_info:
+            del db_info["collation"]
+        return db_info
+
     def submit(self):
-        pdb.set_trace()
         if not bool(self.db_to_schemas):
             return
         self._columns_count  = 0
@@ -56,10 +66,17 @@ def submit(self):
                  "timestamp": time.time() * 1000
                  }
         for db, schemas_by_id in self.db_to_schemas.items():
-            event["metadata"] =  event["metadata"] + [{"db_name":db, "schemas": list(schemas_by_id.values()) }]
+            db_info = {}
+            if db not in self.db_info:
+                #TODO log error
+                db_info["name"] = db
+            else:
+                db_info = self.db_info[db]
+            event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
-        self._submit(json_event)
+        pdb.set_trace()
+        self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 
 #TODO Introduce total max for data
@@ -67,6 +84,8 @@ class Schemas:
     def __init__(self, check):
         self._check = check 
         self._log = check.log
+        self._tags = [t for t in check.tags if not t.startswith('dd.internal')]
+        self._tags.append("boris:data")
         self.schemas_per_db = {} 
         """
         base_event = {
@@ -85,10 +104,10 @@ def __init__(self, check):
             "agent_version": datadog_agent.get_version(),
             "dbms": "postgres", #TODO fake it until you make it - trying to pass this data as postgres for now
             "kind": "pg_databases", # TODO pg_databases - will result in KindPgDatabases and so processor would thing its postgres 
-            "collection_interval": 100, #dummy
+            "collection_interval": 0.5, #dummy
             "dbms_version": 1, #dummy
-            #"tags": self._tags_no_db,
-            #"cloud_metadata": self._config.cloud_metadata,
+            "tags": self._tags, #in postgres it's no DB.
+            "cloud_metadata": self._check._config.cloud_metadata,
         }
 
         self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
@@ -122,10 +141,10 @@ def _init_schema_collection(self):
     schema
     dict:
         "name": str
-        "schema_id": str
+        "id": str
         "principal_id": str
         "tables" : []
-            object_id : str
+            id : str
             name : str
             columns: list of columns                  
                 "columns": dict
@@ -149,19 +168,21 @@ def collect_schemas_data(self):
                 #"collection_interval": self.schemas_collection_interval,
                 #"dbms_version": self._payload_pg_version(),
                 #"tags": self._tags_no_db,
-                #"cloud_metadata": self._config.cloud_metadata,
+                "cloud_metadata": self._check._config.cloud_metadata,
             }
 
         def fetch_schema_data(cursor, db_name):
+            db_info  = self._query_db_information(db_name, cursor)
             schemas = self._query_schema_information(cursor)
+            self._dataSubmitter.store_db_info(db_name, db_info)
             chunk_size = 50
             for schema in schemas:
                 tables = self._get_tables(schema, cursor)            
                 tables_chunk = list(get_list_chunks(tables, chunk_size))
                 for tables_chunk in tables_chunk:
-                    columns_count, tables = self._get_tables_data(tables_chunk, schema, cursor)
-                    self._dataSubmitter.store(db_name, schema, tables, columns_count)  
-                #self._dataSubmitter.submit() # we force submit when we reach the end of schema, it's like in Seths solution
+                    columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
+                    self._dataSubmitter.store(db_name, schema, tables_info, columns_count)  
+                    self._dataSubmitter.submit() # we force submit when we reach the end of schema, it's like in Seths solution
                 if len(tables) == 0:
                     self._dataSubmitter.store(db_name, schema, [], 0)
                 # to ask him if this is needed or we can submit only on 100 000 column
@@ -170,7 +191,14 @@ def fetch_schema_data(cursor, db_name):
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
         # submit the last chunk of data if any
         self._dataSubmitter.submit()
-        
+
+
+    def _query_db_information(self, db_name, cursor):
+        db_info = execute_query_output_result_as_a_dict(DB_QUERY.format(db_name), cursor)
+        if len(db_info) == 1:
+            return db_info[0]
+        else:
+            return None
     # TODO how often ?
 
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
@@ -199,14 +227,14 @@ def _get_tables_data(self, table_list, schema, cursor):
         name_to_id = {}
         id_to_all = {}
         table_names = ",".join(["'{}'".format(t.get("name")) for t in table_list])
-        table_ids = ",".join(["{}".format(t.get("object_id")) for t in table_list])
+        table_ids = ",".join(["{}".format(t.get("id")) for t in table_list])
         for t in table_list:
-            name_to_id[t["name"]] = t["object_id"] 
-            id_to_all[t["object_id"]] = t
+            name_to_id[t["name"]] = t["id"] 
+            id_to_all[t["id"]] = t
         total_columns_number  = self._populate_with_columns_data(table_names, name_to_id, id_to_all, schema, cursor)
-        self._populate_with_partitions_data(table_ids, id_to_all, cursor)
-        self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor)
-        self._populate_with_index_data(table_ids, id_to_all, cursor)
+        #self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
+        #self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
+        #self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         # unwrap id_to_all
         return total_columns_number, list(id_to_all.values())
 
@@ -215,13 +243,27 @@ def _populate_with_columns_data(self, table_names, name_to_id, id_to_all, schema
         # get columns if we dont have a dict here unlike postgres
         cursor.execute(COLUMN_QUERY.format(table_names, schema["name"]))
         data = cursor.fetchall()
-        columns = [str(i[0]).lower() for i in cursor.description]
+        columns = []
+        #TODO we need it cause if I put AS default its a forbidden key word and to be inline with postgres we need it
+        for i in cursor.description:
+            if str(i[0]).lower() == "column_default":
+                columns.append("default")
+            else:
+                columns.append(str(i[0]).lower())
+        
+
         rows = [dict(zip(columns, row)) for row in data]       
         for row in rows:
             table_id = name_to_id.get(str(row.get("table_name")))
             if table_id is not None:
                 # exclude "table_name" from the row dict
                 row.pop("table_name", None)
+                if "nullable" in row:
+                    if row["nullable"].lower() == "no" or row["nullable"].lower() == "false":
+                        #to make compatible with postgres 
+                        row["nullable"] = "false"
+                    else:
+                        row["nullable"] = "true"
                 id_to_all.get(table_id)["columns"] = id_to_all.get(table_id).get("columns",[]) + [row]
         return len(data)
     
@@ -230,13 +272,13 @@ def _populate_with_partitions_data(self, table_ids, id_to_all, cursor):
         columns = [str(i[0]).lower() for i in cursor.description] 
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         for row in rows:
-            id  = row.pop("object_id", None)
+            id  = row.pop("id", None)
             if id is not None:
                 #TODO what happens if not found ? 
                 id_to_all.get(id)["partitions"] = row
             else:
                 print("todo error")
-            row.pop("object_id", None)
+            row.pop("id", None)
         print("end")
 
     def _populate_with_index_data(self, table_ids, id_to_all, cursor):
@@ -244,12 +286,12 @@ def _populate_with_index_data(self, table_ids, id_to_all, cursor):
         columns = [str(i[0]).lower() for i in cursor.description] 
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         for row in rows:
-            id  = row.pop("object_id", None)
+            id  = row.pop("id", None)
             if id is not None:
                 id_to_all.get(id)["indexes"] = row
             else:
                 print("todo error")
-            row.pop("object_id", None)
+            row.pop("id", None)
         print("end")
 
     def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
@@ -257,7 +299,7 @@ def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
             columns = [str(i[0]).lower() for i in cursor.description] 
             rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
             for row in rows:
-                id  = row.pop("object_id", None)
+                id  = row.pop("id", None)
                 if id is not None:
                     id_to_all.get(id)["foreign_keys"] = row
                 else:
@@ -270,12 +312,12 @@ def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
 
     #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
     def _get_tables(self, schema, cursor):
-        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["schema_id"]))
+        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["id"]))
         columns = [str(i[0]).lower() for i in cursor.description]
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()] #TODO may be more optimal to patch columns with index etc 
         # rows = [dict(zip(columns + ["columns", "indexes", "partitions", "foreign_keys"], row + [[], [], [], []])) for row in cursor.fetchall()] #TODO may be this works
-        return [ {"object_id" : row["object_id"], "name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []} for row in rows ]                  
-
+        #return [ {"id" : row["object_id"], "name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []} for row in rows ]     # TODO P disabled because of postgres later enable             
+        return [ {"id" : row["object_id"], "name" : row['name'], "columns" : []} for row in rows ]  
 
     #TODO table 1803153469 is in  sys.indexes but not in sys.index_columns ... shell we do something about it ?
 

From 37962ae700e82b7eb0eac1fbcd641da4285ff564 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 22 Apr 2024 12:44:11 +0000
Subject: [PATCH 028/132] adopted payload to the backend

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 13 +++++++------
 sqlserver/datadog_checks/sqlserver/utils.py   |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 69e8f983e1b42..0d4c3fd056d2e 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -105,7 +105,7 @@ def __init__(self, check):
             "dbms": "postgres", #TODO fake it until you make it - trying to pass this data as postgres for now
             "kind": "pg_databases", # TODO pg_databases - will result in KindPgDatabases and so processor would thing its postgres 
             "collection_interval": 0.5, #dummy
-            "dbms_version": 1, #dummy
+            "dbms_version": "v14.2", #dummy but may be format i v11 is important ?
             "tags": self._tags, #in postgres it's no DB.
             "cloud_metadata": self._check._config.cloud_metadata,
         }
@@ -213,7 +213,8 @@ def _query_schema_information(self, cursor):
         cursor.execute(SCHEMA_QUERY)
         schemas = []
         columns = [i[0] for i in cursor.description]
-        schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        schemas = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
+        #TODO we can refactor it , doesnt have to have a tables :[] if there is nothing. 
         for schema in schemas:
             schema["tables"] = []
         self._log.debug("fetched schemas len(rows)=%s", len(schemas))
@@ -252,7 +253,7 @@ def _populate_with_columns_data(self, table_names, name_to_id, id_to_all, schema
                 columns.append(str(i[0]).lower())
         
 
-        rows = [dict(zip(columns, row)) for row in data]       
+        rows = [dict(zip(columns, [str(item) for item in row])) for row in data]       
         for row in rows:
             table_id = name_to_id.get(str(row.get("table_name")))
             if table_id is not None:
@@ -261,9 +262,9 @@ def _populate_with_columns_data(self, table_names, name_to_id, id_to_all, schema
                 if "nullable" in row:
                     if row["nullable"].lower() == "no" or row["nullable"].lower() == "false":
                         #to make compatible with postgres 
-                        row["nullable"] = "false"
+                        row["nullable"] = False
                     else:
-                        row["nullable"] = "true"
+                        row["nullable"] = True
                 id_to_all.get(table_id)["columns"] = id_to_all.get(table_id).get("columns",[]) + [row]
         return len(data)
     
@@ -317,7 +318,7 @@ def _get_tables(self, schema, cursor):
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()] #TODO may be more optimal to patch columns with index etc 
         # rows = [dict(zip(columns + ["columns", "indexes", "partitions", "foreign_keys"], row + [[], [], [], []])) for row in cursor.fetchall()] #TODO may be this works
         #return [ {"id" : row["object_id"], "name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []} for row in rows ]     # TODO P disabled because of postgres later enable             
-        return [ {"id" : row["object_id"], "name" : row['name'], "columns" : []} for row in rows ]  
+        return [ {"id" : str(row["object_id"]), "name" : row['name'], "columns" : []} for row in rows ]  
 
     #TODO table 1803153469 is in  sys.indexes but not in sys.index_columns ... shell we do something about it ?
 
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index 9f94ab620aac4..cfe1f64e2a254 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -145,7 +145,7 @@ def execute_query_output_result_as_a_dict(query, cursor, column_name=None):
         columns = [str(column_name).lower() for i in cursor.description]
     else:
         columns = [str(i[0]).lower() for i in cursor.description]
-    rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+    rows = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
     return rows
 
 def get_list_chunks(lst, n):

From 2bef067fe18cac2da6a5455b6835eb3b5f77d322 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 22 Apr 2024 13:13:06 +0000
Subject: [PATCH 029/132] remove breakpoints

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 0d4c3fd056d2e..93c240ec3a781 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -75,7 +75,6 @@ def submit(self):
             event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
-        pdb.set_trace()
         self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 

From 0c3f0b9c40ffc2bd48aa7ff8b4afda5d09d9c674 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 24 Apr 2024 10:35:39 +0000
Subject: [PATCH 030/132] adding a test

---
 sqlserver/datadog_checks/sqlserver/const.py   |  4 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 16 +++---
 .../datadog_checks/sqlserver/sqlserver.py     | 36 ++++++-------
 sqlserver/tests/compose/setup.sh              |  2 +
 sqlserver/tests/compose/setup.sql             | 10 ++++
 sqlserver/tests/test_metadata.py              | 53 ++++++++++++++++---
 6 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 918aece9a3d8a..ad8a9d95d52b6 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -282,8 +282,10 @@
 COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id OR c.user_type_id = t.user_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
 
 #WHERE  attrelid IN ({table_ids})
+COLUMN_QUERY3 = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
 COLUMN_QUERY = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
-
+#TODO add ORDER BY ORDINAL_POSITION; ? 
+#"SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ('boris', OBJECT_NAME(917578307))
 
 #PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
 PARTITIONS_QUERY2 = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 93c240ec3a781..c75a0dd162a35 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -185,8 +185,8 @@ def fetch_schema_data(cursor, db_name):
                 if len(tables) == 0:
                     self._dataSubmitter.store(db_name, schema, [], 0)
                 # to ask him if this is needed or we can submit only on 100 000 column
-            # tells if we want to move to the next DB or stop
-            return True
+            # tells if we want to move to the next DB or stop, stop == TRUE
+            return False
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
         # submit the last chunk of data if any
         self._dataSubmitter.submit()
@@ -226,12 +226,14 @@ def _get_tables_data(self, table_list, schema, cursor):
             return
         name_to_id = {}
         id_to_all = {}
-        table_names = ",".join(["'{}'".format(t.get("name")) for t in table_list])
-        table_ids = ",".join(["{}".format(t.get("id")) for t in table_list])
+        #table_names = ",".join(["'{}'".format(t.get("name")) for t in table_list])
+        #OBJECT_NAME is needed to make it work for special characters 
+        table_ids = ",".join(["OBJECT_NAME({})".format(t.get("id")) for t in table_list])
+        #pdb.set_trace()
         for t in table_list:
             name_to_id[t["name"]] = t["id"] 
             id_to_all[t["id"]] = t
-        total_columns_number  = self._populate_with_columns_data(table_names, name_to_id, id_to_all, schema, cursor)
+        total_columns_number  = self._populate_with_columns_data(table_ids, name_to_id, id_to_all, schema, cursor)
         #self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         #self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         #self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
@@ -239,9 +241,9 @@ def _get_tables_data(self, table_list, schema, cursor):
         return total_columns_number, list(id_to_all.values())
 
     # TODO refactor the next 3 to have a base function when everythng is settled.
-    def _populate_with_columns_data(self, table_names, name_to_id, id_to_all, schema, cursor):
+    def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema, cursor):
         # get columns if we dont have a dict here unlike postgres
-        cursor.execute(COLUMN_QUERY.format(table_names, schema["name"]))
+        cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
         data = cursor.fetchall()
         columns = []
         #TODO we need it cause if I put AS default its a forbidden key word and to be inline with postgres we need it
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index b147a521c3297..f88e1489c9154 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -109,7 +109,7 @@ class SQLServer(AgentCheck):
     def __init__(self, name, init_config, instances):
         super(SQLServer, self).__init__(name, init_config, instances)
 
-        self._resolved_hostname = None
+        self.resolved_hostname = None
         self._agent_hostname = None
         self.connection = None
         self.failed_connections = {}
@@ -209,10 +209,10 @@ def set_resource_tags(self):
                     self._config.cloud_metadata.get("aws")["instance_endpoint"],
                 )
             )
-        elif AWS_RDS_HOSTNAME_SUFFIX in self._resolved_hostname:
+        elif AWS_RDS_HOSTNAME_SUFFIX in self.resolved_hostname:
             # allow for detecting if the host is an RDS host, and emit
             # the resource properly even if the `aws` config is unset
-            self.tags.append("dd.internal.resource:aws_rds_instance:{}".format(self._resolved_hostname))
+            self.tags.append("dd.internal.resource:aws_rds_instance:{}".format(self.resolved_hostname))
         if self._config.cloud_metadata.get("azure") is not None:
             deployment_type = self._config.cloud_metadata.get("azure")["deployment_type"]
             name = self._config.cloud_metadata.get("azure")["name"]
@@ -221,7 +221,7 @@ def set_resource_tags(self):
                 # azure sql databases have a special format, which is set for DBM
                 # customers in the resolved_hostname.
                 # If user is not DBM customer, the resource_name should just be set to the `name`
-                db_instance = self._resolved_hostname
+                db_instance = self.resolved_hostname
             # some `deployment_type`s map to multiple `resource_type`s
             resource_types = AZURE_DEPLOYMENT_TYPE_TO_RESOURCE_TYPES.get(deployment_type).split(",")
             for r_type in resource_types:
@@ -232,18 +232,18 @@ def set_resource_tags(self):
         # finally, emit a `database_instance` resource for this instance
         self.tags.append(
             "dd.internal.resource:database_instance:{}".format(
-                self._resolved_hostname,
+                self.resolved_hostname,
             )
         )
 
     def set_resolved_hostname(self):
         self.load_static_information()
-        if self._resolved_hostname is None:
+        if self.resolved_hostname is None:
             if self._config.reported_hostname:
-                self._resolved_hostname = self._config.reported_hostname
+                self.resolved_hostname = self._config.reported_hostname
             else:
                 host, _ = split_sqlserver_host_port(self.instance.get("host"))
-                self._resolved_hostname = resolve_db_host(host)
+                self.resolved_hostname = resolve_db_host(host)
                 engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
                 if engine_edition == ENGINE_EDITION_SQL_DATABASE:
                     configured_database = self.instance.get("database", None)
@@ -316,7 +316,7 @@ def debug_stats_kwargs(self, tags=None):
         tags = tags if tags else []
         return {
             "tags": self.debug_tags() + tags,
-            "hostname": self.resolved_hostname,
+            "hostname": self._resolved_hostname,
             "raw": True,
         }
 
@@ -329,7 +329,7 @@ def agent_hostname(self):
 
     def initialize_connection(self):
         self.connection = Connection(
-            host=self.resolved_hostname,
+            host=self._resolved_hostname,
             init_config=self.init_config,
             instance_config=self.instance,
             service_check_handler=self.handle_service_check,
@@ -367,12 +367,12 @@ def handle_service_check(self, status, connection_host, database, message=None,
         custom_tags = self.instance.get("tags", [])
         disable_generic_tags = self.instance.get("disable_generic_tags", False)
         service_check_tags = [
-            "sqlserver_host:{}".format(self.resolved_hostname),
+            "sqlserver_host:{}".format(self._resolved_hostname),
             "db:{}".format(database),
             "connection_host:{}".format(connection_host),
         ]
         if not disable_generic_tags:
-            service_check_tags.append("host:{}".format(self.resolved_hostname))
+            service_check_tags.append("host:{}".format(self._resolved_hostname))
         if custom_tags is not None:
             service_check_tags.extend(custom_tags)
         service_check_tags = list(set(service_check_tags))
@@ -691,7 +691,7 @@ def typed_metric(self, cfg_inst, table, base_name=None, user_type=None, sql_coun
             metric_type_str, cls = metrics.TABLE_MAPPING[table]
             metric_type = getattr(self, metric_type_str)
 
-        cfg_inst["hostname"] = self.resolved_hostname
+        cfg_inst["hostname"] = self._resolved_hostname
 
         return cls(cfg_inst, base_name, metric_type, column, self.log)
 
@@ -768,7 +768,7 @@ def check(self, _):
             if self._query_manager is None:
                 # use QueryManager to process custom queries
                 self._query_manager = QueryManager(
-                    self, self.execute_query_raw, tags=self.tags, hostname=self.resolved_hostname
+                    self, self.execute_query_raw, tags=self.tags, hostname=self._resolved_hostname
                 )
                 self._query_manager.compile_queries()
             if self.server_state_queries is None:
@@ -785,8 +785,6 @@ def check(self, _):
             if self._config.autodiscovery and self._config.autodiscovery_db_service_check:
                 self._check_database_conns()
             if self._config.dbm_enabled:
-                #TODO limit this check by some minutes ...
-                self._schemas.collect_schemas_data() 
                 self.statement_metrics.run_job_loop(self.tags)
                 self.procedure_metrics.run_job_loop(self.tags)
                 self.activity.run_job_loop(self.tags)
@@ -1043,9 +1041,9 @@ def proc_check_guard(self, sql):
         return should_run
 
     def _send_database_instance_metadata(self):
-        if self.resolved_hostname not in self._database_instance_emitted:
+        if self._resolved_hostname not in self._database_instance_emitted:
             event = {
-                "host": self.resolved_hostname,
+                "host": self._resolved_hostname,
                 "agent_version": datadog_agent.get_version(),
                 "dbms": "sqlserver",
                 "kind": "database_instance",
@@ -1063,5 +1061,5 @@ def _send_database_instance_metadata(self):
                     "connection_host": self._config.connection_host,
                 },
             }
-            self._database_instance_emitted[self.resolved_hostname] = event
+            self._database_instance_emitted[self._resolved_hostname] = event
             self.database_monitoring_metadata(json.dumps(event, default=default_json_event_encoding))
diff --git a/sqlserver/tests/compose/setup.sh b/sqlserver/tests/compose/setup.sh
index e0b3cc7a678e4..f4aa33bb663b7 100644
--- a/sqlserver/tests/compose/setup.sh
+++ b/sqlserver/tests/compose/setup.sh
@@ -13,7 +13,9 @@ do
     fi
 done
 
+
 /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P $SA_PASSWORD -d master -i setup.sql -b
+
 if [ $? -eq 0 ]
 then
     echo "INFO: setup.sql completed."
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index bea74fdfbcb1b..838ccb28f6f3a 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -16,6 +16,14 @@ CREATE USER fred FOR LOGIN fred;
 GRANT CONNECT ANY DATABASE to fred;
 GO
 
+
+CREATE DATABASE datadog_test_schemas;
+GO
+USE datadog_test_schemas;
+GO
+
+CREATE SCHEMA test_schema;
+GO
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 CREATE DATABASE [datadog_test-1];
@@ -30,6 +38,8 @@ CREATE USER fred FOR LOGIN fred;
 CREATE CLUSTERED INDEX thingsindex ON [datadog_test-1].dbo.ϑings (name);
 GO
 
+
+
 EXEC sp_addrolemember 'db_datareader', 'bob'
 EXEC sp_addrolemember 'db_datareader', 'fred'
 EXEC sp_addrolemember 'db_datawriter', 'bob'
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 226519eb6ebdb..9744f60aef1a9 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -10,6 +10,7 @@
 import pytest
 
 from datadog_checks.sqlserver import SQLServer
+#from deepdiff import DeepDiff - not clear how to add it to ddev
 
 from .common import CHECK_NAME
 
@@ -18,6 +19,7 @@
 except ImportError:
     pyodbc = None
 
+import pdb
 
 @pytest.fixture
 def dbm_instance(instance_docker):
@@ -35,6 +37,8 @@ def dbm_instance(instance_docker):
     return copy(instance_docker)
 
 
+
+
 @pytest.mark.integration
 @pytest.mark.usefixtures('dd_environment')
 @pytest.mark.parametrize(
@@ -51,13 +55,14 @@ def dbm_instance(instance_docker):
     ],
 )
 def test_get_available_settings_columns(dbm_instance, expected_columns, available_columns):
-    check = SQLServer(CHECK_NAME, {}, [dbm_instance])
-    check.initialize_connection()
-    _conn_key_prefix = "dbm-metadata-"
-    with check.connection.open_managed_default_connection(key_prefix=_conn_key_prefix):
-        with check.connection.get_managed_cursor(key_prefix=_conn_key_prefix) as cursor:
-            result_available_columns = check.sql_metadata._get_available_settings_columns(cursor, expected_columns)
-            assert result_available_columns == available_columns
+    pass
+    #check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    #check.initialize_connection()
+    #_conn_key_prefix = "dbm-metadata-"
+    #with check.connection.open_managed_default_connection(key_prefix=_conn_key_prefix):
+        #with check.connection.get_managed_cursor(key_prefix=_conn_key_prefix) as cursor:
+            #result_available_columns = check.sql_metadata._get_available_settings_columns(cursor, expected_columns)
+            #assert result_available_columns == available_columns
 
 
 @pytest.mark.integration
@@ -90,3 +95,37 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     assert event['dbms'] == "sqlserver"
     assert event['kind'] == "sqlserver_configs"
     assert len(event["metadata"]) > 0
+
+def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
+    pdb.set_trace()
+    dbm_instance['database_autodiscovery'] = True
+    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas','datadog_test']
+
+    check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    dd_run_check(check)
+    #check.initialize_connection()
+    #check.check(dbm_instance)
+
+    #extracting events.
+    dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
+
+    # check that all expected tables are present
+    tables_set = {
+        "cities"
+    }
+    
+    #result = 
+    tables_got = []
+    #TODO later modify kind
+    for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'):
+
+        #First should be equal without order:
+        #diff = DeepDiff(r1, r2, ignore_order=True)
+        ##assert not diff, f"difference in response: {diff}"
+        # For tables order is important pick up these tables and check with order:
+        assert schema_event.get("timestamp") is not None
+        # there should only be one database, datadog_test
+        pdb.set_trace()
+        database_metadata = schema_event['metadata']
+        assert len(database_metadata) == 1
+        assert 'datadog_test' == database_metadata[0]['name']

From cafb6afc128bbbf6b955c9eacea35fe4da570e18 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 24 Apr 2024 10:41:29 +0000
Subject: [PATCH 031/132] Put back resolved hostname

---
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index f88e1489c9154..719bfdc7173f6 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -109,7 +109,7 @@ class SQLServer(AgentCheck):
     def __init__(self, name, init_config, instances):
         super(SQLServer, self).__init__(name, init_config, instances)
 
-        self.resolved_hostname = None
+        self._resolved_hostname = None
         self._agent_hostname = None
         self.connection = None
         self.failed_connections = {}
@@ -209,10 +209,10 @@ def set_resource_tags(self):
                     self._config.cloud_metadata.get("aws")["instance_endpoint"],
                 )
             )
-        elif AWS_RDS_HOSTNAME_SUFFIX in self.resolved_hostname:
+        elif AWS_RDS_HOSTNAME_SUFFIX in self._resolved_hostname:
             # allow for detecting if the host is an RDS host, and emit
             # the resource properly even if the `aws` config is unset
-            self.tags.append("dd.internal.resource:aws_rds_instance:{}".format(self.resolved_hostname))
+            self.tags.append("dd.internal.resource:aws_rds_instance:{}".format(self._resolved_hostname))
         if self._config.cloud_metadata.get("azure") is not None:
             deployment_type = self._config.cloud_metadata.get("azure")["deployment_type"]
             name = self._config.cloud_metadata.get("azure")["name"]
@@ -221,7 +221,7 @@ def set_resource_tags(self):
                 # azure sql databases have a special format, which is set for DBM
                 # customers in the resolved_hostname.
                 # If user is not DBM customer, the resource_name should just be set to the `name`
-                db_instance = self.resolved_hostname
+                db_instance = self._resolved_hostname
             # some `deployment_type`s map to multiple `resource_type`s
             resource_types = AZURE_DEPLOYMENT_TYPE_TO_RESOURCE_TYPES.get(deployment_type).split(",")
             for r_type in resource_types:

From 8a2af6c0a114e3daeced1b81667b82c7510fc888 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 24 Apr 2024 10:45:30 +0000
Subject: [PATCH 032/132] ficed resolved host name

---
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 719bfdc7173f6..20d23bbe3e47f 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -232,18 +232,18 @@ def set_resource_tags(self):
         # finally, emit a `database_instance` resource for this instance
         self.tags.append(
             "dd.internal.resource:database_instance:{}".format(
-                self.resolved_hostname,
+                self._resolved_hostname,
             )
         )
 
     def set_resolved_hostname(self):
         self.load_static_information()
-        if self.resolved_hostname is None:
+        if self._resolved_hostname is None:
             if self._config.reported_hostname:
-                self.resolved_hostname = self._config.reported_hostname
+                self._resolved_hostname = self._config.reported_hostname
             else:
                 host, _ = split_sqlserver_host_port(self.instance.get("host"))
-                self.resolved_hostname = resolve_db_host(host)
+                self._resolved_hostname = resolve_db_host(host)
                 engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
                 if engine_edition == ENGINE_EDITION_SQL_DATABASE:
                     configured_database = self.instance.get("database", None)
@@ -316,7 +316,7 @@ def debug_stats_kwargs(self, tags=None):
         tags = tags if tags else []
         return {
             "tags": self.debug_tags() + tags,
-            "hostname": self._resolved_hostname,
+            "hostname": self.resolved_hostname,
             "raw": True,
         }
 
@@ -329,7 +329,7 @@ def agent_hostname(self):
 
     def initialize_connection(self):
         self.connection = Connection(
-            host=self._resolved_hostname,
+            host=self.resolved_hostname,
             init_config=self.init_config,
             instance_config=self.instance,
             service_check_handler=self.handle_service_check,
@@ -367,12 +367,12 @@ def handle_service_check(self, status, connection_host, database, message=None,
         custom_tags = self.instance.get("tags", [])
         disable_generic_tags = self.instance.get("disable_generic_tags", False)
         service_check_tags = [
-            "sqlserver_host:{}".format(self._resolved_hostname),
+            "sqlserver_host:{}".format(self.resolved_hostname),
             "db:{}".format(database),
             "connection_host:{}".format(connection_host),
         ]
         if not disable_generic_tags:
-            service_check_tags.append("host:{}".format(self._resolved_hostname))
+            service_check_tags.append("host:{}".format(self.resolved_hostname))
         if custom_tags is not None:
             service_check_tags.extend(custom_tags)
         service_check_tags = list(set(service_check_tags))

From f1501026b0b7d36d4e78a64c4e566c1c0e6e8587 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 24 Apr 2024 10:47:35 +0000
Subject: [PATCH 033/132] Fixed more resolved host name

---
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 20d23bbe3e47f..cbc4aedb3f431 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -691,7 +691,7 @@ def typed_metric(self, cfg_inst, table, base_name=None, user_type=None, sql_coun
             metric_type_str, cls = metrics.TABLE_MAPPING[table]
             metric_type = getattr(self, metric_type_str)
 
-        cfg_inst["hostname"] = self._resolved_hostname
+        cfg_inst["hostname"] = self.resolved_hostname
 
         return cls(cfg_inst, base_name, metric_type, column, self.log)
 
@@ -768,7 +768,7 @@ def check(self, _):
             if self._query_manager is None:
                 # use QueryManager to process custom queries
                 self._query_manager = QueryManager(
-                    self, self.execute_query_raw, tags=self.tags, hostname=self._resolved_hostname
+                    self, self.execute_query_raw, tags=self.tags, hostname=self.resolved_hostname
                 )
                 self._query_manager.compile_queries()
             if self.server_state_queries is None:
@@ -1041,9 +1041,9 @@ def proc_check_guard(self, sql):
         return should_run
 
     def _send_database_instance_metadata(self):
-        if self._resolved_hostname not in self._database_instance_emitted:
+        if self.resolved_hostname not in self._database_instance_emitted:
             event = {
-                "host": self._resolved_hostname,
+                "host": self.resolved_hostname,
                 "agent_version": datadog_agent.get_version(),
                 "dbms": "sqlserver",
                 "kind": "database_instance",
@@ -1061,5 +1061,5 @@ def _send_database_instance_metadata(self):
                     "connection_host": self._config.connection_host,
                 },
             }
-            self._database_instance_emitted[self._resolved_hostname] = event
+            self._database_instance_emitted[self.resolved_hostname] = event
             self.database_monitoring_metadata(json.dumps(event, default=default_json_event_encoding))

From b6f096eed2e0fb98cf1e60d1e9f27709c9b97f21 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 24 Apr 2024 15:07:44 +0000
Subject: [PATCH 034/132] Imporved unit test

---
 sqlserver/datadog_checks/sqlserver/schemas.py |  6 ++-
 sqlserver/tests/test_metadata.py              | 42 ++++++++++++-------
 sqlserver/tests/utils.py                      | 19 +++++++++
 3 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index c75a0dd162a35..847c52c5d2c60 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -98,8 +98,12 @@ def __init__(self, check):
                 #"cloud_metadata": self._config.cloud_metadata,
             }
         """
+        #TODO remove : hosts were null onstaging /....
+        hostname = "boris"
+        if self._check.resolved_hostname is not None:
+            hostname = self._check.resolved_hostname
         base_event = {
-            "host": self._check.resolved_hostname,
+            "host": hostname,
             "agent_version": datadog_agent.get_version(),
             "dbms": "postgres", #TODO fake it until you make it - trying to pass this data as postgres for now
             "kind": "pg_databases", # TODO pg_databases - will result in KindPgDatabases and so processor would thing its postgres 
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 9744f60aef1a9..71e995bd76e8a 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -13,13 +13,14 @@
 #from deepdiff import DeepDiff - not clear how to add it to ddev
 
 from .common import CHECK_NAME
-
+from .utils import delete_if_found, compare_coumns_in_tables
 try:
     import pyodbc
 except ImportError:
     pyodbc = None
 
 import pdb
+import json
 
 @pytest.fixture
 def dbm_instance(instance_docker):
@@ -97,35 +98,46 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     assert len(event["metadata"]) > 0
 
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
+
+    databases_to_find  = ['datadog_test_schemas','datadog_test']
+    exp_datadog_test =  {'id': '6', 'name': 'datadog_test', 'owner': 'dbo', 'schemas': [ {'name': 'dbo', 'id': '1', 'owner': '1', 'tables': [{'id': '885578193', 'name': 'ϑings', 'columns': [{'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True}, {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True}]}]}]}
+    exp_datadog_test_schemas = {'id': '5', 'name': 'datadog_test_schemas', 'owner': 'dbo', 'schemas': [{'name': 'test_schema', 'id': '5', 'owner': '1', 'tables': []}]}
+    expected_data_for_db = {'datadog_test' : exp_datadog_test, 'datadog_test_schemas' : exp_datadog_test_schemas}
+
     pdb.set_trace()
     dbm_instance['database_autodiscovery'] = True
     dbm_instance['autodiscovery_include'] = ['datadog_test_schemas','datadog_test']
 
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     dd_run_check(check)
-    #check.initialize_connection()
-    #check.check(dbm_instance)
 
     #extracting events.
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
-
-    # check that all expected tables are present
-    tables_set = {
-        "cities"
-    }
     
-    #result = 
-    tables_got = []
+
     #TODO later modify kind
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'):
+        if len(databases_to_find) == 0:
+            # we may see the correct payload for the database several times in events
+            return
 
-        #First should be equal without order:
-        #diff = DeepDiff(r1, r2, ignore_order=True)
-        ##assert not diff, f"difference in response: {diff}"
-        # For tables order is important pick up these tables and check with order:
         assert schema_event.get("timestamp") is not None
         # there should only be one database, datadog_test
         pdb.set_trace()
         database_metadata = schema_event['metadata']
         assert len(database_metadata) == 1
-        assert 'datadog_test' == database_metadata[0]['name']
+        db_name = database_metadata[0]['name']
+        assert delete_if_found(databases_to_find, db_name)
+
+        # TODO enable when we add the package 
+        #difference = DeepDiff(database_metadata[0], expected_data_for_db[db_name], ignore_order=True)
+        difference = []
+        diff_keys = list(difference.keys())
+        if len(diff_keys) > 0 and list(diff_keys.keys()) is not ['iterable_item_removed']:
+            logging.debug("found the following diffs %s", json.dumps(difference))
+            assert False
+
+        # we need a special comparison as order of columns matter
+        assert compare_coumns_in_tables(expected_data_for_db[db_name], database_metadata[0])
+        
+    assert len(databases_to_find) == 0
\ No newline at end of file
diff --git a/sqlserver/tests/utils.py b/sqlserver/tests/utils.py
index 1d009b47ed6f5..63ff63cdc3b37 100644
--- a/sqlserver/tests/utils.py
+++ b/sqlserver/tests/utils.py
@@ -220,3 +220,22 @@ def run_query_and_ignore_exception(conn, query):
     @staticmethod
     def _create_rand_string(length=5):
         return ''.join(choice(string.ascii_lowercase + string.digits) for _ in range(length))
+
+def delete_if_found(my_list, value):
+    try:
+        index = my_list.index(value)
+        del my_list[index]
+        return True
+    except ValueError:
+        return None
+    
+def compare_coumns_in_tables(expected_data_for_db, actual_data_for_db):
+    for schema in expected_data_for_db['schemas']:
+        actual_schema = next(filter(lambda x: x['id'] == schema['id'], actual_data_for_db['schemas']))
+        for table in schema['tables']:
+            #find a table and then finally compare columns
+            actual_table = next(filter(lambda x: x['id'] == table['id'], actual_schema['tables']))
+            if actual_table['columns'] == table['columns']:
+                return True
+            else:
+                return False
\ No newline at end of file

From e4e1ada7a30d82e35fb9a0be55f49be9690e9b46 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 24 Apr 2024 16:26:51 +0000
Subject: [PATCH 035/132] trying to add deepdiff pkg

---
 ddev/hatch.toml                               |  1 +
 sqlserver/datadog_checks/sqlserver/schemas.py |  2 +
 sqlserver/hatch.toml                          |  5 +++
 sqlserver/tests/test_metadata.py              | 38 +++++++++++--------
 4 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/ddev/hatch.toml b/ddev/hatch.toml
index 2f299a9ceb09c..b39663cdf11e4 100644
--- a/ddev/hatch.toml
+++ b/ddev/hatch.toml
@@ -10,6 +10,7 @@ e2e-env = false
 dependencies = [
   "pyyaml",
   "vcrpy",
+  "deepdiff",
 ]
 # TODO: remove this when the old CLI is gone
 pre-install-commands = [
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 847c52c5d2c60..004d3da1c82fc 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -190,6 +190,8 @@ def fetch_schema_data(cursor, db_name):
                     self._dataSubmitter.store(db_name, schema, [], 0)
                 # to ask him if this is needed or we can submit only on 100 000 column
             # tells if we want to move to the next DB or stop, stop == TRUE
+            # we want to submit for each DB for clarity
+            self._dataSubmitter.submit()
             return False
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
         # submit the last chunk of data if any
diff --git a/sqlserver/hatch.toml b/sqlserver/hatch.toml
index a305f161e8fcf..dc30a882a15cb 100644
--- a/sqlserver/hatch.toml
+++ b/sqlserver/hatch.toml
@@ -1,3 +1,7 @@
+post-install-commands = [
+  "python -m pip install deepdiff",
+]
+
 [env.collectors.datadog-checks]
 base-package-features = ["deps", "db", "json"]
 
@@ -65,3 +69,4 @@ matrix.driver.env-vars = [
 name.linux-odbc-2019-high-cardinality.env-vars = "COMPOSE_FOLDER=compose-high-cardinality"
 name.linux-odbc-2022-high-cardinality.env-vars = "COMPOSE_FOLDER=compose-high-cardinality"
 name.windows-odbc-2019-high-cardinality.env-vars = "COMPOSE_FOLDER=compose-high-cardinality-windows"
+
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 71e995bd76e8a..ad5ece5a28046 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -9,6 +9,8 @@
 
 import pytest
 
+from deepdiff import DeepDiff
+
 from datadog_checks.sqlserver import SQLServer
 #from deepdiff import DeepDiff - not clear how to add it to ddev
 
@@ -86,25 +88,28 @@ def test_get_settings_query_cached(dbm_instance, caplog):
 
 
 def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
-    check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    pass
+    #check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     # dd_run_check(check)
-    check.initialize_connection()
-    check.check(dbm_instance)
-    dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
-    event = next((e for e in dbm_metadata if e['kind'] == 'sqlserver_configs'), None)
-    assert event is not None
-    assert event['dbms'] == "sqlserver"
-    assert event['kind'] == "sqlserver_configs"
-    assert len(event["metadata"]) > 0
-
+    #check.initialize_connection()
+    #check.check(dbm_instance)
+    #dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
+    #event = next((e for e in dbm_metadata if e['kind'] == 'sqlserver_configs'), None)
+    #assert event is not None
+    #assert event['dbms'] == "sqlserver"
+    #assert event['kind'] == "sqlserver_configs"
+    #assert len(event["metadata"]) > 0
+
+#TODO this test relies on a certain granularity
+#later we need to upgrade it to accumulate data for each DB before checking.
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
-
+    
     databases_to_find  = ['datadog_test_schemas','datadog_test']
     exp_datadog_test =  {'id': '6', 'name': 'datadog_test', 'owner': 'dbo', 'schemas': [ {'name': 'dbo', 'id': '1', 'owner': '1', 'tables': [{'id': '885578193', 'name': 'ϑings', 'columns': [{'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True}, {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True}]}]}]}
     exp_datadog_test_schemas = {'id': '5', 'name': 'datadog_test_schemas', 'owner': 'dbo', 'schemas': [{'name': 'test_schema', 'id': '5', 'owner': '1', 'tables': []}]}
     expected_data_for_db = {'datadog_test' : exp_datadog_test, 'datadog_test_schemas' : exp_datadog_test_schemas}
 
-    pdb.set_trace()
+
     dbm_instance['database_autodiscovery'] = True
     dbm_instance['autodiscovery_include'] = ['datadog_test_schemas','datadog_test']
 
@@ -112,9 +117,9 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     dd_run_check(check)
 
     #extracting events.
+    pdb.set_trace()
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
     
-
     #TODO later modify kind
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'):
         if len(databases_to_find) == 0:
@@ -123,15 +128,16 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
         assert schema_event.get("timestamp") is not None
         # there should only be one database, datadog_test
-        pdb.set_trace()
+        
         database_metadata = schema_event['metadata']
         assert len(database_metadata) == 1
         db_name = database_metadata[0]['name']
         assert delete_if_found(databases_to_find, db_name)
 
         # TODO enable when we add the package 
-        #difference = DeepDiff(database_metadata[0], expected_data_for_db[db_name], ignore_order=True)
-        difference = []
+        difference = DeepDiff(database_metadata[0], expected_data_for_db[db_name], ignore_order=True)
+        pdb.set_trace()
+        #difference = {}
         diff_keys = list(difference.keys())
         if len(diff_keys) > 0 and list(diff_keys.keys()) is not ['iterable_item_removed']:
             logging.debug("found the following diffs %s", json.dumps(difference))

From 8c7a958d4007928651c84f48e403dbb6899dcb57 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 24 Apr 2024 17:42:32 +0000
Subject: [PATCH 036/132] Fixed test to combine payloads

---
 sqlserver/tests/test_metadata.py | 33 +++++++++++++++++++++++---------
 sqlserver/tests/utils.py         | 10 ++++++----
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index ad5ece5a28046..c941e72109f77 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from deepdiff import DeepDiff
+#from deepdiff import DeepDiff
 
 from datadog_checks.sqlserver import SQLServer
 #from deepdiff import DeepDiff - not clear how to add it to ddev
@@ -117,9 +117,11 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     dd_run_check(check)
 
     #extracting events.
-    pdb.set_trace()
+
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
     
+    actual_payloads = {}
+
     #TODO later modify kind
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'):
         if len(databases_to_find) == 0:
@@ -132,18 +134,31 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         database_metadata = schema_event['metadata']
         assert len(database_metadata) == 1
         db_name = database_metadata[0]['name']
-        assert delete_if_found(databases_to_find, db_name)
+
+        if db_name in actual_payloads:
+            actual_payloads[db_name]['schemas'] = actual_payloads[db_name]['schemas'] + database_metadata[0]['schemas']
+        else:
+            actual_payloads[db_name] = database_metadata[0]
+
+    assert len(actual_payloads) == len(expected_data_for_db)    
+
+    for db_name, actual_payload in actual_payloads.items():
+
+        #assert delete_if_found(databases_to_find, db_name)
+        assert db_name in databases_to_find
+        # we need to accumulate all data ... as payloads may differ 
 
         # TODO enable when we add the package 
-        difference = DeepDiff(database_metadata[0], expected_data_for_db[db_name], ignore_order=True)
-        pdb.set_trace()
-        #difference = {}
+        #difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
+
+        difference = {}
         diff_keys = list(difference.keys())
         if len(diff_keys) > 0 and list(diff_keys.keys()) is not ['iterable_item_removed']:
             logging.debug("found the following diffs %s", json.dumps(difference))
             assert False
 
         # we need a special comparison as order of columns matter
-        assert compare_coumns_in_tables(expected_data_for_db[db_name], database_metadata[0])
-        
-    assert len(databases_to_find) == 0
\ No newline at end of file
+        pdb.set_trace()
+        assert compare_coumns_in_tables(expected_data_for_db[db_name], actual_payload)
+        pdb.set_trace()
+        print("ok")
diff --git a/sqlserver/tests/utils.py b/sqlserver/tests/utils.py
index 63ff63cdc3b37..00c25d807d95f 100644
--- a/sqlserver/tests/utils.py
+++ b/sqlserver/tests/utils.py
@@ -229,13 +229,15 @@ def delete_if_found(my_list, value):
     except ValueError:
         return None
     
+import pdb    
 def compare_coumns_in_tables(expected_data_for_db, actual_data_for_db):
+    pdb.set_trace()
     for schema in expected_data_for_db['schemas']:
         actual_schema = next(filter(lambda x: x['id'] == schema['id'], actual_data_for_db['schemas']))
         for table in schema['tables']:
             #find a table and then finally compare columns
             actual_table = next(filter(lambda x: x['id'] == table['id'], actual_schema['tables']))
-            if actual_table['columns'] == table['columns']:
-                return True
-            else:
-                return False
\ No newline at end of file
+            if actual_table['columns'] != table['columns']:
+                return False
+
+    return True
\ No newline at end of file

From 80714f3a12bc18148f8e904847307d386b7d67a2 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 25 Apr 2024 08:55:24 +0000
Subject: [PATCH 037/132] added deepdiff to the sqlserver hatch

---
 sqlserver/hatch.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sqlserver/hatch.toml b/sqlserver/hatch.toml
index dc30a882a15cb..ffdd9ea6deff7 100644
--- a/sqlserver/hatch.toml
+++ b/sqlserver/hatch.toml
@@ -2,6 +2,10 @@ post-install-commands = [
   "python -m pip install deepdiff",
 ]
 
+dependencies = [
+  "deepdiff",
+]
+
 [env.collectors.datadog-checks]
 base-package-features = ["deps", "db", "json"]
 

From c1a0576c063b4718286e6f446289af284036d4ba Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 25 Apr 2024 09:32:46 +0000
Subject: [PATCH 038/132] Tried to add deepdifff deferently

---
 sqlserver/hatch.toml             | 17 +++++++++--------
 sqlserver/tests/test_metadata.py |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/sqlserver/hatch.toml b/sqlserver/hatch.toml
index ffdd9ea6deff7..b8eb10090de73 100644
--- a/sqlserver/hatch.toml
+++ b/sqlserver/hatch.toml
@@ -1,19 +1,19 @@
-post-install-commands = [
-  "python -m pip install deepdiff",
-]
-
-dependencies = [
-  "deepdiff",
-]
-
 [env.collectors.datadog-checks]
 base-package-features = ["deps", "db", "json"]
 
+[envs.default]
+pre-install-commands = [
+  "python -m pip install deepdiff",
+]
+
 [[envs.default.matrix]]
 python = ["3.11"]
 os = ["linux"]
 version = ["2017", "2019", "2022"]
 setup = ["single", "ha"]
+dependencies = [
+  "deepdiff"
+]
 
 # test the full combination of python-version/driver against a the latest sql server version
 # ideally we'd test this against all sql server versions but that makes the test take too long and time out.
@@ -26,6 +26,7 @@ driver = ["SQLOLEDB", "SQLNCLI11", "MSOLEDBSQL", "odbc"]
 version = ["2019", "2022"]
 setup = ["single"]
 
+
 # The high cardinality environment is meant to be used for local dev/testing
 # for example, when we want to do performance testing on local changes to the metrics
 # query, we can do that by uncommenting this env setup. Note, you should make sure to set you
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index c941e72109f77..cb6215c4991eb 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-#from deepdiff import DeepDiff
+from deepdiff import DeepDiff
 
 from datadog_checks.sqlserver import SQLServer
 #from deepdiff import DeepDiff - not clear how to add it to ddev

From 1d9e43c9c2ae33c4baaecf313d34c8ea08a479ea Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 25 Apr 2024 19:49:33 +0000
Subject: [PATCH 039/132] Enabled test

---
 sqlserver/datadog_checks/sqlserver/metadata.py |  1 -
 sqlserver/datadog_checks/sqlserver/schemas.py  | 18 ++++--------------
 .../datadog_checks/sqlserver/sqlserver.py      |  1 -
 sqlserver/hatch.toml                           |  7 +------
 sqlserver/tests/compose/setup.sql              |  5 +++++
 sqlserver/tests/test_metadata.py               | 14 +++++++-------
 sqlserver/tests/utils.py                       |  4 +---
 7 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/metadata.py b/sqlserver/datadog_checks/sqlserver/metadata.py
index 33cf24a92e8ab..15fddbbce47af 100644
--- a/sqlserver/datadog_checks/sqlserver/metadata.py
+++ b/sqlserver/datadog_checks/sqlserver/metadata.py
@@ -2,7 +2,6 @@
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 import time
-import pdb
 from datadog_checks.base import is_affirmative
 from datadog_checks.base.utils.db.utils import (
     DBMAsyncJob,
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 004d3da1c82fc..3213f164b1903 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -17,7 +17,7 @@
     execute_query_output_result_as_a_dict, get_list_chunks
 )
 
-import pdb
+
 
 import time
 import json
@@ -163,16 +163,6 @@ def _init_schema_collection(self):
     #sends all the data in one go but split in chunks (like Seth's solution)
     def collect_schemas_data(self):
         
-        base_event = {
-                "host": self._check.resolved_hostname,
-                #"agent_version": datadog_agent.get_version(),
-                "dbms": "sqlserver", #TODO ?
-                "kind": "", # TODO ? 
-                #"collection_interval": self.schemas_collection_interval,
-                #"dbms_version": self._payload_pg_version(),
-                #"tags": self._tags_no_db,
-                "cloud_metadata": self._check._config.cloud_metadata,
-            }
 
         def fetch_schema_data(cursor, db_name):
             db_info  = self._query_db_information(db_name, cursor)
@@ -234,12 +224,12 @@ def _get_tables_data(self, table_list, schema, cursor):
         id_to_all = {}
         #table_names = ",".join(["'{}'".format(t.get("name")) for t in table_list])
         #OBJECT_NAME is needed to make it work for special characters 
-        table_ids = ",".join(["OBJECT_NAME({})".format(t.get("id")) for t in table_list])
-        #pdb.set_trace()
+        table_ids_object = ",".join(["OBJECT_NAME({})".format(t.get("id")) for t in table_list])
+        table_ids = ",".join(["{}".format(t.get("id")) for t in table_list])
         for t in table_list:
             name_to_id[t["name"]] = t["id"] 
             id_to_all[t["id"]] = t
-        total_columns_number  = self._populate_with_columns_data(table_ids, name_to_id, id_to_all, schema, cursor)
+        total_columns_number  = self._populate_with_columns_data(table_ids_object, name_to_id, id_to_all, schema, cursor)
         #self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         #self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         #self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index cbc4aedb3f431..4987d3bb5d862 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -2,7 +2,6 @@
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 from __future__ import division
-import pdb
 import copy
 import time
 from collections import defaultdict
diff --git a/sqlserver/hatch.toml b/sqlserver/hatch.toml
index b8eb10090de73..59de0ead06750 100644
--- a/sqlserver/hatch.toml
+++ b/sqlserver/hatch.toml
@@ -2,18 +2,13 @@
 base-package-features = ["deps", "db", "json"]
 
 [envs.default]
-pre-install-commands = [
-  "python -m pip install deepdiff",
-]
+dependencies = ["deepdiff"]
 
 [[envs.default.matrix]]
 python = ["3.11"]
 os = ["linux"]
 version = ["2017", "2019", "2022"]
 setup = ["single", "ha"]
-dependencies = [
-  "deepdiff"
-]
 
 # test the full combination of python-version/driver against a the latest sql server version
 # ideally we'd test this against all sql server versions but that makes the test take too long and time out.
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 838ccb28f6f3a..cedc070565559 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -24,6 +24,11 @@ GO
 
 CREATE SCHEMA test_schema;
 GO
+
+CREATE TABLE datadog_test_schemas.test_schema.cities (id int DEFAULT 0, name varchar(255));
+CREATE INDEX one_column_index ON datadog_test_schemas.test_schema.cities (id);
+CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey'), (2, 'bar');
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 CREATE DATABASE [datadog_test-1];
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index cb6215c4991eb..8c4e27855bbb3 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -21,7 +21,7 @@
 except ImportError:
     pyodbc = None
 
-import pdb
+
 import json
 
 @pytest.fixture
@@ -106,7 +106,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     
     databases_to_find  = ['datadog_test_schemas','datadog_test']
     exp_datadog_test =  {'id': '6', 'name': 'datadog_test', 'owner': 'dbo', 'schemas': [ {'name': 'dbo', 'id': '1', 'owner': '1', 'tables': [{'id': '885578193', 'name': 'ϑings', 'columns': [{'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True}, {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True}]}]}]}
-    exp_datadog_test_schemas = {'id': '5', 'name': 'datadog_test_schemas', 'owner': 'dbo', 'schemas': [{'name': 'test_schema', 'id': '5', 'owner': '1', 'tables': []}]}
+    exp_datadog_test_schemas = {'id': '5', 'name': 'datadog_test_schemas', 'owner': 'dbo', 'schemas': [{'name': 'test_schema', 'id': '5', 'owner': '1', 'tables': [{'id': '885578193', 'name': 'cities', 'columns': [{'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True}, {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True}]}]}]}
     expected_data_for_db = {'datadog_test' : exp_datadog_test, 'datadog_test_schemas' : exp_datadog_test_schemas}
 
 
@@ -149,16 +149,16 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         # we need to accumulate all data ... as payloads may differ 
 
         # TODO enable when we add the package 
-        #difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
+        difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
-        difference = {}
+        #difference = {}
         diff_keys = list(difference.keys())
-        if len(diff_keys) > 0 and list(diff_keys.keys()) is not ['iterable_item_removed']:
+        if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
             logging.debug("found the following diffs %s", json.dumps(difference))
             assert False
 
         # we need a special comparison as order of columns matter
-        pdb.set_trace()
+
         assert compare_coumns_in_tables(expected_data_for_db[db_name], actual_payload)
-        pdb.set_trace()
+
         print("ok")
diff --git a/sqlserver/tests/utils.py b/sqlserver/tests/utils.py
index 00c25d807d95f..05bd4b12ccb30 100644
--- a/sqlserver/tests/utils.py
+++ b/sqlserver/tests/utils.py
@@ -228,10 +228,8 @@ def delete_if_found(my_list, value):
         return True
     except ValueError:
         return None
-    
-import pdb    
+      
 def compare_coumns_in_tables(expected_data_for_db, actual_data_for_db):
-    pdb.set_trace()
     for schema in expected_data_for_db['schemas']:
         actual_schema = next(filter(lambda x: x['id'] == schema['id'], actual_data_for_db['schemas']))
         for table in schema['tables']:

From 769d155c78c80344fbbd1b18f7b0be285eac99c1 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 29 Apr 2024 12:53:32 +0000
Subject: [PATCH 040/132] Added a total limit of columns

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 3213f164b1903..b60f3a3a221a5 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -17,8 +17,6 @@
     execute_query_output_result_as_a_dict, get_list_chunks
 )
 
-
-
 import time
 import json
 import copy
@@ -28,9 +26,13 @@
 class SubmitData: 
     MAX_COLUMN_COUNT  = 100_000
 
+    # REDAPL has a 3MB limit per resource
+    MAX_TOTAL_COLUMN_COUNT = 250_000
+
     def __init__(self, submit_data_function, base_event, logger):
         self._submit_to_agent_queue = submit_data_function
         self._columns_count  = 0
+        self._total_columns_count = 0
         self.db_to_schemas = {} # dbname : { id : schema }
         self.db_info = {} # name to info
         self._base_event = base_event
@@ -41,6 +43,7 @@ def store_db_info(self, db_name, db_info):
 
     def store(self, db_name, schema, tables, columns_count):
         self._columns_count += columns_count
+        self._total_columns_count += columns_count
         schemas = self.db_to_schemas.setdefault(db_name, {})
         if schema["id"] in schemas:
             known_tables = schemas[schema["id"]].setdefault("tables",[])
@@ -56,6 +59,9 @@ def tmp_modify_to_fit_in_postgres(self, db_info):
         if "collation" in db_info:
             del db_info["collation"]
         return db_info
+    
+    def exceeded_total_columns_number(self):
+        return self._total_columns_count > self.MAX_TOTAL_COLUMN_COUNT
 
     def submit(self):
         if not bool(self.db_to_schemas):
@@ -73,6 +79,13 @@ def submit(self):
             else:
                 db_info = self.db_info[db]
             event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
+        #TODO Remove Debug Code, calculate tables and schemas sent : 
+        schemas_debug  = list(schemas_by_id.values())
+        t_count = 0
+        for schema in schemas_debug:
+            t_count += len(schema['tables'])
+        self._log.error("Boris Adding event to Agent queue with : {} schemas and {} tables.".format(len(schemas_debug), t_count))
+        #END debug code
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
         self._submit_to_agent_queue(json_event)
@@ -163,24 +176,25 @@ def _init_schema_collection(self):
     #sends all the data in one go but split in chunks (like Seth's solution)
     def collect_schemas_data(self):
         
-
+        #returns Stop, Stop == True.
         def fetch_schema_data(cursor, db_name):
             db_info  = self._query_db_information(db_name, cursor)
             schemas = self._query_schema_information(cursor)
             self._dataSubmitter.store_db_info(db_name, db_info)
             chunk_size = 50
             for schema in schemas:
+                if self._dataSubmitter.exceeded_total_columns_number():
+                    self._log.warning("Truncated data due to the max limit")
+                    return True
                 tables = self._get_tables(schema, cursor)            
                 tables_chunk = list(get_list_chunks(tables, chunk_size))
                 for tables_chunk in tables_chunk:
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
                     self._dataSubmitter.store(db_name, schema, tables_info, columns_count)  
-                    self._dataSubmitter.submit() # we force submit when we reach the end of schema, it's like in Seths solution
+                    self._dataSubmitter.submit() # we force submit after each 50 tables chunk
                 if len(tables) == 0:
                     self._dataSubmitter.store(db_name, schema, [], 0)
-                # to ask him if this is needed or we can submit only on 100 000 column
-            # tells if we want to move to the next DB or stop, stop == TRUE
-            # we want to submit for each DB for clarity
+            # we want to submit for each DB separetly for clarity
             self._dataSubmitter.submit()
             return False
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())

From 4ed01eb2aef66aef89aa2e6b04d133c3e6321141 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 29 Apr 2024 13:10:47 +0000
Subject: [PATCH 041/132] Improved exception treatment

---
 sqlserver/datadog_checks/sqlserver/schemas.py   | 2 +-
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 2 +-
 sqlserver/tests/test_metadata.py                | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index b60f3a3a221a5..8c72ce06d199d 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -184,7 +184,7 @@ def fetch_schema_data(cursor, db_name):
             chunk_size = 50
             for schema in schemas:
                 if self._dataSubmitter.exceeded_total_columns_number():
-                    self._log.warning("Truncated data due to the max limit")
+                    self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
                     return True
                 tables = self._get_tables(schema, cursor)            
                 tables_chunk = list(get_list_chunks(tables, chunk_size))
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 4987d3bb5d862..9bb2754ce1244 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -747,7 +747,7 @@ def do_for_databases(self, action, databases):
                         if stop:
                             break;                  
                     except Exception as e:
-                        print("TODO")
+                        print("An exception occurred during do_for_databases in db - {}: {}".format(db, e))
                 # Switch DB back to MASTER
                 if not is_azure_sql_database(engine_edition):
                     cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 8c4e27855bbb3..3268e481afd77 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -148,7 +148,6 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         assert db_name in databases_to_find
         # we need to accumulate all data ... as payloads may differ 
 
-        # TODO enable when we add the package 
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         #difference = {}

From 292cb520f389a6954b65097443b11005c8204c80 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 29 Apr 2024 18:05:05 +0000
Subject: [PATCH 042/132] fixed hostname

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 8c72ce06d199d..3630d815c712f 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -31,12 +31,24 @@ class SubmitData:
 
     def __init__(self, submit_data_function, base_event, logger):
         self._submit_to_agent_queue = submit_data_function
+        self._base_event = base_event
+        self._log = logger
+
         self._columns_count  = 0
         self._total_columns_count = 0
         self.db_to_schemas = {} # dbname : { id : schema }
         self.db_info = {} # name to info
-        self._base_event = base_event
-        self._log = logger
+
+    def set_base_event_data(self, hostname, tags, cloud_metadata):
+        self._base_event["host"] = hostname
+        self._base_event["tags"] = tags
+        self._base_event["cloud_metadata"] = cloud_metadata
+
+    def reset(self):
+        self._columns_count = 0
+        self._total_columns_count = 0
+        self.db_to_schemas = {}
+        self.db_info = {}
     
     def store_db_info(self, db_name, db_info):
         self.db_info[db_name] = db_info
@@ -80,11 +92,16 @@ def submit(self):
                 db_info = self.db_info[db]
             event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
         #TODO Remove Debug Code, calculate tables and schemas sent : 
-        schemas_debug  = list(schemas_by_id.values())
+        schemas_debug = list(schemas_by_id.values())
         t_count = 0
+        printed_first = False
         for schema in schemas_debug:
             t_count += len(schema['tables'])
-        self._log.error("Boris Adding event to Agent queue with : {} schemas and {} tables.".format(len(schemas_debug), t_count))
+            if not printed_first and len(schema['tables']) >0:
+                printed_first = True
+                self._log.warning("One of tables db {} schema {} table {}".format( list(schemas_by_id.keys()), schema['name'], schema['tables'][0]["name"]))
+
+        self._log.warning("Boris Adding event to Agent queue with : {} schemas and {} tables.".format(len(schemas_debug), t_count))
         #END debug code
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
@@ -111,8 +128,8 @@ def __init__(self, check):
                 #"cloud_metadata": self._config.cloud_metadata,
             }
         """
-        #TODO remove : hosts were null onstaging /....
-        hostname = "boris"
+        #TODO error is just so that payload passes, shoud be removed
+        hostname = "error"
         if self._check.resolved_hostname is not None:
             hostname = self._check.resolved_hostname
         base_event = {
@@ -175,7 +192,9 @@ def _init_schema_collection(self):
     
     #sends all the data in one go but split in chunks (like Seth's solution)
     def collect_schemas_data(self):
-        
+        self._dataSubmitter.reset()
+        # for now only setting host and tags and metada
+        self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._tags, self._check._config.cloud_metadata)
         #returns Stop, Stop == True.
         def fetch_schema_data(cursor, db_name):
             db_info  = self._query_db_information(db_name, cursor)

From ddba122d4a93ad9fdb209d8cb75386f886b1b3bb Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 30 Apr 2024 19:17:51 +0000
Subject: [PATCH 043/132] Added Foreign key columns

---
 sqlserver/datadog_checks/sqlserver/const.py   |  9 +++--
 sqlserver/datadog_checks/sqlserver/schemas.py | 14 +++++---
 sqlserver/tests/compose/setup.sql             | 33 ++++++++++++++++++-
 sqlserver/tests/test_metadata.py              |  3 --
 4 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index ad8a9d95d52b6..d5f7a50b98b4e 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -290,9 +290,14 @@
 #PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
 PARTITIONS_QUERY2 = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
 PARTITIONS_QUERY = "SELECT object_id, COUNT(*) AS partition_count FROM sys.partitions WHERE object_id IN ({}) GROUP BY object_id;"
-FOREIGN_KEY_QUERY = "SELECT referenced_object_id, COUNT(*) AS foreign_key_count FROM sys.foreign_keys WHERE referenced_object_id IN ({}) GROUP BY referenced_object_id;"
+#parent_object_id - is the one of the parent table.
+FOREIGN_KEY_QUERY3 = "SELECT referenced_object_id, COUNT(*) AS foreign_key_count FROM sys.foreign_keys WHERE referenced_object_id IN ({}) GROUP BY referenced_object_id;"
 INDEX_QUERY2 = "SELECT i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id = {};"
 # May be this query is wrong like what if index is build on 2 columns will this work ? to test ? 
-INDEX_QUERY = "SELECT i.object_id AS object_id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id IN ({});"
+INDEX_QUERY = "SELECT i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, STRING_AGG(c.name, ',') AS column_names FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;"
+#INDEX_QUERY = "SELECT i.object_id AS object_id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, STRING_AGG(c.name, ',') AS column_names FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id IN ({}) GROUP BY i.object_id, i.name;"
+
 #FOREIGN_KEY_QUERY2 = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
 FOREIGN_KEY_QUERY2 = "SELECT COUNT(*) FROM sys.foreign_keys WHERE referenced_object_id = {};"
+
+FOREIGN_KEY_QUERY="SELECT FK.referenced_object_id AS id, FK.name AS foreign_key_name, OBJECT_NAME(FK.parent_object_id) AS referencing_table,  STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column, OBJECT_NAME(FK.referenced_object_id) AS referenced_table, STRING_AGG(COL_NAME(FKC.referenced_object_id, FKC.referenced_column_id),',') AS referenced_column FROM sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id WHERE  FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 3630d815c712f..028755e090eba 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -20,6 +20,7 @@
 import time
 import json
 import copy
+import pdb
 
 from datadog_checks.base.utils.db.utils import default_json_event_encoding
 
@@ -27,6 +28,7 @@ class SubmitData:
     MAX_COLUMN_COUNT  = 100_000
 
     # REDAPL has a 3MB limit per resource
+    #TODO Report truncation to the backend
     MAX_TOTAL_COLUMN_COUNT = 250_000
 
     def __init__(self, submit_data_function, base_event, logger):
@@ -99,7 +101,7 @@ def submit(self):
             t_count += len(schema['tables'])
             if not printed_first and len(schema['tables']) >0:
                 printed_first = True
-                self._log.warning("One of tables db {} schema {} table {}".format( list(schemas_by_id.keys()), schema['name'], schema['tables'][0]["name"]))
+                self._log.warning("One of tables db {} schema {} table {}".format( list(self.db_to_schemas.keys()), schema['name'], schema['tables'][0]["name"]))
 
         self._log.warning("Boris Adding event to Agent queue with : {} schemas and {} tables.".format(len(schemas_debug), t_count))
         #END debug code
@@ -264,7 +266,9 @@ def _get_tables_data(self, table_list, schema, cursor):
             id_to_all[t["id"]] = t
         total_columns_number  = self._populate_with_columns_data(table_ids_object, name_to_id, id_to_all, schema, cursor)
         #self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        #self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
+        pdb.set_trace()
+        self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
+        pdb.set_trace()
         #self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         # unwrap id_to_all
         return total_columns_number, list(id_to_all.values())
@@ -306,7 +310,7 @@ def _populate_with_partitions_data(self, table_ids, id_to_all, cursor):
             id  = row.pop("id", None)
             if id is not None:
                 #TODO what happens if not found ? 
-                id_to_all.get(id)["partitions"] = row
+                id_to_all.get(str(id))["partitions"] = row
             else:
                 print("todo error")
             row.pop("id", None)
@@ -319,7 +323,7 @@ def _populate_with_index_data(self, table_ids, id_to_all, cursor):
         for row in rows:
             id  = row.pop("id", None)
             if id is not None:
-                id_to_all.get(id)["indexes"] = row
+                id_to_all.get(str(id))["indexes"] = row
             else:
                 print("todo error")
             row.pop("id", None)
@@ -332,7 +336,7 @@ def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
             for row in rows:
                 id  = row.pop("id", None)
                 if id is not None:
-                    id_to_all.get(id)["foreign_keys"] = row
+                    id_to_all.get(str(id))["foreign_keys"] = row
                 else:
                     print("todo error")  
             print("end")
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index cedc070565559..024a25a7601dc 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -26,9 +26,40 @@ CREATE SCHEMA test_schema;
 GO
 
 CREATE TABLE datadog_test_schemas.test_schema.cities (id int DEFAULT 0, name varchar(255));
-CREATE INDEX one_column_index ON datadog_test_schemas.test_schema.cities (id);
+GO
+ALTER TABLE datadog_test_schemas.test_schema.cities
+ALTER COLUMN id INT NOT NULL;
+GO
 CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+ALTER TABLE datadog_test_schemas.test_schema.cities
+ADD CONSTRAINT PK_Cities PRIMARY KEY (id);
+GO
 INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey'), (2, 'bar');
+GO
+CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);
+GO
+ALTER TABLE datadog_test_schemas.test_schema.landmarks ADD CONSTRAINT FK_CityId FOREIGN KEY (city_id) REFERENCES datadog_test_schemas.test_schema.cities(id);
+GO
+
+--------------------------------------------------
+CREATE TABLE datadog_test_schemas.test_schema.Restaurants (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Cuisine VARCHAR(100),
+    CONSTRAINT UC_RestaurantNameDistrict UNIQUE (RestaurantName, District)
+);
+GO
+
+CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Review VARCHAR(MAX),
+    CONSTRAINT FK_RestaurantNameDistrict FOREIGN KEY (RestaurantName, District) REFERENCES datadog_test_schemas.test_schema.Restaurants(RestaurantName, District)
+);
+GO
+
+
+
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 CREATE DATABASE [datadog_test-1];
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 3268e481afd77..70ebf02c2df31 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -21,7 +21,6 @@
 except ImportError:
     pyodbc = None
 
-
 import json
 
 @pytest.fixture
@@ -40,8 +39,6 @@ def dbm_instance(instance_docker):
     return copy(instance_docker)
 
 
-
-
 @pytest.mark.integration
 @pytest.mark.usefixtures('dd_environment')
 @pytest.mark.parametrize(

From d3a04fcff65e3636a747ca5b472cfebc68655df5 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 30 Apr 2024 20:32:33 +0000
Subject: [PATCH 044/132] Added Foreign key columns

---
 sqlserver/datadog_checks/sqlserver/schemas.py |  4 +--
 sqlserver/tests/compose/setup.sql             | 25 +++++++++++++------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 028755e090eba..ad89cb342d529 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -266,9 +266,7 @@ def _get_tables_data(self, table_list, schema, cursor):
             id_to_all[t["id"]] = t
         total_columns_number  = self._populate_with_columns_data(table_ids_object, name_to_id, id_to_all, schema, cursor)
         #self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        pdb.set_trace()
-        self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        pdb.set_trace()
+        #self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         #self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         # unwrap id_to_all
         return total_columns_number, list(id_to_all.values())
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 024a25a7601dc..b9d3136944e66 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -25,15 +25,24 @@ GO
 CREATE SCHEMA test_schema;
 GO
 
-CREATE TABLE datadog_test_schemas.test_schema.cities (id int DEFAULT 0, name varchar(255));
-GO
-ALTER TABLE datadog_test_schemas.test_schema.cities
-ALTER COLUMN id INT NOT NULL;
-GO
+--CREATE TABLE datadog_test_schemas.test_schema.cities (id int DEFAULT 0, name varchar(255));
+--GO
+--ALTER TABLE datadog_test_schemas.test_schema.cities
+--ALTER COLUMN id INT NOT NULL;
+--GO
+--CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+--ALTER TABLE datadog_test_schemas.test_schema.cities
+--ADD CONSTRAINT PK_Cities PRIMARY KEY (id);
+--GO
+
+CREATE TABLE datadog_test_schemas.test_schema.cities (
+    id INT NOT NULL DEFAULT 0,
+    name VARCHAR(255),
+    CONSTRAINT PK_Cities PRIMARY KEY (id)
+);
+
 CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
-ALTER TABLE datadog_test_schemas.test_schema.cities
-ADD CONSTRAINT PK_Cities PRIMARY KEY (id);
-GO
+
 INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey'), (2, 'bar');
 GO
 CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);

From c3731f6ba0cccfa461df93f1a5ec2a1671523192 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 3 May 2024 12:34:36 +0000
Subject: [PATCH 045/132] Sorted tables

---
 sqlserver/datadog_checks/sqlserver/const.py   |  2 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 21 +++--
 .../datadog_checks/sqlserver/sqlserver.py     |  1 -
 sqlserver/tests/compose/setup.sql             | 91 ++++++++++++++++++-
 4 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index d5f7a50b98b4e..efe5beb3d57ba 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -289,7 +289,7 @@
 
 #PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
 PARTITIONS_QUERY2 = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
-PARTITIONS_QUERY = "SELECT object_id, COUNT(*) AS partition_count FROM sys.partitions WHERE object_id IN ({}) GROUP BY object_id;"
+PARTITIONS_QUERY = "SELECT object_id AS id, COUNT(*) AS partition_count FROM sys.partitions WHERE object_id IN ({}) GROUP BY object_id;"
 #parent_object_id - is the one of the parent table.
 FOREIGN_KEY_QUERY3 = "SELECT referenced_object_id, COUNT(*) AS foreign_key_count FROM sys.foreign_keys WHERE referenced_object_id IN ({}) GROUP BY referenced_object_id;"
 INDEX_QUERY2 = "SELECT i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id = {};"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index ad89cb342d529..d778348e6a2b7 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -2,6 +2,7 @@
     import datadog_agent
 except ImportError:
     from ..stubs import datadog_agent
+import time
 
 from datadog_checks.sqlserver.const import (
     TABLES_IN_SCHEMA_QUERY,
@@ -20,7 +21,6 @@
 import time
 import json
 import copy
-import pdb
 
 from datadog_checks.base.utils.db.utils import default_json_event_encoding
 
@@ -106,7 +106,7 @@ def submit(self):
         self._log.warning("Boris Adding event to Agent queue with : {} schemas and {} tables.".format(len(schemas_debug), t_count))
         #END debug code
         json_event = json.dumps(event, default=default_json_event_encoding)
-        self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
+        #self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
         self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 
@@ -195,6 +195,8 @@ def _init_schema_collection(self):
     #sends all the data in one go but split in chunks (like Seth's solution)
     def collect_schemas_data(self):
         self._dataSubmitter.reset()
+        start_time = time.time()
+        self._log.warning("Starting schema collection {}".format(start_time))
         # for now only setting host and tags and metada
         self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._tags, self._check._config.cloud_metadata)
         #returns Stop, Stop == True.
@@ -204,12 +206,16 @@ def fetch_schema_data(cursor, db_name):
             self._dataSubmitter.store_db_info(db_name, db_info)
             chunk_size = 50
             for schema in schemas:
-                if self._dataSubmitter.exceeded_total_columns_number():
-                    self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
-                    return True
-                tables = self._get_tables(schema, cursor)            
-                tables_chunk = list(get_list_chunks(tables, chunk_size))
+
+                tables = self._get_tables(schema, cursor)  
+                #TODO sorting is purely for testing
+                sorted_tables = sorted(tables, key=lambda x: x['name'])          
+                tables_chunk = list(get_list_chunks(sorted_tables, chunk_size))
                 for tables_chunk in tables_chunk:
+                    if self._dataSubmitter.exceeded_total_columns_number():
+                        self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
+                        return True
+                    self._log.warning("elapsed time {}".format(time.time() - start_time))
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
                     self._dataSubmitter.store(db_name, schema, tables_info, columns_count)  
                     self._dataSubmitter.submit() # we force submit after each 50 tables chunk
@@ -217,6 +223,7 @@ def fetch_schema_data(cursor, db_name):
                     self._dataSubmitter.store(db_name, schema, [], 0)
             # we want to submit for each DB separetly for clarity
             self._dataSubmitter.submit()
+            self._log.error("Finished collecting for DB elapsed time {}".format(time.time() - start_time))
             return False
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
         # submit the last chunk of data if any
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 9bb2754ce1244..ff7475a86fffd 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -5,7 +5,6 @@
 import copy
 import time
 from collections import defaultdict
-
 import six
 from cachetools import TTLCache
 
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index b9d3136944e66..3b8f00fc63e18 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -35,11 +35,27 @@ GO
 --ADD CONSTRAINT PK_Cities PRIMARY KEY (id);
 --GO
 
+--CREATE TABLE datadog_test_schemas.test_schema.cities (
+--    id INT NOT NULL DEFAULT 0,
+--    name VARCHAR(255),
+--    CONSTRAINT PK_Cities PRIMARY KEY (id)
+--);
+
+-- Create the partition function
+CREATE PARTITION FUNCTION CityPartitionFunction (INT)
+AS RANGE LEFT FOR VALUES (100, 200, 300); -- Define your partition boundaries here
+
+-- Create the partition scheme
+CREATE PARTITION SCHEME CityPartitionScheme
+AS PARTITION CityPartitionFunction ALL TO ([PRIMARY]); -- Assign partitions to filegroups
+
+-- Create the partitioned table
 CREATE TABLE datadog_test_schemas.test_schema.cities (
     id INT NOT NULL DEFAULT 0,
     name VARCHAR(255),
     CONSTRAINT PK_Cities PRIMARY KEY (id)
-);
+) ON CityPartitionScheme(id); -- Assign the partition scheme to the table
+
 
 CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
 
@@ -67,7 +83,80 @@ CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
 );
 GO
 
+-- Start of populate.sql
+DECLARE @TableNamePrefix NVARCHAR(100) = 'dbm_employee_boris';
+DECLARE @Index INT = 1;
+DECLARE @MaxTables INT = 10000;
 
+WHILE @Index <= @MaxTables
+BEGIN
+    DECLARE @TableName NVARCHAR(200) = @TableNamePrefix + '_' + CAST(@Index AS NVARCHAR(10));
+    DECLARE @SQL NVARCHAR(MAX);
+
+    SET @SQL = '
+        CREATE TABLE ' + QUOTENAME(@TableName) + ' (
+            id INT NOT NULL IDENTITY PRIMARY KEY,
+            username VARCHAR(200),
+            nickname VARCHAR(200),
+            email VARCHAR(200),
+            created_at DATETIME DEFAULT GETDATE(),
+            updated_at DATETIME DEFAULT GETDATE(),
+            username2 VARCHAR(200),
+username3 VARCHAR(200),
+username4 VARCHAR(200),
+username5 VARCHAR(200),
+username6 VARCHAR(200),
+username7 VARCHAR(200),
+username8 VARCHAR(200),
+username9 VARCHAR(200),
+username10 VARCHAR(200),
+username11 VARCHAR(200),
+username12 VARCHAR(200),
+username13 VARCHAR(200),
+username14 VARCHAR(200),
+username15 VARCHAR(200),
+username16 VARCHAR(200),
+username17 VARCHAR(200),
+username18 VARCHAR(200),
+username19 VARCHAR(200),
+username20 VARCHAR(200),
+username21 VARCHAR(200),
+username22 VARCHAR(200),
+username23 VARCHAR(200),
+username24 VARCHAR(200),
+username25 VARCHAR(200),
+username26 VARCHAR(200),
+username27 VARCHAR(200),
+username28 VARCHAR(200),
+username29 VARCHAR(200),
+username30 VARCHAR(200),
+username31 VARCHAR(200),
+username32 VARCHAR(200),
+username33 VARCHAR(200),
+username34 VARCHAR(200),
+username35 VARCHAR(200),
+username36 VARCHAR(200),
+username37 VARCHAR(200),
+username38 VARCHAR(200),
+username39 VARCHAR(200),
+username40 VARCHAR(200),
+username41 VARCHAR(200),
+username42 VARCHAR(200),
+username43 VARCHAR(200),
+username44 VARCHAR(200),
+username45 VARCHAR(200),
+username46 VARCHAR(200),
+username47 VARCHAR(200),
+username48 VARCHAR(200),
+username49 VARCHAR(200),
+username50 VARCHAR(200)
+        );';
+
+    EXEC sp_executesql @SQL, N'@TableNamePrefix NVARCHAR(100)', @TableNamePrefix;
+
+    SET @Index = @Index + 1;
+END;
+-- End of populate.sql
 
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database

From 1dc10e95dbb65749bb6957f220873d901076d7bf Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 3 May 2024 15:59:28 +0000
Subject: [PATCH 046/132] add time log for individual query

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index d778348e6a2b7..2272dc5b42e67 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -281,7 +281,9 @@ def _get_tables_data(self, table_list, schema, cursor):
     # TODO refactor the next 3 to have a base function when everythng is settled.
     def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema, cursor):
         # get columns if we dont have a dict here unlike postgres
+        start_time = time.time()
         cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
+        self._log.warning("Executed columns query for {} seconds".format(time.time() - start_time))
         data = cursor.fetchall()
         columns = []
         #TODO we need it cause if I put AS default its a forbidden key word and to be inline with postgres we need it

From dd1f4380e0ab3dd636afa9c3fbdb5a16335528ae Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 3 May 2024 16:30:50 +0000
Subject: [PATCH 047/132] removed other jobs

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 2272dc5b42e67..4a8435697fbc7 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -223,10 +223,11 @@ def fetch_schema_data(cursor, db_name):
                     self._dataSubmitter.store(db_name, schema, [], 0)
             # we want to submit for each DB separetly for clarity
             self._dataSubmitter.submit()
-            self._log.error("Finished collecting for DB elapsed time {}".format(time.time() - start_time))
+            self._log.error("Finished collecting for DB - {} elapsed time {}".format(db_name, time.time() - start_time))
             return False
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
         # submit the last chunk of data if any
+        self._log.error("Finished collect_schemas_data")
         self._dataSubmitter.submit()
 
 

From cf20c1829ead56329d301533a0090f039aba6df2 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 3 May 2024 17:38:24 +0000
Subject: [PATCH 048/132] Added timestamps

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 9 +++++++++
 sqlserver/tests/compose/setup.sql             | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 4a8435697fbc7..fe826a4da22b9 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -216,9 +216,18 @@ def fetch_schema_data(cursor, db_name):
                         self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
                         return True
                     self._log.warning("elapsed time {}".format(time.time() - start_time))
+
+                    start_get_tables_time = time.time()
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
+                    self._log.warning("_get_tables_data time {}".format(time.time() - start_get_tables_time))
+
+                    start_store_time = time.time()
                     self._dataSubmitter.store(db_name, schema, tables_info, columns_count)  
+                    self._log.warning("store time {}".format(time.time() - start_store_time))
+
+                    start_submit_time = time.time()
                     self._dataSubmitter.submit() # we force submit after each 50 tables chunk
+                    self._log.warning("submit time {}".format(time.time() - start_submit_time))
                 if len(tables) == 0:
                     self._dataSubmitter.store(db_name, schema, [], 0)
             # we want to submit for each DB separetly for clarity
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 3b8f00fc63e18..5703699e1788d 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -86,7 +86,7 @@ GO
 -- Start of populate.sql
 DECLARE @TableNamePrefix NVARCHAR(100) = 'dbm_employee_boris';
 DECLARE @Index INT = 1;
-DECLARE @MaxTables INT = 10000;
+DECLARE @MaxTables INT = 10;
 
 WHILE @Index <= @MaxTables
 BEGIN

From 7366af8ab11da46320a240143bc2f85572c9d8f9 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Sat, 4 May 2024 00:13:28 +0000
Subject: [PATCH 049/132] Add more logs

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index fe826a4da22b9..d88925112d114 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -294,7 +294,10 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema,
         start_time = time.time()
         cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
         self._log.warning("Executed columns query for {} seconds".format(time.time() - start_time))
+        start_time_fetch = time.time()
         data = cursor.fetchall()
+        self._log.warning("Executed cursor.fetchall()for {} seconds".format(time.time() - start_time_fetch))
+        start_time_rest = time.time()
         columns = []
         #TODO we need it cause if I put AS default its a forbidden key word and to be inline with postgres we need it
         for i in cursor.description:
@@ -317,6 +320,7 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema,
                     else:
                         row["nullable"] = True
                 id_to_all.get(table_id)["columns"] = id_to_all.get(table_id).get("columns",[]) + [row]
+        self._log.warning("Executed loops for {} seconds".format(time.time() - start_time_rest))
         return len(data)
     
     def _populate_with_partitions_data(self, table_ids, id_to_all, cursor):

From 983bd9ed53730ae66227e85509a039854e414f1a Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Sat, 4 May 2024 00:28:21 +0000
Subject: [PATCH 050/132] increase to 500

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index d88925112d114..f9d6ce090f9b8 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -204,7 +204,7 @@ def fetch_schema_data(cursor, db_name):
             db_info  = self._query_db_information(db_name, cursor)
             schemas = self._query_schema_information(cursor)
             self._dataSubmitter.store_db_info(db_name, db_info)
-            chunk_size = 50
+            chunk_size = 500
             for schema in schemas:
 
                 tables = self._get_tables(schema, cursor)  

From 6b591b185f56571af8c9fda1e4cfb48747ed2cc0 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 6 May 2024 08:54:13 +0000
Subject: [PATCH 051/132] removing postgres simulation

---
 sqlserver/datadog_checks/sqlserver/const.py   |  2 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 15 +++++++++------
 sqlserver/tests/compose/setup.sql             |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index efe5beb3d57ba..863af5cbf14a6 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -283,7 +283,7 @@
 
 #WHERE  attrelid IN ({table_ids})
 COLUMN_QUERY3 = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
-COLUMN_QUERY = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
+COLUMN_QUERY = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
 #TODO add ORDER BY ORDINAL_POSITION; ? 
 #"SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ('boris', OBJECT_NAME(917578307))
 
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index f9d6ce090f9b8..894bf0d2c1d28 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -17,7 +17,7 @@
 from datadog_checks.sqlserver.utils import (
     execute_query_output_result_as_a_dict, get_list_chunks
 )
-
+import pdb
 import time
 import json
 import copy
@@ -92,7 +92,9 @@ def submit(self):
                 db_info["name"] = db
             else:
                 db_info = self.db_info[db]
+            #event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
             event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
+            pdb.set_trace()
         #TODO Remove Debug Code, calculate tables and schemas sent : 
         schemas_debug = list(schemas_by_id.values())
         t_count = 0
@@ -204,9 +206,10 @@ def fetch_schema_data(cursor, db_name):
             db_info  = self._query_db_information(db_name, cursor)
             schemas = self._query_schema_information(cursor)
             self._dataSubmitter.store_db_info(db_name, db_info)
-            chunk_size = 500
+            chunk_size = 50
             for schema in schemas:
-
+                if schema['name'] != 'test_schema':
+                    continue
                 tables = self._get_tables(schema, cursor)  
                 #TODO sorting is purely for testing
                 sorted_tables = sorted(tables, key=lambda x: x['name'])          
@@ -282,9 +285,9 @@ def _get_tables_data(self, table_list, schema, cursor):
             name_to_id[t["name"]] = t["id"] 
             id_to_all[t["id"]] = t
         total_columns_number  = self._populate_with_columns_data(table_ids_object, name_to_id, id_to_all, schema, cursor)
-        #self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        #self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        #self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
+        self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
+        self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
+        self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
         # unwrap id_to_all
         return total_columns_number, list(id_to_all.values())
 
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 5703699e1788d..deaee35cd17a8 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -86,7 +86,7 @@ GO
 -- Start of populate.sql
 DECLARE @TableNamePrefix NVARCHAR(100) = 'dbm_employee_boris';
 DECLARE @Index INT = 1;
-DECLARE @MaxTables INT = 10;
+DECLARE @MaxTables INT = 0;
 
 WHILE @Index <= @MaxTables
 BEGIN

From 3009c4523ac2354508de3cac239f5fe0f66ea272 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 6 May 2024 09:44:30 +0000
Subject: [PATCH 052/132] fix errors

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 51 +++----------------
 sqlserver/tests/test_metadata.py              |  8 ++-
 2 files changed, 11 insertions(+), 48 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 894bf0d2c1d28..cd2f56024aec1 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -25,11 +25,11 @@
 from datadog_checks.base.utils.db.utils import default_json_event_encoding
 
 class SubmitData: 
-    MAX_COLUMN_COUNT  = 100_000
+    MAX_COLUMN_COUNT  = 10_000
 
     # REDAPL has a 3MB limit per resource
     #TODO Report truncation to the backend
-    MAX_TOTAL_COLUMN_COUNT = 250_000
+    MAX_TOTAL_COLUMN_COUNT = 100_000 
 
     def __init__(self, submit_data_function, base_event, logger):
         self._submit_to_agent_queue = submit_data_function
@@ -68,12 +68,6 @@ def store(self, db_name, schema, tables, columns_count):
         if self._columns_count > self.MAX_COLUMN_COUNT:
             self._submit()
 
-    #TODO P - disable for p.
-    def tmp_modify_to_fit_in_postgres(self, db_info):
-        if "collation" in db_info:
-            del db_info["collation"]
-        return db_info
-    
     def exceeded_total_columns_number(self):
         return self._total_columns_count > self.MAX_TOTAL_COLUMN_COUNT
 
@@ -92,23 +86,9 @@ def submit(self):
                 db_info["name"] = db
             else:
                 db_info = self.db_info[db]
-            #event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
-            event["metadata"] =  event["metadata"] + [{**(self.tmp_modify_to_fit_in_postgres(db_info)), "schemas": list(schemas_by_id.values())}]
-            pdb.set_trace()
-        #TODO Remove Debug Code, calculate tables and schemas sent : 
-        schemas_debug = list(schemas_by_id.values())
-        t_count = 0
-        printed_first = False
-        for schema in schemas_debug:
-            t_count += len(schema['tables'])
-            if not printed_first and len(schema['tables']) >0:
-                printed_first = True
-                self._log.warning("One of tables db {} schema {} table {}".format( list(self.db_to_schemas.keys()), schema['name'], schema['tables'][0]["name"]))
-
-        self._log.warning("Boris Adding event to Agent queue with : {} schemas and {} tables.".format(len(schemas_debug), t_count))
-        #END debug code
+            event["metadata"] =  event["metadata"] + [{**(db_info), "schemas": list(schemas_by_id.values())}]
         json_event = json.dumps(event, default=default_json_event_encoding)
-        #self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
+        self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
         self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 
@@ -139,27 +119,14 @@ def __init__(self, check):
         base_event = {
             "host": hostname,
             "agent_version": datadog_agent.get_version(),
-            "dbms": "postgres", #TODO fake it until you make it - trying to pass this data as postgres for now
-            "kind": "pg_databases", # TODO pg_databases - will result in KindPgDatabases and so processor would thing its postgres 
+            "dbms": "sqlserver", #TODO fake it until you make it - trying to pass this data as postgres for now
+            "kind": "sqlserver_databases", # TODO pg_databases - will result in KindPgDatabases and so processor would thing its postgres 
             "collection_interval": 0.5, #dummy
             "dbms_version": "v14.2", #dummy but may be format i v11 is important ?
             "tags": self._tags, #in postgres it's no DB.
             "cloud_metadata": self._check._config.cloud_metadata,
         }
-
         self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
-
-        # These are fields related to the work to do while doing the initial intake
-        # for diffs there should eb a self._done_db_list which will be used to see if new dbs have appeared/disappeared.
-        self._databases_to_query = []
-        self._current_table_list = None
-        self._current_schema_list = None
-        self._number_of_collected_tables = 0 #TODO later switch to columns
-
-    def reset_data_collection(self):
-        self._current_table_list = None  
-        self._current_schema_list = None
-        self._number_of_collected_tables = 0
        
     def _init_schema_collection(self):
         currently_known_databases = self._check.get_databases()
@@ -208,12 +175,10 @@ def fetch_schema_data(cursor, db_name):
             self._dataSubmitter.store_db_info(db_name, db_info)
             chunk_size = 50
             for schema in schemas:
-                if schema['name'] != 'test_schema':
-                    continue
                 tables = self._get_tables(schema, cursor)  
                 #TODO sorting is purely for testing
-                sorted_tables = sorted(tables, key=lambda x: x['name'])          
-                tables_chunk = list(get_list_chunks(sorted_tables, chunk_size))
+                #sorted_tables = sorted(tables, key=lambda x: x['name'])          
+                tables_chunk = list(get_list_chunks(tables, chunk_size))
                 for tables_chunk in tables_chunk:
                     if self._dataSubmitter.exceeded_total_columns_number():
                         self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 70ebf02c2df31..25cd66414d1d2 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -20,7 +20,7 @@
     import pyodbc
 except ImportError:
     pyodbc = None
-
+import pdb
 import json
 
 @pytest.fixture
@@ -106,7 +106,6 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     exp_datadog_test_schemas = {'id': '5', 'name': 'datadog_test_schemas', 'owner': 'dbo', 'schemas': [{'name': 'test_schema', 'id': '5', 'owner': '1', 'tables': [{'id': '885578193', 'name': 'cities', 'columns': [{'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True}, {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True}]}]}]}
     expected_data_for_db = {'datadog_test' : exp_datadog_test, 'datadog_test_schemas' : exp_datadog_test_schemas}
 
-
     dbm_instance['database_autodiscovery'] = True
     dbm_instance['autodiscovery_include'] = ['datadog_test_schemas','datadog_test']
 
@@ -119,8 +118,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     
     actual_payloads = {}
 
-    #TODO later modify kind
-    for schema_event in (e for e in dbm_metadata if e['kind'] == 'pg_databases'):
+    for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
         if len(databases_to_find) == 0:
             # we may see the correct payload for the database several times in events
             return
@@ -136,7 +134,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             actual_payloads[db_name]['schemas'] = actual_payloads[db_name]['schemas'] + database_metadata[0]['schemas']
         else:
             actual_payloads[db_name] = database_metadata[0]
-
+    pdb.set_trace()
     assert len(actual_payloads) == len(expected_data_for_db)    
 
     for db_name, actual_payload in actual_payloads.items():

From c44b8bd9b14604c8c8f8c64c949e7cdaf5bce7a3 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 6 May 2024 18:57:01 +0000
Subject: [PATCH 053/132] added collection interval

---
 sqlserver/assets/configuration/spec.yaml      |   8 +
 sqlserver/datadog_checks/sqlserver/config.py  |   3 +-
 sqlserver/datadog_checks/sqlserver/const.py   |   9 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 278 ++++++++++--------
 .../datadog_checks/sqlserver/sqlserver.py     |   6 +-
 5 files changed, 179 insertions(+), 125 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 0126a1af7f63a..53414b37e09b8 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -713,6 +713,14 @@ files:
         type: number
         example: 1800
         display_default: false
+    - name: schemas_collection_interval 
+      description: |
+        The database schema collection interval (in seconds).
+        Defaults to 1200 seconds to include everything.
+      value:
+        type: number
+        example: 600
+        display_default: false
     - template: instances/default
   - template: logs
     example:
diff --git a/sqlserver/datadog_checks/sqlserver/config.py b/sqlserver/datadog_checks/sqlserver/config.py
index 99c3a12aa52ed..010f3352cb082 100644
--- a/sqlserver/datadog_checks/sqlserver/config.py
+++ b/sqlserver/datadog_checks/sqlserver/config.py
@@ -7,7 +7,7 @@
 
 from datadog_checks.base.config import is_affirmative
 from datadog_checks.base.utils.common import to_native_string
-from datadog_checks.sqlserver.const import DEFAULT_AUTODISCOVERY_INTERVAL, PROC_CHAR_LIMIT
+from datadog_checks.sqlserver.const import DEFAULT_AUTODISCOVERY_INTERVAL, PROC_CHAR_LIMIT, DEFAULT_SCHEMAS_COLLECTION_INTERVAL
 
 
 class SQLServerConfig:
@@ -23,6 +23,7 @@ def __init__(self, init_config, instance, log):
         self.autodiscovery_interval: int = instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
         self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include)
         self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude)
+        self.schemas_collection_interval: int = instance.get('schemas_collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL)
 
         self.proc: str = instance.get('stored_procedure')
         self.custom_metrics: list[dict] = init_config.get('custom_metrics', []) or []
diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 863af5cbf14a6..107c8fadf0daa 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -269,14 +269,17 @@
 
 PROC_CHAR_LIMIT = 500
 
+#Schemas
+DEFAULT_SCHEMAS_COLLECTION_INTERVAL = 1200
+
 #for now description results in ('ODBC SQL type -150 is not yet supported.  column-index=4  type=-150', 'HY106')
 DB_QUERY2 = "SELECT db.database_id AS id, db.name AS NAME, db.collation_name AS collation, dp.name AS owner, ep.value AS description FROM sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid LEFT JOIN sys.extended_properties ep ON ep.major_id = db.database_id AND ep.minor_id = 0 AND ep.class = 0 AND ep.name = 'MS_Description' WHERE db.name = '{}';"
 DB_QUERY = "SELECT db.database_id AS id, db.name AS NAME, db.collation_name AS collation, dp.name AS owner FROM sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid WHERE db.name = '{}';"
 
 #TODO as owner for the postgresbackend 
-SCHEMA_QUERY = "SELECT name,schema_id AS id,principal_id AS owner FROM sys.schemas WHERE name NOT IN ('sys', 'information_schema');"
-
-TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id FROM sys.tables WHERE schema_id={}"
+SCHEMA_QUERY = "SELECT name,schema_id AS id, dp.name AS OwnerName, FROM sys.schemas AS s LEFT JOIN sys.database_principals dp ON s.principal_id = dp.principal_id WHERE s.name NOT IN ('sys', 'information_schema');"
+SCHEMA_QUERY = "SELECT s.name AS name ,s.schema_id AS id, dp.name AS owner_name FROM sys.schemas AS s LEFT JOIN sys.database_principals dp ON s.principal_id = dp.principal_id WHERE s.name NOT IN ('sys', 'information_schema')";
+TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id AS id FROM sys.tables WHERE schema_id={}"
 COLUMN_QUERY3 = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}' and TABLE_SCHEMA='{}';"
 #this query returns several values in case there is an alias for an int ... 
 COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id OR c.user_type_id = t.user_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index cd2f56024aec1..bb0a876d42637 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -11,7 +11,9 @@
     INDEX_QUERY,
     FOREIGN_KEY_QUERY,
     SCHEMA_QUERY,
-    DB_QUERY
+    DB_QUERY,
+    STATIC_INFO_VERSION,
+    STATIC_INFO_ENGINE_EDITION
 )
 
 from datadog_checks.sqlserver.utils import (
@@ -41,10 +43,11 @@ def __init__(self, submit_data_function, base_event, logger):
         self.db_to_schemas = {} # dbname : { id : schema }
         self.db_info = {} # name to info
 
-    def set_base_event_data(self, hostname, tags, cloud_metadata):
+    def set_base_event_data(self, hostname, tags, cloud_metadata, dbms_version):
         self._base_event["host"] = hostname
         self._base_event["tags"] = tags
         self._base_event["cloud_metadata"] = cloud_metadata
+        self._base_event["dbms_version"] = dbms_version        
 
     def reset(self):
         self._columns_count = 0
@@ -92,119 +95,102 @@ def submit(self):
         self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 
-#TODO Introduce total max for data
 class Schemas:
-    def __init__(self, check):
+
+    # Requests for infromation about tables are done for a certain amount of tables at the time
+    # This number of tables doesnt slow down performance by much (15% compared to 500 tables)
+    # but allows the queue to be stable.
+    TABLES_CHUNK_SIZE = 50
+
+    def __init__(self, check, schemas_collection_interval):
         self._check = check 
         self._log = check.log
         self._tags = [t for t in check.tags if not t.startswith('dd.internal')]
         self._tags.append("boris:data")
         self.schemas_per_db = {} 
-        """
-        base_event = {
-                "host": self._check.resolved_hostname,
-                "agent_version": datadog_agent.get_version(),
-                "dbms": "sqlserver", #TODO ?
-                "kind": "", # TODO 
-                #"collection_interval": self.schemas_collection_interval,
-                #"dbms_version": self._payload_pg_version(),
-                #"tags": self._tags_no_db,
-                #"cloud_metadata": self._config.cloud_metadata,
-            }
-        """
-        #TODO error is just so that payload passes, shoud be removed
-        hostname = "error"
-        if self._check.resolved_hostname is not None:
-            hostname = self._check.resolved_hostname
+
         base_event = {
-            "host": hostname,
+            "host": None,
             "agent_version": datadog_agent.get_version(),
-            "dbms": "sqlserver", #TODO fake it until you make it - trying to pass this data as postgres for now
-            "kind": "sqlserver_databases", # TODO pg_databases - will result in KindPgDatabases and so processor would thing its postgres 
-            "collection_interval": 0.5, #dummy
-            "dbms_version": "v14.2", #dummy but may be format i v11 is important ?
-            "tags": self._tags, #in postgres it's no DB.
+            "dbms": "sqlserver",
+            "kind": "sqlserver_databases",
+            "collection_interval": schemas_collection_interval,
+            "dbms_version": None,
+            "tags": self._tags, #in postgres it's no DB ?
             "cloud_metadata": self._check._config.cloud_metadata,
         }
         self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
-       
-    def _init_schema_collection(self):
-        currently_known_databases = self._check.get_databases()
-        if len(self._databases_to_query) == 0:
-            self._databases_to_query = self._check.get_databases()
-            return  
-        else:
-            if self._databases_to_query[0] not in currently_known_databases:
-                #TODO if db dissapeared we invalidate indexes should be done in exception treatment of use DB ?
-                #if DB is not there the first use db will throw and we continue until we find an existing db or exaust the list
-                # the idea is always finish the existing DB list and then run "diff" logic which will create a new list of "tasks"
-                self.reset_data_collection()
 
-   #TODO update this at the very end as it constantly changing
     """schemas data struct is a dictionnary with key being a schema name the value is
     schema
     dict:
         "name": str
         "id": str
-        "principal_id": str
-        "tables" : []
-            id : str
-            name : str
-            columns: list of columns                  
-                "columns": dict
-                    name: str
-                    data_type: str
-                    default: str
-                    is_nullable : str
-            indexes : list of indexes - important
-            foreign_keys : list of foreign keys
+        "owner_name": str
+        "tables" : list of tables dicts
+            table 
+            dict:
+                "id" : str
+                "name" : str
+                columns: list of columns dicts                 
+                    columns 
+                    dict:
+                        "name": str
+                        "data_type": str
+                        "default": str
+                        "nullable": bool
+            indexes : list of index dicts
+                index
+                dict:
+                    "name": str
+                    "type": str
+                    "is_unique": bool
+                    "is_primary_key": bool
+                    "is_unique_constraint": bool
+                    "is_disabled": bool,
+                    "column_names": str
+            foreign_keys : list of foreign key dicts
+                foreign_key
+                dict:
+                    "foreign_key_name": str
+                    "referencing_table": str
+                    "referencing_column": str
+                    "referenced_table": str
+                    "referenced_column": str
+            partitions: list of partitions dict
+                partition
+                dict:
+                    "partition_count": int
             partitions useful to know the number 
     """
-    
-    #sends all the data in one go but split in chunks (like Seth's solution)
     def collect_schemas_data(self):
         self._dataSubmitter.reset()
-        start_time = time.time()
-        self._log.warning("Starting schema collection {}".format(start_time))
-        # for now only setting host and tags and metada
-        self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._tags, self._check._config.cloud_metadata)
-        #returns Stop, Stop == True.
+        self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._tags, self._check._config.cloud_metadata, 
+                                                "{},{}".format(
+                                                self._check.static_info_cache.get(STATIC_INFO_VERSION, ""),
+                                                self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION, ""),)
+        )
+        #returns if to stop, True means stop iterating.
         def fetch_schema_data(cursor, db_name):
             db_info  = self._query_db_information(db_name, cursor)
             schemas = self._query_schema_information(cursor)
             self._dataSubmitter.store_db_info(db_name, db_info)
-            chunk_size = 50
             for schema in schemas:
-                tables = self._get_tables(schema, cursor)  
-                #TODO sorting is purely for testing
-                #sorted_tables = sorted(tables, key=lambda x: x['name'])          
-                tables_chunk = list(get_list_chunks(tables, chunk_size))
+                tables = self._get_tables(schema, cursor)         
+                tables_chunk = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
                 for tables_chunk in tables_chunk:
                     if self._dataSubmitter.exceeded_total_columns_number():
                         self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
-                        return True
-                    self._log.warning("elapsed time {}".format(time.time() - start_time))
-
-                    start_get_tables_time = time.time()
+                        return True                    
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
-                    self._log.warning("_get_tables_data time {}".format(time.time() - start_get_tables_time))
-
-                    start_store_time = time.time()
                     self._dataSubmitter.store(db_name, schema, tables_info, columns_count)  
-                    self._log.warning("store time {}".format(time.time() - start_store_time))
-
-                    start_submit_time = time.time()
-                    self._dataSubmitter.submit() # we force submit after each 50 tables chunk
-                    self._log.warning("submit time {}".format(time.time() - start_submit_time))
+                    self._dataSubmitter.submit() # Submit is forced after each 50 tables chunk
                 if len(tables) == 0:
                     self._dataSubmitter.store(db_name, schema, [], 0)
-            # we want to submit for each DB separetly for clarity
             self._dataSubmitter.submit()
-            self._log.error("Finished collecting for DB - {} elapsed time {}".format(db_name, time.time() - start_time))
             return False
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
-        # submit the last chunk of data if any
-        self._log.error("Finished collect_schemas_data")
+        self._log.debug("Finished collect_schemas_data")
         self._dataSubmitter.submit()
 
 
@@ -216,56 +202,120 @@ def _query_db_information(self, db_name, cursor):
             return None
     # TODO how often ?
 
-    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
-
-    #TODO Looks fine similar to Postgres, do we need to do someting with prinicipal_id
-    # or reporting principal_id is ok
+    """schemas data struct is a dictionnary with key being a schema name the value is
+    schema
+    dict:
+        "name": str
+        "id": str
+        "owner_name": str
+        "tables" : list of tables dicts
+            table 
+            dict:
+                "id" : str
+                "name" : str
+                columns: list of columns dicts                 
+                    columns 
+                    dict:
+                        "name": str
+                        "data_type": str
+                        "default": str
+                        "nullable": bool
+            indexes : list of index dicts
+                index
+                dict:
+                    "name": str
+                    "type": str
+                    "is_unique": bool
+                    "is_primary_key": bool
+                    "is_unique_constraint": bool
+                    "is_disabled": bool,
+                    "column_names": str
+            foreign_keys : list of foreign key dicts
+                foreign_key
+                dict:
+                    "foreign_key_name": str
+                    "referencing_table": str
+                    "referencing_column": str
+                    "referenced_table": str
+                    "referenced_column": str
+            partitions: list of partitions dict
+                partition
+                dict:
+                    "partition_count": int
+            partitions useful to know the number 
+    """    
+    """fetches schemas dict 
+    schema
+    dict:
+        "name": str
+        "id": str
+        "owner_name": str"""
     def _query_schema_information(self, cursor):
-
-        # principal_id is kind of like an owner not sure if need it.
-        self._log.debug("collecting db schemas")
         self._log.debug("Running query [%s]", SCHEMA_QUERY)
         cursor.execute(SCHEMA_QUERY)
         schemas = []
         columns = [i[0] for i in cursor.description]
         schemas = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
-        #TODO we can refactor it , doesnt have to have a tables :[] if there is nothing. 
-        for schema in schemas:
-            schema["tables"] = []
         self._log.debug("fetched schemas len(rows)=%s", len(schemas))
         return schemas
-        
-    #TODO collect diffs : we need to take care of new DB / removed DB . schemas new removed
-    # will nedd a separate query for changed indexes
+    
+    """ returns extracted column numbers and a list of tables
+        "tables" : list of tables dicts
+        table 
+        dict:
+            "id" : str
+            "name" : str
+            columns: list of columns dicts                 
+                columns 
+                dict:
+                    "name": str
+                    "data_type": str
+                    "default": str
+                    "nullable": bool
+            indexes : list of index dicts
+                index
+                dict:
+                    "name": str
+                    "type": str
+                    "is_unique": bool
+                    "is_primary_key": bool
+                    "is_unique_constraint": bool
+                    "is_disabled": bool,
+                    "column_names": str
+            foreign_keys : list of foreign key dicts
+                foreign_key
+                dict:
+                    "foreign_key_name": str
+                    "referencing_table": str
+                    "referencing_column": str
+                    "referenced_table": str
+                    "referenced_column": str
+            partitions: list of partitions dict
+                partition
+                dict:
+                    "partition_count": int
+    """
     def _get_tables_data(self, table_list, schema, cursor):
         if len(table_list) == 0:
             return
         name_to_id = {}
-        id_to_all = {}
-        #table_names = ",".join(["'{}'".format(t.get("name")) for t in table_list])
-        #OBJECT_NAME is needed to make it work for special characters 
+        id_to_table_data = {}
         table_ids_object = ",".join(["OBJECT_NAME({})".format(t.get("id")) for t in table_list])
         table_ids = ",".join(["{}".format(t.get("id")) for t in table_list])
         for t in table_list:
             name_to_id[t["name"]] = t["id"] 
-            id_to_all[t["id"]] = t
-        total_columns_number  = self._populate_with_columns_data(table_ids_object, name_to_id, id_to_all, schema, cursor)
-        self._populate_with_partitions_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        self._populate_with_foreign_keys_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        self._populate_with_index_data(table_ids, id_to_all, cursor) #TODO P DISABLED as postgrss backend accepts different data model
-        # unwrap id_to_all
-        return total_columns_number, list(id_to_all.values())
+            id_to_table_data[t["id"]] = t
+        total_columns_number  = self._populate_with_columns_data(table_ids_object, name_to_id, id_to_table_data, schema, cursor)
+        self._populate_with_partitions_data(table_ids, id_to_table_data, cursor)
+        self._populate_with_foreign_keys_data(table_ids, id_to_table_data, cursor)
+        self._populate_with_index_data(table_ids, id_to_table_data, cursor)
+        return total_columns_number, list(id_to_table_data.values())
 
     # TODO refactor the next 3 to have a base function when everythng is settled.
     def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema, cursor):
         # get columns if we dont have a dict here unlike postgres
-        start_time = time.time()
         cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
-        self._log.warning("Executed columns query for {} seconds".format(time.time() - start_time))
-        start_time_fetch = time.time()
         data = cursor.fetchall()
-        self._log.warning("Executed cursor.fetchall()for {} seconds".format(time.time() - start_time_fetch))
-        start_time_rest = time.time()
         columns = []
         #TODO we need it cause if I put AS default its a forbidden key word and to be inline with postgres we need it
         for i in cursor.description:
@@ -288,7 +338,6 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema,
                     else:
                         row["nullable"] = True
                 id_to_all.get(table_id)["columns"] = id_to_all.get(table_id).get("columns",[]) + [row]
-        self._log.warning("Executed loops for {} seconds".format(time.time() - start_time_rest))
         return len(data)
     
     def _populate_with_partitions_data(self, table_ids, id_to_all, cursor):
@@ -330,21 +379,12 @@ def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
                     print("todo error")  
             print("end")
         #return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
-    
-        
-    #TODO in SQLServer partitioned child tables should have the same object_id might be worth checking with a test.
 
-    #TODOTODO do we need this map/list format if we are not dumping in json ??? May be we need to send query results as they are ? 
     def _get_tables(self, schema, cursor):
         cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["id"]))
         columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()] #TODO may be more optimal to patch columns with index etc 
-        # rows = [dict(zip(columns + ["columns", "indexes", "partitions", "foreign_keys"], row + [[], [], [], []])) for row in cursor.fetchall()] #TODO may be this works
-        #return [ {"id" : row["object_id"], "name" : row['name'], "columns" : [], "indexes" : [], "partitions" : [], "foreign_keys" : []} for row in rows ]     # TODO P disabled because of postgres later enable             
-        return [ {"id" : str(row["object_id"]), "name" : row['name'], "columns" : []} for row in rows ]  
-
-    #TODO table 1803153469 is in  sys.indexes but not in sys.index_columns ... shell we do something about it ?
-
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        return [ {"id" : str(row["id"]), "name" : row['name'], "columns" : []} for row in rows ] 
 
     #TODO its hard to get the partition key - for later ? 
 
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index ff7475a86fffd..a738fd5dff069 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -84,7 +84,7 @@
     is_azure_sql_database,
     set_default_driver_conf,
 )
-
+import pdb
 try:
     import adodbapi
 except ImportError:
@@ -125,7 +125,7 @@ def __init__(self, name, init_config, instances):
         self._sql_counter_types = {}
         self.proc_type_mapping = {"gauge": self.gauge, "rate": self.rate, "histogram": self.histogram}
 
-        self._schemas = Schemas(self)
+        self._schemas = Schemas(self, self._config.schemas_collection_interval)
 
         # DBM
         self.statement_metrics = SqlserverStatementMetrics(self, self._config)
@@ -159,6 +159,7 @@ def __init__(self, name, init_config, instances):
         self.sqlserver_incr_fraction_metric_previous_values = {}
 
         self._database_metrics = None
+        self._last_schemas_collect_time = None
 
     def cancel(self):
         self.statement_metrics.cancel()
@@ -746,6 +747,7 @@ def do_for_databases(self, action, databases):
                         if stop:
                             break;                  
                     except Exception as e:
+                        pdb.set_trace()
                         print("An exception occurred during do_for_databases in db - {}: {}".format(db, e))
                 # Switch DB back to MASTER
                 if not is_azure_sql_database(engine_edition):

From 03858a13e0b8111cc66cd1cdd08dc51cdc2204f9 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 6 May 2024 19:41:17 +0000
Subject: [PATCH 054/132] Added arrays to indexes nd partitions

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 33 ++++++++++++-------
 sqlserver/tests/compose/setup.sql             |  4 ++-
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index bb0a876d42637..5f43b646e599b 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -193,13 +193,27 @@ def fetch_schema_data(cursor, db_name):
         self._log.debug("Finished collect_schemas_data")
         self._dataSubmitter.submit()
 
-
     def _query_db_information(self, db_name, cursor):
         db_info = execute_query_output_result_as_a_dict(DB_QUERY.format(db_name), cursor)
         if len(db_info) == 1:
             return db_info[0]
         else:
-            return None
+            self._log.error("Couldnt query database information for %s", db_name)
+            return None        
+
+    """ returns a list of tables for schema with their names and empty column array
+    list of table dicts
+    "id": str
+    "name": str
+    "columns": [] 
+    """
+    def _get_tables(self, schema, cursor):
+        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["id"]))
+        columns = [str(i[0]).lower() for i in cursor.description]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        return [ {"id" : str(row["id"]), "name" : row['name'], "columns" : []} for row in rows ] 
+
+
     # TODO how often ?
 
     """schemas data struct is a dictionnary with key being a schema name the value is
@@ -251,12 +265,10 @@ def _query_db_information(self, db_name, cursor):
         "id": str
         "owner_name": str"""
     def _query_schema_information(self, cursor):
-        self._log.debug("Running query [%s]", SCHEMA_QUERY)
         cursor.execute(SCHEMA_QUERY)
         schemas = []
         columns = [i[0] for i in cursor.description]
         schemas = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
-        self._log.debug("fetched schemas len(rows)=%s", len(schemas))
         return schemas
     
     """ returns extracted column numbers and a list of tables
@@ -311,7 +323,6 @@ def _get_tables_data(self, table_list, schema, cursor):
         self._populate_with_index_data(table_ids, id_to_table_data, cursor)
         return total_columns_number, list(id_to_table_data.values())
 
-    # TODO refactor the next 3 to have a base function when everythng is settled.
     def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema, cursor):
         # get columns if we dont have a dict here unlike postgres
         cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
@@ -361,7 +372,8 @@ def _populate_with_index_data(self, table_ids, id_to_all, cursor):
         for row in rows:
             id  = row.pop("id", None)
             if id is not None:
-                id_to_all.get(str(id))["indexes"] = row
+                id_to_all.get(str(id)).setdefault("indexes", [])
+                id_to_all.get(str(id))["indexes"].append(row)
             else:
                 print("todo error")
             row.pop("id", None)
@@ -374,17 +386,14 @@ def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
             for row in rows:
                 id  = row.pop("id", None)
                 if id is not None:
-                    id_to_all.get(str(id))["foreign_keys"] = row
+                    id_to_all.get(str(id)).setdefault("foreign_keys", [])
+                    id_to_all.get(str(id))["foreign_keys"].append(row)
                 else:
                     print("todo error")  
             print("end")
         #return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
 
-    def _get_tables(self, schema, cursor):
-        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["id"]))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        return [ {"id" : str(row["id"]), "name" : row['name'], "columns" : []} for row in rows ] 
+
 
     #TODO its hard to get the partition key - for later ? 
 
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index deaee35cd17a8..d3f75fec8a1d5 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -53,13 +53,15 @@ AS PARTITION CityPartitionFunction ALL TO ([PRIMARY]); -- Assign partitions to f
 CREATE TABLE datadog_test_schemas.test_schema.cities (
     id INT NOT NULL DEFAULT 0,
     name VARCHAR(255),
+    population INT NOT NULL DEFAULT 0,
     CONSTRAINT PK_Cities PRIMARY KEY (id)
 ) ON CityPartitionScheme(id); -- Assign the partition scheme to the table
 
 
 CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+CREATE INDEX single_column_index ON datadog_test_schemas.test_schema.cities (population);
 
-INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey'), (2, 'bar');
+INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey', 100), (2, 'bar', 200);
 GO
 CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);
 GO

From 5452f01477d5cf1c92e7ab20ccde37e0d81a06bf Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 6 May 2024 20:49:22 +0000
Subject: [PATCH 055/132] added error logs

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 59 ++++++++++---------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 5f43b646e599b..f991c377b62d3 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -213,9 +213,6 @@ def _get_tables(self, schema, cursor):
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         return [ {"id" : str(row["id"]), "name" : row['name'], "columns" : []} for row in rows ] 
 
-
-    # TODO how often ?
-
     """schemas data struct is a dictionnary with key being a schema name the value is
     schema
     dict:
@@ -323,71 +320,75 @@ def _get_tables_data(self, table_list, schema, cursor):
         self._populate_with_index_data(table_ids, id_to_table_data, cursor)
         return total_columns_number, list(id_to_table_data.values())
 
-    def _populate_with_columns_data(self, table_ids, name_to_id, id_to_all, schema, cursor):
-        # get columns if we dont have a dict here unlike postgres
+
+    """ 
+    adds columns list data to each table in a provided list
+    """
+    def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, schema, cursor):
         cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
         data = cursor.fetchall()
         columns = []
-        #TODO we need it cause if I put AS default its a forbidden key word and to be inline with postgres we need it
-        for i in cursor.description:
-            if str(i[0]).lower() == "column_default":
-                columns.append("default")
-            else:
-                columns.append(str(i[0]).lower())
-        
-
+        # AS default - cannot be used in sqlserver query as this word is reserved
+        columns = ["default" if str(i[0]).lower() == "column_default" else str(i[0]).lower() for i in cursor.description]
         rows = [dict(zip(columns, [str(item) for item in row])) for row in data]       
         for row in rows:
             table_id = name_to_id.get(str(row.get("table_name")))
             if table_id is not None:
-                # exclude "table_name" from the row dict
                 row.pop("table_name", None)
                 if "nullable" in row:
                     if row["nullable"].lower() == "no" or row["nullable"].lower() == "false":
-                        #to make compatible with postgres 
                         row["nullable"] = False
                     else:
                         row["nullable"] = True
-                id_to_all.get(table_id)["columns"] = id_to_all.get(table_id).get("columns",[]) + [row]
+                if table_id in id_to_table_data:        
+                    id_to_table_data.get(table_id)["columns"] = id_to_table_data.get(table_id).get("columns",[]) + [row]
+                else:
+                    self._log.error("Columns found for an unkown table with the object_id: %s", table_id)
+            else:
+                self._log.error("Couldn't find id of a table: %s", table_id)
         return len(data)
     
-    def _populate_with_partitions_data(self, table_ids, id_to_all, cursor):
+    """ 
+    adds partitions dict to each table in a provided list
+    """
+    def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
         cursor.execute(PARTITIONS_QUERY.format(table_ids))
         columns = [str(i[0]).lower() for i in cursor.description] 
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         for row in rows:
             id  = row.pop("id", None)
-            if id is not None:
-                #TODO what happens if not found ? 
-                id_to_all.get(str(id))["partitions"] = row
+            if id is not None: 
+                id_str = str(id)
+                if id_str in id_to_table_data:
+                    id_to_table_data[id_str]["partitions"] = row
+                else:
+                    self._log.error("Partition found for an unkown table with the object_id: %s", id_str)
             else:
-                print("todo error")
-            row.pop("id", None)
-        print("end")
+                self._log.error("Return rows of [%s] query should have id column", PARTITIONS_QUERY)
 
-    def _populate_with_index_data(self, table_ids, id_to_all, cursor):
+    def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
         cursor.execute(INDEX_QUERY.format(table_ids))
         columns = [str(i[0]).lower() for i in cursor.description] 
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         for row in rows:
             id  = row.pop("id", None)
             if id is not None:
-                id_to_all.get(str(id)).setdefault("indexes", [])
-                id_to_all.get(str(id))["indexes"].append(row)
+                id_to_table_data.get(str(id)).setdefault("indexes", [])
+                id_to_table_data.get(str(id))["indexes"].append(row)
             else:
                 print("todo error")
             row.pop("id", None)
         print("end")
 
-    def _populate_with_foreign_keys_data(self, table_ids, id_to_all, cursor):
+    def _populate_with_foreign_keys_data(self, table_ids, id_to_table_data, cursor):
             cursor.execute(FOREIGN_KEY_QUERY.format(table_ids))
             columns = [str(i[0]).lower() for i in cursor.description] 
             rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
             for row in rows:
                 id  = row.pop("id", None)
                 if id is not None:
-                    id_to_all.get(str(id)).setdefault("foreign_keys", [])
-                    id_to_all.get(str(id))["foreign_keys"].append(row)
+                    id_to_table_data.get(str(id)).setdefault("foreign_keys", [])
+                    id_to_table_data.get(str(id))["foreign_keys"].append(row)
                 else:
                     print("todo error")  
             print("end")

From 1b9fc98e74edfaa81608faa30aee56593a7b2260 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 6 May 2024 22:24:56 +0000
Subject: [PATCH 056/132] formatted queries

---
 sqlserver/datadog_checks/sqlserver/const.py   | 84 ++++++++++++-------
 sqlserver/datadog_checks/sqlserver/schemas.py | 46 +++++-----
 .../datadog_checks/sqlserver/sqlserver.py     |  8 +-
 sqlserver/tests/test_metadata.py              |  4 +-
 4 files changed, 78 insertions(+), 64 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 107c8fadf0daa..8762726a72ec9 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -272,35 +272,55 @@
 #Schemas
 DEFAULT_SCHEMAS_COLLECTION_INTERVAL = 1200
 
-#for now description results in ('ODBC SQL type -150 is not yet supported.  column-index=4  type=-150', 'HY106')
-DB_QUERY2 = "SELECT db.database_id AS id, db.name AS NAME, db.collation_name AS collation, dp.name AS owner, ep.value AS description FROM sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid LEFT JOIN sys.extended_properties ep ON ep.major_id = db.database_id AND ep.minor_id = 0 AND ep.class = 0 AND ep.name = 'MS_Description' WHERE db.name = '{}';"
-DB_QUERY = "SELECT db.database_id AS id, db.name AS NAME, db.collation_name AS collation, dp.name AS owner FROM sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid WHERE db.name = '{}';"
-
-#TODO as owner for the postgresbackend 
-SCHEMA_QUERY = "SELECT name,schema_id AS id, dp.name AS OwnerName, FROM sys.schemas AS s LEFT JOIN sys.database_principals dp ON s.principal_id = dp.principal_id WHERE s.name NOT IN ('sys', 'information_schema');"
-SCHEMA_QUERY = "SELECT s.name AS name ,s.schema_id AS id, dp.name AS owner_name FROM sys.schemas AS s LEFT JOIN sys.database_principals dp ON s.principal_id = dp.principal_id WHERE s.name NOT IN ('sys', 'information_schema')";
-TABLES_IN_SCHEMA_QUERY = "SELECT name, object_id AS id FROM sys.tables WHERE schema_id={}"
-COLUMN_QUERY3 = "SELECT COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT , IS_NULLABLE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{}' and TABLE_SCHEMA='{}';"
-#this query returns several values in case there is an alias for an int ... 
-COLUMN_QUERY2 = "SELECT c.name AS name, t.name AS data_type, c.is_nullable AS is_nullable, dc.definition AS default_value FROM sys.columns c JOIN sys.types t ON c.system_type_id = t.system_type_id OR c.user_type_id = t.user_type_id LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id WHERE c.object_id = {}"
-
-#WHERE  attrelid IN ({table_ids})
-COLUMN_QUERY3 = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
-COLUMN_QUERY = "SELECT COLUMN_NAME AS name, DATA_TYPE AS data_type, COLUMN_DEFAULT, IS_NULLABLE AS nullable , TABLE_NAME, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ({}) and TABLE_SCHEMA='{}';"
-#TODO add ORDER BY ORDINAL_POSITION; ? 
-#"SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME IN ('boris', OBJECT_NAME(917578307))
-
-#PARTITIONS_QUERY2 = "SELECT ps.name AS partition_scheme, pf.name AS partition_function FROM sys.tables t INNER JOIN sys.indexes i ON t.object_id = i.object_id INNER JOIN sys.partition_schemes ps ON i.data_space_id = ps.data_space_id INNER JOIN sys.partition_functions pf ON ps.function_id = pf.function_id WHERE t.object_id = {};"
-PARTITIONS_QUERY2 = "SELECT COUNT(*) FROM sys.partitions WHERE object_id = {};"
-PARTITIONS_QUERY = "SELECT object_id AS id, COUNT(*) AS partition_count FROM sys.partitions WHERE object_id IN ({}) GROUP BY object_id;"
-#parent_object_id - is the one of the parent table.
-FOREIGN_KEY_QUERY3 = "SELECT referenced_object_id, COUNT(*) AS foreign_key_count FROM sys.foreign_keys WHERE referenced_object_id IN ({}) GROUP BY referenced_object_id;"
-INDEX_QUERY2 = "SELECT i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, c.name AS column_name FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id = {};"
-# May be this query is wrong like what if index is build on 2 columns will this work ? to test ? 
-INDEX_QUERY = "SELECT i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, STRING_AGG(c.name, ',') AS column_names FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;"
-#INDEX_QUERY = "SELECT i.object_id AS object_id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled, STRING_AGG(c.name, ',') AS column_names FROM sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id WHERE i.object_id IN ({}) GROUP BY i.object_id, i.name;"
-
-#FOREIGN_KEY_QUERY2 = "SELECT name , OBJECT_NAME(parent_object_id) AS parent_table FROM sys.foreign_keys WHERE object_id={};"
-FOREIGN_KEY_QUERY2 = "SELECT COUNT(*) FROM sys.foreign_keys WHERE referenced_object_id = {};"
-
-FOREIGN_KEY_QUERY="SELECT FK.referenced_object_id AS id, FK.name AS foreign_key_name, OBJECT_NAME(FK.parent_object_id) AS referencing_table,  STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column, OBJECT_NAME(FK.referenced_object_id) AS referenced_table, STRING_AGG(COL_NAME(FKC.referenced_object_id, FKC.referenced_column_id),',') AS referenced_column FROM sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id WHERE  FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;"
+DB_QUERY = """SELECT 
+                  db.database_id AS id, db.name AS name, db.collation_name AS collation, dp.name AS owner 
+              FROM 
+                  sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid 
+              WHERE db.name = '{}';"""
+
+SCHEMA_QUERY = """SELECT 
+                      s.name AS name, s.schema_id AS id, dp.name AS owner_name 
+                  FROM
+                      sys.schemas AS s JOIN sys.database_principals dp ON s.principal_id = dp.principal_id 
+                  WHERE s.name NOT IN ('sys', 'information_schema')""";
+
+TABLES_IN_SCHEMA_QUERY = """SELECT 
+                                name, object_id AS id
+                            FROM 
+                                sys.tables 
+                            WHERE schema_id={}"""
+
+COLUMN_QUERY = """SELECT 
+                      column_name AS name, data_type, column_default, is_nullable AS nullable , table_name, ordinal_position 
+                  FROM 
+                      information_schema.columns 
+                  WHERE 
+                      table_name IN ({}) and table_schema='{}';"""
+
+PARTITIONS_QUERY = """SELECT 
+                          object_id AS id, COUNT(*) AS partition_count 
+                      FROM 
+                          sys.partitions 
+                      WHERE 
+                          object_id IN ({}) GROUP BY object_id;"""
+
+INDEX_QUERY = """SELECT 
+                     i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, 
+                     i.is_disabled, STRING_AGG(c.name, ',') AS column_names 
+                 FROM 
+                     sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id 
+                     AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id 
+                 WHERE 
+                     i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type, 
+                     i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;"""
+
+FOREIGN_KEY_QUERY="""SELECT 
+                         FK.referenced_object_id AS id, FK.name AS foreign_key_name, 
+                         OBJECT_NAME(FK.parent_object_id) AS referencing_table,  
+                         STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column, 
+                         OBJECT_NAME(FK.referenced_object_id) AS referenced_table, 
+                         STRING_AGG(COL_NAME(FKC.referenced_object_id, FKC.referenced_column_id),',') AS referenced_column 
+                     FROM 
+                         sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id 
+                     WHERE 
+                         FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;"""
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index f991c377b62d3..6fa95975917bc 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -30,7 +30,6 @@ class SubmitData:
     MAX_COLUMN_COUNT  = 10_000
 
     # REDAPL has a 3MB limit per resource
-    #TODO Report truncation to the backend
     MAX_TOTAL_COLUMN_COUNT = 100_000 
 
     def __init__(self, submit_data_function, base_event, logger):
@@ -66,7 +65,7 @@ def store(self, db_name, schema, tables, columns_count):
             known_tables = schemas[schema["id"]].setdefault("tables",[])
             known_tables = known_tables + tables
         else:
-            schemas[schema["id"]] = copy.deepcopy(schema) # TODO a deep copy ? kind of costs not much to be safe
+            schemas[schema["id"]] = copy.deepcopy(schema)
             schemas[schema["id"]]["tables"] = tables
         if self._columns_count > self.MAX_COLUMN_COUNT:
             self._submit()
@@ -85,7 +84,7 @@ def submit(self):
         for db, schemas_by_id in self.db_to_schemas.items():
             db_info = {}
             if db not in self.db_info:
-                #TODO log error
+                self._log.error("Couldn't find database info for %s", db)
                 db_info["name"] = db
             else:
                 db_info = self.db_info[db]
@@ -105,8 +104,6 @@ class Schemas:
     def __init__(self, check, schemas_collection_interval):
         self._check = check 
         self._log = check.log
-        self._tags = [t for t in check.tags if not t.startswith('dd.internal')]
-        self._tags.append("boris:data")
         self.schemas_per_db = {} 
 
         base_event = {
@@ -116,7 +113,7 @@ def __init__(self, check, schemas_collection_interval):
             "kind": "sqlserver_databases",
             "collection_interval": schemas_collection_interval,
             "dbms_version": None,
-            "tags": self._tags, #in postgres it's no DB ?
+            "tags": self._check.non_internal_tags,
             "cloud_metadata": self._check._config.cloud_metadata,
         }
         self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
@@ -165,7 +162,7 @@ def __init__(self, check, schemas_collection_interval):
     """
     def collect_schemas_data(self):
         self._dataSubmitter.reset()
-        self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._tags, self._check._config.cloud_metadata, 
+        self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._check.non_internal_tags, self._check._config.cloud_metadata, 
                                                 "{},{}".format(
                                                 self._check.static_info_cache.get(STATIC_INFO_VERSION, ""),
                                                 self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION, ""),)
@@ -180,6 +177,7 @@ def fetch_schema_data(cursor, db_name):
                 tables_chunk = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
                 for tables_chunk in tables_chunk:
                     if self._dataSubmitter.exceeded_total_columns_number():
+                        #TODO Report truncation to the backend
                         self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
                         return True                    
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
@@ -320,7 +318,6 @@ def _get_tables_data(self, table_list, schema, cursor):
         self._populate_with_index_data(table_ids, id_to_table_data, cursor)
         return total_columns_number, list(id_to_table_data.values())
 
-
     """ 
     adds columns list data to each table in a provided list
     """
@@ -373,12 +370,14 @@ def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
         for row in rows:
             id  = row.pop("id", None)
             if id is not None:
-                id_to_table_data.get(str(id)).setdefault("indexes", [])
-                id_to_table_data.get(str(id))["indexes"].append(row)
+                id_str = str(id)
+                if id_str in id_to_table_data:
+                    id_to_table_data[id_str].setdefault("indexes", [])
+                    id_to_table_data[id_str]["indexes"].append(row)
+                else:
+                    self._log.error("Index found for an unkown table with the object_id: %s", id_str)
             else:
-                print("todo error")
-            row.pop("id", None)
-        print("end")
+                self._log.error("Return rows of [%s] query should have id column", INDEX_QUERY)
 
     def _populate_with_foreign_keys_data(self, table_ids, id_to_table_data, cursor):
             cursor.execute(FOREIGN_KEY_QUERY.format(table_ids))
@@ -387,19 +386,12 @@ def _populate_with_foreign_keys_data(self, table_ids, id_to_table_data, cursor):
             for row in rows:
                 id  = row.pop("id", None)
                 if id is not None:
-                    id_to_table_data.get(str(id)).setdefault("foreign_keys", [])
-                    id_to_table_data.get(str(id))["foreign_keys"].append(row)
+                    id_str = str(id)
+                    if id_str in id_to_table_data:
+                        id_to_table_data.get(str(id)).setdefault("foreign_keys", [])
+                        id_to_table_data.get(str(id))["foreign_keys"].append(row)
+                    else:
+                        self._log.error("Foreign key found for an unkown table with the object_id: %s", id_str)
                 else:
-                    print("todo error")  
-            print("end")
-        #return execute_query_output_result_as_a_dict(COLUMN_QUERY.format(table_name, schema_name), cursor)
-
-
-
-    #TODO its hard to get the partition key - for later ? 
-
-        # TODO check out sys.partitions in postgres we deliver some data about patitions
-        # "partition_key": str (if has partitions) - equiv ? 
-        # may be use this  https://littlekendra.com/2016/03/15/find-the-partitioning-key-on-an-existing-table-with-partition_ordinal/
-        # for more in depth search, it's not trivial to determine partition key like in Postgres
+                    self._log.error("Return rows of [%s] query should have id column", FOREIGN_KEY_QUERY)  
        
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index a738fd5dff069..1b30ef2bd9c85 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -125,8 +125,6 @@ def __init__(self, name, init_config, instances):
         self._sql_counter_types = {}
         self.proc_type_mapping = {"gauge": self.gauge, "rate": self.rate, "histogram": self.histogram}
 
-        self._schemas = Schemas(self, self._config.schemas_collection_interval)
-
         # DBM
         self.statement_metrics = SqlserverStatementMetrics(self, self._config)
         self.procedure_metrics = SqlserverProcedureMetrics(self, self._config)
@@ -145,7 +143,7 @@ def __init__(self, name, init_config, instances):
         )  # type: TTLCache
         # Keep a copy of the tags before the internal resource tags are set so they can be used for paths that don't
         # go through the agent internal metrics submission processing those tags
-        self._non_internal_tags = copy.deepcopy(self.tags)
+        self.non_internal_tags = copy.deepcopy(self.tags)
         self.check_initializations.append(self.initialize_connection)
         self.check_initializations.append(self.set_resolved_hostname)
         self.check_initializations.append(self.set_resolved_hostname_metadata)
@@ -159,7 +157,9 @@ def __init__(self, name, init_config, instances):
         self.sqlserver_incr_fraction_metric_previous_values = {}
 
         self._database_metrics = None
+
         self._last_schemas_collect_time = None
+        self._schemas = Schemas(self, self._config.schemas_collection_interval)
 
     def cancel(self):
         self.statement_metrics.cancel()
@@ -1053,7 +1053,7 @@ def _send_database_instance_metadata(self):
                     self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION, ""),
                 ),
                 "integration_version": __version__,
-                "tags": self._non_internal_tags,
+                "tags": self.non_internal_tags,
                 "timestamp": time.time() * 1000,
                 "cloud_metadata": self._config.cloud_metadata,
                 "metadata": {
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 25cd66414d1d2..5594d36024d86 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -146,9 +146,11 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         #difference = {}
+
         diff_keys = list(difference.keys())
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
-            logging.debug("found the following diffs %s", json.dumps(difference))
+            pdb.set_trace()
+            logging.debug("found the following diffs %s", str(difference))
             assert False
 
         # we need a special comparison as order of columns matter

From c68e849d2a2a102fa2aa87480510aa624a13e023 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 07:40:29 +0000
Subject: [PATCH 057/132] format queries

---
 sqlserver/datadog_checks/sqlserver/const.py | 42 ++++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 8762726a72ec9..e9b303a00dcac 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -272,39 +272,50 @@
 #Schemas
 DEFAULT_SCHEMAS_COLLECTION_INTERVAL = 1200
 
-DB_QUERY = """SELECT 
+DB_QUERY = """
+              SELECT 
                   db.database_id AS id, db.name AS name, db.collation_name AS collation, dp.name AS owner 
               FROM 
                   sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid 
-              WHERE db.name = '{}';"""
+              WHERE db.name = '{}';
+           """
 
-SCHEMA_QUERY = """SELECT 
+SCHEMA_QUERY = """
+                  SELECT 
                       s.name AS name, s.schema_id AS id, dp.name AS owner_name 
                   FROM
                       sys.schemas AS s JOIN sys.database_principals dp ON s.principal_id = dp.principal_id 
-                  WHERE s.name NOT IN ('sys', 'information_schema')""";
+                  WHERE s.name NOT IN ('sys', 'information_schema')
+               """;
 
-TABLES_IN_SCHEMA_QUERY = """SELECT 
+TABLES_IN_SCHEMA_QUERY = """
+                            SELECT 
                                 name, object_id AS id
                             FROM 
                                 sys.tables 
-                            WHERE schema_id={}"""
+                            WHERE schema_id={}
+                         """
 
-COLUMN_QUERY = """SELECT 
+COLUMN_QUERY = """
+                  SELECT 
                       column_name AS name, data_type, column_default, is_nullable AS nullable , table_name, ordinal_position 
                   FROM 
                       information_schema.columns 
                   WHERE 
-                      table_name IN ({}) and table_schema='{}';"""
+                      table_name IN ({}) and table_schema='{}';
+               """
 
-PARTITIONS_QUERY = """SELECT 
+PARTITIONS_QUERY = """
+                      SELECT 
                           object_id AS id, COUNT(*) AS partition_count 
                       FROM 
                           sys.partitions 
                       WHERE 
-                          object_id IN ({}) GROUP BY object_id;"""
+                          object_id IN ({}) GROUP BY object_id;
+                   """
 
-INDEX_QUERY = """SELECT 
+INDEX_QUERY = """
+                 SELECT 
                      i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, 
                      i.is_disabled, STRING_AGG(c.name, ',') AS column_names 
                  FROM 
@@ -312,9 +323,11 @@
                      AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id 
                  WHERE 
                      i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type, 
-                     i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;"""
+                     i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;
+                 """
 
-FOREIGN_KEY_QUERY="""SELECT 
+FOREIGN_KEY_QUERY="""
+                     SELECT 
                          FK.referenced_object_id AS id, FK.name AS foreign_key_name, 
                          OBJECT_NAME(FK.parent_object_id) AS referencing_table,  
                          STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column, 
@@ -323,4 +336,5 @@
                      FROM 
                          sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id 
                      WHERE 
-                         FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;"""
+                         FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;
+                  """

From bcc95b1dbe3e547915ddc607b660eabc5c27afc4 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 09:22:47 +0000
Subject: [PATCH 058/132] refactored queries execution

---
 sqlserver/datadog_checks/sqlserver/const.py   |   2 +-
 .../datadog_checks/sqlserver/metadata.py      |  10 --
 sqlserver/datadog_checks/sqlserver/schemas.py | 109 ++++++------------
 sqlserver/datadog_checks/sqlserver/utils.py   |   8 +-
 4 files changed, 42 insertions(+), 87 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index e9b303a00dcac..1b03f77f1c456 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -290,7 +290,7 @@
 
 TABLES_IN_SCHEMA_QUERY = """
                             SELECT 
-                                name, object_id AS id
+                                object_id AS id, name
                             FROM 
                                 sys.tables 
                             WHERE schema_id={}
diff --git a/sqlserver/datadog_checks/sqlserver/metadata.py b/sqlserver/datadog_checks/sqlserver/metadata.py
index 15fddbbce47af..1989422440264 100644
--- a/sqlserver/datadog_checks/sqlserver/metadata.py
+++ b/sqlserver/datadog_checks/sqlserver/metadata.py
@@ -246,13 +246,3 @@ def report_sqlserver_metadata(self):
                     "metadata": settings_rows,
                 }
                 self._check.database_monitoring_metadata(json.dumps(event, default=default_json_event_encoding))
-
-                #TODO split in functions
-                #NEXT BIg thing whats with different DBS , filtering , partitions 
-                #Trade off dict vs normal data structure ? 
-
-                #TODO do it per DB if not Azure otherwise connect , kind of bad main thread ?
-                #schemas = self._query_schema_information(cursor)
-                #self._get_table_infos(schemas, cursor)
-                #print(schemas)
-                #pdb.set_trace()
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 6fa95975917bc..776bb74b5df26 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -2,7 +2,6 @@
     import datadog_agent
 except ImportError:
     from ..stubs import datadog_agent
-import time
 
 from datadog_checks.sqlserver.const import (
     TABLES_IN_SCHEMA_QUERY,
@@ -19,6 +18,9 @@
 from datadog_checks.sqlserver.utils import (
     execute_query_output_result_as_a_dict, get_list_chunks
 )
+
+from datadog_checks.base.utils.tracking import tracked_method
+
 import pdb
 import time
 import json
@@ -94,6 +96,9 @@ def submit(self):
         self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 
+def agent_check_getter(self):
+    return self._check
+
 class Schemas:
 
     # Requests for infromation about tables are done for a certain amount of tables at the time
@@ -118,27 +123,27 @@ def __init__(self, check, schemas_collection_interval):
         }
         self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
 
-    """schemas data struct is a dictionnary with key being a schema name the value is
-    schema
-    dict:
+    """Collects database information and schemas and submits to the agent's queue as dictionaries
+    schema dict
+    key/value:
         "name": str
         "id": str
         "owner_name": str
         "tables" : list of tables dicts
             table 
-            dict:
+            key/value:
                 "id" : str
                 "name" : str
                 columns: list of columns dicts                 
                     columns 
-                    dict:
+                    key/value:
                         "name": str
                         "data_type": str
                         "default": str
                         "nullable": bool
             indexes : list of index dicts
                 index
-                dict:
+                key/value:
                     "name": str
                     "type": str
                     "is_unique": bool
@@ -148,18 +153,18 @@ def __init__(self, check, schemas_collection_interval):
                     "column_names": str
             foreign_keys : list of foreign key dicts
                 foreign_key
-                dict:
+                key/value:
                     "foreign_key_name": str
                     "referencing_table": str
                     "referencing_column": str
                     "referenced_table": str
                     "referenced_column": str
-            partitions: list of partitions dict
+            partitions: partition dict
                 partition
-                dict:
-                    "partition_count": int
-            partitions useful to know the number 
+                key/value:
+                    "partition_count": int 
     """
+    @tracked_method(agent_check_getter=agent_check_getter)
     def collect_schemas_data(self):
         self._dataSubmitter.reset()
         self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._check.non_internal_tags, self._check._config.cloud_metadata, 
@@ -205,83 +210,39 @@ def _query_db_information(self, db_name, cursor):
     "name": str
     "columns": [] 
     """
+    @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables(self, schema, cursor):
-        cursor.execute(TABLES_IN_SCHEMA_QUERY.format(schema["id"]))
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        return [ {"id" : str(row["id"]), "name" : row['name'], "columns" : []} for row in rows ] 
-
-    """schemas data struct is a dictionnary with key being a schema name the value is
-    schema
-    dict:
-        "name": str
-        "id": str
-        "owner_name": str
-        "tables" : list of tables dicts
-            table 
-            dict:
-                "id" : str
-                "name" : str
-                columns: list of columns dicts                 
-                    columns 
-                    dict:
-                        "name": str
-                        "data_type": str
-                        "default": str
-                        "nullable": bool
-            indexes : list of index dicts
-                index
-                dict:
-                    "name": str
-                    "type": str
-                    "is_unique": bool
-                    "is_primary_key": bool
-                    "is_unique_constraint": bool
-                    "is_disabled": bool,
-                    "column_names": str
-            foreign_keys : list of foreign key dicts
-                foreign_key
-                dict:
-                    "foreign_key_name": str
-                    "referencing_table": str
-                    "referencing_column": str
-                    "referenced_table": str
-                    "referenced_column": str
-            partitions: list of partitions dict
-                partition
-                dict:
-                    "partition_count": int
-            partitions useful to know the number 
-    """    
-    """fetches schemas dict 
+        tables_info = execute_query_output_result_as_a_dict(TABLES_IN_SCHEMA_QUERY.format(schema["id"]), cursor)
+        for t in tables_info:
+            t.setdefault("columns", [])
+        return tables_info
+    
+    """ returns a list of schema dicts
     schema
     dict:
         "name": str
         "id": str
         "owner_name": str"""
+    @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _query_schema_information(self, cursor):
-        cursor.execute(SCHEMA_QUERY)
-        schemas = []
-        columns = [i[0] for i in cursor.description]
-        schemas = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
-        return schemas
+        return execute_query_output_result_as_a_dict(SCHEMA_QUERY, cursor)
     
     """ returns extracted column numbers and a list of tables
         "tables" : list of tables dicts
         table 
-        dict:
+        key/value:
             "id" : str
             "name" : str
             columns: list of columns dicts                 
                 columns 
-                dict:
+                key/value:
                     "name": str
                     "data_type": str
                     "default": str
                     "nullable": bool
             indexes : list of index dicts
                 index
-                dict:
+                key/value:
                     "name": str
                     "type": str
                     "is_unique": bool
@@ -291,17 +252,18 @@ def _query_schema_information(self, cursor):
                     "column_names": str
             foreign_keys : list of foreign key dicts
                 foreign_key
-                dict:
+                key/value:
                     "foreign_key_name": str
                     "referencing_table": str
                     "referencing_column": str
                     "referenced_table": str
                     "referenced_column": str
-            partitions: list of partitions dict
+            partitions: partition dict
                 partition
-                dict:
+                key/value:
                     "partition_count": int
     """
+    @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables_data(self, table_list, schema, cursor):
         if len(table_list) == 0:
             return
@@ -321,10 +283,10 @@ def _get_tables_data(self, table_list, schema, cursor):
     """ 
     adds columns list data to each table in a provided list
     """
+    @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, schema, cursor):
         cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
         data = cursor.fetchall()
-        columns = []
         # AS default - cannot be used in sqlserver query as this word is reserved
         columns = ["default" if str(i[0]).lower() == "column_default" else str(i[0]).lower() for i in cursor.description]
         rows = [dict(zip(columns, [str(item) for item in row])) for row in data]       
@@ -348,6 +310,7 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, s
     """ 
     adds partitions dict to each table in a provided list
     """
+    @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
         cursor.execute(PARTITIONS_QUERY.format(table_ids))
         columns = [str(i[0]).lower() for i in cursor.description] 
@@ -363,6 +326,7 @@ def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
             else:
                 self._log.error("Return rows of [%s] query should have id column", PARTITIONS_QUERY)
 
+    @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
         cursor.execute(INDEX_QUERY.format(table_ids))
         columns = [str(i[0]).lower() for i in cursor.description] 
@@ -379,6 +343,7 @@ def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
             else:
                 self._log.error("Return rows of [%s] query should have id column", INDEX_QUERY)
 
+    @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _populate_with_foreign_keys_data(self, table_ids, id_to_table_data, cursor):
             cursor.execute(FOREIGN_KEY_QUERY.format(table_ids))
             columns = [str(i[0]).lower() for i in cursor.description] 
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index cfe1f64e2a254..b30d3070b001a 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -138,13 +138,13 @@ def is_azure_sql_database(engine_edition):
     """
     return engine_edition == ENGINE_EDITION_SQL_DATABASE
 
-def execute_query_output_result_as_a_dict(query, cursor, column_name=None):
+def execute_query_output_result_as_a_dict(query, cursor, modify_columns=None):
     cursor.execute(query)
     columns = []
-    if column_name:
-        columns = [str(column_name).lower() for i in cursor.description]
+    if modify_columns:
+        columns = modify_columns(cursor.description)
     else:
-        columns = [str(i[0]).lower() for i in cursor.description]
+        columns = [str(column[0]).lower() for column in cursor.description]
     rows = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
     return rows
 

From 060eacbdc63e80430e483affc8c9662749f76670 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 10:41:25 +0000
Subject: [PATCH 059/132] improved formatting

---
 sqlserver/datadog_checks/sqlserver/config.py  |  10 +-
 sqlserver/datadog_checks/sqlserver/const.py   | 112 +++++-----
 .../datadog_checks/sqlserver/metadata.py      |  98 +--------
 sqlserver/datadog_checks/sqlserver/schemas.py | 197 +++++++++---------
 .../datadog_checks/sqlserver/sqlserver.py     |  22 +-
 sqlserver/datadog_checks/sqlserver/utils.py   |  16 +-
 sqlserver/tests/test_metadata.py              | 116 +++++++----
 sqlserver/tests/utils.py                      |  13 +-
 8 files changed, 272 insertions(+), 312 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/config.py b/sqlserver/datadog_checks/sqlserver/config.py
index 010f3352cb082..382ae3c3d364d 100644
--- a/sqlserver/datadog_checks/sqlserver/config.py
+++ b/sqlserver/datadog_checks/sqlserver/config.py
@@ -7,7 +7,11 @@
 
 from datadog_checks.base.config import is_affirmative
 from datadog_checks.base.utils.common import to_native_string
-from datadog_checks.sqlserver.const import DEFAULT_AUTODISCOVERY_INTERVAL, PROC_CHAR_LIMIT, DEFAULT_SCHEMAS_COLLECTION_INTERVAL
+from datadog_checks.sqlserver.const import (
+    DEFAULT_AUTODISCOVERY_INTERVAL,
+    DEFAULT_SCHEMAS_COLLECTION_INTERVAL,
+    PROC_CHAR_LIMIT,
+)
 
 
 class SQLServerConfig:
@@ -23,7 +27,9 @@ def __init__(self, init_config, instance, log):
         self.autodiscovery_interval: int = instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
         self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include)
         self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude)
-        self.schemas_collection_interval: int = instance.get('schemas_collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL)
+        self.schemas_collection_interval: int = instance.get(
+            'schemas_collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
+        )
 
         self.proc: str = instance.get('stored_procedure')
         self.custom_metrics: list[dict] = init_config.get('custom_metrics', []) or []
diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index 1b03f77f1c456..e30a049a82625 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -269,72 +269,72 @@
 
 PROC_CHAR_LIMIT = 500
 
-#Schemas
+# Schemas
 DEFAULT_SCHEMAS_COLLECTION_INTERVAL = 1200
 
 DB_QUERY = """
-              SELECT 
-                  db.database_id AS id, db.name AS name, db.collation_name AS collation, dp.name AS owner 
-              FROM 
-                  sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid 
-              WHERE db.name = '{}';
-           """
+SELECT
+    db.database_id AS id, db.name AS name, db.collation_name AS collation, dp.name AS owner
+FROM
+    sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid
+WHERE db.name = '{}';
+"""
 
 SCHEMA_QUERY = """
-                  SELECT 
-                      s.name AS name, s.schema_id AS id, dp.name AS owner_name 
-                  FROM
-                      sys.schemas AS s JOIN sys.database_principals dp ON s.principal_id = dp.principal_id 
-                  WHERE s.name NOT IN ('sys', 'information_schema')
-               """;
+SELECT
+    s.name AS name, s.schema_id AS id, dp.name AS owner_name
+FROM
+    sys.schemas AS s JOIN sys.database_principals dp ON s.principal_id = dp.principal_id
+WHERE s.name NOT IN ('sys', 'information_schema')
+"""
 
 TABLES_IN_SCHEMA_QUERY = """
-                            SELECT 
-                                object_id AS id, name
-                            FROM 
-                                sys.tables 
-                            WHERE schema_id={}
-                         """
+SELECT
+    object_id AS id, name
+FROM
+    sys.tables
+WHERE schema_id={}
+"""
 
 COLUMN_QUERY = """
-                  SELECT 
-                      column_name AS name, data_type, column_default, is_nullable AS nullable , table_name, ordinal_position 
-                  FROM 
-                      information_schema.columns 
-                  WHERE 
-                      table_name IN ({}) and table_schema='{}';
-               """
+SELECT
+    column_name AS name, data_type, column_default, is_nullable AS nullable , table_name, ordinal_position
+FROM
+    information_schema.columns
+WHERE
+    table_name IN ({}) and table_schema='{}';
+"""
 
 PARTITIONS_QUERY = """
-                      SELECT 
-                          object_id AS id, COUNT(*) AS partition_count 
-                      FROM 
-                          sys.partitions 
-                      WHERE 
-                          object_id IN ({}) GROUP BY object_id;
-                   """
+SELECT
+    object_id AS id, COUNT(*) AS partition_count
+FROM
+    sys.partitions
+WHERE
+    object_id IN ({}) GROUP BY object_id;
+"""
 
 INDEX_QUERY = """
-                 SELECT 
-                     i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint, 
-                     i.is_disabled, STRING_AGG(c.name, ',') AS column_names 
-                 FROM 
-                     sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id 
-                     AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id 
-                 WHERE 
-                     i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type, 
-                     i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;
-                 """
-
-FOREIGN_KEY_QUERY="""
-                     SELECT 
-                         FK.referenced_object_id AS id, FK.name AS foreign_key_name, 
-                         OBJECT_NAME(FK.parent_object_id) AS referencing_table,  
-                         STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column, 
-                         OBJECT_NAME(FK.referenced_object_id) AS referenced_table, 
-                         STRING_AGG(COL_NAME(FKC.referenced_object_id, FKC.referenced_column_id),',') AS referenced_column 
-                     FROM 
-                         sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id 
-                     WHERE 
-                         FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;
-                  """
+SELECT
+    i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint,
+    i.is_disabled, STRING_AGG(c.name, ',') AS column_names
+FROM
+    sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id
+    AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
+WHERE
+    i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type,
+    i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;
+"""
+
+FOREIGN_KEY_QUERY = """
+SELECT
+    FK.referenced_object_id AS id, FK.name AS foreign_key_name,
+    OBJECT_NAME(FK.parent_object_id) AS referencing_table,
+    STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column,
+    OBJECT_NAME(FK.referenced_object_id) AS referenced_table,
+    STRING_AGG(COL_NAME(FKC.referenced_object_id, FKC.referenced_column_id),',') AS referenced_column
+FROM
+    sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id
+WHERE
+    FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;
+"""
diff --git a/sqlserver/datadog_checks/sqlserver/metadata.py b/sqlserver/datadog_checks/sqlserver/metadata.py
index 1989422440264..4550118a9b0c4 100644
--- a/sqlserver/datadog_checks/sqlserver/metadata.py
+++ b/sqlserver/datadog_checks/sqlserver/metadata.py
@@ -2,6 +2,7 @@
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 import time
+
 from datadog_checks.base import is_affirmative
 from datadog_checks.base.utils.db.utils import (
     DBMAsyncJob,
@@ -127,104 +128,7 @@ def _load_settings_rows(self, cursor):
         rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
         self.log.debug("loaded sql server settings len(rows)=%s", len(rows))
         return rows
-    
-    """schemas data struct is a dictionnary with key being a schema name the value is
-    schema
-    dict:
-        "name": str
-        "schema_id": str
-        "principal_id": str
-        "tables" : dict
-            name: list of columns                  
-                "columns": dict
-                    name: str
-                    data_type: str
-                    default: str
-
-
-    """
-    @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
-    def _query_schema_information(self, cursor):
-
-        # principal_id is kind of like an owner
 
-        # Todo put in consts
-        # there is also principal_id not sure if need it.
-        SCHEMA_QUERY = "SELECT name,schema_id,principal_id FROM sys.schemas;"
-        self.log.debug("collecting db schemas")
-        self.log.debug("Running query [%s]", SCHEMA_QUERY)
-        cursor.execute(SCHEMA_QUERY)
-        schemas = []
-        columns = [i[0] for i in cursor.description]
-        schemas = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        schemas_by_name = {}
-
-        schemas_by_name = {}
-
-        for schema in schemas:
-            name = schema['name'].lower()
-            #add tables
-            schema['tables'] = {}
-            schemas_by_name[name] = schema
-
-        self.log.debug("fetched schemas len(rows)=%s", len(schemas))
-        return schemas_by_name
-
-    def _get_table_infos(self, schemas, cursor):
-        #TODO do we need this for sqlserver ? 
-        #If any tables are partitioned, only the master paritition table name will be returned, and none of its children.
-
-        # TODO 
-        #Do we need a limit ? like in postgress , seems not
-        #limit = self._config.schemas_metadata_config.get("max_tables", 300)
-
-        TABLES_QUERY = "SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS;"
-        cursor.execute(TABLES_QUERY)
-        #TODO
-        #             nullable: bool column ?
-        #TODO
-        #"foreign_keys": dict (if has foreign keys)
-        #    name: str
-        #    definition: str
-        #TODO
-        #        "indexes": dict (if has indexes)
-        #    name: str
-        #    definition: str
-        #TODO
-        #"toast_table": str (if associated toast table exists) - equivalent in sql server
-        
-        # "partition_key": str (if has partitions) - equiv ? 
-
-        # "num_partitions": int (if has partitions) - equiv ? 
-        #apply lower case ? 
-        #this is just to avoid doing something like row[0] , row[1] etc 
-        columns = [str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        
-        for row in rows:
-            if len(row) != 5:
-                #TODO some warning ? 
-                print("warning") 
-
-            #TODO treat not found 
-            schema = schemas[row['table_schema']]
-
-            tables_dict_for_schema = schema['tables']
-
-            #do the same mapping as in postgres for some uniformity otherwise could've just loop and exclude some keys
-            if row['table_name'] not in tables_dict_for_schema:
-                #new table
-                tables_dict_for_schema[row['table_name']] = []
-            column = {}
-            column['name'] = row['column_name']
-            column['data_type'] = row['data_type']
-            column['default'] = row['column_default']
-            #table is an array of column dict for now.
-            tables_dict_for_schema[row['table_name']].append(column)
-            # table dict has a key columns with value arrray of dicts
-
-#self._sort_and_limit_table_info(cursor, dbname, table_info, limit)
-# for now not sort and limit
     @tracked_method(agent_check_getter=agent_check_getter)
     def report_sqlserver_metadata(self):
         with self._check.connection.open_managed_default_connection(key_prefix=self._conn_key_prefix):
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 776bb74b5df26..1fe4aef47bcb6 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -3,59 +3,55 @@
 except ImportError:
     from ..stubs import datadog_agent
 
+import copy
+import json
+import pdb
+import time
+
+from datadog_checks.base.utils.db.utils import default_json_event_encoding
+from datadog_checks.base.utils.tracking import tracked_method
 from datadog_checks.sqlserver.const import (
-    TABLES_IN_SCHEMA_QUERY,
     COLUMN_QUERY,
-    PARTITIONS_QUERY,
-    INDEX_QUERY,
+    DB_QUERY,
     FOREIGN_KEY_QUERY,
+    INDEX_QUERY,
+    PARTITIONS_QUERY,
     SCHEMA_QUERY,
-    DB_QUERY,
+    STATIC_INFO_ENGINE_EDITION,
     STATIC_INFO_VERSION,
-    STATIC_INFO_ENGINE_EDITION
+    TABLES_IN_SCHEMA_QUERY,
 )
+from datadog_checks.sqlserver.utils import execute_query_output_result_as_a_dict, get_list_chunks
 
-from datadog_checks.sqlserver.utils import (
-    execute_query_output_result_as_a_dict, get_list_chunks
-)
 
-from datadog_checks.base.utils.tracking import tracked_method
-
-import pdb
-import time
-import json
-import copy
-
-from datadog_checks.base.utils.db.utils import default_json_event_encoding
-
-class SubmitData: 
-    MAX_COLUMN_COUNT  = 10_000
+class SubmitData:
+    MAX_COLUMN_COUNT = 10_000
 
     # REDAPL has a 3MB limit per resource
-    MAX_TOTAL_COLUMN_COUNT = 100_000 
+    MAX_TOTAL_COLUMN_COUNT = 100_000
 
     def __init__(self, submit_data_function, base_event, logger):
         self._submit_to_agent_queue = submit_data_function
         self._base_event = base_event
         self._log = logger
 
-        self._columns_count  = 0
+        self._columns_count = 0
         self._total_columns_count = 0
-        self.db_to_schemas = {} # dbname : { id : schema }
-        self.db_info = {} # name to info
+        self.db_to_schemas = {}  # dbname : { id : schema }
+        self.db_info = {}  # name to info
 
     def set_base_event_data(self, hostname, tags, cloud_metadata, dbms_version):
         self._base_event["host"] = hostname
         self._base_event["tags"] = tags
         self._base_event["cloud_metadata"] = cloud_metadata
-        self._base_event["dbms_version"] = dbms_version        
+        self._base_event["dbms_version"] = dbms_version
 
     def reset(self):
         self._columns_count = 0
         self._total_columns_count = 0
         self.db_to_schemas = {}
         self.db_info = {}
-    
+
     def store_db_info(self, db_name, db_info):
         self.db_info[db_name] = db_info
 
@@ -64,7 +60,7 @@ def store(self, db_name, schema, tables, columns_count):
         self._total_columns_count += columns_count
         schemas = self.db_to_schemas.setdefault(db_name, {})
         if schema["id"] in schemas:
-            known_tables = schemas[schema["id"]].setdefault("tables",[])
+            known_tables = schemas[schema["id"]].setdefault("tables", [])
             known_tables = known_tables + tables
         else:
             schemas[schema["id"]] = copy.deepcopy(schema)
@@ -78,11 +74,8 @@ def exceeded_total_columns_number(self):
     def submit(self):
         if not bool(self.db_to_schemas):
             return
-        self._columns_count  = 0
-        event = {**self._base_event,
-                 "metadata" : [],
-                 "timestamp": time.time() * 1000
-                 }
+        self._columns_count = 0
+        event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000}
         for db, schemas_by_id in self.db_to_schemas.items():
             db_info = {}
             if db not in self.db_info:
@@ -90,15 +83,17 @@ def submit(self):
                 db_info["name"] = db
             else:
                 db_info = self.db_info[db]
-            event["metadata"] =  event["metadata"] + [{**(db_info), "schemas": list(schemas_by_id.values())}]
+            event["metadata"] = event["metadata"] + [{**(db_info), "schemas": list(schemas_by_id.values())}]
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
         self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 
+
 def agent_check_getter(self):
     return self._check
 
+
 class Schemas:
 
     # Requests for infromation about tables are done for a certain amount of tables at the time
@@ -107,9 +102,9 @@ class Schemas:
     TABLES_CHUNK_SIZE = 50
 
     def __init__(self, check, schemas_collection_interval):
-        self._check = check 
+        self._check = check
         self._log = check.log
-        self.schemas_per_db = {} 
+        self.schemas_per_db = {}
 
         base_event = {
             "host": None,
@@ -130,12 +125,12 @@ def __init__(self, check, schemas_collection_interval):
         "id": str
         "owner_name": str
         "tables" : list of tables dicts
-            table 
+            table
             key/value:
                 "id" : str
                 "name" : str
-                columns: list of columns dicts                 
-                    columns 
+                columns: list of columns dicts                
+                    columns
                     key/value:
                         "name": str
                         "data_type": str
@@ -162,79 +157,94 @@ def __init__(self, check, schemas_collection_interval):
             partitions: partition dict
                 partition
                 key/value:
-                    "partition_count": int 
+                    "partition_count": int
     """
+
     @tracked_method(agent_check_getter=agent_check_getter)
     def collect_schemas_data(self):
         self._dataSubmitter.reset()
-        self._dataSubmitter.set_base_event_data(self._check.resolved_hostname, self._check.non_internal_tags, self._check._config.cloud_metadata, 
-                                                "{},{}".format(
-                                                self._check.static_info_cache.get(STATIC_INFO_VERSION, ""),
-                                                self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION, ""),)
+        self._dataSubmitter.set_base_event_data(
+            self._check.resolved_hostname,
+            self._check.non_internal_tags,
+            self._check._config.cloud_metadata,
+            "{},{}".format(
+                self._check.static_info_cache.get(STATIC_INFO_VERSION, ""),
+                self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION, ""),
+            ),
         )
-        #returns if to stop, True means stop iterating.
+
+        # returns if to stop, True means stop iterating.
         def fetch_schema_data(cursor, db_name):
-            db_info  = self._query_db_information(db_name, cursor)
+            db_info = self._query_db_information(db_name, cursor)
             schemas = self._query_schema_information(cursor)
             self._dataSubmitter.store_db_info(db_name, db_info)
             for schema in schemas:
-                tables = self._get_tables(schema, cursor)         
+                tables = self._get_tables(schema, cursor)
                 tables_chunk = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
                 for tables_chunk in tables_chunk:
                     if self._dataSubmitter.exceeded_total_columns_number():
-                        #TODO Report truncation to the backend
-                        self._log.warning("Truncated data due to the max limit, stopped on db - {} on schema {}".format(db_name, schema["name"]))
-                        return True                    
+                        # TODO Report truncation to the backend
+                        self._log.warning(
+                            "Truncated data due to the max limit, stopped on db - {} on schema {}".format(
+                                db_name, schema["name"]
+                            )
+                        )
+                        return True
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
-                    self._dataSubmitter.store(db_name, schema, tables_info, columns_count)  
-                    self._dataSubmitter.submit() # Submit is forced after each 50 tables chunk
+                    self._dataSubmitter.store(db_name, schema, tables_info, columns_count)
+                    self._dataSubmitter.submit()  # Submit is forced after each 50 tables chunk
                 if len(tables) == 0:
                     self._dataSubmitter.store(db_name, schema, [], 0)
             self._dataSubmitter.submit()
             return False
+
         self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
         self._log.debug("Finished collect_schemas_data")
         self._dataSubmitter.submit()
 
     def _query_db_information(self, db_name, cursor):
-        db_info = execute_query_output_result_as_a_dict(DB_QUERY.format(db_name), cursor)
+        db_info = execute_query_output_result_as_a_dict(DB_QUERY.format(db_name), cursor, convert_results_to_str=True)
         if len(db_info) == 1:
             return db_info[0]
         else:
             self._log.error("Couldnt query database information for %s", db_name)
-            return None        
+            return None
 
     """ returns a list of tables for schema with their names and empty column array
     list of table dicts
     "id": str
     "name": str
-    "columns": [] 
+    "columns": []
     """
+
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables(self, schema, cursor):
-        tables_info = execute_query_output_result_as_a_dict(TABLES_IN_SCHEMA_QUERY.format(schema["id"]), cursor)
+        tables_info = execute_query_output_result_as_a_dict(
+            TABLES_IN_SCHEMA_QUERY.format(schema["id"]), cursor, convert_results_to_str=True
+        )
         for t in tables_info:
             t.setdefault("columns", [])
         return tables_info
-    
+
     """ returns a list of schema dicts
     schema
     dict:
         "name": str
         "id": str
         "owner_name": str"""
+
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _query_schema_information(self, cursor):
-        return execute_query_output_result_as_a_dict(SCHEMA_QUERY, cursor)
-    
+        return execute_query_output_result_as_a_dict(SCHEMA_QUERY, cursor, convert_results_to_str=True)
+
     """ returns extracted column numbers and a list of tables
         "tables" : list of tables dicts
-        table 
+        table
         key/value:
             "id" : str
             "name" : str
-            columns: list of columns dicts                 
-                columns 
+            columns: list of columns dicts             
+                columns
                 key/value:
                     "name": str
                     "data_type": str
@@ -263,6 +273,7 @@ def _query_schema_information(self, cursor):
                 key/value:
                     "partition_count": int
     """
+
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables_data(self, table_list, schema, cursor):
         if len(table_list) == 0:
@@ -272,24 +283,25 @@ def _get_tables_data(self, table_list, schema, cursor):
         table_ids_object = ",".join(["OBJECT_NAME({})".format(t.get("id")) for t in table_list])
         table_ids = ",".join(["{}".format(t.get("id")) for t in table_list])
         for t in table_list:
-            name_to_id[t["name"]] = t["id"] 
+            name_to_id[t["name"]] = t["id"]
             id_to_table_data[t["id"]] = t
-        total_columns_number  = self._populate_with_columns_data(table_ids_object, name_to_id, id_to_table_data, schema, cursor)
+        total_columns_number = self._populate_with_columns_data(
+            table_ids_object, name_to_id, id_to_table_data, schema, cursor
+        )
         self._populate_with_partitions_data(table_ids, id_to_table_data, cursor)
         self._populate_with_foreign_keys_data(table_ids, id_to_table_data, cursor)
         self._populate_with_index_data(table_ids, id_to_table_data, cursor)
         return total_columns_number, list(id_to_table_data.values())
 
-    """ 
-    adds columns list data to each table in a provided list
-    """
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, schema, cursor):
         cursor.execute(COLUMN_QUERY.format(table_ids, schema["name"]))
         data = cursor.fetchall()
         # AS default - cannot be used in sqlserver query as this word is reserved
-        columns = ["default" if str(i[0]).lower() == "column_default" else str(i[0]).lower() for i in cursor.description]
-        rows = [dict(zip(columns, [str(item) for item in row])) for row in data]       
+        columns = [
+            "default" if str(i[0]).lower() == "column_default" else str(i[0]).lower() for i in cursor.description
+        ]
+        rows = [dict(zip(columns, [str(item) for item in row])) for row in data]
         for row in rows:
             table_id = name_to_id.get(str(row.get("table_name")))
             if table_id is not None:
@@ -299,25 +311,22 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, s
                         row["nullable"] = False
                     else:
                         row["nullable"] = True
-                if table_id in id_to_table_data:        
-                    id_to_table_data.get(table_id)["columns"] = id_to_table_data.get(table_id).get("columns",[]) + [row]
+                if table_id in id_to_table_data:
+                    id_to_table_data.get(table_id)["columns"] = id_to_table_data.get(table_id).get("columns", []) + [
+                        row
+                    ]
                 else:
                     self._log.error("Columns found for an unkown table with the object_id: %s", table_id)
             else:
                 self._log.error("Couldn't find id of a table: %s", table_id)
         return len(data)
-    
-    """ 
-    adds partitions dict to each table in a provided list
-    """
+
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
-        cursor.execute(PARTITIONS_QUERY.format(table_ids))
-        columns = [str(i[0]).lower() for i in cursor.description] 
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        rows = execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_ids), cursor)
         for row in rows:
-            id  = row.pop("id", None)
-            if id is not None: 
+            id = row.pop("id", None)
+            if id is not None:
                 id_str = str(id)
                 if id_str in id_to_table_data:
                     id_to_table_data[id_str]["partitions"] = row
@@ -326,13 +335,12 @@ def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
             else:
                 self._log.error("Return rows of [%s] query should have id column", PARTITIONS_QUERY)
 
+    # TODO update example , apply linter
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
-        cursor.execute(INDEX_QUERY.format(table_ids))
-        columns = [str(i[0]).lower() for i in cursor.description] 
-        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
+        rows = execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_ids), cursor)
         for row in rows:
-            id  = row.pop("id", None)
+            id = row.pop("id", None)
             if id is not None:
                 id_str = str(id)
                 if id_str in id_to_table_data:
@@ -345,18 +353,15 @@ def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _populate_with_foreign_keys_data(self, table_ids, id_to_table_data, cursor):
-            cursor.execute(FOREIGN_KEY_QUERY.format(table_ids))
-            columns = [str(i[0]).lower() for i in cursor.description] 
-            rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
-            for row in rows:
-                id  = row.pop("id", None)
-                if id is not None:
-                    id_str = str(id)
-                    if id_str in id_to_table_data:
-                        id_to_table_data.get(str(id)).setdefault("foreign_keys", [])
-                        id_to_table_data.get(str(id))["foreign_keys"].append(row)
-                    else:
-                        self._log.error("Foreign key found for an unkown table with the object_id: %s", id_str)
+        rows = execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_ids), cursor)
+        for row in rows:
+            id = row.pop("id", None)
+            if id is not None:
+                id_str = str(id)
+                if id_str in id_to_table_data:
+                    id_to_table_data.get(str(id)).setdefault("foreign_keys", [])
+                    id_to_table_data.get(str(id))["foreign_keys"].append(row)
                 else:
-                    self._log.error("Return rows of [%s] query should have id column", FOREIGN_KEY_QUERY)  
-       
+                    self._log.error("Foreign key found for an unkown table with the object_id: %s", id_str)
+            else:
+                self._log.error("Return rows of [%s] query should have id column", FOREIGN_KEY_QUERY)
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 1b30ef2bd9c85..5c8e98cb040ab 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -2,9 +2,11 @@
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 from __future__ import division
+
 import copy
 import time
 from collections import defaultdict
+
 import six
 from cachetools import TTLCache
 
@@ -21,16 +23,18 @@
     SqlserverIndexUsageMetrics,
 )
 from datadog_checks.sqlserver.metadata import SqlserverMetadata
+from datadog_checks.sqlserver.schemas import Schemas
 from datadog_checks.sqlserver.statements import SqlserverStatementMetrics
 from datadog_checks.sqlserver.stored_procedures import SqlserverProcedureMetrics
 from datadog_checks.sqlserver.utils import Database, construct_use_statement, parse_sqlserver_major_version
-from datadog_checks.sqlserver.schemas import Schemas
 
 try:
     import datadog_agent
 except ImportError:
     from ..stubs import datadog_agent
 
+import pdb
+
 from datadog_checks.sqlserver import metrics
 from datadog_checks.sqlserver.__about__ import __version__
 from datadog_checks.sqlserver.connection import Connection, SQLConnectionError, split_sqlserver_host_port
@@ -84,7 +88,7 @@
     is_azure_sql_database,
     set_default_driver_conf,
 )
-import pdb
+
 try:
     import adodbapi
 except ImportError:
@@ -725,12 +729,14 @@ def _check_connections_by_use_db(self):
                         continue
                 # Switch DB back to MASTER
                 cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
-    
-    #TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
+
+    # TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     def get_databases(self):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         if not is_azure_sql_database(engine_edition):
-            db_names = [d.name for d in self.databases] or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
+            db_names = [d.name for d in self.databases] or [
+                self.instance.get('database', self.connection.DEFAULT_DATABASE)
+            ]
         else:
             db_names = [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
         return db_names
@@ -739,13 +745,13 @@ def do_for_databases(self, action, databases):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         with self.connection.open_managed_default_connection():
             with self.connection.get_managed_cursor() as cursor:
-                for db in databases:                    
+                for db in databases:
                     try:
                         if not is_azure_sql_database(engine_edition):
                             cursor.execute(SWITCH_DB_STATEMENT.format(db))
-                        stop = action(cursor, db)        
+                        stop = action(cursor, db)
                         if stop:
-                            break;                  
+                            break
                     except Exception as e:
                         pdb.set_trace()
                         print("An exception occurred during do_for_databases in db - {}: {}".format(db, e))
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index b30d3070b001a..421c5f446485b 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -138,17 +138,19 @@ def is_azure_sql_database(engine_edition):
     """
     return engine_edition == ENGINE_EDITION_SQL_DATABASE
 
-def execute_query_output_result_as_a_dict(query, cursor, modify_columns=None):
+
+def execute_query_output_result_as_a_dict(query, cursor, convert_results_to_str=False):
     cursor.execute(query)
-    columns = []
-    if modify_columns:
-        columns = modify_columns(cursor.description)
+    columns = [str(column[0]).lower() for column in cursor.description]
+    rows = []
+    if convert_results_to_str:
+        rows = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
     else:
-        columns = [str(column[0]).lower() for column in cursor.description]
-    rows = [dict(zip(columns, [str(item) for item in row])) for row in cursor.fetchall()]
+        rows = [dict(zip(columns, row)) for row in cursor.fetchall()]
     return rows
 
+
 def get_list_chunks(lst, n):
     """Yield successive n-sized chunks from lst."""
     for i in range(0, len(lst), n):
-        yield lst[i : i + n]
\ No newline at end of file
+        yield lst[i : i + n]
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 5594d36024d86..955de5af56d0b 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -8,20 +8,21 @@
 from copy import copy
 
 import pytest
-
 from deepdiff import DeepDiff
 
 from datadog_checks.sqlserver import SQLServer
-#from deepdiff import DeepDiff - not clear how to add it to ddev
 
+# from deepdiff import DeepDiff - not clear how to add it to ddev
 from .common import CHECK_NAME
-from .utils import delete_if_found, compare_coumns_in_tables
+from .utils import compare_coumns_in_tables
+
 try:
     import pyodbc
 except ImportError:
     pyodbc = None
-import pdb
 import json
+import pdb
+
 
 @pytest.fixture
 def dbm_instance(instance_docker):
@@ -56,13 +57,13 @@ def dbm_instance(instance_docker):
 )
 def test_get_available_settings_columns(dbm_instance, expected_columns, available_columns):
     pass
-    #check = SQLServer(CHECK_NAME, {}, [dbm_instance])
-    #check.initialize_connection()
-    #_conn_key_prefix = "dbm-metadata-"
-    #with check.connection.open_managed_default_connection(key_prefix=_conn_key_prefix):
-        #with check.connection.get_managed_cursor(key_prefix=_conn_key_prefix) as cursor:
-            #result_available_columns = check.sql_metadata._get_available_settings_columns(cursor, expected_columns)
-            #assert result_available_columns == available_columns
+    # check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    # check.initialize_connection()
+    # _conn_key_prefix = "dbm-metadata-"
+    # with check.connection.open_managed_default_connection(key_prefix=_conn_key_prefix):
+    # with check.connection.get_managed_cursor(key_prefix=_conn_key_prefix) as cursor:
+    # result_available_columns = check.sql_metadata._get_available_settings_columns(cursor, expected_columns)
+    # assert result_available_columns == available_columns
 
 
 @pytest.mark.integration
@@ -86,36 +87,79 @@ def test_get_settings_query_cached(dbm_instance, caplog):
 
 def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     pass
-    #check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    # check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     # dd_run_check(check)
-    #check.initialize_connection()
-    #check.check(dbm_instance)
-    #dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
-    #event = next((e for e in dbm_metadata if e['kind'] == 'sqlserver_configs'), None)
-    #assert event is not None
-    #assert event['dbms'] == "sqlserver"
-    #assert event['kind'] == "sqlserver_configs"
-    #assert len(event["metadata"]) > 0
-
-#TODO this test relies on a certain granularity
-#later we need to upgrade it to accumulate data for each DB before checking.
+    # check.initialize_connection()
+    # check.check(dbm_instance)
+    # dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
+    # event = next((e for e in dbm_metadata if e['kind'] == 'sqlserver_configs'), None)
+    # assert event is not None
+    # assert event['dbms'] == "sqlserver"
+    # assert event['kind'] == "sqlserver_configs"
+    # assert len(event["metadata"]) > 0
+
+
+# TODO this test relies on a certain granularity
+# later we need to upgrade it to accumulate data for each DB before checking.
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
-    
-    databases_to_find  = ['datadog_test_schemas','datadog_test']
-    exp_datadog_test =  {'id': '6', 'name': 'datadog_test', 'owner': 'dbo', 'schemas': [ {'name': 'dbo', 'id': '1', 'owner': '1', 'tables': [{'id': '885578193', 'name': 'ϑings', 'columns': [{'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True}, {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True}]}]}]}
-    exp_datadog_test_schemas = {'id': '5', 'name': 'datadog_test_schemas', 'owner': 'dbo', 'schemas': [{'name': 'test_schema', 'id': '5', 'owner': '1', 'tables': [{'id': '885578193', 'name': 'cities', 'columns': [{'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True}, {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True}]}]}]}
-    expected_data_for_db = {'datadog_test' : exp_datadog_test, 'datadog_test_schemas' : exp_datadog_test_schemas}
+
+    databases_to_find = ['datadog_test_schemas', 'datadog_test']
+    exp_datadog_test = {
+        'id': '6',
+        'name': 'datadog_test',
+        'owner': 'dbo',
+        'schemas': [
+            {
+                'name': 'dbo',
+                'id': '1',
+                'owner': '1',
+                'tables': [
+                    {
+                        'id': '885578193',
+                        'name': 'ϑings',
+                        'columns': [
+                            {'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True},
+                            {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True},
+                        ],
+                    }
+                ],
+            }
+        ],
+    }
+    exp_datadog_test_schemas = {
+        'id': '5',
+        'name': 'datadog_test_schemas',
+        'owner': 'dbo',
+        'schemas': [
+            {
+                'name': 'test_schema',
+                'id': '5',
+                'owner': '1',
+                'tables': [
+                    {
+                        'id': '885578193',
+                        'name': 'cities',
+                        'columns': [
+                            {'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True},
+                            {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True},
+                        ],
+                    }
+                ],
+            }
+        ],
+    }
+    expected_data_for_db = {'datadog_test': exp_datadog_test, 'datadog_test_schemas': exp_datadog_test_schemas}
 
     dbm_instance['database_autodiscovery'] = True
-    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas','datadog_test']
+    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test']
 
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     dd_run_check(check)
 
-    #extracting events.
+    # extracting events.
 
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
-    
+
     actual_payloads = {}
 
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
@@ -125,7 +169,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
         assert schema_event.get("timestamp") is not None
         # there should only be one database, datadog_test
-        
+
         database_metadata = schema_event['metadata']
         assert len(database_metadata) == 1
         db_name = database_metadata[0]['name']
@@ -135,17 +179,17 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         else:
             actual_payloads[db_name] = database_metadata[0]
     pdb.set_trace()
-    assert len(actual_payloads) == len(expected_data_for_db)    
+    assert len(actual_payloads) == len(expected_data_for_db)
 
     for db_name, actual_payload in actual_payloads.items():
 
-        #assert delete_if_found(databases_to_find, db_name)
+        # assert delete_if_found(databases_to_find, db_name)
         assert db_name in databases_to_find
-        # we need to accumulate all data ... as payloads may differ 
+        # we need to accumulate all data ... as payloads may differ
 
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
-        #difference = {}
+        # difference = {}
 
         diff_keys = list(difference.keys())
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
diff --git a/sqlserver/tests/utils.py b/sqlserver/tests/utils.py
index 05bd4b12ccb30..f0d303d263b97 100644
--- a/sqlserver/tests/utils.py
+++ b/sqlserver/tests/utils.py
@@ -221,21 +221,14 @@ def run_query_and_ignore_exception(conn, query):
     def _create_rand_string(length=5):
         return ''.join(choice(string.ascii_lowercase + string.digits) for _ in range(length))
 
-def delete_if_found(my_list, value):
-    try:
-        index = my_list.index(value)
-        del my_list[index]
-        return True
-    except ValueError:
-        return None
-      
+
 def compare_coumns_in_tables(expected_data_for_db, actual_data_for_db):
     for schema in expected_data_for_db['schemas']:
         actual_schema = next(filter(lambda x: x['id'] == schema['id'], actual_data_for_db['schemas']))
         for table in schema['tables']:
-            #find a table and then finally compare columns
+            # find a table and then finally compare columns
             actual_table = next(filter(lambda x: x['id'] == table['id'], actual_schema['tables']))
             if actual_table['columns'] != table['columns']:
                 return False
 
-    return True
\ No newline at end of file
+    return True

From 648914e689821abb5a842ffe08b82b666c47b540 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 11:27:57 +0000
Subject: [PATCH 060/132] applied lnter

---
 sqlserver/datadog_checks/sqlserver/schemas.py   |  8 ++++----
 sqlserver/datadog_checks/sqlserver/sqlserver.py |  1 -
 sqlserver/tests/test_metadata.py                | 13 ++-----------
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 1fe4aef47bcb6..c84595daf2145 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -129,7 +129,7 @@ def __init__(self, check, schemas_collection_interval):
             key/value:
                 "id" : str
                 "name" : str
-                columns: list of columns dicts                
+                columns: list of columns dicts
                     columns
                     key/value:
                         "name": str
@@ -180,8 +180,8 @@ def fetch_schema_data(cursor, db_name):
             self._dataSubmitter.store_db_info(db_name, db_info)
             for schema in schemas:
                 tables = self._get_tables(schema, cursor)
-                tables_chunk = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
-                for tables_chunk in tables_chunk:
+                tables_chunks = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
+                for tables_chunk in tables_chunks:
                     if self._dataSubmitter.exceeded_total_columns_number():
                         # TODO Report truncation to the backend
                         self._log.warning(
@@ -243,7 +243,7 @@ def _query_schema_information(self, cursor):
         key/value:
             "id" : str
             "name" : str
-            columns: list of columns dicts             
+            columns: list of columns dicts
                 columns
                 key/value:
                     "name": str
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 5c8e98cb040ab..6f900f111fc84 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -730,7 +730,6 @@ def _check_connections_by_use_db(self):
                 # Switch DB back to MASTER
                 cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
 
-    # TODO as we do it a second type iterate connection through DB make a function and unite it with _get_table_infos check
     def get_databases(self):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         if not is_azure_sql_database(engine_edition):
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 955de5af56d0b..ca53be97a9a69 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -99,8 +99,7 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     # assert len(event["metadata"]) > 0
 
 
-# TODO this test relies on a certain granularity
-# later we need to upgrade it to accumulate data for each DB before checking.
+
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
     databases_to_find = ['datadog_test_schemas', 'datadog_test']
@@ -183,22 +182,14 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
     for db_name, actual_payload in actual_payloads.items():
 
-        # assert delete_if_found(databases_to_find, db_name)
         assert db_name in databases_to_find
-        # we need to accumulate all data ... as payloads may differ
 
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
-        # difference = {}
-
         diff_keys = list(difference.keys())
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
             pdb.set_trace()
-            logging.debug("found the following diffs %s", str(difference))
-            assert False
+            raise AssertionError(Exception("found the following diffs: " + str(difference)))
 
         # we need a special comparison as order of columns matter
-
         assert compare_coumns_in_tables(expected_data_for_db[db_name], actual_payload)
-
-        print("ok")

From 50aa7b97808d029e849cc2d2abe02e409b50b0ca Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 12:04:25 +0000
Subject: [PATCH 061/132] Updated test expectations

---
 sqlserver/datadog_checks/sqlserver/schemas.py |   1 -
 sqlserver/tests/test_metadata.py              | 206 ++++++++++++++++--
 sqlserver/tests/utils.py                      |  12 -
 3 files changed, 191 insertions(+), 28 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index c84595daf2145..bdc4062702088 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -335,7 +335,6 @@ def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
             else:
                 self._log.error("Return rows of [%s] query should have id column", PARTITIONS_QUERY)
 
-    # TODO update example , apply linter
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
         rows = execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_ids), cursor)
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index ca53be97a9a69..fd8ddf56b9682 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -12,9 +12,7 @@
 
 from datadog_checks.sqlserver import SQLServer
 
-# from deepdiff import DeepDiff - not clear how to add it to ddev
 from .common import CHECK_NAME
-from .utils import compare_coumns_in_tables
 
 try:
     import pyodbc
@@ -99,26 +97,50 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     # assert len(event["metadata"]) > 0
 
 
-
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
     databases_to_find = ['datadog_test_schemas', 'datadog_test']
     exp_datadog_test = {
         'id': '6',
         'name': 'datadog_test',
+        "collation":"SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
         'schemas': [
             {
                 'name': 'dbo',
                 'id': '1',
-                'owner': '1',
+                'owner_name': 'dbo',
                 'tables': [
                     {
                         'id': '885578193',
                         'name': 'ϑings',
                         'columns': [
-                            {'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True},
-                            {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True},
+                            {
+                                'name': 'id',
+                                'data_type': 'int',
+                                'default': '((0))',
+                                'nullable': True,
+                                'ordinal_position': '1',
+                            },
+                            {
+                                'name': 'name',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '2',
+                            },
+                        ],
+                        'partitions': {'partition_count': 1},
+                        'indexes': [
+                            {
+                                'name': 'thingsindex',
+                                'type': 1,
+                                'is_unique': False,
+                                'is_primary_key': False,
+                                'is_unique_constraint': False,
+                                'is_disabled': False,
+                                'column_names': 'name',
+                            }
                         ],
                     }
                 ],
@@ -128,21 +150,177 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     exp_datadog_test_schemas = {
         'id': '5',
         'name': 'datadog_test_schemas',
+        "collation":"SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
         'schemas': [
             {
                 'name': 'test_schema',
                 'id': '5',
-                'owner': '1',
+                'owner_name': 'dbo',
                 'tables': [
                     {
                         'id': '885578193',
                         'name': 'cities',
                         'columns': [
-                            {'name': 'id', 'data_type': 'int', 'default': '((0))', 'nullable': True},
-                            {'name': 'name', 'data_type': 'varchar', 'default': 'None', 'nullable': True},
+                            {
+                                'name': 'id',
+                                'data_type': 'int',
+                                'default': '((0))',
+                                'nullable': False,
+                                'ordinal_position': '1',
+                            },
+                            {
+                                'name': 'name',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '2',
+                            },
+                            {
+                                'name': 'population',
+                                'data_type': 'int',
+                                'default': '((0))',
+                                'nullable': False,
+                                'ordinal_position': '3',
+                            },
                         ],
-                    }
+                        'partitions': {'partition_count': 12},
+                        'foreign_keys': [
+                            {
+                                'foreign_key_name': 'FK_CityId',
+                                'referencing_table': 'landmarks',
+                                'referencing_column': 'city_id',
+                                'referenced_table': 'cities',
+                                'referenced_column': 'id',
+                            }
+                        ],
+                        'indexes': [
+                            {
+                                'name': 'PK_Cities',
+                                'type': 1,
+                                'is_unique': True,
+                                'is_primary_key': True,
+                                'is_unique_constraint': False,
+                                'is_disabled': False,
+                                'column_names': 'id',
+                            },
+                            {
+                                'name': 'single_column_index',
+                                'type': 2,
+                                'is_unique': False,
+                                'is_primary_key': False,
+                                'is_unique_constraint': False,
+                                'is_disabled': False,
+                                'column_names': 'population,id',
+                            },
+                            {
+                                'name': 'two_columns_index',
+                                'type': 2,
+                                'is_unique': False,
+                                'is_primary_key': False,
+                                'is_unique_constraint': False,
+                                'is_disabled': False,
+                                'column_names': 'id,name',
+                            },
+                        ],
+                    },
+                    {
+                        'id': '949578421',
+                        'name': 'landmarks',
+                        'columns': [
+                            {
+                                'name': 'name',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '1',
+                            },
+                            {
+                                'name': 'city_id',
+                                'data_type': 'int',
+                                'default': '((0))',
+                                'nullable': True,
+                                'ordinal_position': '2',
+                            },
+                        ],
+                        'partitions': {'partition_count': 1},
+                    },
+                    {
+                        'id': '1029578706',
+                        'name': 'RestaurantReviews',
+                        'columns': [
+                            {
+                                'name': 'RestaurantName',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '1',
+                            },
+                            {
+                                'name': 'District',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '2',
+                            },
+                            {
+                                'name': 'Review',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '3',
+                            },
+                        ],
+                        'partitions': {'partition_count': 1},
+                    },
+                    {
+                        'id': '997578592',
+                        'name': 'Restaurants',
+                        'columns': [
+                            {
+                                'name': 'RestaurantName',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '1',
+                            },
+                            {
+                                'name': 'District',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '2',
+                            },
+                            {
+                                'name': 'Cuisine',
+                                'data_type': 'varchar',
+                                'default': 'None',
+                                'nullable': True,
+                                'ordinal_position': '3',
+                            },
+                        ],
+                        'partitions': {'partition_count': 2},
+                        'foreign_keys': [
+                            {
+                                'foreign_key_name': 'FK_RestaurantNameDistrict',
+                                'referencing_table': 'RestaurantReviews',
+                                'referencing_column': 'RestaurantName,District',
+                                'referenced_table': 'Restaurants',
+                                'referenced_column': 'RestaurantName,District',
+                            }
+                        ],
+                        'indexes': [
+                            {
+                                'name': 'UC_RestaurantNameDistrict',
+                                'type': 2,
+                                'is_unique': True,
+                                'is_primary_key': False,
+                                'is_unique_constraint': True,
+                                'is_disabled': False,
+                                'column_names': 'RestaurantName,District',
+                            }
+                        ],
+                    },
                 ],
             }
         ],
@@ -167,7 +345,6 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             return
 
         assert schema_event.get("timestamp") is not None
-        # there should only be one database, datadog_test
 
         database_metadata = schema_event['metadata']
         assert len(database_metadata) == 1
@@ -187,9 +364,8 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         diff_keys = list(difference.keys())
+        # schema data also collects certain built in schemas which are ignored in the test
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
-            pdb.set_trace()
             raise AssertionError(Exception("found the following diffs: " + str(difference)))
-
-        # we need a special comparison as order of columns matter
-        assert compare_coumns_in_tables(expected_data_for_db[db_name], actual_payload)
+    pdb.set_trace()
+    print("end")
\ No newline at end of file
diff --git a/sqlserver/tests/utils.py b/sqlserver/tests/utils.py
index f0d303d263b97..1d009b47ed6f5 100644
--- a/sqlserver/tests/utils.py
+++ b/sqlserver/tests/utils.py
@@ -220,15 +220,3 @@ def run_query_and_ignore_exception(conn, query):
     @staticmethod
     def _create_rand_string(length=5):
         return ''.join(choice(string.ascii_lowercase + string.digits) for _ in range(length))
-
-
-def compare_coumns_in_tables(expected_data_for_db, actual_data_for_db):
-    for schema in expected_data_for_db['schemas']:
-        actual_schema = next(filter(lambda x: x['id'] == schema['id'], actual_data_for_db['schemas']))
-        for table in schema['tables']:
-            # find a table and then finally compare columns
-            actual_table = next(filter(lambda x: x['id'] == table['id'], actual_schema['tables']))
-            if actual_table['columns'] != table['columns']:
-                return False
-
-    return True

From 861eef04720f7b47c4d4c1858c0b73c954573a35 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 12:36:43 +0000
Subject: [PATCH 062/132] Removed pdb

---
 sqlserver/datadog_checks/sqlserver/schemas.py |  1 -
 .../datadog_checks/sqlserver/sqlserver.py     |  5 +-
 sqlserver/tests/test_metadata.py              | 56 ++++++++-----------
 3 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index bdc4062702088..fb91bf1a9d152 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -5,7 +5,6 @@
 
 import copy
 import json
-import pdb
 import time
 
 from datadog_checks.base.utils.db.utils import default_json_event_encoding
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 6f900f111fc84..cfd266cc7feab 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -33,8 +33,6 @@
 except ImportError:
     from ..stubs import datadog_agent
 
-import pdb
-
 from datadog_checks.sqlserver import metrics
 from datadog_checks.sqlserver.__about__ import __version__
 from datadog_checks.sqlserver.connection import Connection, SQLConnectionError, split_sqlserver_host_port
@@ -752,8 +750,7 @@ def do_for_databases(self, action, databases):
                         if stop:
                             break
                     except Exception as e:
-                        pdb.set_trace()
-                        print("An exception occurred during do_for_databases in db - {}: {}".format(db, e))
+                        self.log.error("An exception occurred during do_for_databases in db - %s: %s", db, e)
                 # Switch DB back to MASTER
                 if not is_azure_sql_database(engine_edition):
                     cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index fd8ddf56b9682..56edc3cb9ddb4 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -18,8 +18,6 @@
     import pyodbc
 except ImportError:
     pyodbc = None
-import json
-import pdb
 
 
 @pytest.fixture
@@ -54,14 +52,13 @@ def dbm_instance(instance_docker):
     ],
 )
 def test_get_available_settings_columns(dbm_instance, expected_columns, available_columns):
-    pass
-    # check = SQLServer(CHECK_NAME, {}, [dbm_instance])
-    # check.initialize_connection()
-    # _conn_key_prefix = "dbm-metadata-"
-    # with check.connection.open_managed_default_connection(key_prefix=_conn_key_prefix):
-    # with check.connection.get_managed_cursor(key_prefix=_conn_key_prefix) as cursor:
-    # result_available_columns = check.sql_metadata._get_available_settings_columns(cursor, expected_columns)
-    # assert result_available_columns == available_columns
+    check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    check.initialize_connection()
+    _conn_key_prefix = "dbm-metadata-"
+    with check.connection.open_managed_default_connection(key_prefix=_conn_key_prefix):
+        with check.connection.get_managed_cursor(key_prefix=_conn_key_prefix) as cursor:
+            result_available_columns = check.sql_metadata._get_available_settings_columns(cursor, expected_columns)
+            assert result_available_columns == available_columns
 
 
 @pytest.mark.integration
@@ -84,17 +81,16 @@ def test_get_settings_query_cached(dbm_instance, caplog):
 
 
 def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
-    pass
-    # check = SQLServer(CHECK_NAME, {}, [dbm_instance])
-    # dd_run_check(check)
-    # check.initialize_connection()
-    # check.check(dbm_instance)
-    # dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
-    # event = next((e for e in dbm_metadata if e['kind'] == 'sqlserver_configs'), None)
-    # assert event is not None
-    # assert event['dbms'] == "sqlserver"
-    # assert event['kind'] == "sqlserver_configs"
-    # assert len(event["metadata"]) > 0
+    check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    dd_run_check(check)
+    check.initialize_connection()
+    check.check(dbm_instance)
+    dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
+    event = next((e for e in dbm_metadata if e['kind'] == 'sqlserver_configs'), None)
+    assert event is not None
+    assert event['dbms'] == "sqlserver"
+    assert event['kind'] == "sqlserver_configs"
+    assert len(event["metadata"]) > 0
 
 
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
@@ -333,18 +329,17 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     dd_run_check(check)
 
-    # extracting events.
-
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
 
     actual_payloads = {}
 
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
-        if len(databases_to_find) == 0:
-            # we may see the correct payload for the database several times in events
-            return
-
         assert schema_event.get("timestamp") is not None
+        assert schema_event["host"] == "stubbed.hostname" 
+        assert schema_event["agent_version"] == "0.0.0"
+        assert schema_event["dbms"] == "sqlserver"
+        assert schema_event.get("collection_interval") is not None
+        assert schema_event.get("dbms_version") is not None
 
         database_metadata = schema_event['metadata']
         assert len(database_metadata) == 1
@@ -354,7 +349,6 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             actual_payloads[db_name]['schemas'] = actual_payloads[db_name]['schemas'] + database_metadata[0]['schemas']
         else:
             actual_payloads[db_name] = database_metadata[0]
-    pdb.set_trace()
     assert len(actual_payloads) == len(expected_data_for_db)
 
     for db_name, actual_payload in actual_payloads.items():
@@ -364,8 +358,6 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         diff_keys = list(difference.keys())
-        # schema data also collects certain built in schemas which are ignored in the test
+        # schema data also collects certain built default schemas which are ignored in the test
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
-            raise AssertionError(Exception("found the following diffs: " + str(difference)))
-    pdb.set_trace()
-    print("end")
\ No newline at end of file
+            raise AssertionError(Exception("found the following diffs: " + str(difference)))
\ No newline at end of file

From a24433d11e75c745ca0f18c9bf36d84e57a42f03 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 12:40:35 +0000
Subject: [PATCH 063/132] Adding a changelog

---
 datadog_checks_base/changelog.d/17258.added | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 datadog_checks_base/changelog.d/17258.added

diff --git a/datadog_checks_base/changelog.d/17258.added b/datadog_checks_base/changelog.d/17258.added
new file mode 100644
index 0000000000000..d5ffc4b7d356a
--- /dev/null
+++ b/datadog_checks_base/changelog.d/17258.added
@@ -0,0 +1 @@
+Adding schema collection to sqlserver

From c318747721cad1cc91c8cc45618788ae07ceb0d9 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 12:42:39 +0000
Subject: [PATCH 064/132] removed pdb

---
 postgres/datadog_checks/postgres/metadata.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py
index ae2da66fbc622..21ad5b992aec6 100644
--- a/postgres/datadog_checks/postgres/metadata.py
+++ b/postgres/datadog_checks/postgres/metadata.py
@@ -4,7 +4,6 @@
 import json
 import time
 from typing import Dict, List, Optional, Tuple, Union  # noqa: F401
-import pdb
 import psycopg2
 
 from datadog_checks.postgres.cursor import CommenterDictCursor
@@ -312,7 +311,6 @@ def report_postgres_metadata(self):
             self._is_schemas_collection_in_progress = False
 
     def _flush_schema(self, base_event, database, schema, tables):
-        pdb.set_trace()
         event = {
             **base_event,
             "metadata": [{**database, "schemas": [{**schema, "tables": tables}]}],

From 0035119fac9f25d8245d8d1427b65fa23398df35 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 12:49:33 +0000
Subject: [PATCH 065/132] Formatted

---
 postgres/datadog_checks/postgres/metadata.py |  1 +
 sqlserver/hatch.toml                         |  2 --
 sqlserver/tests/compose/setup.sh             |  2 --
 sqlserver/tests/compose/setup.sql            | 23 +++++---------------
 4 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/postgres/datadog_checks/postgres/metadata.py b/postgres/datadog_checks/postgres/metadata.py
index 21ad5b992aec6..37dd85495f137 100644
--- a/postgres/datadog_checks/postgres/metadata.py
+++ b/postgres/datadog_checks/postgres/metadata.py
@@ -4,6 +4,7 @@
 import json
 import time
 from typing import Dict, List, Optional, Tuple, Union  # noqa: F401
+
 import psycopg2
 
 from datadog_checks.postgres.cursor import CommenterDictCursor
diff --git a/sqlserver/hatch.toml b/sqlserver/hatch.toml
index 59de0ead06750..27cd54574a225 100644
--- a/sqlserver/hatch.toml
+++ b/sqlserver/hatch.toml
@@ -21,7 +21,6 @@ driver = ["SQLOLEDB", "SQLNCLI11", "MSOLEDBSQL", "odbc"]
 version = ["2019", "2022"]
 setup = ["single"]
 
-
 # The high cardinality environment is meant to be used for local dev/testing
 # for example, when we want to do performance testing on local changes to the metrics
 # query, we can do that by uncommenting this env setup. Note, you should make sure to set you
@@ -69,4 +68,3 @@ matrix.driver.env-vars = [
 name.linux-odbc-2019-high-cardinality.env-vars = "COMPOSE_FOLDER=compose-high-cardinality"
 name.linux-odbc-2022-high-cardinality.env-vars = "COMPOSE_FOLDER=compose-high-cardinality"
 name.windows-odbc-2019-high-cardinality.env-vars = "COMPOSE_FOLDER=compose-high-cardinality-windows"
-
diff --git a/sqlserver/tests/compose/setup.sh b/sqlserver/tests/compose/setup.sh
index f4aa33bb663b7..e0b3cc7a678e4 100644
--- a/sqlserver/tests/compose/setup.sh
+++ b/sqlserver/tests/compose/setup.sh
@@ -13,9 +13,7 @@ do
     fi
 done
 
-
 /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P $SA_PASSWORD -d master -i setup.sql -b
-
 if [ $? -eq 0 ]
 then
     echo "INFO: setup.sql completed."
diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index d3f75fec8a1d5..5ccd96521b254 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -25,22 +25,6 @@ GO
 CREATE SCHEMA test_schema;
 GO
 
---CREATE TABLE datadog_test_schemas.test_schema.cities (id int DEFAULT 0, name varchar(255));
---GO
---ALTER TABLE datadog_test_schemas.test_schema.cities
---ALTER COLUMN id INT NOT NULL;
---GO
---CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
---ALTER TABLE datadog_test_schemas.test_schema.cities
---ADD CONSTRAINT PK_Cities PRIMARY KEY (id);
---GO
-
---CREATE TABLE datadog_test_schemas.test_schema.cities (
---    id INT NOT NULL DEFAULT 0,
---    name VARCHAR(255),
---    CONSTRAINT PK_Cities PRIMARY KEY (id)
---);
-
 -- Create the partition function
 CREATE PARTITION FUNCTION CityPartitionFunction (INT)
 AS RANGE LEFT FOR VALUES (100, 200, 300); -- Define your partition boundaries here
@@ -57,18 +41,20 @@ CREATE TABLE datadog_test_schemas.test_schema.cities (
     CONSTRAINT PK_Cities PRIMARY KEY (id)
 ) ON CityPartitionScheme(id); -- Assign the partition scheme to the table
 
-
+-- Create indexes
 CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
 CREATE INDEX single_column_index ON datadog_test_schemas.test_schema.cities (population);
 
 INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey', 100), (2, 'bar', 200);
 GO
+
+-- Create table with a foreign key
 CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);
 GO
 ALTER TABLE datadog_test_schemas.test_schema.landmarks ADD CONSTRAINT FK_CityId FOREIGN KEY (city_id) REFERENCES datadog_test_schemas.test_schema.cities(id);
 GO
 
---------------------------------------------------
+-- Create table with unique constraint
 CREATE TABLE datadog_test_schemas.test_schema.Restaurants (
     RestaurantName VARCHAR(255),
     District VARCHAR(100),
@@ -77,6 +63,7 @@ CREATE TABLE datadog_test_schemas.test_schema.Restaurants (
 );
 GO
 
+-- Create table with a foreign key on two columns
 CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
     RestaurantName VARCHAR(255),
     District VARCHAR(100),

From 2b2531b628c7387dc260536f45196cfd4840917d Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:34:15 +0000
Subject: [PATCH 066/132] removed populate

---
 sqlserver/tests/compose/setup.sql | 75 -------------------------------
 1 file changed, 75 deletions(-)

diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 5ccd96521b254..3c4e610bb992a 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -72,81 +72,6 @@ CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
 );
 GO
 
--- Start of populate.sql
-DECLARE @TableNamePrefix NVARCHAR(100) = 'dbm_employee_boris';
-DECLARE @Index INT = 1;
-DECLARE @MaxTables INT = 0;
-
-WHILE @Index <= @MaxTables
-BEGIN
-    DECLARE @TableName NVARCHAR(200) = @TableNamePrefix + '_' + CAST(@Index AS NVARCHAR(10));
-    DECLARE @SQL NVARCHAR(MAX);
-
-    SET @SQL = '
-        CREATE TABLE ' + QUOTENAME(@TableName) + ' (
-            id INT NOT NULL IDENTITY PRIMARY KEY,
-            username VARCHAR(200),
-            nickname VARCHAR(200),
-            email VARCHAR(200),
-            created_at DATETIME DEFAULT GETDATE(),
-            updated_at DATETIME DEFAULT GETDATE(),
-            username2 VARCHAR(200),
-username3 VARCHAR(200),
-username4 VARCHAR(200),
-username5 VARCHAR(200),
-username6 VARCHAR(200),
-username7 VARCHAR(200),
-username8 VARCHAR(200),
-username9 VARCHAR(200),
-username10 VARCHAR(200),
-username11 VARCHAR(200),
-username12 VARCHAR(200),
-username13 VARCHAR(200),
-username14 VARCHAR(200),
-username15 VARCHAR(200),
-username16 VARCHAR(200),
-username17 VARCHAR(200),
-username18 VARCHAR(200),
-username19 VARCHAR(200),
-username20 VARCHAR(200),
-username21 VARCHAR(200),
-username22 VARCHAR(200),
-username23 VARCHAR(200),
-username24 VARCHAR(200),
-username25 VARCHAR(200),
-username26 VARCHAR(200),
-username27 VARCHAR(200),
-username28 VARCHAR(200),
-username29 VARCHAR(200),
-username30 VARCHAR(200),
-username31 VARCHAR(200),
-username32 VARCHAR(200),
-username33 VARCHAR(200),
-username34 VARCHAR(200),
-username35 VARCHAR(200),
-username36 VARCHAR(200),
-username37 VARCHAR(200),
-username38 VARCHAR(200),
-username39 VARCHAR(200),
-username40 VARCHAR(200),
-username41 VARCHAR(200),
-username42 VARCHAR(200),
-username43 VARCHAR(200),
-username44 VARCHAR(200),
-username45 VARCHAR(200),
-username46 VARCHAR(200),
-username47 VARCHAR(200),
-username48 VARCHAR(200),
-username49 VARCHAR(200),
-username50 VARCHAR(200)
-        );';
-
-    EXEC sp_executesql @SQL, N'@TableNamePrefix NVARCHAR(100)', @TableNamePrefix;
-
-    SET @Index = @Index + 1;
-END;
--- End of populate.sql
-
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 CREATE DATABASE [datadog_test-1];

From f6cebf05b16f83c38ee7467ea6839de475f66be9 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:37:57 +0000
Subject: [PATCH 067/132] Clean up empty lines

---
 sqlserver/tests/compose/setup.sql | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 3c4e610bb992a..23fa756c303c4 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -5,7 +5,6 @@ GRANT SELECT on sys.dm_os_performance_counters to datadog;
 GRANT VIEW SERVER STATE to datadog;
 GRANT CONNECT ANY DATABASE to datadog;
 GRANT VIEW ANY DEFINITION to datadog;
-GRANT CREATE TYPE TO datadog;
 
 -- test users
 CREATE LOGIN bob WITH PASSWORD = 'Password12!';
@@ -86,8 +85,6 @@ CREATE USER fred FOR LOGIN fred;
 CREATE CLUSTERED INDEX thingsindex ON [datadog_test-1].dbo.ϑings (name);
 GO
 
-
-
 EXEC sp_addrolemember 'db_datareader', 'bob'
 EXEC sp_addrolemember 'db_datareader', 'fred'
 EXEC sp_addrolemember 'db_datawriter', 'bob'

From 7261e4c292e315a0898ebe009ffa562e35071f57 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:46:14 +0000
Subject: [PATCH 068/132] put back the driver

---
 sqlserver/tests/odbc/odbcinst.ini | 2 +-
 sqlserver/tests/test_metadata.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sqlserver/tests/odbc/odbcinst.ini b/sqlserver/tests/odbc/odbcinst.ini
index 58163f2833d9e..75ffdd4b4d72d 100644
--- a/sqlserver/tests/odbc/odbcinst.ini
+++ b/sqlserver/tests/odbc/odbcinst.ini
@@ -6,5 +6,5 @@ Driver=/usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
 
 [ODBC Driver 18 for SQL Server]
 Description=Microsoft ODBC Driver 18 for SQL Server
-Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.2.1
+Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.3.1
 UsageCount=1
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 56edc3cb9ddb4..0aa2cbfb2a650 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -358,6 +358,6 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         diff_keys = list(difference.keys())
-        # schema data also collects certain built default schemas which are ignored in the test
+        # schema data also collects certain builtin default schemas which are ignored in the test
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
             raise AssertionError(Exception("found the following diffs: " + str(difference)))
\ No newline at end of file

From c9f8e0b4c7a3281b5917a8b59694ed28133cad58 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:48:29 +0000
Subject: [PATCH 069/132] put remove check

---
 sqlserver/tests/test_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 0aa2cbfb2a650..a1c85b5d91551 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -82,7 +82,7 @@ def test_get_settings_query_cached(dbm_instance, caplog):
 
 def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
-    dd_run_check(check)
+    #dd_run_check(check)
     check.initialize_connection()
     check.check(dbm_instance)
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")

From c029d077dadf323d611e74e75b8c602bc57bf0e5 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:49:15 +0000
Subject: [PATCH 070/132] put back space

---
 sqlserver/tests/test_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index a1c85b5d91551..248c428cd758c 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -82,7 +82,7 @@ def test_get_settings_query_cached(dbm_instance, caplog):
 
 def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
-    #dd_run_check(check)
+    # dd_run_check(check)
     check.initialize_connection()
     check.check(dbm_instance)
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")

From 75c7c34769a1831525cd438d362ae668ef88bb3f Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:50:22 +0000
Subject: [PATCH 071/132] remove space

---
 sqlserver/tests/test_metrics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sqlserver/tests/test_metrics.py b/sqlserver/tests/test_metrics.py
index 6f2b88011214c..9cd60b1aa92bf 100644
--- a/sqlserver/tests/test_metrics.py
+++ b/sqlserver/tests/test_metrics.py
@@ -212,7 +212,6 @@ def test_check_index_usage_metrics(
 ):
     instance_docker_metrics['database'] = 'datadog_test-1'
     instance_docker_metrics['include_index_usage_metrics'] = True
-
     instance_docker_metrics['ignore_missing_database'] = True
 
     # Cause an index seek

From 3426b14f52f422a5bbd0cf53e20604e89f2f5a49 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:52:15 +0000
Subject: [PATCH 072/132] reapplied linter

---
 sqlserver/tests/test_metadata.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 248c428cd758c..2088056dde959 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -99,7 +99,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     exp_datadog_test = {
         'id': '6',
         'name': 'datadog_test',
-        "collation":"SQL_Latin1_General_CP1_CI_AS",
+        "collation": "SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
         'schemas': [
             {
@@ -146,7 +146,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     exp_datadog_test_schemas = {
         'id': '5',
         'name': 'datadog_test_schemas',
-        "collation":"SQL_Latin1_General_CP1_CI_AS",
+        "collation": "SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
         'schemas': [
             {
@@ -335,7 +335,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
         assert schema_event.get("timestamp") is not None
-        assert schema_event["host"] == "stubbed.hostname" 
+        assert schema_event["host"] == "stubbed.hostname"
         assert schema_event["agent_version"] == "0.0.0"
         assert schema_event["dbms"] == "sqlserver"
         assert schema_event.get("collection_interval") is not None
@@ -360,4 +360,4 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         diff_keys = list(difference.keys())
         # schema data also collects certain builtin default schemas which are ignored in the test
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
-            raise AssertionError(Exception("found the following diffs: " + str(difference)))
\ No newline at end of file
+            raise AssertionError(Exception("found the following diffs: " + str(difference)))

From f7b89ea0b3a59dfc6919b562abea3a6f43492e47 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 14:56:05 +0000
Subject: [PATCH 073/132] Improved changelog

---
 datadog_checks_base/changelog.d/17258.added | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/datadog_checks_base/changelog.d/17258.added b/datadog_checks_base/changelog.d/17258.added
index d5ffc4b7d356a..800afe1e7b738 100644
--- a/datadog_checks_base/changelog.d/17258.added
+++ b/datadog_checks_base/changelog.d/17258.added
@@ -1 +1,3 @@
-Adding schema collection to sqlserver
+Added schema collection to the SQL Server integration. 
+Schema data includes information about the tables, their columns, indexes, foreign keys, and partitions.
+The total amount of fetched columns is limited to 100_000.

From 2209de4241a68a56820d6afb82f3cdb0b50c6765 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 15:29:31 +0000
Subject: [PATCH 074/132] Improved docs

---
 sqlserver/assets/configuration/spec.yaml                  | 2 +-
 sqlserver/datadog_checks/sqlserver/data/conf.yaml.example | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 53414b37e09b8..f1a81e420f330 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -719,7 +719,7 @@ files:
         Defaults to 1200 seconds to include everything.
       value:
         type: number
-        example: 600
+        example: 1200
         display_default: false
     - template: instances/default
   - template: logs
diff --git a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
index 8d3fff9c006c2..e106aab5cbbe4 100644
--- a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
+++ b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
@@ -658,6 +658,9 @@ instances:
     ## If the DB specified doesn't exist on the server then don't do the check
     #
     # ignore_missing_database: false
+    
+    # @param schemas_collection_interval - int - optional - dafault: 1200
+    # Schema collection interval in seconds.
 
     ## @param tags - list of strings - optional
     ## A list of tags to attach to every metric and service check emitted by this instance.

From 233ceacedf8cbaf725d66580ab70442eb47865e7 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 7 May 2024 15:32:54 +0000
Subject: [PATCH 075/132] improved example

---
 sqlserver/datadog_checks/sqlserver/data/conf.yaml.example | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
index e106aab5cbbe4..91d9f9ca8df1a 100644
--- a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
+++ b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
@@ -660,7 +660,9 @@ instances:
     # ignore_missing_database: false
     
     # @param schemas_collection_interval - int - optional - dafault: 1200
-    # Schema collection interval in seconds.
+    # Frequency in seconds of schema collections.  Defaults to `1200`.
+    #
+    # schemas_collection_interval: 1200
 
     ## @param tags - list of strings - optional
     ## A list of tags to attach to every metric and service check emitted by this instance.

From d98d1d1920916b93465bfe471d6144dfc913b0b0 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 8 May 2024 11:28:36 +0000
Subject: [PATCH 076/132] corrected comment

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index fb91bf1a9d152..c485d99188f8f 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -26,7 +26,10 @@
 class SubmitData:
     MAX_COLUMN_COUNT = 10_000
 
-    # REDAPL has a 3MB limit per resource
+    # TBD - REDAPL has a 3MB limit per resource
+    # If a column payload is ~ 10bytes : name, type, default , if nullable nullable
+    # then the limit should be only 25_000.
+
     MAX_TOTAL_COLUMN_COUNT = 100_000
 
     def __init__(self, submit_data_function, base_event, logger):

From 65d190d224eab2f76de4250421feed9058f6057a Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 8 May 2024 12:28:34 +0000
Subject: [PATCH 077/132] added submitter unit test

---
 sqlserver/datadog_checks/sqlserver/schemas.py |   1 +
 sqlserver/tests/test_unit.py                  | 133 ++++++++++++++++++
 2 files changed, 134 insertions(+)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index c485d99188f8f..3420afec2113c 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -73,6 +73,7 @@ def store(self, db_name, schema, tables, columns_count):
     def exceeded_total_columns_number(self):
         return self._total_columns_count > self.MAX_TOTAL_COLUMN_COUNT
 
+    #NOTE: DB with no schemas is never submitted
     def submit(self):
         if not bool(self.db_to_schemas):
             return
diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index 0f65e631a01cc..c2660f77fbe20 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -8,6 +8,7 @@
 
 import mock
 import pytest
+import json
 
 from datadog_checks.dev import EnvVars
 from datadog_checks.sqlserver import SQLServer
@@ -21,6 +22,11 @@
     set_default_driver_conf,
 )
 
+from datadog_checks.sqlserver.schemas import SubmitData
+from deepdiff import DeepDiff
+from datadog_checks.base.utils.db.utils import default_json_event_encoding
+
+import pdb
 from .common import CHECK_NAME, DOCKER_SERVER, assert_metrics
 from .utils import windows_ci
 
@@ -735,3 +741,130 @@ def test_extract_sql_comments_and_procedure_name(query, expected_comments, is_pr
     assert comments == expected_comments
     assert p == is_proc
     assert re.match(name, expected_name, re.IGNORECASE) if expected_name else expected_name == name
+
+
+class DummyLogger:
+    def debug(*args):
+        pass
+    def error(*args):
+        pass
+    
+def set_up_submitter_unit_test():
+    submitted_data = []
+    base_event = {
+        "host": "some",
+        "agent_version": 0,
+        "dbms": "sqlserver",
+        "kind": "sqlserver_databases",
+        "collection_interval": 1200,
+        "dbms_version": "some",
+        "tags": "some",
+        "cloud_metadata": "some",
+    }
+    def submitData(data):
+       submitted_data.append(data)
+    
+    dataSubmitter = SubmitData(submitData, base_event, DummyLogger())
+    return dataSubmitter, submitted_data    
+
+def test_submit_data():
+
+    dataSubmitter, submitted_data = set_up_submitter_unit_test()
+
+    dataSubmitter.store_db_info("test_db1", {"id": 3, "name" : "test_db1"})
+    dataSubmitter.store_db_info("test_db2", {"id": 4, "name" : "test_db2"})
+    schema1 = {"id" : "1"}
+    schema2 = {"id" : "2"}
+    schema3 = {"id" : "3"}
+
+    dataSubmitter.store("test_db1", schema1, [1,2], 5)
+    dataSubmitter.store("test_db2", schema3, [1,2], 5)
+    dataSubmitter.store("test_db1", schema2, [1,2], 10)
+    
+    dataSubmitter.submit()
+
+    expected_data = {
+	"host":"some",
+	"agent_version":0,
+	"dbms":"sqlserver",
+	"kind":"sqlserver_databases",
+	"collection_interval":1200,
+	"dbms_version":"some",
+	"tags":"some",
+	"cloud_metadata":"some",
+	"metadata":[
+		{
+			"id":3,
+			"name":"test_db1",
+			"schemas":[
+				{
+					"id":"1",
+					"tables":[
+						1,
+						2
+					]
+				},
+				{
+					"id":"2",
+					"tables":[
+						1,
+						2
+					]
+				}
+			]
+		},
+		{
+			"id":4,
+			"name":"test_db2",
+			"schemas":[
+				{
+					"id":"3",
+					"tables":[
+						1,
+						2
+					]
+				}
+			]
+		},        
+	],
+	"timestamp":1.1
+    }
+    difference = DeepDiff(json.loads(submitted_data[0]),expected_data , exclude_paths="root['timestamp']", ignore_order=True)
+    assert len(difference) == 0
+
+def test_store_large_amount_of_columns():
+
+    dataSubmitter, submitted_data = set_up_submitter_unit_test()    
+    dataSubmitter.store_db_info("test_db1", {"id": 3, "name" : "test_db1"})
+    schema1 = {"id" : "1"}
+    dataSubmitter.store("test_db1", schema1, [1,2], SubmitData.MAX_COLUMN_COUNT+SubmitData.MAX_TOTAL_COLUMN_COUNT+1)
+    expected_data = {
+	"host":"some",
+	"agent_version":0,
+	"dbms":"sqlserver",
+	"kind":"sqlserver_databases",
+	"collection_interval":1200,
+	"dbms_version":"some",
+	"tags":"some",
+	"cloud_metadata":"some",
+	"metadata":[
+		{
+			"id":3,
+			"name":"test_db1",
+			"schemas":[
+				{
+					"id":"1",
+					"tables":[
+						1,
+						2
+					]
+				}
+			]
+		},      
+	],
+	"timestamp":1.1
+    }
+    assert dataSubmitter.exceeded_total_columns_number()
+    difference = DeepDiff(json.loads(submitted_data[0]),expected_data , exclude_paths="root['timestamp']", ignore_order=True)
+    assert len(difference) == 0
+

From b41fa38b8dc70d5be18a740c70ba5aa720c2fb27 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 10 May 2024 13:20:04 +0000
Subject: [PATCH 078/132] xchanged config

---
 sqlserver/assets/configuration/spec.yaml       | 18 +++++++++++++++---
 sqlserver/datadog_checks/sqlserver/config.py   |  6 +-----
 sqlserver/datadog_checks/sqlserver/const.py    |  2 +-
 sqlserver/datadog_checks/sqlserver/schemas.py  | 14 +++++++++++---
 .../datadog_checks/sqlserver/sqlserver.py      |  3 ++-
 sqlserver/tests/test_metadata.py               |  2 ++
 6 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index f1a81e420f330..39f7850372108 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -713,10 +713,22 @@ files:
         type: number
         example: 1800
         display_default: false
-    - name: schemas_collection_interval 
+    - name: schemas_collection 
       description: |
-        The database schema collection interval (in seconds).
-        Defaults to 1200 seconds to include everything.
+        Configure collection of schemas.
+      options:
+        - name: enabled
+          description: |
+            Enable schema collection. Requires `dbm: true`.
+          value:
+            type: boolean
+            example: false
+        - name: collection_interval
+          description: |
+            Set the database schema collection interval (in seconds). Defaults to 600 seconds
+          value:
+            type: number
+            example: 600
       value:
         type: number
         example: 1200
diff --git a/sqlserver/datadog_checks/sqlserver/config.py b/sqlserver/datadog_checks/sqlserver/config.py
index 382ae3c3d364d..23819f8336dba 100644
--- a/sqlserver/datadog_checks/sqlserver/config.py
+++ b/sqlserver/datadog_checks/sqlserver/config.py
@@ -9,7 +9,6 @@
 from datadog_checks.base.utils.common import to_native_string
 from datadog_checks.sqlserver.const import (
     DEFAULT_AUTODISCOVERY_INTERVAL,
-    DEFAULT_SCHEMAS_COLLECTION_INTERVAL,
     PROC_CHAR_LIMIT,
 )
 
@@ -27,10 +26,6 @@ def __init__(self, init_config, instance, log):
         self.autodiscovery_interval: int = instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
         self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include)
         self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude)
-        self.schemas_collection_interval: int = instance.get(
-            'schemas_collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
-        )
-
         self.proc: str = instance.get('stored_procedure')
         self.custom_metrics: list[dict] = init_config.get('custom_metrics', []) or []
         self.include_index_usage_metrics_tempdb: bool = is_affirmative(
@@ -52,6 +47,7 @@ def __init__(self, init_config, instance, log):
         self.procedure_metrics_config: dict = instance.get('procedure_metrics', {}) or {}
         self.settings_config: dict = instance.get('collect_settings', {}) or {}
         self.activity_config: dict = instance.get('query_activity', {}) or {}
+        self.schema_config: dict = instance.get('schemas_collection', {}) or {}
         self.cloud_metadata: dict = {}
         aws: dict = instance.get('aws', {}) or {}
         gcp: dict = instance.get('gcp', {}) or {}
diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index e30a049a82625..c18f3f464fd5c 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -270,7 +270,7 @@
 PROC_CHAR_LIMIT = 500
 
 # Schemas
-DEFAULT_SCHEMAS_COLLECTION_INTERVAL = 1200
+DEFAULT_SCHEMAS_COLLECTION_INTERVAL = 600
 
 DB_QUERY = """
 SELECT
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 3420afec2113c..cee2cf9e546fa 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -19,6 +19,7 @@
     STATIC_INFO_ENGINE_EDITION,
     STATIC_INFO_VERSION,
     TABLES_IN_SCHEMA_QUERY,
+    DEFAULT_SCHEMAS_COLLECTION_INTERVAL
 )
 from datadog_checks.sqlserver.utils import execute_query_output_result_as_a_dict, get_list_chunks
 
@@ -104,17 +105,22 @@ class Schemas:
     # but allows the queue to be stable.
     TABLES_CHUNK_SIZE = 50
 
-    def __init__(self, check, schemas_collection_interval):
+    def __init__(self, check, config):
         self._check = check
         self._log = check.log
         self.schemas_per_db = {}
-
+        collection_interval = config.schema_config.get(
+            'collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
+        )
+        self._collection_interval = collection_interval if collection_interval > 0 else DEFAULT_SCHEMAS_COLLECTION_INTERVAL
+        self._enabled = config.schema_config.get('enabled', False)
+       
         base_event = {
             "host": None,
             "agent_version": datadog_agent.get_version(),
             "dbms": "sqlserver",
             "kind": "sqlserver_databases",
-            "collection_interval": schemas_collection_interval,
+            "collection_interval":  self._collection_interval,
             "dbms_version": None,
             "tags": self._check.non_internal_tags,
             "cloud_metadata": self._check._config.cloud_metadata,
@@ -165,6 +171,8 @@ def __init__(self, check, schemas_collection_interval):
 
     @tracked_method(agent_check_getter=agent_check_getter)
     def collect_schemas_data(self):
+        if not self._enabled:
+            return
         self._dataSubmitter.reset()
         self._dataSubmitter.set_base_event_data(
             self._check.resolved_hostname,
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index cfd266cc7feab..e14e2a9529efa 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -161,7 +161,7 @@ def __init__(self, name, init_config, instances):
         self._database_metrics = None
 
         self._last_schemas_collect_time = None
-        self._schemas = Schemas(self, self._config.schemas_collection_interval)
+        self._schemas = Schemas(self, self._config)
 
     def cancel(self):
         self.statement_metrics.cancel()
@@ -791,6 +791,7 @@ def check(self, _):
                 self.procedure_metrics.run_job_loop(self.tags)
                 self.activity.run_job_loop(self.tags)
                 self.sql_metadata.run_job_loop(self.tags)
+                self._schemas.collect_schemas_data()
         else:
             self.log.debug("Skipping check")
 
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 2088056dde959..cccb5b7766b49 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -325,6 +325,8 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
     dbm_instance['database_autodiscovery'] = True
     dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test']
+    dbm_instance['dbm'] = True
+    dbm_instance['schemas_collection'] = {"enabled" : True}
 
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     dd_run_check(check)

From b611a8b4d33af86123f6cfa99e264e92eeadfb2c Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 10 May 2024 19:16:25 +0000
Subject: [PATCH 079/132] Added param query

---
 sqlserver/datadog_checks/sqlserver/const.py   |  4 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 53 ++++++++++++-------
 .../datadog_checks/sqlserver/sqlserver.py     |  1 -
 sqlserver/datadog_checks/sqlserver/utils.py   |  7 ++-
 4 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index c18f3f464fd5c..f931b7774292e 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -277,7 +277,7 @@
     db.database_id AS id, db.name AS name, db.collation_name AS collation, dp.name AS owner
 FROM
     sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid
-WHERE db.name = '{}';
+WHERE db.name IN ({});
 """
 
 SCHEMA_QUERY = """
@@ -293,7 +293,7 @@
     object_id AS id, name
 FROM
     sys.tables
-WHERE schema_id={}
+WHERE schema_id=?
 """
 
 COLUMN_QUERY = """
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index cee2cf9e546fa..216a12505e552 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -6,7 +6,7 @@
 import copy
 import json
 import time
-
+import pdb
 from datadog_checks.base.utils.db.utils import default_json_event_encoding
 from datadog_checks.base.utils.tracking import tracked_method
 from datadog_checks.sqlserver.const import (
@@ -21,7 +21,7 @@
     TABLES_IN_SCHEMA_QUERY,
     DEFAULT_SCHEMAS_COLLECTION_INTERVAL
 )
-from datadog_checks.sqlserver.utils import execute_query_output_result_as_a_dict, get_list_chunks
+from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks
 
 
 class SubmitData:
@@ -55,8 +55,9 @@ def reset(self):
         self.db_to_schemas = {}
         self.db_info = {}
 
-    def store_db_info(self, db_name, db_info):
-        self.db_info[db_name] = db_info
+    def store_db_infos(self, db_infos):
+        for db_info in db_infos:
+            self.db_info[db_info['name']] = db_info
 
     def store(self, db_name, schema, tables, columns_count):
         self._columns_count += columns_count
@@ -109,6 +110,7 @@ def __init__(self, check, config):
         self._check = check
         self._log = check.log
         self.schemas_per_db = {}
+        self._last_schemas_collect_time = None
         collection_interval = config.schema_config.get(
             'collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
         )
@@ -169,10 +171,22 @@ def __init__(self, check, config):
                     "partition_count": int
     """
 
-    @tracked_method(agent_check_getter=agent_check_getter)
     def collect_schemas_data(self):
         if not self._enabled:
             return
+        if (
+                self._last_schemas_collect_time is None
+                or time.time() - self._last_schemas_collect_time > self._config.schemas_collection_interval
+            ):
+            try:
+                self._collect_schemas_data()
+            except:
+                raise
+            finally:
+                self._last_schemas_collect_time = time.time()
+
+    @tracked_method(agent_check_getter=agent_check_getter)
+    def _collect_schemas_data(self):
         self._dataSubmitter.reset()
         self._dataSubmitter.set_base_event_data(
             self._check.resolved_hostname,
@@ -184,11 +198,12 @@ def collect_schemas_data(self):
             ),
         )
 
+        databases = self._check.get_databases()
+        db_infos = self._query_db_informations(databases)
+        self._dataSubmitter.store_db_infos(db_infos)
         # returns if to stop, True means stop iterating.
         def fetch_schema_data(cursor, db_name):
-            db_info = self._query_db_information(db_name, cursor)
             schemas = self._query_schema_information(cursor)
-            self._dataSubmitter.store_db_info(db_name, db_info)
             for schema in schemas:
                 tables = self._get_tables(schema, cursor)
                 tables_chunks = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
@@ -213,13 +228,11 @@ def fetch_schema_data(cursor, db_name):
         self._log.debug("Finished collect_schemas_data")
         self._dataSubmitter.submit()
 
-    def _query_db_information(self, db_name, cursor):
-        db_info = execute_query_output_result_as_a_dict(DB_QUERY.format(db_name), cursor, convert_results_to_str=True)
-        if len(db_info) == 1:
-            return db_info[0]
-        else:
-            self._log.error("Couldnt query database information for %s", db_name)
-            return None
+    def _query_db_informations(self, db_names):
+        with self._check.connection.open_managed_default_connection():
+            with self._check.connection.get_managed_cursor() as cursor:
+                db_names_formatted = ",".join(["'{}'".format(t) for t in db_names])
+                return execute_query_output_result_as_dicts(DB_QUERY.format(db_names_formatted), cursor, convert_results_to_str=True)
 
     """ returns a list of tables for schema with their names and empty column array
     list of table dicts
@@ -230,8 +243,8 @@ def _query_db_information(self, db_name, cursor):
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables(self, schema, cursor):
-        tables_info = execute_query_output_result_as_a_dict(
-            TABLES_IN_SCHEMA_QUERY.format(schema["id"]), cursor, convert_results_to_str=True
+        tables_info = execute_query_output_result_as_dicts(
+            TABLES_IN_SCHEMA_QUERY, cursor, convert_results_to_str=True, parameter=schema["id"]
         )
         for t in tables_info:
             t.setdefault("columns", [])
@@ -246,7 +259,7 @@ def _get_tables(self, schema, cursor):
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _query_schema_information(self, cursor):
-        return execute_query_output_result_as_a_dict(SCHEMA_QUERY, cursor, convert_results_to_str=True)
+        return execute_query_output_result_as_dicts(SCHEMA_QUERY, cursor, convert_results_to_str=True)
 
     """ returns extracted column numbers and a list of tables
         "tables" : list of tables dicts
@@ -334,7 +347,7 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, s
 
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
-        rows = execute_query_output_result_as_a_dict(PARTITIONS_QUERY.format(table_ids), cursor)
+        rows = execute_query_output_result_as_dicts(PARTITIONS_QUERY.format(table_ids), cursor)
         for row in rows:
             id = row.pop("id", None)
             if id is not None:
@@ -348,7 +361,7 @@ def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
 
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
-        rows = execute_query_output_result_as_a_dict(INDEX_QUERY.format(table_ids), cursor)
+        rows = execute_query_output_result_as_dicts(INDEX_QUERY.format(table_ids), cursor)
         for row in rows:
             id = row.pop("id", None)
             if id is not None:
@@ -363,7 +376,7 @@ def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _populate_with_foreign_keys_data(self, table_ids, id_to_table_data, cursor):
-        rows = execute_query_output_result_as_a_dict(FOREIGN_KEY_QUERY.format(table_ids), cursor)
+        rows = execute_query_output_result_as_dicts(FOREIGN_KEY_QUERY.format(table_ids), cursor)
         for row in rows:
             id = row.pop("id", None)
             if id is not None:
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index e14e2a9529efa..f869723e6f3fa 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -160,7 +160,6 @@ def __init__(self, name, init_config, instances):
 
         self._database_metrics = None
 
-        self._last_schemas_collect_time = None
         self._schemas = Schemas(self, self._config)
 
     def cancel(self):
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index 421c5f446485b..b65799c49366a 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -139,8 +139,11 @@ def is_azure_sql_database(engine_edition):
     return engine_edition == ENGINE_EDITION_SQL_DATABASE
 
 
-def execute_query_output_result_as_a_dict(query, cursor, convert_results_to_str=False):
-    cursor.execute(query)
+def execute_query_output_result_as_dicts(query, cursor, convert_results_to_str=False, parameter=None):
+    if parameter is not None:
+       cursor.execute(query,(parameter,))
+    else:    
+       cursor.execute(query)
     columns = [str(column[0]).lower() for column in cursor.description]
     rows = []
     if convert_results_to_str:

From 63178b894d24ab3ed273cf5c9323bb9df42f6c70 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 10 May 2024 19:54:18 +0000
Subject: [PATCH 080/132] improved logging

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 216a12505e552..b9e795d76ba58 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -6,7 +6,7 @@
 import copy
 import json
 import time
-import pdb
+
 from datadog_checks.base.utils.db.utils import default_json_event_encoding
 from datadog_checks.base.utils.tracking import tracked_method
 from datadog_checks.sqlserver.const import (
@@ -74,10 +74,17 @@ def store(self, db_name, schema, tables, columns_count):
 
     def exceeded_total_columns_number(self):
         return self._total_columns_count > self.MAX_TOTAL_COLUMN_COUNT
+    
+    def truncate(self, json_event):
+        max_length = 1000
+        if len(json_event) > max_length:
+            return json_event[:max_length] + " ... (truncated)"
+        else:
+            return json_event
 
     #NOTE: DB with no schemas is never submitted
     def submit(self):
-        if not bool(self.db_to_schemas):
+        if not self.db_to_schemas:
             return
         self._columns_count = 0
         event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000}
@@ -90,7 +97,7 @@ def submit(self):
                 db_info = self.db_info[db]
             event["metadata"] = event["metadata"] + [{**(db_info), "schemas": list(schemas_by_id.values())}]
         json_event = json.dumps(event, default=default_json_event_encoding)
-        self._log.debug("Reporting the following payload for schema collection: {}".format(json_event))
+        self._log.debug("Reporting the following payload for schema collection: {}".format(self.truncate(json_event)))
         self._submit_to_agent_queue(json_event)
         self.db_to_schemas = {}
 
@@ -202,6 +209,7 @@ def _collect_schemas_data(self):
         db_infos = self._query_db_informations(databases)
         self._dataSubmitter.store_db_infos(db_infos)
         # returns if to stop, True means stop iterating.
+        @tracked_method(agent_check_getter=agent_check_getter)
         def fetch_schema_data(cursor, db_name):
             schemas = self._query_schema_information(cursor)
             for schema in schemas:

From 8346d3cf35db38a22802d78b4f2d38768fd6fc06 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 13 May 2024 17:19:21 +0000
Subject: [PATCH 081/132] changelog changed

---
 datadog_checks_base/changelog.d/17258.added | 2 +-
 sqlserver/tests/odbc/odbcinst.ini           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/datadog_checks_base/changelog.d/17258.added b/datadog_checks_base/changelog.d/17258.added
index 800afe1e7b738..389ea6b571c4a 100644
--- a/datadog_checks_base/changelog.d/17258.added
+++ b/datadog_checks_base/changelog.d/17258.added
@@ -1,3 +1,3 @@
 Added schema collection to the SQL Server integration. 
 Schema data includes information about the tables, their columns, indexes, foreign keys, and partitions.
-The total amount of fetched columns is limited to 100_000.
+The total amount of fetched columns is limited to 100,000.
diff --git a/sqlserver/tests/odbc/odbcinst.ini b/sqlserver/tests/odbc/odbcinst.ini
index 75ffdd4b4d72d..58163f2833d9e 100644
--- a/sqlserver/tests/odbc/odbcinst.ini
+++ b/sqlserver/tests/odbc/odbcinst.ini
@@ -6,5 +6,5 @@ Driver=/usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
 
 [ODBC Driver 18 for SQL Server]
 Description=Microsoft ODBC Driver 18 for SQL Server
-Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.3.1
+Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.2.1
 UsageCount=1

From 1073f060d3113dd820c8a7b003a6514916e95eee Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 13 May 2024 19:03:16 +0000
Subject: [PATCH 082/132] Added stop iteration

---
 sqlserver/datadog_checks/sqlserver/schemas.py   | 7 +++++--
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index b9e795d76ba58..0548125b2b675 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -223,7 +223,7 @@ def fetch_schema_data(cursor, db_name):
                                 db_name, schema["name"]
                             )
                         )
-                        return True
+                        raise StopIteration
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
                     self._dataSubmitter.store(db_name, schema, tables_info, columns_count)
                     self._dataSubmitter.submit()  # Submit is forced after each 50 tables chunk
@@ -232,7 +232,10 @@ def fetch_schema_data(cursor, db_name):
             self._dataSubmitter.submit()
             return False
 
-        self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
+        errors = self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
+        if errors:
+            for e in errors:
+                self._log.error("While executing fetch schemas for databse - %s, the following exception occured - %s", e[0], e[1])
         self._log.debug("Finished collect_schemas_data")
         self._dataSubmitter.submit()
 
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index f869723e6f3fa..c624f5ca9989d 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -738,6 +738,7 @@ def get_databases(self):
         return db_names
 
     def do_for_databases(self, action, databases):
+        exceptions = []
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         with self.connection.open_managed_default_connection():
             with self.connection.get_managed_cursor() as cursor:
@@ -745,10 +746,12 @@ def do_for_databases(self, action, databases):
                     try:
                         if not is_azure_sql_database(engine_edition):
                             cursor.execute(SWITCH_DB_STATEMENT.format(db))
-                        stop = action(cursor, db)
-                        if stop:
-                            break
+                        action(cursor, db)
+                    except StopIteration as e:
+                        exceptions.append((db, "StopIteration"))
+                        return exceptions    
                     except Exception as e:
+                        exceptions.append((db, e))
                         self.log.error("An exception occurred during do_for_databases in db - %s: %s", db, e)
                 # Switch DB back to MASTER
                 if not is_azure_sql_database(engine_edition):

From 04f77c9c5b5927005a8b9f84c9587f8c96b5d64f Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 13 May 2024 19:04:36 +0000
Subject: [PATCH 083/132] pujt back odb

---
 sqlserver/tests/odbc/odbcinst.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/tests/odbc/odbcinst.ini b/sqlserver/tests/odbc/odbcinst.ini
index 58163f2833d9e..75ffdd4b4d72d 100644
--- a/sqlserver/tests/odbc/odbcinst.ini
+++ b/sqlserver/tests/odbc/odbcinst.ini
@@ -6,5 +6,5 @@ Driver=/usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
 
 [ODBC Driver 18 for SQL Server]
 Description=Microsoft ODBC Driver 18 for SQL Server
-Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.2.1
+Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.3.1
 UsageCount=1

From 183ce9c8e2628ff6abab8d45a10527dd6db793e8 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 21 May 2024 18:04:30 +0000
Subject: [PATCH 084/132] Inherited from async job

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 87 ++++++++++---------
 .../datadog_checks/sqlserver/sqlserver.py     |  3 +-
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 0548125b2b675..815e5d6a416a7 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -7,7 +7,12 @@
 import json
 import time
 
-from datadog_checks.base.utils.db.utils import default_json_event_encoding
+from datadog_checks.base import is_affirmative
+from datadog_checks.base.utils.db.utils import (
+    default_json_event_encoding,
+    DBMAsyncJob
+)
+
 from datadog_checks.base.utils.tracking import tracked_method
 from datadog_checks.sqlserver.const import (
     COLUMN_QUERY,
@@ -23,15 +28,13 @@
 )
 from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks
 
+#TODO 
+# make it a subclass of async but set sync
+# remove total amount of columns and put total exec time
+# pull out stop logic - submit tables one by one ? and control columns number for payload ? 
+# I can do a timer but in case of multithreading how to ensure ??? disable ? as kiiled by the 
 
 class SubmitData:
-    MAX_COLUMN_COUNT = 10_000
-
-    # TBD - REDAPL has a 3MB limit per resource
-    # If a column payload is ~ 10bytes : name, type, default , if nullable nullable
-    # then the limit should be only 25_000.
-
-    MAX_TOTAL_COLUMN_COUNT = 100_000
 
     def __init__(self, submit_data_function, base_event, logger):
         self._submit_to_agent_queue = submit_data_function
@@ -39,7 +42,6 @@ def __init__(self, submit_data_function, base_event, logger):
         self._log = logger
 
         self._columns_count = 0
-        self._total_columns_count = 0
         self.db_to_schemas = {}  # dbname : { id : schema }
         self.db_info = {}  # name to info
 
@@ -51,7 +53,6 @@ def set_base_event_data(self, hostname, tags, cloud_metadata, dbms_version):
 
     def reset(self):
         self._columns_count = 0
-        self._total_columns_count = 0
         self.db_to_schemas = {}
         self.db_info = {}
 
@@ -61,7 +62,6 @@ def store_db_infos(self, db_infos):
 
     def store(self, db_name, schema, tables, columns_count):
         self._columns_count += columns_count
-        self._total_columns_count += columns_count
         schemas = self.db_to_schemas.setdefault(db_name, {})
         if schema["id"] in schemas:
             known_tables = schemas[schema["id"]].setdefault("tables", [])
@@ -69,11 +69,9 @@ def store(self, db_name, schema, tables, columns_count):
         else:
             schemas[schema["id"]] = copy.deepcopy(schema)
             schemas[schema["id"]]["tables"] = tables
-        if self._columns_count > self.MAX_COLUMN_COUNT:
-            self._submit()
 
-    def exceeded_total_columns_number(self):
-        return self._total_columns_count > self.MAX_TOTAL_COLUMN_COUNT
+    def columns_since_last_submit(self):
+        return self._columns_count
     
     def truncate(self, json_event):
         max_length = 1000
@@ -106,36 +104,56 @@ def agent_check_getter(self):
     return self._check
 
 
-class Schemas:
+class Schemas(DBMAsyncJob):
 
     # Requests for infromation about tables are done for a certain amount of tables at the time
     # This number of tables doesnt slow down performance by much (15% compared to 500 tables)
     # but allows the queue to be stable.
     TABLES_CHUNK_SIZE = 50
+    # Note: in async mode execution time also cannot exceed 2 checks.
+    MAX_EXECUTION_TIME = 10
+    MAX_COLUMNS_PER_EVENT = 100_000
 
     def __init__(self, check, config):
         self._check = check
         self._log = check.log
         self.schemas_per_db = {}
+        #TODO to add
+        self._max_execution_time = config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME)
         self._last_schemas_collect_time = None
         collection_interval = config.schema_config.get(
             'collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
         )
-        self._collection_interval = collection_interval if collection_interval > 0 else DEFAULT_SCHEMAS_COLLECTION_INTERVAL
-        self._enabled = config.schema_config.get('enabled', False)
-       
+        super(Schemas, self).__init__(
+            check,
+            run_sync=is_affirmative(config.schema_config.get('run_sync', True)),
+            enabled=is_affirmative(config.schema_config.get('enabled', False)),
+            expected_db_exceptions=(),
+            # min collection interval is a desired collection interval for a check as a whole.
+            min_collection_interval=config.min_collection_interval,
+            dbms="sqlserver",
+            rate_limit=1 / float(collection_interval),
+            job_name="query-schemas",
+            shutdown_callback=self.shut_down,
+        )
         base_event = {
             "host": None,
             "agent_version": datadog_agent.get_version(),
             "dbms": "sqlserver",
             "kind": "sqlserver_databases",
-            "collection_interval":  self._collection_interval,
+            "collection_interval":  collection_interval,
             "dbms_version": None,
             "tags": self._check.non_internal_tags,
             "cloud_metadata": self._check._config.cloud_metadata,
         }
         self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
 
+    def run_job(self):
+        self._collect_schemas_data()
+
+    def shut_down(self):
+        self._dataSubmitter.submit()
+
     """Collects database information and schemas and submits to the agent's queue as dictionaries
     schema dict
     key/value:
@@ -177,23 +195,9 @@ def __init__(self, check, config):
                 key/value:
                     "partition_count": int
     """
-
-    def collect_schemas_data(self):
-        if not self._enabled:
-            return
-        if (
-                self._last_schemas_collect_time is None
-                or time.time() - self._last_schemas_collect_time > self._config.schemas_collection_interval
-            ):
-            try:
-                self._collect_schemas_data()
-            except:
-                raise
-            finally:
-                self._last_schemas_collect_time = time.time()
-
     @tracked_method(agent_check_getter=agent_check_getter)
     def _collect_schemas_data(self):
+        start_time = time.thread_time()
         self._dataSubmitter.reset()
         self._dataSubmitter.set_base_event_data(
             self._check.resolved_hostname,
@@ -216,19 +220,20 @@ def fetch_schema_data(cursor, db_name):
                 tables = self._get_tables(schema, cursor)
                 tables_chunks = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
                 for tables_chunk in tables_chunks:
-                    if self._dataSubmitter.exceeded_total_columns_number():
+                    if time.thread_time() -  start_time > self.MAX_EXECUTION_TIME:
                         # TODO Report truncation to the backend
                         self._log.warning(
-                            "Truncated data due to the max limit, stopped on db - {} on schema {}".format(
-                                db_name, schema["name"]
+                            "Truncated data due to the effective execution time reaching {}, stopped on db - {} on schema {}".format(
+                                self.MAX_EXECUTION_TIME, db_name, schema["name"]
                             )
                         )
                         raise StopIteration
+
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
+
                     self._dataSubmitter.store(db_name, schema, tables_info, columns_count)
-                    self._dataSubmitter.submit()  # Submit is forced after each 50 tables chunk
-                if len(tables) == 0:
-                    self._dataSubmitter.store(db_name, schema, [], 0)
+                    if self._dataSubmitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:
+                        self._dataSubmitter.submit()
             self._dataSubmitter.submit()
             return False
 
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index c624f5ca9989d..b3d756a717c09 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -167,6 +167,7 @@ def cancel(self):
         self.procedure_metrics.cancel()
         self.activity.cancel()
         self.sql_metadata.cancel()
+        self._schemas.cancel()
 
     def config_checks(self):
         if self._config.autodiscovery and self.instance.get("database"):
@@ -793,7 +794,7 @@ def check(self, _):
                 self.procedure_metrics.run_job_loop(self.tags)
                 self.activity.run_job_loop(self.tags)
                 self.sql_metadata.run_job_loop(self.tags)
-                self._schemas.collect_schemas_data()
+                self._schemas.run_job_loop(self.tags)
         else:
             self.log.debug("Skipping check")
 

From 9148cb3ab5f35e74d32848e21ad818ede55775df Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 21 May 2024 20:24:54 +0000
Subject: [PATCH 085/132] Added conf parameters

---
 sqlserver/assets/configuration/spec.yaml       | 18 +++++++++++++-----
 sqlserver/datadog_checks/sqlserver/schemas.py  |  3 +--
 .../datadog_checks/sqlserver/sqlserver.py      |  4 ++--
 sqlserver/tests/test_unit.py                   |  2 +-
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 39f7850372108..67dd33a0f3f3c 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -725,14 +725,22 @@ files:
             example: false
         - name: collection_interval
           description: |
-            Set the database schema collection interval (in seconds). Defaults to 600 seconds
+            Set the database schema collection interval (in seconds). Defaults to 600 seconds.
           value:
             type: number
             example: 600
-      value:
-        type: number
-        example: 1200
-        display_default: false
+        - name: max_execution_time
+          description: |
+            Set the maximum time for schema collection (in seconds). Defaults to 10 seconds. Capped by `collection_interval`
+          value:
+            type: number
+            example: 10   
+        - name: run_sync
+          description: |
+            Configures if schema collection is ran on the main thread.
+          value:
+            type: boolean
+            example: false
     - template: instances/default
   - template: logs
     example:
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 815e5d6a416a7..b96c228554c3b 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -118,12 +118,11 @@ def __init__(self, check, config):
         self._check = check
         self._log = check.log
         self.schemas_per_db = {}
-        #TODO to add
-        self._max_execution_time = config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME)
         self._last_schemas_collect_time = None
         collection_interval = config.schema_config.get(
             'collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
         )
+        self._max_execution_time = min(config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME), collection_interval)
         super(Schemas, self).__init__(
             check,
             run_sync=is_affirmative(config.schema_config.get('run_sync', True)),
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index b3d756a717c09..5384a82db0ffb 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -102,7 +102,7 @@
 
 set_default_driver_conf()
 
-
+import pdb
 class SQLServer(AgentCheck):
     __NAMESPACE__ = "sqlserver"
 
@@ -116,7 +116,7 @@ def __init__(self, name, init_config, instances):
         self.instance_metrics = []
         self.instance_per_type_metrics = defaultdict(set)
         self.do_check = True
-
+        #pdb.set_trace()
         self._config = SQLServerConfig(self.init_config, self.instance, self.log)
         self.tags = self._config.tags
 
diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index c2660f77fbe20..09792b333fa8c 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -766,7 +766,7 @@ def submitData(data):
     
     dataSubmitter = SubmitData(submitData, base_event, DummyLogger())
     return dataSubmitter, submitted_data    
-
+#TODO simplidy this test partly moves to schema
 def test_submit_data():
 
     dataSubmitter, submitted_data = set_up_submitter_unit_test()

From e2dc3e5372c3a46583c0e3f15b48178706034aaf Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 22 May 2024 09:56:50 +0000
Subject: [PATCH 086/132] Fixed unit test

---
 sqlserver/datadog_checks/sqlserver/schemas.py |  2 +-
 sqlserver/tests/odbc/odbcinst.ini             |  2 +-
 sqlserver/tests/test_unit.py                  | 42 +------------------
 3 files changed, 4 insertions(+), 42 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index b96c228554c3b..13ddf8f1a5590 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -211,7 +211,7 @@ def _collect_schemas_data(self):
         databases = self._check.get_databases()
         db_infos = self._query_db_informations(databases)
         self._dataSubmitter.store_db_infos(db_infos)
-        # returns if to stop, True means stop iterating.
+
         @tracked_method(agent_check_getter=agent_check_getter)
         def fetch_schema_data(cursor, db_name):
             schemas = self._query_schema_information(cursor)
diff --git a/sqlserver/tests/odbc/odbcinst.ini b/sqlserver/tests/odbc/odbcinst.ini
index 75ffdd4b4d72d..58163f2833d9e 100644
--- a/sqlserver/tests/odbc/odbcinst.ini
+++ b/sqlserver/tests/odbc/odbcinst.ini
@@ -6,5 +6,5 @@ Driver=/usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
 
 [ODBC Driver 18 for SQL Server]
 Description=Microsoft ODBC Driver 18 for SQL Server
-Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.3.1
+Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.2.1
 UsageCount=1
diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index 09792b333fa8c..45f01d79c4595 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -766,13 +766,12 @@ def submitData(data):
     
     dataSubmitter = SubmitData(submitData, base_event, DummyLogger())
     return dataSubmitter, submitted_data    
-#TODO simplidy this test partly moves to schema
+
 def test_submit_data():
 
     dataSubmitter, submitted_data = set_up_submitter_unit_test()
 
-    dataSubmitter.store_db_info("test_db1", {"id": 3, "name" : "test_db1"})
-    dataSubmitter.store_db_info("test_db2", {"id": 4, "name" : "test_db2"})
+    dataSubmitter.store_db_infos([{"id": 3, "name" : "test_db1"},{"id": 4, "name" : "test_db2"}])
     schema1 = {"id" : "1"}
     schema2 = {"id" : "2"}
     schema3 = {"id" : "3"}
@@ -831,40 +830,3 @@ def test_submit_data():
     }
     difference = DeepDiff(json.loads(submitted_data[0]),expected_data , exclude_paths="root['timestamp']", ignore_order=True)
     assert len(difference) == 0
-
-def test_store_large_amount_of_columns():
-
-    dataSubmitter, submitted_data = set_up_submitter_unit_test()    
-    dataSubmitter.store_db_info("test_db1", {"id": 3, "name" : "test_db1"})
-    schema1 = {"id" : "1"}
-    dataSubmitter.store("test_db1", schema1, [1,2], SubmitData.MAX_COLUMN_COUNT+SubmitData.MAX_TOTAL_COLUMN_COUNT+1)
-    expected_data = {
-	"host":"some",
-	"agent_version":0,
-	"dbms":"sqlserver",
-	"kind":"sqlserver_databases",
-	"collection_interval":1200,
-	"dbms_version":"some",
-	"tags":"some",
-	"cloud_metadata":"some",
-	"metadata":[
-		{
-			"id":3,
-			"name":"test_db1",
-			"schemas":[
-				{
-					"id":"1",
-					"tables":[
-						1,
-						2
-					]
-				}
-			]
-		},      
-	],
-	"timestamp":1.1
-    }
-    assert dataSubmitter.exceeded_total_columns_number()
-    difference = DeepDiff(json.loads(submitted_data[0]),expected_data , exclude_paths="root['timestamp']", ignore_order=True)
-    assert len(difference) == 0
-

From d9c1a0042bb39d61ca6eda6d5f5299bf3abf54ef Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 22 May 2024 10:02:13 +0000
Subject: [PATCH 087/132] removed pdb

---
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 5384a82db0ffb..c4b17c45556e5 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -102,7 +102,6 @@
 
 set_default_driver_conf()
 
-import pdb
 class SQLServer(AgentCheck):
     __NAMESPACE__ = "sqlserver"
 
@@ -116,7 +115,6 @@ def __init__(self, name, init_config, instances):
         self.instance_metrics = []
         self.instance_per_type_metrics = defaultdict(set)
         self.do_check = True
-        #pdb.set_trace()
         self._config = SQLServerConfig(self.init_config, self.instance, self.log)
         self.tags = self._config.tags
 

From 9b90162e49c0e0b06ee12d2b52c34cf999177cc8 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 22 May 2024 11:40:03 +0000
Subject: [PATCH 088/132] Formatted comments

---
 sqlserver/datadog_checks/sqlserver/config.py  |   1 +
 sqlserver/datadog_checks/sqlserver/schemas.py | 210 +++++++++---------
 2 files changed, 103 insertions(+), 108 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/config.py b/sqlserver/datadog_checks/sqlserver/config.py
index 23819f8336dba..de7dcfea0aa4e 100644
--- a/sqlserver/datadog_checks/sqlserver/config.py
+++ b/sqlserver/datadog_checks/sqlserver/config.py
@@ -26,6 +26,7 @@ def __init__(self, init_config, instance, log):
         self.autodiscovery_interval: int = instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
         self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include)
         self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude)
+
         self.proc: str = instance.get('stored_procedure')
         self.custom_metrics: list[dict] = init_config.get('custom_metrics', []) or []
         self.include_index_usage_metrics_tempdb: bool = is_affirmative(
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 13ddf8f1a5590..ff88088818dd9 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -28,12 +28,6 @@
 )
 from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks
 
-#TODO 
-# make it a subclass of async but set sync
-# remove total amount of columns and put total exec time
-# pull out stop logic - submit tables one by one ? and control columns number for payload ? 
-# I can do a timer but in case of multithreading how to ensure ??? disable ? as kiiled by the 
-
 class SubmitData:
 
     def __init__(self, submit_data_function, base_event, logger):
@@ -109,7 +103,7 @@ class Schemas(DBMAsyncJob):
     # Requests for infromation about tables are done for a certain amount of tables at the time
     # This number of tables doesnt slow down performance by much (15% compared to 500 tables)
     # but allows the queue to be stable.
-    TABLES_CHUNK_SIZE = 50
+    TABLES_CHUNK_SIZE = 500
     # Note: in async mode execution time also cannot exceed 2 checks.
     MAX_EXECUTION_TIME = 10
     MAX_COLUMNS_PER_EVENT = 100_000
@@ -145,60 +139,60 @@ def __init__(self, check, config):
             "tags": self._check.non_internal_tags,
             "cloud_metadata": self._check._config.cloud_metadata,
         }
-        self._dataSubmitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
+        self._data_submitter = SubmitData(self._check.database_monitoring_metadata, base_event, self._log)
 
     def run_job(self):
         self._collect_schemas_data()
 
     def shut_down(self):
-        self._dataSubmitter.submit()
+        self._data_submitter.submit()
 
-    """Collects database information and schemas and submits to the agent's queue as dictionaries
-    schema dict
-    key/value:
-        "name": str
-        "id": str
-        "owner_name": str
-        "tables" : list of tables dicts
-            table
-            key/value:
-                "id" : str
-                "name" : str
-                columns: list of columns dicts
-                    columns
-                    key/value:
-                        "name": str
-                        "data_type": str
-                        "default": str
-                        "nullable": bool
-            indexes : list of index dicts
-                index
-                key/value:
-                    "name": str
-                    "type": str
-                    "is_unique": bool
-                    "is_primary_key": bool
-                    "is_unique_constraint": bool
-                    "is_disabled": bool,
-                    "column_names": str
-            foreign_keys : list of foreign key dicts
-                foreign_key
-                key/value:
-                    "foreign_key_name": str
-                    "referencing_table": str
-                    "referencing_column": str
-                    "referenced_table": str
-                    "referenced_column": str
-            partitions: partition dict
-                partition
-                key/value:
-                    "partition_count": int
-    """
     @tracked_method(agent_check_getter=agent_check_getter)
     def _collect_schemas_data(self):
+        """Collects database information and schemas and submits to the agent's queue as dictionaries
+        schema dict
+        key/value:
+            "name": str
+            "id": str
+            "owner_name": str
+            "tables" : list of tables dicts
+                table
+                key/value:
+                    "id" : str
+                    "name" : str
+                    columns: list of columns dicts
+                        columns
+                        key/value:
+                            "name": str
+                            "data_type": str
+                            "default": str
+                            "nullable": bool
+                indexes : list of index dicts
+                    index
+                    key/value:
+                        "name": str
+                        "type": str
+                        "is_unique": bool
+                        "is_primary_key": bool
+                        "is_unique_constraint": bool
+                        "is_disabled": bool,
+                        "column_names": str
+                foreign_keys : list of foreign key dicts
+                    foreign_key
+                    key/value:
+                        "foreign_key_name": str
+                        "referencing_table": str
+                        "referencing_column": str
+                        "referenced_table": str
+                        "referenced_column": str
+                partitions: partition dict
+                    partition
+                    key/value:
+                        "partition_count": int
+        """
         start_time = time.thread_time()
-        self._dataSubmitter.reset()
-        self._dataSubmitter.set_base_event_data(
+        self._data_submitter.reset()
+        self._data_submitter.set_base_event_data(
             self._check.resolved_hostname,
             self._check.non_internal_tags,
             self._check._config.cloud_metadata,
@@ -210,7 +204,7 @@ def _collect_schemas_data(self):
 
         databases = self._check.get_databases()
         db_infos = self._query_db_informations(databases)
-        self._dataSubmitter.store_db_infos(db_infos)
+        self._data_submitter.store_db_infos(db_infos)
 
         @tracked_method(agent_check_getter=agent_check_getter)
         def fetch_schema_data(cursor, db_name):
@@ -230,10 +224,10 @@ def fetch_schema_data(cursor, db_name):
 
                     columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
 
-                    self._dataSubmitter.store(db_name, schema, tables_info, columns_count)
-                    if self._dataSubmitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:
-                        self._dataSubmitter.submit()
-            self._dataSubmitter.submit()
+                    self._data_submitter.store(db_name, schema, tables_info, columns_count)
+                    if self._data_submitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:
+                        self._data_submitter.submit()
+            self._data_submitter.submit()
             return False
 
         errors = self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
@@ -241,7 +235,7 @@ def fetch_schema_data(cursor, db_name):
             for e in errors:
                 self._log.error("While executing fetch schemas for databse - %s, the following exception occured - %s", e[0], e[1])
         self._log.debug("Finished collect_schemas_data")
-        self._dataSubmitter.submit()
+        self._data_submitter.submit()
 
     def _query_db_informations(self, db_names):
         with self._check.connection.open_managed_default_connection():
@@ -249,15 +243,16 @@ def _query_db_informations(self, db_names):
                 db_names_formatted = ",".join(["'{}'".format(t) for t in db_names])
                 return execute_query_output_result_as_dicts(DB_QUERY.format(db_names_formatted), cursor, convert_results_to_str=True)
 
-    """ returns a list of tables for schema with their names and empty column array
-    list of table dicts
-    "id": str
-    "name": str
-    "columns": []
-    """
+
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables(self, schema, cursor):
+        """ returns a list of tables for schema with their names and empty column array
+        list of table dicts
+        "id": str
+        "name": str
+        "columns": []
+        """
         tables_info = execute_query_output_result_as_dicts(
             TABLES_IN_SCHEMA_QUERY, cursor, convert_results_to_str=True, parameter=schema["id"]
         )
@@ -265,56 +260,55 @@ def _get_tables(self, schema, cursor):
             t.setdefault("columns", [])
         return tables_info
 
-    """ returns a list of schema dicts
-    schema
-    dict:
-        "name": str
-        "id": str
-        "owner_name": str"""
-
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _query_schema_information(self, cursor):
+        """ returns a list of schema dicts
+            schema
+            dict:
+                "name": str
+                "id": str
+                "owner_name": str
+        """
         return execute_query_output_result_as_dicts(SCHEMA_QUERY, cursor, convert_results_to_str=True)
 
-    """ returns extracted column numbers and a list of tables
-        "tables" : list of tables dicts
-        table
-        key/value:
-            "id" : str
-            "name" : str
-            columns: list of columns dicts
-                columns
-                key/value:
-                    "name": str
-                    "data_type": str
-                    "default": str
-                    "nullable": bool
-            indexes : list of index dicts
-                index
-                key/value:
-                    "name": str
-                    "type": str
-                    "is_unique": bool
-                    "is_primary_key": bool
-                    "is_unique_constraint": bool
-                    "is_disabled": bool,
-                    "column_names": str
-            foreign_keys : list of foreign key dicts
-                foreign_key
-                key/value:
-                    "foreign_key_name": str
-                    "referencing_table": str
-                    "referencing_column": str
-                    "referenced_table": str
-                    "referenced_column": str
-            partitions: partition dict
-                partition
-                key/value:
-                    "partition_count": int
-    """
-
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables_data(self, table_list, schema, cursor):
+        """ returns extracted column numbers and a list of tables
+            "tables" : list of tables dicts
+            table
+            key/value:
+                "id" : str
+                "name" : str
+                columns: list of columns dicts
+                    columns
+                    key/value:
+                        "name": str
+                        "data_type": str
+                        "default": str
+                        "nullable": bool
+                indexes : list of index dicts
+                    index
+                    key/value:
+                        "name": str
+                        "type": str
+                        "is_unique": bool
+                        "is_primary_key": bool
+                        "is_unique_constraint": bool
+                        "is_disabled": bool,
+                        "column_names": str
+                foreign_keys : list of foreign key dicts
+                    foreign_key
+                    key/value:
+                        "foreign_key_name": str
+                        "referencing_table": str
+                        "referencing_column": str
+                        "referenced_table": str
+                        "referenced_column": str
+                partitions: partition dict
+                    partition
+                    key/value:
+                        "partition_count": int
+        """
         if len(table_list) == 0:
             return
         name_to_id = {}

From 220e22825ab8e94dde2e52d012c0b5aad2787f70 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 22 May 2024 13:27:14 +0000
Subject: [PATCH 089/132] Added a chnage to dbmasync

---
 .../datadog_checks/base/utils/db/utils.py     |  9 +++++++
 .../sqlserver/data/conf.yaml.example          | 27 +++++++++++++++----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
index 2a2d081b9de76..cdf35476d43db 100644
--- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py
+++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
@@ -365,6 +365,15 @@ def _job_loop(self):
     def _set_rate_limit(self, rate_limit):
         if self._rate_limiter.rate_limit_s != rate_limit:
             self._rate_limiter = ConstantRateLimiter(rate_limit)
+    
+    def _run_sync_job_rate_limited(self):
+        if self._rate_limiter.shell_execute():
+            try:
+                self._run_job_traced()
+            except:
+                raise
+            finally:
+                self._rate_limiter.update_last_time()
 
     def _run_sync_job_rate_limited(self):
         if self._rate_limiter.shall_execute():
diff --git a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
index 91d9f9ca8df1a..cf8a22d7ab741 100644
--- a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
+++ b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
@@ -361,6 +361,28 @@ instances:
         #
         # collection_interval: 10
 
+    ## Configure collection of database schemas
+    #
+    ## schemas_collection
+
+        ## @param enabled - boolean - optional - default: true
+        ## Enable collection of active sessions. Requires `dbm: true`.
+        #
+        # enabled: true
+
+        ## @param collection_interval - number - optional - default: 600
+        ## Set the database schema collection interval (in seconds).        
+        ## If a non-default value is chosen, then that exact same value must be used for *every* check instance. TODO ?
+        ## Running different instances with different collection intervals is not supported.
+        #
+        # collection_interval: 600
+
+        ## @param max_execution_time - number - optional - default: 10
+        ## Set the maximum time for schema collection (in seconds).
+        ## Capped by `collection_interval`.
+        #
+        # max_execution_time: 10
+
     ## @param stored_procedure_characters_limit - integer - optional - default: 500
     ## Limit the number of characters of the text of a stored procedure that is collected.
     ## The characters limit is applicable to both query metrics and query samples.
@@ -659,11 +681,6 @@ instances:
     #
     # ignore_missing_database: false
     
-    # @param schemas_collection_interval - int - optional - dafault: 1200
-    # Frequency in seconds of schema collections.  Defaults to `1200`.
-    #
-    # schemas_collection_interval: 1200
-
     ## @param tags - list of strings - optional
     ## A list of tags to attach to every metric and service check emitted by this instance.
     ##

From ea42501e6b04d13aa7449d6d9f06577cbb80d0c5 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 22 May 2024 14:05:27 +0000
Subject: [PATCH 090/132] Update spec

---
 sqlserver/assets/configuration/spec.yaml             | 12 +++---------
 .../datadog_checks/sqlserver/data/conf.yaml.example  |  2 +-
 sqlserver/datadog_checks/sqlserver/schemas.py        |  9 ++++++---
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 67dd33a0f3f3c..23b4fd5368e34 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -719,10 +719,10 @@ files:
       options:
         - name: enabled
           description: |
-            Enable schema collection. Requires `dbm: true`.
+            Enable schema collection. Requires `dbm: true`. Defaults to true.
           value:
             type: boolean
-            example: false
+            example: true
         - name: collection_interval
           description: |
             Set the database schema collection interval (in seconds). Defaults to 600 seconds.
@@ -734,13 +734,7 @@ files:
             Set the maximum time for schema collection (in seconds). Defaults to 10 seconds. Capped by `collection_interval`
           value:
             type: number
-            example: 10   
-        - name: run_sync
-          description: |
-            Configures if schema collection is ran on the main thread.
-          value:
-            type: boolean
-            example: false
+            example: 10
     - template: instances/default
   - template: logs
     example:
diff --git a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
index cf8a22d7ab741..346e3b6174cac 100644
--- a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
+++ b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
@@ -680,7 +680,7 @@ instances:
     ## If the DB specified doesn't exist on the server then don't do the check
     #
     # ignore_missing_database: false
-    
+
     ## @param tags - list of strings - optional
     ## A list of tags to attach to every metric and service check emitted by this instance.
     ##
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index ff88088818dd9..bc6bfe2fc83be 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -97,7 +97,7 @@ def submit(self):
 def agent_check_getter(self):
     return self._check
 
-
+import pdb
 class Schemas(DBMAsyncJob):
 
     # Requests for infromation about tables are done for a certain amount of tables at the time
@@ -116,11 +116,14 @@ def __init__(self, check, config):
         collection_interval = config.schema_config.get(
             'collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
         )
+        pdb.set_trace()
         self._max_execution_time = min(config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME), collection_interval)
+        e = is_affirmative(config.schema_config.get('enabled', True))
+        print(e)
         super(Schemas, self).__init__(
             check,
-            run_sync=is_affirmative(config.schema_config.get('run_sync', True)),
-            enabled=is_affirmative(config.schema_config.get('enabled', False)),
+            run_sync=True,
+            enabled=is_affirmative(config.schema_config.get('enabled', True)),
             expected_db_exceptions=(),
             # min collection interval is a desired collection interval for a check as a whole.
             min_collection_interval=config.min_collection_interval,

From 84efb894c3bd579b6275991168e733fc6679d506 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 22 May 2024 14:10:36 +0000
Subject: [PATCH 091/132] Removed pdb

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index bc6bfe2fc83be..e2f137e1f0263 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -97,7 +97,6 @@ def submit(self):
 def agent_check_getter(self):
     return self._check
 
-import pdb
 class Schemas(DBMAsyncJob):
 
     # Requests for infromation about tables are done for a certain amount of tables at the time
@@ -116,7 +115,6 @@ def __init__(self, check, config):
         collection_interval = config.schema_config.get(
             'collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
         )
-        pdb.set_trace()
         self._max_execution_time = min(config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME), collection_interval)
         e = is_affirmative(config.schema_config.get('enabled', True))
         print(e)

From cd1fbbd04e64858090abcdcb3f8fed5ca64ed8e2 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 22 May 2024 14:11:45 +0000
Subject: [PATCH 092/132] put back driver

---
 sqlserver/tests/odbc/odbcinst.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/tests/odbc/odbcinst.ini b/sqlserver/tests/odbc/odbcinst.ini
index 58163f2833d9e..75ffdd4b4d72d 100644
--- a/sqlserver/tests/odbc/odbcinst.ini
+++ b/sqlserver/tests/odbc/odbcinst.ini
@@ -6,5 +6,5 @@ Driver=/usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
 
 [ODBC Driver 18 for SQL Server]
 Description=Microsoft ODBC Driver 18 for SQL Server
-Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.2.1
+Driver=/opt/microsoft/msodbcsql18/lib64/libmsodbcsql-18.3.so.3.1
 UsageCount=1

From 92776765f4656b6b0e32cdc8f947a08731dc26ae Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 23 May 2024 08:32:46 +0000
Subject: [PATCH 093/132] fixed changelogs

---
 datadog_checks_base/changelog.d/17258.added | 3 +--
 sqlserver/changelog.d/17258.added           | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 sqlserver/changelog.d/17258.added

diff --git a/datadog_checks_base/changelog.d/17258.added b/datadog_checks_base/changelog.d/17258.added
index 389ea6b571c4a..ac15210ed68ba 100644
--- a/datadog_checks_base/changelog.d/17258.added
+++ b/datadog_checks_base/changelog.d/17258.added
@@ -1,3 +1,2 @@
-Added schema collection to the SQL Server integration. 
+Adding schema collection to sqlserver
 Schema data includes information about the tables, their columns, indexes, foreign keys, and partitions.
-The total amount of fetched columns is limited to 100,000.
diff --git a/sqlserver/changelog.d/17258.added b/sqlserver/changelog.d/17258.added
new file mode 100644
index 0000000000000..ac15210ed68ba
--- /dev/null
+++ b/sqlserver/changelog.d/17258.added
@@ -0,0 +1,2 @@
+Adding schema collection to sqlserver
+Schema data includes information about the tables, their columns, indexes, foreign keys, and partitions.

From 5e298186b78347f333d7d219213dee3dc3d27df5 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 23 May 2024 08:44:33 +0000
Subject: [PATCH 094/132] applied linter

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 121 +++++++++---------
 .../datadog_checks/sqlserver/sqlserver.py     |   5 +-
 sqlserver/datadog_checks/sqlserver/utils.py   |   6 +-
 sqlserver/tests/test_metadata.py              |   2 +-
 sqlserver/tests/test_unit.py                  |  98 +++++---------
 5 files changed, 103 insertions(+), 129 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index e2f137e1f0263..5458f4cec4e0c 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -8,15 +8,12 @@
 import time
 
 from datadog_checks.base import is_affirmative
-from datadog_checks.base.utils.db.utils import (
-    default_json_event_encoding,
-    DBMAsyncJob
-)
-
+from datadog_checks.base.utils.db.utils import DBMAsyncJob, default_json_event_encoding
 from datadog_checks.base.utils.tracking import tracked_method
 from datadog_checks.sqlserver.const import (
     COLUMN_QUERY,
     DB_QUERY,
+    DEFAULT_SCHEMAS_COLLECTION_INTERVAL,
     FOREIGN_KEY_QUERY,
     INDEX_QUERY,
     PARTITIONS_QUERY,
@@ -24,10 +21,10 @@
     STATIC_INFO_ENGINE_EDITION,
     STATIC_INFO_VERSION,
     TABLES_IN_SCHEMA_QUERY,
-    DEFAULT_SCHEMAS_COLLECTION_INTERVAL
 )
 from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks
 
+
 class SubmitData:
 
     def __init__(self, submit_data_function, base_event, logger):
@@ -66,7 +63,7 @@ def store(self, db_name, schema, tables, columns_count):
 
     def columns_since_last_submit(self):
         return self._columns_count
-    
+
     def truncate(self, json_event):
         max_length = 1000
         if len(json_event) > max_length:
@@ -74,7 +71,7 @@ def truncate(self, json_event):
         else:
             return json_event
 
-    #NOTE: DB with no schemas is never submitted
+    # NOTE: DB with no schemas is never submitted
     def submit(self):
         if not self.db_to_schemas:
             return
@@ -97,6 +94,7 @@ def submit(self):
 def agent_check_getter(self):
     return self._check
 
+
 class Schemas(DBMAsyncJob):
 
     # Requests for infromation about tables are done for a certain amount of tables at the time
@@ -112,10 +110,10 @@ def __init__(self, check, config):
         self._log = check.log
         self.schemas_per_db = {}
         self._last_schemas_collect_time = None
-        collection_interval = config.schema_config.get(
-            'collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL
+        collection_interval = config.schema_config.get('collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL)
+        self._max_execution_time = min(
+            config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME), collection_interval
         )
-        self._max_execution_time = min(config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME), collection_interval)
         e = is_affirmative(config.schema_config.get('enabled', True))
         print(e)
         super(Schemas, self).__init__(
@@ -135,7 +133,7 @@ def __init__(self, check, config):
             "agent_version": datadog_agent.get_version(),
             "dbms": "sqlserver",
             "kind": "sqlserver_databases",
-            "collection_interval":  collection_interval,
+            "collection_interval": collection_interval,
             "dbms_version": None,
             "tags": self._check.non_internal_tags,
             "cloud_metadata": self._check._config.cloud_metadata,
@@ -214,10 +212,11 @@ def fetch_schema_data(cursor, db_name):
                 tables = self._get_tables(schema, cursor)
                 tables_chunks = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
                 for tables_chunk in tables_chunks:
-                    if time.thread_time() -  start_time > self.MAX_EXECUTION_TIME:
+                    if time.thread_time() - start_time > self.MAX_EXECUTION_TIME:
                         # TODO Report truncation to the backend
                         self._log.warning(
-                            "Truncated data due to the effective execution time reaching {}, stopped on db - {} on schema {}".format(
+                            """Truncated data due to the effective execution time reaching {},
+                             stopped on db - {} on schema {}""".format(
                                 self.MAX_EXECUTION_TIME, db_name, schema["name"]
                             )
                         )
@@ -234,7 +233,9 @@ def fetch_schema_data(cursor, db_name):
         errors = self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
         if errors:
             for e in errors:
-                self._log.error("While executing fetch schemas for databse - %s, the following exception occured - %s", e[0], e[1])
+                self._log.error(
+                    "While executing fetch schemas for databse - %s, the following exception occured - %s", e[0], e[1]
+                )
         self._log.debug("Finished collect_schemas_data")
         self._data_submitter.submit()
 
@@ -242,13 +243,13 @@ def _query_db_informations(self, db_names):
         with self._check.connection.open_managed_default_connection():
             with self._check.connection.get_managed_cursor() as cursor:
                 db_names_formatted = ",".join(["'{}'".format(t) for t in db_names])
-                return execute_query_output_result_as_dicts(DB_QUERY.format(db_names_formatted), cursor, convert_results_to_str=True)
-
-
+                return execute_query_output_result_as_dicts(
+                    DB_QUERY.format(db_names_formatted), cursor, convert_results_to_str=True
+                )
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables(self, schema, cursor):
-        """ returns a list of tables for schema with their names and empty column array
+        """returns a list of tables for schema with their names and empty column array
         list of table dicts
         "id": str
         "name": str
@@ -263,52 +264,52 @@ def _get_tables(self, schema, cursor):
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _query_schema_information(self, cursor):
-        """ returns a list of schema dicts
-            schema
-            dict:
-                "name": str
-                "id": str
-                "owner_name": str
+        """returns a list of schema dicts
+        schema
+        dict:
+            "name": str
+            "id": str
+            "owner_name": str
         """
         return execute_query_output_result_as_dicts(SCHEMA_QUERY, cursor, convert_results_to_str=True)
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables_data(self, table_list, schema, cursor):
-        """ returns extracted column numbers and a list of tables
-            "tables" : list of tables dicts
-            table
-            key/value:
-                "id" : str
-                "name" : str
-                columns: list of columns dicts
-                    columns
-                    key/value:
-                        "name": str
-                        "data_type": str
-                        "default": str
-                        "nullable": bool
-                indexes : list of index dicts
-                    index
-                    key/value:
-                        "name": str
-                        "type": str
-                        "is_unique": bool
-                        "is_primary_key": bool
-                        "is_unique_constraint": bool
-                        "is_disabled": bool,
-                        "column_names": str
-                foreign_keys : list of foreign key dicts
-                    foreign_key
-                    key/value:
-                        "foreign_key_name": str
-                        "referencing_table": str
-                        "referencing_column": str
-                        "referenced_table": str
-                        "referenced_column": str
-                partitions: partition dict
-                    partition
-                    key/value:
-                        "partition_count": int
+        """returns extracted column numbers and a list of tables
+        "tables" : list of tables dicts
+        table
+        key/value:
+            "id" : str
+            "name" : str
+            columns: list of columns dicts
+                columns
+                key/value:
+                    "name": str
+                    "data_type": str
+                    "default": str
+                    "nullable": bool
+            indexes : list of index dicts
+                index
+                key/value:
+                    "name": str
+                    "type": str
+                    "is_unique": bool
+                    "is_primary_key": bool
+                    "is_unique_constraint": bool
+                    "is_disabled": bool,
+                    "column_names": str
+            foreign_keys : list of foreign key dicts
+                foreign_key
+                key/value:
+                    "foreign_key_name": str
+                    "referencing_table": str
+                    "referencing_column": str
+                    "referenced_table": str
+                    "referenced_column": str
+            partitions: partition dict
+                partition
+                key/value:
+                    "partition_count": int
         """
         if len(table_list) == 0:
             return
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index c4b17c45556e5..56c41c9e4519e 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -102,6 +102,7 @@
 
 set_default_driver_conf()
 
+
 class SQLServer(AgentCheck):
     __NAMESPACE__ = "sqlserver"
 
@@ -746,9 +747,9 @@ def do_for_databases(self, action, databases):
                         if not is_azure_sql_database(engine_edition):
                             cursor.execute(SWITCH_DB_STATEMENT.format(db))
                         action(cursor, db)
-                    except StopIteration as e:
+                    except StopIteration:
                         exceptions.append((db, "StopIteration"))
-                        return exceptions    
+                        return exceptions
                     except Exception as e:
                         exceptions.append((db, e))
                         self.log.error("An exception occurred during do_for_databases in db - %s: %s", db, e)
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index b65799c49366a..b816b6a8cea8b 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -141,9 +141,9 @@ def is_azure_sql_database(engine_edition):
 
 def execute_query_output_result_as_dicts(query, cursor, convert_results_to_str=False, parameter=None):
     if parameter is not None:
-       cursor.execute(query,(parameter,))
-    else:    
-       cursor.execute(query)
+        cursor.execute(query, (parameter,))
+    else:
+        cursor.execute(query)
     columns = [str(column[0]).lower() for column in cursor.description]
     rows = []
     if convert_results_to_str:
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index cccb5b7766b49..2a42e1ca40e21 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -326,7 +326,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     dbm_instance['database_autodiscovery'] = True
     dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test']
     dbm_instance['dbm'] = True
-    dbm_instance['schemas_collection'] = {"enabled" : True}
+    dbm_instance['schemas_collection'] = {"enabled": True}
 
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     dd_run_check(check)
diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index 45f01d79c4595..004a3b81739e4 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -2,18 +2,20 @@
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 import copy
+import json
 import os
 import re
 from collections import namedtuple
 
 import mock
 import pytest
-import json
+from deepdiff import DeepDiff
 
 from datadog_checks.dev import EnvVars
 from datadog_checks.sqlserver import SQLServer
 from datadog_checks.sqlserver.connection import split_sqlserver_host_port
 from datadog_checks.sqlserver.metrics import SqlFractionMetric, SqlMasterDatabaseFileStats
+from datadog_checks.sqlserver.schemas import SubmitData
 from datadog_checks.sqlserver.sqlserver import SQLConnectionError
 from datadog_checks.sqlserver.utils import (
     Database,
@@ -22,11 +24,6 @@
     set_default_driver_conf,
 )
 
-from datadog_checks.sqlserver.schemas import SubmitData
-from deepdiff import DeepDiff
-from datadog_checks.base.utils.db.utils import default_json_event_encoding
-
-import pdb
 from .common import CHECK_NAME, DOCKER_SERVER, assert_metrics
 from .utils import windows_ci
 
@@ -746,9 +743,11 @@ def test_extract_sql_comments_and_procedure_name(query, expected_comments, is_pr
 class DummyLogger:
     def debug(*args):
         pass
+
     def error(*args):
         pass
-    
+
+
 def set_up_submitter_unit_test():
     submitted_data = []
     base_event = {
@@ -761,72 +760,45 @@ def set_up_submitter_unit_test():
         "tags": "some",
         "cloud_metadata": "some",
     }
+
     def submitData(data):
-       submitted_data.append(data)
-    
+        submitted_data.append(data)
+
     dataSubmitter = SubmitData(submitData, base_event, DummyLogger())
-    return dataSubmitter, submitted_data    
+    return dataSubmitter, submitted_data
+
 
 def test_submit_data():
 
     dataSubmitter, submitted_data = set_up_submitter_unit_test()
 
-    dataSubmitter.store_db_infos([{"id": 3, "name" : "test_db1"},{"id": 4, "name" : "test_db2"}])
-    schema1 = {"id" : "1"}
-    schema2 = {"id" : "2"}
-    schema3 = {"id" : "3"}
+    dataSubmitter.store_db_infos([{"id": 3, "name": "test_db1"}, {"id": 4, "name": "test_db2"}])
+    schema1 = {"id": "1"}
+    schema2 = {"id": "2"}
+    schema3 = {"id": "3"}
+
+    dataSubmitter.store("test_db1", schema1, [1, 2], 5)
+    dataSubmitter.store("test_db2", schema3, [1, 2], 5)
+    dataSubmitter.store("test_db1", schema2, [1, 2], 10)
 
-    dataSubmitter.store("test_db1", schema1, [1,2], 5)
-    dataSubmitter.store("test_db2", schema3, [1,2], 5)
-    dataSubmitter.store("test_db1", schema2, [1,2], 10)
-    
     dataSubmitter.submit()
 
     expected_data = {
-	"host":"some",
-	"agent_version":0,
-	"dbms":"sqlserver",
-	"kind":"sqlserver_databases",
-	"collection_interval":1200,
-	"dbms_version":"some",
-	"tags":"some",
-	"cloud_metadata":"some",
-	"metadata":[
-		{
-			"id":3,
-			"name":"test_db1",
-			"schemas":[
-				{
-					"id":"1",
-					"tables":[
-						1,
-						2
-					]
-				},
-				{
-					"id":"2",
-					"tables":[
-						1,
-						2
-					]
-				}
-			]
-		},
-		{
-			"id":4,
-			"name":"test_db2",
-			"schemas":[
-				{
-					"id":"3",
-					"tables":[
-						1,
-						2
-					]
-				}
-			]
-		},        
-	],
-	"timestamp":1.1
+        "host": "some",
+        "agent_version": 0,
+        "dbms": "sqlserver",
+        "kind": "sqlserver_databases",
+        "collection_interval": 1200,
+        "dbms_version": "some",
+        "tags": "some",
+        "cloud_metadata": "some",
+        "metadata": [
+            {"id": 3, "name": "test_db1", "schemas": [{"id": "1", "tables": [1, 2]}, {"id": "2", "tables": [1, 2]}]},
+            {"id": 4, "name": "test_db2", "schemas": [{"id": "3", "tables": [1, 2]}]},
+        ],
+        "timestamp": 1.1,
     }
-    difference = DeepDiff(json.loads(submitted_data[0]),expected_data , exclude_paths="root['timestamp']", ignore_order=True)
+    difference = DeepDiff(
+        json.loads(submitted_data[0]), expected_data, exclude_paths="root['timestamp']", ignore_order=True
+    )
     assert len(difference) == 0

From 8b98973af5acd4961f6d2fa45042de9ba8c99ae3 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 3 Jun 2024 19:19:21 +0000
Subject: [PATCH 095/132] minor improvments

---
 .../datadog_checks/base/utils/db/utils.py     |  2 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 56 +++++++++----------
 2 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
index cdf35476d43db..8289e8a3ec4c6 100644
--- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py
+++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
@@ -367,7 +367,7 @@ def _set_rate_limit(self, rate_limit):
             self._rate_limiter = ConstantRateLimiter(rate_limit)
     
     def _run_sync_job_rate_limited(self):
-        if self._rate_limiter.shell_execute():
+        if self._rate_limiter.shall_execute():
             try:
                 self._run_job_traced()
             except:
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 5458f4cec4e0c..11386812d715f 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -125,7 +125,7 @@ def __init__(self, check, config):
             min_collection_interval=config.min_collection_interval,
             dbms="sqlserver",
             rate_limit=1 / float(collection_interval),
-            job_name="query-schemas",
+            job_name="schemas",
             shutdown_callback=self.shut_down,
         )
         base_event = {
@@ -146,6 +146,31 @@ def run_job(self):
     def shut_down(self):
         self._data_submitter.submit()
 
+    @tracked_method(agent_check_getter=agent_check_getter)
+    def __fetch_schema_data(self, cursor, db_name):
+        start_time = time.time()
+        schemas = self._query_schema_information(cursor)
+        for schema in schemas:
+            tables = self._get_tables(schema, cursor)
+            tables_chunks = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
+            for tables_chunk in tables_chunks:
+                schema_collection_elapsed_time = time.time() - start_time
+                if schema_collection_elapsed_time > self.MAX_EXECUTION_TIME:
+                    # TODO Report truncation to the backend
+                    self._log.warning(
+                        """Truncated data due to the effective execution time reaching {},
+                         stopped on db - {} on schema {}""".format(
+                            self.MAX_EXECUTION_TIME, db_name, schema["name"]
+                        )
+                    )
+                    raise StopIteration("Schema collections took {} which is longer than allowed limit {}".format(schema_collection_elapsed_time, self.MAX_EXECUTION_TIME))
+                columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
+                self._data_submitter.store(db_name, schema, tables_info, columns_count)
+                if self._data_submitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:
+                    self._data_submitter.submit()
+        self._data_submitter.submit()
+        return False     
+   
     @tracked_method(agent_check_getter=agent_check_getter)
     def _collect_schemas_data(self):
         """Collects database information and schemas and submits to the agent's queue as dictionaries
@@ -189,7 +214,6 @@ def _collect_schemas_data(self):
                     key/value:
                         "partition_count": int
         """
-        start_time = time.thread_time()
         self._data_submitter.reset()
         self._data_submitter.set_base_event_data(
             self._check.resolved_hostname,
@@ -204,33 +228,7 @@ def _collect_schemas_data(self):
         databases = self._check.get_databases()
         db_infos = self._query_db_informations(databases)
         self._data_submitter.store_db_infos(db_infos)
-
-        @tracked_method(agent_check_getter=agent_check_getter)
-        def fetch_schema_data(cursor, db_name):
-            schemas = self._query_schema_information(cursor)
-            for schema in schemas:
-                tables = self._get_tables(schema, cursor)
-                tables_chunks = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
-                for tables_chunk in tables_chunks:
-                    if time.thread_time() - start_time > self.MAX_EXECUTION_TIME:
-                        # TODO Report truncation to the backend
-                        self._log.warning(
-                            """Truncated data due to the effective execution time reaching {},
-                             stopped on db - {} on schema {}""".format(
-                                self.MAX_EXECUTION_TIME, db_name, schema["name"]
-                            )
-                        )
-                        raise StopIteration
-
-                    columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
-
-                    self._data_submitter.store(db_name, schema, tables_info, columns_count)
-                    if self._data_submitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:
-                        self._data_submitter.submit()
-            self._data_submitter.submit()
-            return False
-
-        errors = self._check.do_for_databases(fetch_schema_data, self._check.get_databases())
+        errors = self._check.do_for_databases(self.__fetch_schema_data, self._check.get_databases())
         if errors:
             for e in errors:
                 self._log.error(

From 82f2b78ea25639298d01526b8b2e4f15df11205b Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 3 Jun 2024 20:09:39 +0000
Subject: [PATCH 096/132] fixed typo

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 11386812d715f..8be10c2865870 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -163,7 +163,7 @@ def __fetch_schema_data(self, cursor, db_name):
                             self.MAX_EXECUTION_TIME, db_name, schema["name"]
                         )
                     )
-                    raise StopIteration("Schema collections took {} which is longer than allowed limit {}".format(schema_collection_elapsed_time, self.MAX_EXECUTION_TIME))
+                    raise StopIteration("Schema collection took {} which is longer than allowed limit {}".format(schema_collection_elapsed_time, self.MAX_EXECUTION_TIME))
                 columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
                 self._data_submitter.store(db_name, schema, tables_info, columns_count)
                 if self._data_submitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:

From 653135a3060395ee34fef39f252952ef00cc0dea Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Mon, 3 Jun 2024 22:38:57 +0000
Subject: [PATCH 097/132] removed base change

---
 datadog_checks_base/changelog.d/17258.added   |  2 --
 .../datadog_checks/base/utils/db/utils.py     | 32 +++----------------
 2 files changed, 5 insertions(+), 29 deletions(-)
 delete mode 100644 datadog_checks_base/changelog.d/17258.added

diff --git a/datadog_checks_base/changelog.d/17258.added b/datadog_checks_base/changelog.d/17258.added
deleted file mode 100644
index ac15210ed68ba..0000000000000
--- a/datadog_checks_base/changelog.d/17258.added
+++ /dev/null
@@ -1,2 +0,0 @@
-Adding schema collection to sqlserver
-Schema data includes information about the tables, their columns, indexes, foreign keys, and partitions.
diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
index 8289e8a3ec4c6..56f4a388b8368 100644
--- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py
+++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
@@ -117,20 +117,13 @@ def __init__(self, rate_limit_s):
         self.period_s = 1.0 / self.rate_limit_s if self.rate_limit_s > 0 else 0
         self.last_event = 0
 
-    def update_last_time_and_sleep(self):
+    def sleep(self):
         """
         Sleeps long enough to enforce the rate limit
         """
         elapsed_s = time.time() - self.last_event
         sleep_amount = max(self.period_s - elapsed_s, 0)
         time.sleep(sleep_amount)
-        self.update_last_time()
-
-    def shall_execute(self):
-        elapsed_s = time.time() - self.last_event
-        return elapsed_s >= self.period_s
-
-    def update_last_time(self):
         self.last_event = time.time()
 
 
@@ -301,7 +294,7 @@ def run_job_loop(self, tags):
         self._last_check_run = time.time()
         if self._run_sync or is_affirmative(os.environ.get('DBM_THREADED_JOB_RUN_SYNC', "false")):
             self._log.debug("Running threaded job synchronously. job=%s", self._job_name)
-            self._run_sync_job_rate_limited()
+            self._run_job_rate_limited()
         elif self._job_loop_future is None or not self._job_loop_future.running():
             self._job_loop_future = DBMAsyncJob.executor.submit(self._job_loop)
         else:
@@ -365,15 +358,6 @@ def _job_loop(self):
     def _set_rate_limit(self, rate_limit):
         if self._rate_limiter.rate_limit_s != rate_limit:
             self._rate_limiter = ConstantRateLimiter(rate_limit)
-    
-    def _run_sync_job_rate_limited(self):
-        if self._rate_limiter.shall_execute():
-            try:
-                self._run_job_traced()
-            except:
-                raise
-            finally:
-                self._rate_limiter.update_last_time()
 
     def _run_sync_job_rate_limited(self):
         if self._rate_limiter.shall_execute():
@@ -385,15 +369,9 @@ def _run_sync_job_rate_limited(self):
                 self._rate_limiter.update_last_time()
 
     def _run_job_rate_limited(self):
-        try:
-            self._run_job_traced()
-        except:
-            raise
-        finally:
-            if not self._cancel_event.isSet():
-                self._rate_limiter.update_last_time_and_sleep()
-            else:
-                self._rate_limiter.update_last_time()
+        self._run_job_traced()
+        if not self._cancel_event.isSet():
+            self._rate_limiter.sleep()
 
     @_traced_dbm_async_job_method
     def _run_job_traced(self):

From cb87df5246eb069c67e593f3f4c01cd60d5f9e27 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 5 Jun 2024 14:20:35 +0000
Subject: [PATCH 098/132] Moved do for db in schemas

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 43 +++++++++++++------
 .../datadog_checks/sqlserver/sqlserver.py     | 20 ---------
 2 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 8be10c2865870..5275164685d1e 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -21,8 +21,9 @@
     STATIC_INFO_ENGINE_EDITION,
     STATIC_INFO_VERSION,
     TABLES_IN_SCHEMA_QUERY,
+    SWITCH_DB_STATEMENT,
 )
-from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks
+from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks, is_azure_sql_database
 
 
 class SubmitData:
@@ -44,8 +45,8 @@ def set_base_event_data(self, hostname, tags, cloud_metadata, dbms_version):
 
     def reset(self):
         self._columns_count = 0
-        self.db_to_schemas = {}
-        self.db_info = {}
+        self.db_to_schemas.clear()
+        self.db_info.clear()
 
     def store_db_infos(self, db_infos):
         for db_info in db_infos:
@@ -56,9 +57,9 @@ def store(self, db_name, schema, tables, columns_count):
         schemas = self.db_to_schemas.setdefault(db_name, {})
         if schema["id"] in schemas:
             known_tables = schemas[schema["id"]].setdefault("tables", [])
-            known_tables = known_tables + tables
+            known_tables = known_tables.extend(tables)
         else:
-            schemas[schema["id"]] = copy.deepcopy(schema)
+            schemas[schema["id"]] = schema
             schemas[schema["id"]]["tables"] = tables
 
     def columns_since_last_submit(self):
@@ -88,7 +89,7 @@ def submit(self):
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting the following payload for schema collection: {}".format(self.truncate(json_event)))
         self._submit_to_agent_queue(json_event)
-        self.db_to_schemas = {}
+        self.db_to_schemas.clear()
 
 
 def agent_check_getter(self):
@@ -147,7 +148,7 @@ def shut_down(self):
         self._data_submitter.submit()
 
     @tracked_method(agent_check_getter=agent_check_getter)
-    def __fetch_schema_data(self, cursor, db_name):
+    def _fetch_schema_data(self, cursor, db_name):
         start_time = time.time()
         schemas = self._query_schema_information(cursor)
         for schema in schemas:
@@ -169,7 +170,26 @@ def __fetch_schema_data(self, cursor, db_name):
                 if self._data_submitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:
                     self._data_submitter.submit()
         self._data_submitter.submit()
-        return False     
+        return False
+     
+    def _fetch_for_databases(self):
+        databases  = self._check.get_databases()
+        engine_edition = self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
+        with self._check.connection.open_managed_default_connection():
+            with self._check.connection.get_managed_cursor() as cursor:
+                for db_name in databases:
+                    try:
+                        if not is_azure_sql_database(engine_edition):
+                            cursor.execute(SWITCH_DB_STATEMENT.format(db_name))
+                        self._fetch_schema_data(cursor, db_name)
+                    except StopIteration:
+                        self._log.error("While executing fetch schemas for databse - %s, the following exception occured - %s", db_name, e)
+                        return
+                    except Exception as e:
+                        self._log.error("While executing fetch schemas for databse - %s, the following exception occured - %s", db_name, e)
+                # Switch DB back to MASTER
+                if not is_azure_sql_database(engine_edition):
+                    cursor.execute(SWITCH_DB_STATEMENT.format(self._check.connection.DEFAULT_DATABASE))
    
     @tracked_method(agent_check_getter=agent_check_getter)
     def _collect_schemas_data(self):
@@ -228,12 +248,7 @@ def _collect_schemas_data(self):
         databases = self._check.get_databases()
         db_infos = self._query_db_informations(databases)
         self._data_submitter.store_db_infos(db_infos)
-        errors = self._check.do_for_databases(self.__fetch_schema_data, self._check.get_databases())
-        if errors:
-            for e in errors:
-                self._log.error(
-                    "While executing fetch schemas for databse - %s, the following exception occured - %s", e[0], e[1]
-                )
+        self._fetch_for_databases()
         self._log.debug("Finished collect_schemas_data")
         self._data_submitter.submit()
 
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 56c41c9e4519e..37af0a80f1b4a 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -737,26 +737,6 @@ def get_databases(self):
             db_names = [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
         return db_names
 
-    def do_for_databases(self, action, databases):
-        exceptions = []
-        engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
-        with self.connection.open_managed_default_connection():
-            with self.connection.get_managed_cursor() as cursor:
-                for db in databases:
-                    try:
-                        if not is_azure_sql_database(engine_edition):
-                            cursor.execute(SWITCH_DB_STATEMENT.format(db))
-                        action(cursor, db)
-                    except StopIteration:
-                        exceptions.append((db, "StopIteration"))
-                        return exceptions
-                    except Exception as e:
-                        exceptions.append((db, e))
-                        self.log.error("An exception occurred during do_for_databases in db - %s: %s", db, e)
-                # Switch DB back to MASTER
-                if not is_azure_sql_database(engine_edition):
-                    cursor.execute(SWITCH_DB_STATEMENT.format(self.connection.DEFAULT_DATABASE))
-
     def _check_database_conns(self):
         engine_edition = self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         if is_azure_sql_database(engine_edition):

From f9025a48a56abb0bf1f5c346bacf68a5eec0a499 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 5 Jun 2024 14:29:19 +0000
Subject: [PATCH 099/132] Improved const

---
 sqlserver/datadog_checks/sqlserver/queries.py | 66 +++++++++++++++++++
 sqlserver/datadog_checks/sqlserver/schemas.py | 17 +++--
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/queries.py b/sqlserver/datadog_checks/sqlserver/queries.py
index 9f41eb09ccde9..15576673f6867 100644
--- a/sqlserver/datadog_checks/sqlserver/queries.py
+++ b/sqlserver/datadog_checks/sqlserver/queries.py
@@ -143,6 +143,72 @@
     ],
 }
 
+DB_QUERY = """
+SELECT
+    db.database_id AS id, db.name AS name, db.collation_name AS collation, dp.name AS owner
+FROM
+    sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid
+WHERE db.name IN ({});
+"""
+
+SCHEMA_QUERY = """
+SELECT
+    s.name AS name, s.schema_id AS id, dp.name AS owner_name
+FROM
+    sys.schemas AS s JOIN sys.database_principals dp ON s.principal_id = dp.principal_id
+WHERE s.name NOT IN ('sys', 'information_schema')
+"""
+
+TABLES_IN_SCHEMA_QUERY = """
+SELECT
+    object_id AS id, name
+FROM
+    sys.tables
+WHERE schema_id=?
+"""
+
+COLUMN_QUERY = """
+SELECT
+    column_name AS name, data_type, column_default, is_nullable AS nullable , table_name, ordinal_position
+FROM
+    information_schema.columns
+WHERE
+    table_name IN ({}) and table_schema='{}';
+"""
+
+PARTITIONS_QUERY = """
+SELECT
+    object_id AS id, COUNT(*) AS partition_count
+FROM
+    sys.partitions
+WHERE
+    object_id IN ({}) GROUP BY object_id;
+"""
+
+INDEX_QUERY = """
+SELECT
+    i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint,
+    i.is_disabled, STRING_AGG(c.name, ',') AS column_names
+FROM
+    sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id
+    AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
+WHERE
+    i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type,
+    i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;
+"""
+
+FOREIGN_KEY_QUERY = """
+SELECT
+    FK.referenced_object_id AS id, FK.name AS foreign_key_name,
+    OBJECT_NAME(FK.parent_object_id) AS referencing_table,
+    STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column,
+    OBJECT_NAME(FK.referenced_object_id) AS referenced_table,
+    STRING_AGG(COL_NAME(FKC.referenced_object_id, FKC.referenced_column_id),',') AS referenced_column
+FROM
+    sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id
+WHERE
+    FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;
+"""
 
 def get_query_ao_availability_groups(sqlserver_major_version):
     """
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 5275164685d1e..f969d08797376 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -3,7 +3,6 @@
 except ImportError:
     from ..stubs import datadog_agent
 
-import copy
 import json
 import time
 
@@ -11,18 +10,22 @@
 from datadog_checks.base.utils.db.utils import DBMAsyncJob, default_json_event_encoding
 from datadog_checks.base.utils.tracking import tracked_method
 from datadog_checks.sqlserver.const import (
+    DEFAULT_SCHEMAS_COLLECTION_INTERVAL,
+    STATIC_INFO_ENGINE_EDITION,
+    STATIC_INFO_VERSION,
+    SWITCH_DB_STATEMENT,
+)
+from datadog_checks.sqlserver.queries import (
     COLUMN_QUERY,
     DB_QUERY,
-    DEFAULT_SCHEMAS_COLLECTION_INTERVAL,
     FOREIGN_KEY_QUERY,
     INDEX_QUERY,
     PARTITIONS_QUERY,
     SCHEMA_QUERY,
-    STATIC_INFO_ENGINE_EDITION,
-    STATIC_INFO_VERSION,
     TABLES_IN_SCHEMA_QUERY,
-    SWITCH_DB_STATEMENT,
-)
+) 
+
+
 from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks, is_azure_sql_database
 
 
@@ -164,7 +167,7 @@ def _fetch_schema_data(self, cursor, db_name):
                             self.MAX_EXECUTION_TIME, db_name, schema["name"]
                         )
                     )
-                    raise StopIteration("Schema collection took {} which is longer than allowed limit {}".format(schema_collection_elapsed_time, self.MAX_EXECUTION_TIME))
+                    raise StopIteration("Schema collection took {}s which is longer than allowed limit of {}s".format(schema_collection_elapsed_time, self.MAX_EXECUTION_TIME))
                 columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
                 self._data_submitter.store(db_name, schema, tables_info, columns_count)
                 if self._data_submitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:

From 6b0ab439f2ed14fe6a1d1b011f0e3da9a48fde97 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 5 Jun 2024 15:26:10 +0000
Subject: [PATCH 100/132] Applied linter

---
 sqlserver/assets/configuration/spec.yaml      |  2 +-
 sqlserver/datadog_checks/sqlserver/queries.py |  1 +
 sqlserver/datadog_checks/sqlserver/schemas.py | 32 ++++++++++++-------
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 23b4fd5368e34..4cbb5088b166a 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -731,7 +731,7 @@ files:
             example: 600
         - name: max_execution_time
           description: |
-            Set the maximum time for schema collection (in seconds). Defaults to 10 seconds. Capped by `collection_interval`
+            Set the maximum time for schema collection (in seconds). Defaults to 10 seconds. Capped by `schemas_collection.collection_interval`
           value:
             type: number
             example: 10
diff --git a/sqlserver/datadog_checks/sqlserver/queries.py b/sqlserver/datadog_checks/sqlserver/queries.py
index 15576673f6867..f88d3f7231394 100644
--- a/sqlserver/datadog_checks/sqlserver/queries.py
+++ b/sqlserver/datadog_checks/sqlserver/queries.py
@@ -210,6 +210,7 @@
     FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;
 """
 
+
 def get_query_ao_availability_groups(sqlserver_major_version):
     """
     Construct the sys.availability_groups QueryExecutor configuration based on the SQL Server major version
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index f969d08797376..75e81b9ba526a 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -23,9 +23,7 @@
     PARTITIONS_QUERY,
     SCHEMA_QUERY,
     TABLES_IN_SCHEMA_QUERY,
-) 
-
-
+)
 from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks, is_azure_sql_database
 
 
@@ -159,7 +157,7 @@ def _fetch_schema_data(self, cursor, db_name):
             tables_chunks = list(get_list_chunks(tables, self.TABLES_CHUNK_SIZE))
             for tables_chunk in tables_chunks:
                 schema_collection_elapsed_time = time.time() - start_time
-                if schema_collection_elapsed_time > self.MAX_EXECUTION_TIME:
+                if schema_collection_elapsed_time > self._max_execution_time:
                     # TODO Report truncation to the backend
                     self._log.warning(
                         """Truncated data due to the effective execution time reaching {},
@@ -167,16 +165,20 @@ def _fetch_schema_data(self, cursor, db_name):
                             self.MAX_EXECUTION_TIME, db_name, schema["name"]
                         )
                     )
-                    raise StopIteration("Schema collection took {}s which is longer than allowed limit of {}s".format(schema_collection_elapsed_time, self.MAX_EXECUTION_TIME))
+                    raise StopIteration(
+                        "Schema collection took {}s which is longer than allowed limit of {}s".format(
+                            schema_collection_elapsed_time, self.MAX_EXECUTION_TIME
+                        )
+                    )
                 columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
                 self._data_submitter.store(db_name, schema, tables_info, columns_count)
                 if self._data_submitter.columns_since_last_submit() > self.MAX_COLUMNS_PER_EVENT:
                     self._data_submitter.submit()
         self._data_submitter.submit()
         return False
-     
+
     def _fetch_for_databases(self):
-        databases  = self._check.get_databases()
+        databases = self._check.get_databases()
         engine_edition = self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         with self._check.connection.open_managed_default_connection():
             with self._check.connection.get_managed_cursor() as cursor:
@@ -185,15 +187,23 @@ def _fetch_for_databases(self):
                         if not is_azure_sql_database(engine_edition):
                             cursor.execute(SWITCH_DB_STATEMENT.format(db_name))
                         self._fetch_schema_data(cursor, db_name)
-                    except StopIteration:
-                        self._log.error("While executing fetch schemas for databse - %s, the following exception occured - %s", db_name, e)
+                    except StopIteration as e:
+                        self._log.error(
+                            "While executing fetch schemas for databse - %s, the following exception occured - %s",
+                            db_name,
+                            e,
+                        )
                         return
                     except Exception as e:
-                        self._log.error("While executing fetch schemas for databse - %s, the following exception occured - %s", db_name, e)
+                        self._log.error(
+                            "While executing fetch schemas for databse - %s, the following exception occured - %s",
+                            db_name,
+                            e,
+                        )
                 # Switch DB back to MASTER
                 if not is_azure_sql_database(engine_edition):
                     cursor.execute(SWITCH_DB_STATEMENT.format(self._check.connection.DEFAULT_DATABASE))
-   
+
     @tracked_method(agent_check_getter=agent_check_getter)
     def _collect_schemas_data(self):
         """Collects database information and schemas and submits to the agent's queue as dictionaries

From 573554cc7be1f1b7ea579ff3d03db5714f76ab65 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 5 Jun 2024 16:26:45 +0000
Subject: [PATCH 101/132] Improved specs

---
 sqlserver/assets/configuration/spec.yaml      |  2 +-
 sqlserver/datadog_checks/sqlserver/const.py   | 68 -------------------
 sqlserver/datadog_checks/sqlserver/schemas.py |  4 +-
 3 files changed, 2 insertions(+), 72 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 4cbb5088b166a..071f80d030bd2 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -719,7 +719,7 @@ files:
       options:
         - name: enabled
           description: |
-            Enable schema collection. Requires `dbm: true`. Defaults to true.
+            Enable schema collection. Requires `dbm: true`. Defaults to false.
           value:
             type: boolean
             example: true
diff --git a/sqlserver/datadog_checks/sqlserver/const.py b/sqlserver/datadog_checks/sqlserver/const.py
index f931b7774292e..3a6f77923b2aa 100644
--- a/sqlserver/datadog_checks/sqlserver/const.py
+++ b/sqlserver/datadog_checks/sqlserver/const.py
@@ -269,72 +269,4 @@
 
 PROC_CHAR_LIMIT = 500
 
-# Schemas
 DEFAULT_SCHEMAS_COLLECTION_INTERVAL = 600
-
-DB_QUERY = """
-SELECT
-    db.database_id AS id, db.name AS name, db.collation_name AS collation, dp.name AS owner
-FROM
-    sys.databases db LEFT JOIN sys.database_principals dp ON db.owner_sid = dp.sid
-WHERE db.name IN ({});
-"""
-
-SCHEMA_QUERY = """
-SELECT
-    s.name AS name, s.schema_id AS id, dp.name AS owner_name
-FROM
-    sys.schemas AS s JOIN sys.database_principals dp ON s.principal_id = dp.principal_id
-WHERE s.name NOT IN ('sys', 'information_schema')
-"""
-
-TABLES_IN_SCHEMA_QUERY = """
-SELECT
-    object_id AS id, name
-FROM
-    sys.tables
-WHERE schema_id=?
-"""
-
-COLUMN_QUERY = """
-SELECT
-    column_name AS name, data_type, column_default, is_nullable AS nullable , table_name, ordinal_position
-FROM
-    information_schema.columns
-WHERE
-    table_name IN ({}) and table_schema='{}';
-"""
-
-PARTITIONS_QUERY = """
-SELECT
-    object_id AS id, COUNT(*) AS partition_count
-FROM
-    sys.partitions
-WHERE
-    object_id IN ({}) GROUP BY object_id;
-"""
-
-INDEX_QUERY = """
-SELECT
-    i.object_id AS id, i.name, i.type, i.is_unique, i.is_primary_key, i.is_unique_constraint,
-    i.is_disabled, STRING_AGG(c.name, ',') AS column_names
-FROM
-    sys.indexes i JOIN sys.index_columns ic ON i.object_id = ic.object_id
-    AND i.index_id = ic.index_id JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
-WHERE
-    i.object_id IN ({}) GROUP BY i.object_id, i.name, i.type,
-    i.is_unique, i.is_primary_key, i.is_unique_constraint, i.is_disabled;
-"""
-
-FOREIGN_KEY_QUERY = """
-SELECT
-    FK.referenced_object_id AS id, FK.name AS foreign_key_name,
-    OBJECT_NAME(FK.parent_object_id) AS referencing_table,
-    STRING_AGG(COL_NAME(FKC.parent_object_id, FKC.parent_column_id),',') AS referencing_column,
-    OBJECT_NAME(FK.referenced_object_id) AS referenced_table,
-    STRING_AGG(COL_NAME(FKC.referenced_object_id, FKC.referenced_column_id),',') AS referenced_column
-FROM
-    sys.foreign_keys AS FK JOIN sys.foreign_key_columns AS FKC ON FK.object_id = FKC.constraint_object_id
-WHERE
-    FK.referenced_object_id IN ({}) GROUP BY FK.name, FK.parent_object_id, FK.referenced_object_id;
-"""
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 75e81b9ba526a..f307b7b459b88 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -116,12 +116,10 @@ def __init__(self, check, config):
         self._max_execution_time = min(
             config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME), collection_interval
         )
-        e = is_affirmative(config.schema_config.get('enabled', True))
-        print(e)
         super(Schemas, self).__init__(
             check,
             run_sync=True,
-            enabled=is_affirmative(config.schema_config.get('enabled', True)),
+            enabled=is_affirmative(config.schema_config.get('enabled', False)),
             expected_db_exceptions=(),
             # min collection interval is a desired collection interval for a check as a whole.
             min_collection_interval=config.min_collection_interval,

From ebe3894bb92ac6eab4088fabb3b96fc424feac19 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 6 Jun 2024 20:02:53 +0000
Subject: [PATCH 102/132] added more tests

---
 sqlserver/datadog_checks/sqlserver/schemas.py |  9 +--
 sqlserver/tests/test_unit.py                  | 68 +++++++++++++++++++
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index f307b7b459b88..697b7a1509d4b 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -99,9 +99,6 @@ def agent_check_getter(self):
 
 class Schemas(DBMAsyncJob):
 
-    # Requests for infromation about tables are done for a certain amount of tables at the time
-    # This number of tables doesnt slow down performance by much (15% compared to 500 tables)
-    # but allows the queue to be stable.
     TABLES_CHUNK_SIZE = 500
     # Note: in async mode execution time also cannot exceed 2 checks.
     MAX_EXECUTION_TIME = 10
@@ -160,12 +157,12 @@ def _fetch_schema_data(self, cursor, db_name):
                     self._log.warning(
                         """Truncated data due to the effective execution time reaching {},
                          stopped on db - {} on schema {}""".format(
-                            self.MAX_EXECUTION_TIME, db_name, schema["name"]
+                            self._max_execution_time, db_name, schema["name"]
                         )
                     )
                     raise StopIteration(
-                        "Schema collection took {}s which is longer than allowed limit of {}s".format(
-                            schema_collection_elapsed_time, self.MAX_EXECUTION_TIME
+                        "Schema collection took {}s which is longer than allowed limit of {}s, stopped while collecting for db - {}".format(
+                            schema_collection_elapsed_time, self._max_execution_time, db_name
                         )
                     )
                 columns_count, tables_info = self._get_tables_data(tables_chunk, schema, cursor)
diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index 004a3b81739e4..7694366c74d26 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -16,6 +16,7 @@
 from datadog_checks.sqlserver.connection import split_sqlserver_host_port
 from datadog_checks.sqlserver.metrics import SqlFractionMetric, SqlMasterDatabaseFileStats
 from datadog_checks.sqlserver.schemas import SubmitData
+from datadog_checks.sqlserver.schemas import Schemas
 from datadog_checks.sqlserver.sqlserver import SQLConnectionError
 from datadog_checks.sqlserver.utils import (
     Database,
@@ -24,6 +25,8 @@
     set_default_driver_conf,
 )
 
+from cachetools import TTLCache
+
 from .common import CHECK_NAME, DOCKER_SERVER, assert_metrics
 from .utils import windows_ci
 
@@ -779,10 +782,13 @@ def test_submit_data():
 
     dataSubmitter.store("test_db1", schema1, [1, 2], 5)
     dataSubmitter.store("test_db2", schema3, [1, 2], 5)
+    assert dataSubmitter.columns_since_last_submit() == 10
     dataSubmitter.store("test_db1", schema2, [1, 2], 10)
 
     dataSubmitter.submit()
 
+    assert dataSubmitter.columns_since_last_submit() == 0
+
     expected_data = {
         "host": "some",
         "agent_version": 0,
@@ -802,3 +808,65 @@ def test_submit_data():
         json.loads(submitted_data[0]), expected_data, exclude_paths="root['timestamp']", ignore_order=True
     )
     assert len(difference) == 0
+
+def test_submit_data_for_db_without_info():
+
+    dataSubmitter, submitted_data = set_up_submitter_unit_test()
+
+    schema1 = {"id": "1"}
+    dataSubmitter.store("test_db1", schema1, [1, 2], 5)
+    
+    dataSubmitter.submit()
+    expected_data = {
+        "host": "some",
+        "agent_version": 0,
+        "dbms": "sqlserver",
+        "kind": "sqlserver_databases",
+        "collection_interval": 1200,
+        "dbms_version": "some",
+        "tags": "some",
+        "cloud_metadata": "some",
+        "metadata": [
+            {"name": "test_db1", "schemas": [{"id": "1", "tables": [1, 2]}]},
+        ],
+        "timestamp": 1.1,
+    }
+
+    difference = DeepDiff(
+        json.loads(submitted_data[0]), expected_data, exclude_paths="root['timestamp']", ignore_order=True
+    )
+    assert len(difference) == 0
+
+def test_fetch_throws(instance_docker):
+    check = SQLServer(CHECK_NAME, {}, [instance_docker])
+    schemas = Schemas(check, check._config)
+    with mock.patch('time.time', side_effect=[0, 9999999]), \
+        mock.patch('datadog_checks.sqlserver.schemas.Schemas._query_schema_information', return_value = {"id" :1}), \
+        mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables', return_value = [1,2]):
+        with pytest.raises(StopIteration):
+            schemas._fetch_schema_data("dummy_cursor", "my_db")
+
+def test_submit_is_called_if_too_many_columns(instance_docker):
+    check = SQLServer(CHECK_NAME, {}, [instance_docker])
+    schemas = Schemas(check, check._config)
+    with mock.patch('time.time', side_effect=[0, 0]), \
+        mock.patch('datadog_checks.sqlserver.schemas.Schemas._query_schema_information', return_value = {"id" :1}), \
+        mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables', return_value = [1,2]), \
+        mock.patch('datadog_checks.sqlserver.schemas.SubmitData.submit') as mocked_submit, \
+        mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables_data', return_value = (1000_000, {"id" : 1})):
+        with pytest.raises(StopIteration):
+            schemas._fetch_schema_data("dummy_cursor", "my_db")
+            mocked_submit.called_once()
+
+def test_exception_handling_by_do_for_dbs(instance_docker):
+    check = SQLServer(CHECK_NAME, {}, [instance_docker])
+    check.initialize_connection()
+    schemas = Schemas(check, check._config)
+    mock_cursor = mock.MagicMock()
+    with mock.patch('datadog_checks.sqlserver.schemas.Schemas._fetch_schema_data', side_effect=Exception("Can't connect to DB")), \
+        mock.patch('datadog_checks.sqlserver.sqlserver.SQLServer.get_databases', return_value = ["db1"]), \
+        mock.patch('cachetools.TTLCache.get', return_value = "dummy"), \
+        mock.patch('datadog_checks.sqlserver.connection.Connection.open_managed_default_connection'), \
+        mock.patch('datadog_checks.sqlserver.connection.Connection.get_managed_cursor', return_value = mock_cursor), \
+        mock.patch('datadog_checks.sqlserver.utils.is_azure_sql_database', return_value = {}):
+            schemas._fetch_for_databases()
\ No newline at end of file

From 780eefbd702be42dfe46a1c7c4b8ef0c5928c369 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 6 Jun 2024 20:32:36 +0000
Subject: [PATCH 103/132] Improved doc

---
 sqlserver/assets/configuration/spec.yaml        | 3 ++-
 sqlserver/datadog_checks/sqlserver/sqlserver.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 071f80d030bd2..0d5fde37db88c 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -715,7 +715,8 @@ files:
         display_default: false
     - name: schemas_collection 
       description: |
-        Configure collection of schemas.
+        Configure collection of schemas. "\If database_autodiscovery is not enabled, data is collected 
+        only for the database configured with database."\
       options:
         - name: enabled
           description: |
diff --git a/sqlserver/datadog_checks/sqlserver/sqlserver.py b/sqlserver/datadog_checks/sqlserver/sqlserver.py
index 37af0a80f1b4a..17aecaaa6fee7 100644
--- a/sqlserver/datadog_checks/sqlserver/sqlserver.py
+++ b/sqlserver/datadog_checks/sqlserver/sqlserver.py
@@ -116,6 +116,7 @@ def __init__(self, name, init_config, instances):
         self.instance_metrics = []
         self.instance_per_type_metrics = defaultdict(set)
         self.do_check = True
+
         self._config = SQLServerConfig(self.init_config, self.instance, self.log)
         self.tags = self._config.tags
 

From b0979fc88f0ca5fc3f4e8b8ae9718504fa76fd3b Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 7 Jun 2024 16:21:43 +0000
Subject: [PATCH 104/132] improve variable names

---
 sqlserver/assets/configuration/spec.yaml      |  4 +-
 sqlserver/datadog_checks/sqlserver/schemas.py | 70 +++++++++----------
 sqlserver/datadog_checks/sqlserver/utils.py   |  4 +-
 3 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index 0d5fde37db88c..ceac7ef919f72 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -715,8 +715,8 @@ files:
         display_default: false
     - name: schemas_collection 
       description: |
-        Configure collection of schemas. "\If database_autodiscovery is not enabled, data is collected 
-        only for the database configured with database."\
+        Configure collection of schemas. If `database_autodiscovery` is not enabled, data is collected 
+        only for the database configured with `database` parameter.
       options:
         - name: enabled
           description: |
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 697b7a1509d4b..8b32dc3f66cd1 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -24,7 +24,7 @@
     SCHEMA_QUERY,
     TABLES_IN_SCHEMA_QUERY,
 )
-from datadog_checks.sqlserver.utils import execute_query_output_result_as_dicts, get_list_chunks, is_azure_sql_database
+from datadog_checks.sqlserver.utils import execute_query, get_list_chunks, is_azure_sql_database
 
 
 class SubmitData:
@@ -73,7 +73,6 @@ def truncate(self, json_event):
         else:
             return json_event
 
-    # NOTE: DB with no schemas is never submitted
     def submit(self):
         if not self.db_to_schemas:
             return
@@ -82,7 +81,7 @@ def submit(self):
         for db, schemas_by_id in self.db_to_schemas.items():
             db_info = {}
             if db not in self.db_info:
-                self._log.error("Couldn't find database info for %s", db)
+                self._log.error("Couldn't find database info for {}".format(db))
                 db_info["name"] = db
             else:
                 db_info = self.db_info[db]
@@ -101,7 +100,7 @@ class Schemas(DBMAsyncJob):
 
     TABLES_CHUNK_SIZE = 500
     # Note: in async mode execution time also cannot exceed 2 checks.
-    MAX_EXECUTION_TIME = 10
+    DEFAULT_MAX_EXECUTION_TIME = 10
     MAX_COLUMNS_PER_EVENT = 100_000
 
     def __init__(self, check, config):
@@ -111,7 +110,7 @@ def __init__(self, check, config):
         self._last_schemas_collect_time = None
         collection_interval = config.schema_config.get('collection_interval', DEFAULT_SCHEMAS_COLLECTION_INTERVAL)
         self._max_execution_time = min(
-            config.schema_config.get('max_execution_time', self.MAX_EXECUTION_TIME), collection_interval
+            config.schema_config.get('max_execution_time', self.DEFAULT_MAX_EXECUTION_TIME), collection_interval
         )
         super(Schemas, self).__init__(
             check,
@@ -155,8 +154,8 @@ def _fetch_schema_data(self, cursor, db_name):
                 if schema_collection_elapsed_time > self._max_execution_time:
                     # TODO Report truncation to the backend
                     self._log.warning(
-                        """Truncated data due to the effective execution time reaching {},
-                         stopped on db - {} on schema {}""".format(
+                        """Truncated data due to the execution time reaching {}s,
+                         stopped on db {} on schema {}""".format(
                             self._max_execution_time, db_name, schema["name"]
                         )
                     )
@@ -184,16 +183,12 @@ def _fetch_for_databases(self):
                         self._fetch_schema_data(cursor, db_name)
                     except StopIteration as e:
                         self._log.error(
-                            "While executing fetch schemas for databse - %s, the following exception occured - %s",
-                            db_name,
-                            e,
+                            "While executing fetch schemas for databse {}, the following exception occured {}".format(db_name, e)
                         )
                         return
                     except Exception as e:
                         self._log.error(
-                            "While executing fetch schemas for databse - %s, the following exception occured - %s",
-                            db_name,
-                            e,
+                            "While executing fetch schemas for databse {}, the following exception occured {}".format(db_name, e)
                         )
                 # Switch DB back to MASTER
                 if not is_azure_sql_database(engine_edition):
@@ -254,17 +249,17 @@ def _collect_schemas_data(self):
         )
 
         databases = self._check.get_databases()
-        db_infos = self._query_db_informations(databases)
+        db_infos = self._query_db_information(databases)
         self._data_submitter.store_db_infos(db_infos)
         self._fetch_for_databases()
-        self._log.debug("Finished collect_schemas_data")
         self._data_submitter.submit()
+        self._log.debug("Finished collect_schemas_data")
 
-    def _query_db_informations(self, db_names):
+    def _query_db_information(self, db_names):
         with self._check.connection.open_managed_default_connection():
             with self._check.connection.get_managed_cursor() as cursor:
                 db_names_formatted = ",".join(["'{}'".format(t) for t in db_names])
-                return execute_query_output_result_as_dicts(
+                return execute_query(
                     DB_QUERY.format(db_names_formatted), cursor, convert_results_to_str=True
                 )
 
@@ -276,7 +271,7 @@ def _get_tables(self, schema, cursor):
         "name": str
         "columns": []
         """
-        tables_info = execute_query_output_result_as_dicts(
+        tables_info = execute_query(
             TABLES_IN_SCHEMA_QUERY, cursor, convert_results_to_str=True, parameter=schema["id"]
         )
         for t in tables_info:
@@ -292,7 +287,7 @@ def _query_schema_information(self, cursor):
             "id": str
             "owner_name": str
         """
-        return execute_query_output_result_as_dicts(SCHEMA_QUERY, cursor, convert_results_to_str=True)
+        return execute_query(SCHEMA_QUERY, cursor, convert_results_to_str=True)
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables_data(self, table_list, schema, cursor):
@@ -359,7 +354,8 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, s
         ]
         rows = [dict(zip(columns, [str(item) for item in row])) for row in data]
         for row in rows:
-            table_id = name_to_id.get(str(row.get("table_name")))
+            table_name = str(row.get("table_name"))
+            table_id = name_to_id.get(table_name)
             if table_id is not None:
                 row.pop("table_name", None)
                 if "nullable" in row:
@@ -372,14 +368,14 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, s
                         row
                     ]
                 else:
-                    self._log.error("Columns found for an unkown table with the object_id: %s", table_id)
+                    self._log.debug("Columns found for an unkown table with the object_id: {}".format(table_id))
             else:
-                self._log.error("Couldn't find id of a table: %s", table_id)
+                self._log.debug("Couldn't find id of a table: {}".format(table_name))
         return len(data)
 
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
-        rows = execute_query_output_result_as_dicts(PARTITIONS_QUERY.format(table_ids), cursor)
+        rows = execute_query(PARTITIONS_QUERY.format(table_ids), cursor)
         for row in rows:
             id = row.pop("id", None)
             if id is not None:
@@ -387,13 +383,13 @@ def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
                 if id_str in id_to_table_data:
                     id_to_table_data[id_str]["partitions"] = row
                 else:
-                    self._log.error("Partition found for an unkown table with the object_id: %s", id_str)
+                    self._log.debug("Partition found for an unkown table with the object_id: {}".format(id_str))
             else:
-                self._log.error("Return rows of [%s] query should have id column", PARTITIONS_QUERY)
+                self._log.debug("Return rows of [{}] query should have id column".format(PARTITIONS_QUERY))
 
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
-        rows = execute_query_output_result_as_dicts(INDEX_QUERY.format(table_ids), cursor)
+        rows = execute_query(INDEX_QUERY.format(table_ids), cursor)
         for row in rows:
             id = row.pop("id", None)
             if id is not None:
@@ -402,21 +398,21 @@ def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
                     id_to_table_data[id_str].setdefault("indexes", [])
                     id_to_table_data[id_str]["indexes"].append(row)
                 else:
-                    self._log.error("Index found for an unkown table with the object_id: %s", id_str)
+                    self._log.debug("Index found for an unkown table with the object_id: {}".format(id_str))
             else:
-                self._log.error("Return rows of [%s] query should have id column", INDEX_QUERY)
+                self._log.debug("Return rows of [{}] query should have id column".format(INDEX_QUERY))
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
-    def _populate_with_foreign_keys_data(self, table_ids, id_to_table_data, cursor):
-        rows = execute_query_output_result_as_dicts(FOREIGN_KEY_QUERY.format(table_ids), cursor)
+    def _populate_with_foreign_keys_data(self, table_ids, table_id_to_table_data, cursor):
+        rows = execute_query(FOREIGN_KEY_QUERY.format(table_ids), cursor)
         for row in rows:
-            id = row.pop("id", None)
+            table_id = row.pop("id", None)
             if id is not None:
-                id_str = str(id)
-                if id_str in id_to_table_data:
-                    id_to_table_data.get(str(id)).setdefault("foreign_keys", [])
-                    id_to_table_data.get(str(id))["foreign_keys"].append(row)
+                table_id_str = str(table_id)
+                if table_id_str in table_id_to_table_data:
+                    table_id_to_table_data.get(table_id_str).setdefault("foreign_keys", [])
+                    table_id_to_table_data.get(table_id_str)["foreign_keys"].append(row)
                 else:
-                    self._log.error("Foreign key found for an unkown table with the object_id: %s", id_str)
+                    self._log.debug("Foreign key found for an unkown table with the object_id: {}".format(table_id_str))
             else:
-                self._log.error("Return rows of [%s] query should have id column", FOREIGN_KEY_QUERY)
+                self._log.debug("Return rows of [{}] query should have id column".format(FOREIGN_KEY_QUERY))
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index b816b6a8cea8b..dd57242a91d7d 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -7,6 +7,8 @@
 from datadog_checks.base.utils.platform import Platform
 from datadog_checks.sqlserver.const import ENGINE_EDITION_AZURE_MANAGED_INSTANCE, ENGINE_EDITION_SQL_DATABASE
 
+from typing import Dict
+
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 DRIVER_CONFIG_DIR = os.path.join(CURRENT_DIR, 'data', 'driver_config')
 
@@ -139,7 +141,7 @@ def is_azure_sql_database(engine_edition):
     return engine_edition == ENGINE_EDITION_SQL_DATABASE
 
 
-def execute_query_output_result_as_dicts(query, cursor, convert_results_to_str=False, parameter=None):
+def execute_query(query, cursor, convert_results_to_str=False, parameter=None) -> Dict[str, str]:
     if parameter is not None:
         cursor.execute(query, (parameter,))
     else:

From 91df5f80629fdcacece6b1b7c8a400f37378c3b8 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 7 Jun 2024 16:24:57 +0000
Subject: [PATCH 105/132] Applied linter

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 19 ++++----
 sqlserver/datadog_checks/sqlserver/utils.py   |  3 +-
 sqlserver/tests/test_unit.py                  | 48 +++++++++++--------
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 8b32dc3f66cd1..975c205c394e4 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -160,7 +160,8 @@ def _fetch_schema_data(self, cursor, db_name):
                         )
                     )
                     raise StopIteration(
-                        "Schema collection took {}s which is longer than allowed limit of {}s, stopped while collecting for db - {}".format(
+                        """Schema collection took {}s which is longer than allowed limit of {}s,
+                        stopped while collecting for db - {}""".format(
                             schema_collection_elapsed_time, self._max_execution_time, db_name
                         )
                     )
@@ -183,12 +184,16 @@ def _fetch_for_databases(self):
                         self._fetch_schema_data(cursor, db_name)
                     except StopIteration as e:
                         self._log.error(
-                            "While executing fetch schemas for databse {}, the following exception occured {}".format(db_name, e)
+                            "While executing fetch schemas for databse {}, the following exception occured {}".format(
+                                db_name, e
+                            )
                         )
                         return
                     except Exception as e:
                         self._log.error(
-                            "While executing fetch schemas for databse {}, the following exception occured {}".format(db_name, e)
+                            "While executing fetch schemas for databse {}, the following exception occured {}".format(
+                                db_name, e
+                            )
                         )
                 # Switch DB back to MASTER
                 if not is_azure_sql_database(engine_edition):
@@ -259,9 +264,7 @@ def _query_db_information(self, db_names):
         with self._check.connection.open_managed_default_connection():
             with self._check.connection.get_managed_cursor() as cursor:
                 db_names_formatted = ",".join(["'{}'".format(t) for t in db_names])
-                return execute_query(
-                    DB_QUERY.format(db_names_formatted), cursor, convert_results_to_str=True
-                )
+                return execute_query(DB_QUERY.format(db_names_formatted), cursor, convert_results_to_str=True)
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _get_tables(self, schema, cursor):
@@ -271,9 +274,7 @@ def _get_tables(self, schema, cursor):
         "name": str
         "columns": []
         """
-        tables_info = execute_query(
-            TABLES_IN_SCHEMA_QUERY, cursor, convert_results_to_str=True, parameter=schema["id"]
-        )
+        tables_info = execute_query(TABLES_IN_SCHEMA_QUERY, cursor, convert_results_to_str=True, parameter=schema["id"])
         for t in tables_info:
             t.setdefault("columns", [])
         return tables_info
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index dd57242a91d7d..667b1f8d1dff5 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -3,12 +3,11 @@
 # Licensed under a 3-clause BSD style license (see LICENSE)
 import os
 import re
+from typing import Dict
 
 from datadog_checks.base.utils.platform import Platform
 from datadog_checks.sqlserver.const import ENGINE_EDITION_AZURE_MANAGED_INSTANCE, ENGINE_EDITION_SQL_DATABASE
 
-from typing import Dict
-
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 DRIVER_CONFIG_DIR = os.path.join(CURRENT_DIR, 'data', 'driver_config')
 
diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index 7694366c74d26..2be38bfb85776 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -15,8 +15,7 @@
 from datadog_checks.sqlserver import SQLServer
 from datadog_checks.sqlserver.connection import split_sqlserver_host_port
 from datadog_checks.sqlserver.metrics import SqlFractionMetric, SqlMasterDatabaseFileStats
-from datadog_checks.sqlserver.schemas import SubmitData
-from datadog_checks.sqlserver.schemas import Schemas
+from datadog_checks.sqlserver.schemas import Schemas, SubmitData
 from datadog_checks.sqlserver.sqlserver import SQLConnectionError
 from datadog_checks.sqlserver.utils import (
     Database,
@@ -25,8 +24,6 @@
     set_default_driver_conf,
 )
 
-from cachetools import TTLCache
-
 from .common import CHECK_NAME, DOCKER_SERVER, assert_metrics
 from .utils import windows_ci
 
@@ -809,13 +806,14 @@ def test_submit_data():
     )
     assert len(difference) == 0
 
+
 def test_submit_data_for_db_without_info():
 
     dataSubmitter, submitted_data = set_up_submitter_unit_test()
 
     schema1 = {"id": "1"}
     dataSubmitter.store("test_db1", schema1, [1, 2], 5)
-    
+
     dataSubmitter.submit()
     expected_data = {
         "host": "some",
@@ -837,36 +835,46 @@ def test_submit_data_for_db_without_info():
     )
     assert len(difference) == 0
 
+
 def test_fetch_throws(instance_docker):
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
     schemas = Schemas(check, check._config)
-    with mock.patch('time.time', side_effect=[0, 9999999]), \
-        mock.patch('datadog_checks.sqlserver.schemas.Schemas._query_schema_information', return_value = {"id" :1}), \
-        mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables', return_value = [1,2]):
+    with mock.patch('time.time', side_effect=[0, 9999999]), mock.patch(
+        'datadog_checks.sqlserver.schemas.Schemas._query_schema_information', return_value={"id": 1}
+    ), mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables', return_value=[1, 2]):
         with pytest.raises(StopIteration):
             schemas._fetch_schema_data("dummy_cursor", "my_db")
 
+
 def test_submit_is_called_if_too_many_columns(instance_docker):
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
     schemas = Schemas(check, check._config)
-    with mock.patch('time.time', side_effect=[0, 0]), \
-        mock.patch('datadog_checks.sqlserver.schemas.Schemas._query_schema_information', return_value = {"id" :1}), \
-        mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables', return_value = [1,2]), \
-        mock.patch('datadog_checks.sqlserver.schemas.SubmitData.submit') as mocked_submit, \
-        mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables_data', return_value = (1000_000, {"id" : 1})):
+    with mock.patch('time.time', side_effect=[0, 0]), mock.patch(
+        'datadog_checks.sqlserver.schemas.Schemas._query_schema_information', return_value={"id": 1}
+    ), mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables', return_value=[1, 2]), mock.patch(
+        'datadog_checks.sqlserver.schemas.SubmitData.submit'
+    ) as mocked_submit, mock.patch(
+        'datadog_checks.sqlserver.schemas.Schemas._get_tables_data', return_value=(1000_000, {"id": 1})
+    ):
         with pytest.raises(StopIteration):
             schemas._fetch_schema_data("dummy_cursor", "my_db")
             mocked_submit.called_once()
 
+
 def test_exception_handling_by_do_for_dbs(instance_docker):
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
     check.initialize_connection()
     schemas = Schemas(check, check._config)
     mock_cursor = mock.MagicMock()
-    with mock.patch('datadog_checks.sqlserver.schemas.Schemas._fetch_schema_data', side_effect=Exception("Can't connect to DB")), \
-        mock.patch('datadog_checks.sqlserver.sqlserver.SQLServer.get_databases', return_value = ["db1"]), \
-        mock.patch('cachetools.TTLCache.get', return_value = "dummy"), \
-        mock.patch('datadog_checks.sqlserver.connection.Connection.open_managed_default_connection'), \
-        mock.patch('datadog_checks.sqlserver.connection.Connection.get_managed_cursor', return_value = mock_cursor), \
-        mock.patch('datadog_checks.sqlserver.utils.is_azure_sql_database', return_value = {}):
-            schemas._fetch_for_databases()
\ No newline at end of file
+    with mock.patch(
+        'datadog_checks.sqlserver.schemas.Schemas._fetch_schema_data', side_effect=Exception("Can't connect to DB")
+    ), mock.patch('datadog_checks.sqlserver.sqlserver.SQLServer.get_databases', return_value=["db1"]), mock.patch(
+        'cachetools.TTLCache.get', return_value="dummy"
+    ), mock.patch(
+        'datadog_checks.sqlserver.connection.Connection.open_managed_default_connection'
+    ), mock.patch(
+        'datadog_checks.sqlserver.connection.Connection.get_managed_cursor', return_value=mock_cursor
+    ), mock.patch(
+        'datadog_checks.sqlserver.utils.is_azure_sql_database', return_value={}
+    ):
+        schemas._fetch_for_databases()

From ff93303a7f0c91e2cb2e3d1b920263d03a95faf5 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Fri, 7 Jun 2024 17:41:36 +0000
Subject: [PATCH 106/132] linter

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 975c205c394e4..81350e1405582 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -375,31 +375,31 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, s
         return len(data)
 
     @tracked_method(agent_check_getter=agent_check_getter)
-    def _populate_with_partitions_data(self, table_ids, id_to_table_data, cursor):
+    def _populate_with_partitions_data(self, table_ids, table_id_to_table_data, cursor):
         rows = execute_query(PARTITIONS_QUERY.format(table_ids), cursor)
         for row in rows:
-            id = row.pop("id", None)
-            if id is not None:
-                id_str = str(id)
-                if id_str in id_to_table_data:
-                    id_to_table_data[id_str]["partitions"] = row
+            table_id = row.pop("id", None)
+            if table_id is not None:
+                table_id_str = str(table_id)
+                if table_id_str in table_id_to_table_data:
+                    table_id_to_table_data[table_id_str]["partitions"] = row
                 else:
-                    self._log.debug("Partition found for an unkown table with the object_id: {}".format(id_str))
+                    self._log.debug("Partition found for an unkown table with the object_id: {}".format(table_id_str))
             else:
                 self._log.debug("Return rows of [{}] query should have id column".format(PARTITIONS_QUERY))
 
     @tracked_method(agent_check_getter=agent_check_getter)
-    def _populate_with_index_data(self, table_ids, id_to_table_data, cursor):
+    def _populate_with_index_data(self, table_ids, table_id_to_table_data, cursor):
         rows = execute_query(INDEX_QUERY.format(table_ids), cursor)
         for row in rows:
-            id = row.pop("id", None)
-            if id is not None:
-                id_str = str(id)
-                if id_str in id_to_table_data:
-                    id_to_table_data[id_str].setdefault("indexes", [])
-                    id_to_table_data[id_str]["indexes"].append(row)
+            table_id = row.pop("id", None)
+            if table_id is not None:
+                table_id_str = str(table_id)
+                if table_id_str in table_id_to_table_data:
+                    table_id_to_table_data[table_id_str].setdefault("indexes", [])
+                    table_id_to_table_data[table_id_str]["indexes"].append(row)
                 else:
-                    self._log.debug("Index found for an unkown table with the object_id: {}".format(id_str))
+                    self._log.debug("Index found for an unkown table with the object_id: {}".format(table_id_str))
             else:
                 self._log.debug("Return rows of [{}] query should have id column".format(INDEX_QUERY))
 

From 77b50173c3132e4671a7594467be96e5df0822a3 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 11 Jun 2024 18:59:01 +0000
Subject: [PATCH 107/132] Added test for truncation

---
 sqlserver/assets/configuration/spec.yaml      |  5 +--
 .../sqlserver/data/conf.yaml.example          |  9 ++---
 sqlserver/datadog_checks/sqlserver/schemas.py | 33 +++++++++++++------
 sqlserver/tests/test_metadata.py              | 23 +++++++++++++
 4 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/sqlserver/assets/configuration/spec.yaml b/sqlserver/assets/configuration/spec.yaml
index ceac7ef919f72..01df2387bff92 100644
--- a/sqlserver/assets/configuration/spec.yaml
+++ b/sqlserver/assets/configuration/spec.yaml
@@ -723,7 +723,7 @@ files:
             Enable schema collection. Requires `dbm: true`. Defaults to false.
           value:
             type: boolean
-            example: true
+            example: false
         - name: collection_interval
           description: |
             Set the database schema collection interval (in seconds). Defaults to 600 seconds.
@@ -732,7 +732,8 @@ files:
             example: 600
         - name: max_execution_time
           description: |
-            Set the maximum time for schema collection (in seconds). Defaults to 10 seconds. Capped by `schemas_collection.collection_interval`
+            Set the maximum time for schema collection (in seconds). Defaults to 10 seconds. 
+            Capped by `schemas_collection.collection_interval`
           value:
             type: number
             example: 10
diff --git a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
index 346e3b6174cac..82f994d824c43 100644
--- a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
+++ b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
@@ -365,14 +365,15 @@ instances:
     #
     ## schemas_collection
 
-        ## @param enabled - boolean - optional - default: true
-        ## Enable collection of active sessions. Requires `dbm: true`.
+        ## @param enabled - boolean - optional - default: true 
+        ## Enable collection of schemas. Requires `dbm: true`. If `database_autodiscovery` is not enabled,
+        ## data is collected only for the database configured with `database` parameter.
         #
-        # enabled: true
+        # enabled: false
 
         ## @param collection_interval - number - optional - default: 600
         ## Set the database schema collection interval (in seconds).        
-        ## If a non-default value is chosen, then that exact same value must be used for *every* check instance. TODO ?
+        ## If a non-default value is chosen, then that exact same value must be used for *every* check instance.
         ## Running different instances with different collection intervals is not supported.
         #
         # collection_interval: 600
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 81350e1405582..0eb77a67c27e3 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -35,6 +35,7 @@ def __init__(self, submit_data_function, base_event, logger):
         self._log = logger
 
         self._columns_count = 0
+        self._total_columns_sent = 0
         self.db_to_schemas = {}  # dbname : { id : schema }
         self.db_info = {}  # name to info
 
@@ -45,6 +46,7 @@ def set_base_event_data(self, hostname, tags, cloud_metadata, dbms_version):
         self._base_event["dbms_version"] = dbms_version
 
     def reset(self):
+        self._total_columns_sent = 0
         self._columns_count = 0
         self.db_to_schemas.clear()
         self.db_info.clear()
@@ -73,9 +75,25 @@ def truncate(self, json_event):
         else:
             return json_event
 
+    def send_truncated_msg(self, db_name, time_spent):
+        event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000}
+        db_info = {}
+        if db_name not in self.db_to_schemas:
+            db_info = self.db_info[db_name]
+        else:
+            db_info = {"name": db_name}
+        db_info["truncated"] = "Truncated after fetching {} columns, elapsed time is {}s".format(
+            self._total_columns_sent, time_spent
+        )
+        event["metadata"] = [{**(db_info)}]
+        json_event = json.dumps(event, default=default_json_event_encoding)
+        self._log.debug("Reporting truncation of schema collection: {}".format(self.truncate(json_event)))
+        self._submit_to_agent_queue(json_event)
+
     def submit(self):
         if not self.db_to_schemas:
             return
+        self._total_columns_sent += self._columns_count
         self._columns_count = 0
         event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000}
         for db, schemas_by_id in self.db_to_schemas.items():
@@ -143,8 +161,7 @@ def shut_down(self):
         self._data_submitter.submit()
 
     @tracked_method(agent_check_getter=agent_check_getter)
-    def _fetch_schema_data(self, cursor, db_name):
-        start_time = time.time()
+    def _fetch_schema_data(self, cursor, start_time, db_name):
         schemas = self._query_schema_information(cursor)
         for schema in schemas:
             tables = self._get_tables(schema, cursor)
@@ -152,13 +169,8 @@ def _fetch_schema_data(self, cursor, db_name):
             for tables_chunk in tables_chunks:
                 schema_collection_elapsed_time = time.time() - start_time
                 if schema_collection_elapsed_time > self._max_execution_time:
-                    # TODO Report truncation to the backend
-                    self._log.warning(
-                        """Truncated data due to the execution time reaching {}s,
-                         stopped on db {} on schema {}""".format(
-                            self._max_execution_time, db_name, schema["name"]
-                        )
-                    )
+                    self._data_submitter.submit()
+                    self._data_submitter.send_truncated_msg(db_name, schema_collection_elapsed_time)
                     raise StopIteration(
                         """Schema collection took {}s which is longer than allowed limit of {}s,
                         stopped while collecting for db - {}""".format(
@@ -173,6 +185,7 @@ def _fetch_schema_data(self, cursor, db_name):
         return False
 
     def _fetch_for_databases(self):
+        start_time = time.time()
         databases = self._check.get_databases()
         engine_edition = self._check.static_info_cache.get(STATIC_INFO_ENGINE_EDITION)
         with self._check.connection.open_managed_default_connection():
@@ -181,7 +194,7 @@ def _fetch_for_databases(self):
                     try:
                         if not is_azure_sql_database(engine_edition):
                             cursor.execute(SWITCH_DB_STATEMENT.format(db_name))
-                        self._fetch_schema_data(cursor, db_name)
+                        self._fetch_schema_data(cursor, start_time, db_name)
                     except StopIteration as e:
                         self._log.error(
                             "While executing fetch schemas for databse {}, the following exception occured {}".format(
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 2a42e1ca40e21..5193026f67bce 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -5,6 +5,7 @@
 from __future__ import unicode_literals
 
 import logging
+import re
 from copy import copy
 
 import pytest
@@ -363,3 +364,25 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         # schema data also collects certain builtin default schemas which are ignored in the test
         if len(diff_keys) > 0 and diff_keys != ['iterable_item_removed']:
             raise AssertionError(Exception("found the following diffs: " + str(difference)))
+
+
+def test_schemas_collection_truncated(aggregator, dd_run_check, dbm_instance):
+    dbm_instance['database_autodiscovery'] = True
+    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas']
+    dbm_instance['dbm'] = True
+    dbm_instance['schemas_collection'] = {"enabled": True, "max_execution_time": 0}
+    expected_pattern = r"^Truncated after fetching \d+ columns, elapsed time is \d+(\.\d+)?s$"
+
+    check = SQLServer(CHECK_NAME, {}, [dbm_instance])
+    dd_run_check(check)
+    dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
+    found = False
+    for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
+        for database_metadata in schema_event['metadata']:
+            if (
+                "truncated" in database_metadata
+                and database_metadata['name'] == 'datadog_test_schemas'
+                and re.fullmatch(expected_pattern, database_metadata["truncated"])
+            ):
+                found = True
+    assert found

From fe7e780aae429d778b12b48b90c5b718a0953996 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 11 Jun 2024 22:06:16 +0000
Subject: [PATCH 108/132] Add db to the message

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 4 ++--
 sqlserver/tests/test_metadata.py              | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 0eb77a67c27e3..87b097357189c 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -82,8 +82,8 @@ def send_truncated_msg(self, db_name, time_spent):
             db_info = self.db_info[db_name]
         else:
             db_info = {"name": db_name}
-        db_info["truncated"] = "Truncated after fetching {} columns, elapsed time is {}s".format(
-            self._total_columns_sent, time_spent
+        db_info["truncated"] = "Truncated after fetching {} columns, elapsed time is {}s, database is {}".format(
+            self._total_columns_sent, time_spent, db_name
         )
         event["metadata"] = [{**(db_info)}]
         json_event = json.dumps(event, default=default_json_event_encoding)
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 5193026f67bce..b38f6bd7ccc60 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -371,8 +371,7 @@ def test_schemas_collection_truncated(aggregator, dd_run_check, dbm_instance):
     dbm_instance['autodiscovery_include'] = ['datadog_test_schemas']
     dbm_instance['dbm'] = True
     dbm_instance['schemas_collection'] = {"enabled": True, "max_execution_time": 0}
-    expected_pattern = r"^Truncated after fetching \d+ columns, elapsed time is \d+(\.\d+)?s$"
-
+    expected_pattern = r"^Truncated after fetching \d+ columns, elapsed time is \d+(\.\d+)?s, database is .*"
     check = SQLServer(CHECK_NAME, {}, [dbm_instance])
     dd_run_check(check)
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")

From f6b2a518bb932b147229b1fc4ebd5dc6db79ef4b Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 12 Jun 2024 07:36:52 +0000
Subject: [PATCH 109/132] Fixed unit test

---
 sqlserver/tests/test_unit.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index 2be38bfb85776..e0089884ff86e 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -5,6 +5,7 @@
 import json
 import os
 import re
+import time
 from collections import namedtuple
 
 import mock
@@ -843,7 +844,7 @@ def test_fetch_throws(instance_docker):
         'datadog_checks.sqlserver.schemas.Schemas._query_schema_information', return_value={"id": 1}
     ), mock.patch('datadog_checks.sqlserver.schemas.Schemas._get_tables', return_value=[1, 2]):
         with pytest.raises(StopIteration):
-            schemas._fetch_schema_data("dummy_cursor", "my_db")
+            schemas._fetch_schema_data("dummy_cursor", time.time(), "my_db")
 
 
 def test_submit_is_called_if_too_many_columns(instance_docker):
@@ -857,7 +858,7 @@ def test_submit_is_called_if_too_many_columns(instance_docker):
         'datadog_checks.sqlserver.schemas.Schemas._get_tables_data', return_value=(1000_000, {"id": 1})
     ):
         with pytest.raises(StopIteration):
-            schemas._fetch_schema_data("dummy_cursor", "my_db")
+            schemas._fetch_schema_data("dummy_cursor", time.time(), "my_db")
             mocked_submit.called_once()
 
 

From ca243ee2653390b65d28e77c1e52893a56bed362 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 12 Jun 2024 08:03:22 +0000
Subject: [PATCH 110/132] Applied linter

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 60 +++++--------------
 sqlserver/tests/test_unit.py                  | 29 ---------
 2 files changed, 15 insertions(+), 74 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 87b097357189c..27f09e2fc7786 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -98,11 +98,7 @@ def submit(self):
         event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000}
         for db, schemas_by_id in self.db_to_schemas.items():
             db_info = {}
-            if db not in self.db_info:
-                self._log.error("Couldn't find database info for {}".format(db))
-                db_info["name"] = db
-            else:
-                db_info = self.db_info[db]
+            db_info = self.db_info[db]
             event["metadata"] = event["metadata"] + [{**(db_info), "schemas": list(schemas_by_id.values())}]
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting the following payload for schema collection: {}".format(self.truncate(json_event)))
@@ -370,21 +366,13 @@ def _populate_with_columns_data(self, table_ids, name_to_id, id_to_table_data, s
         for row in rows:
             table_name = str(row.get("table_name"))
             table_id = name_to_id.get(table_name)
-            if table_id is not None:
-                row.pop("table_name", None)
-                if "nullable" in row:
-                    if row["nullable"].lower() == "no" or row["nullable"].lower() == "false":
-                        row["nullable"] = False
-                    else:
-                        row["nullable"] = True
-                if table_id in id_to_table_data:
-                    id_to_table_data.get(table_id)["columns"] = id_to_table_data.get(table_id).get("columns", []) + [
-                        row
-                    ]
+            row.pop("table_name", None)
+            if "nullable" in row:
+                if row["nullable"].lower() == "no" or row["nullable"].lower() == "false":
+                    row["nullable"] = False
                 else:
-                    self._log.debug("Columns found for an unkown table with the object_id: {}".format(table_id))
-            else:
-                self._log.debug("Couldn't find id of a table: {}".format(table_name))
+                    row["nullable"] = True
+            id_to_table_data.get(table_id)["columns"] = id_to_table_data.get(table_id).get("columns", []) + [row]
         return len(data)
 
     @tracked_method(agent_check_getter=agent_check_getter)
@@ -392,41 +380,23 @@ def _populate_with_partitions_data(self, table_ids, table_id_to_table_data, curs
         rows = execute_query(PARTITIONS_QUERY.format(table_ids), cursor)
         for row in rows:
             table_id = row.pop("id", None)
-            if table_id is not None:
-                table_id_str = str(table_id)
-                if table_id_str in table_id_to_table_data:
-                    table_id_to_table_data[table_id_str]["partitions"] = row
-                else:
-                    self._log.debug("Partition found for an unkown table with the object_id: {}".format(table_id_str))
-            else:
-                self._log.debug("Return rows of [{}] query should have id column".format(PARTITIONS_QUERY))
+            table_id_str = str(table_id)
+            table_id_to_table_data[table_id_str]["partitions"] = row
 
     @tracked_method(agent_check_getter=agent_check_getter)
     def _populate_with_index_data(self, table_ids, table_id_to_table_data, cursor):
         rows = execute_query(INDEX_QUERY.format(table_ids), cursor)
         for row in rows:
             table_id = row.pop("id", None)
-            if table_id is not None:
-                table_id_str = str(table_id)
-                if table_id_str in table_id_to_table_data:
-                    table_id_to_table_data[table_id_str].setdefault("indexes", [])
-                    table_id_to_table_data[table_id_str]["indexes"].append(row)
-                else:
-                    self._log.debug("Index found for an unkown table with the object_id: {}".format(table_id_str))
-            else:
-                self._log.debug("Return rows of [{}] query should have id column".format(INDEX_QUERY))
+            table_id_str = str(table_id)
+            table_id_to_table_data[table_id_str].setdefault("indexes", [])
+            table_id_to_table_data[table_id_str]["indexes"].append(row)
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _populate_with_foreign_keys_data(self, table_ids, table_id_to_table_data, cursor):
         rows = execute_query(FOREIGN_KEY_QUERY.format(table_ids), cursor)
         for row in rows:
             table_id = row.pop("id", None)
-            if id is not None:
-                table_id_str = str(table_id)
-                if table_id_str in table_id_to_table_data:
-                    table_id_to_table_data.get(table_id_str).setdefault("foreign_keys", [])
-                    table_id_to_table_data.get(table_id_str)["foreign_keys"].append(row)
-                else:
-                    self._log.debug("Foreign key found for an unkown table with the object_id: {}".format(table_id_str))
-            else:
-                self._log.debug("Return rows of [{}] query should have id column".format(FOREIGN_KEY_QUERY))
+            table_id_str = str(table_id)
+            table_id_to_table_data.get(table_id_str).setdefault("foreign_keys", [])
+            table_id_to_table_data.get(table_id_str)["foreign_keys"].append(row)
diff --git a/sqlserver/tests/test_unit.py b/sqlserver/tests/test_unit.py
index e0089884ff86e..35776ab816025 100644
--- a/sqlserver/tests/test_unit.py
+++ b/sqlserver/tests/test_unit.py
@@ -808,35 +808,6 @@ def test_submit_data():
     assert len(difference) == 0
 
 
-def test_submit_data_for_db_without_info():
-
-    dataSubmitter, submitted_data = set_up_submitter_unit_test()
-
-    schema1 = {"id": "1"}
-    dataSubmitter.store("test_db1", schema1, [1, 2], 5)
-
-    dataSubmitter.submit()
-    expected_data = {
-        "host": "some",
-        "agent_version": 0,
-        "dbms": "sqlserver",
-        "kind": "sqlserver_databases",
-        "collection_interval": 1200,
-        "dbms_version": "some",
-        "tags": "some",
-        "cloud_metadata": "some",
-        "metadata": [
-            {"name": "test_db1", "schemas": [{"id": "1", "tables": [1, 2]}]},
-        ],
-        "timestamp": 1.1,
-    }
-
-    difference = DeepDiff(
-        json.loads(submitted_data[0]), expected_data, exclude_paths="root['timestamp']", ignore_order=True
-    )
-    assert len(difference) == 0
-
-
 def test_fetch_throws(instance_docker):
     check = SQLServer(CHECK_NAME, {}, [instance_docker])
     schemas = Schemas(check, check._config)

From f905214cac2edaf4b53360b4c0e7d658557b4e25 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 12 Jun 2024 14:19:54 +0000
Subject: [PATCH 111/132] Changed truncation msg

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 12 ++++--------
 sqlserver/tests/test_metadata.py              |  8 ++------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 27f09e2fc7786..c89039818a72b 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -76,16 +76,12 @@ def truncate(self, json_event):
             return json_event
 
     def send_truncated_msg(self, db_name, time_spent):
-        event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000}
-        db_info = {}
-        if db_name not in self.db_to_schemas:
-            db_info = self.db_info[db_name]
-        else:
-            db_info = {"name": db_name}
-        db_info["truncated"] = "Truncated after fetching {} columns, elapsed time is {}s, database is {}".format(
+        event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000, "collection_errors" : {"error" : "truncated", "message" : ""}}
+        db_info = self.db_info[db_name]
+        event["metadata"] = [{**(db_info)}]
+        event["collection_errors"]["message"] = "Truncated after fetching {} columns, elapsed time is {}s, database is {}".format(
             self._total_columns_sent, time_spent, db_name
         )
-        event["metadata"] = [{**(db_info)}]
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting truncation of schema collection: {}".format(self.truncate(json_event)))
         self._submit_to_agent_queue(json_event)
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index b38f6bd7ccc60..55871fb39aa2e 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -377,11 +377,7 @@ def test_schemas_collection_truncated(aggregator, dd_run_check, dbm_instance):
     dbm_metadata = aggregator.get_event_platform_events("dbm-metadata")
     found = False
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
-        for database_metadata in schema_event['metadata']:
-            if (
-                "truncated" in database_metadata
-                and database_metadata['name'] == 'datadog_test_schemas'
-                and re.fullmatch(expected_pattern, database_metadata["truncated"])
-            ):
+        if "collection_errors" in schema_event:
+            if schema_event["collection_errors"]["error"] == "truncated" and re.fullmatch(expected_pattern, schema_event["collection_errors"]["message"]):
                 found = True
     assert found

From f1be7552a078d50723333f5dc165f8330cce7dd2 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 12 Jun 2024 14:21:39 +0000
Subject: [PATCH 112/132] applied linter

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 13 ++++++++++---
 sqlserver/tests/test_metadata.py              |  4 +++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index c89039818a72b..0e8486f62bac7 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -76,11 +76,18 @@ def truncate(self, json_event):
             return json_event
 
     def send_truncated_msg(self, db_name, time_spent):
-        event = {**self._base_event, "metadata": [], "timestamp": time.time() * 1000, "collection_errors" : {"error" : "truncated", "message" : ""}}
+        event = {
+            **self._base_event,
+            "metadata": [],
+            "timestamp": time.time() * 1000,
+            "collection_errors": {"error": "truncated", "message": ""},
+        }
         db_info = self.db_info[db_name]
         event["metadata"] = [{**(db_info)}]
-        event["collection_errors"]["message"] = "Truncated after fetching {} columns, elapsed time is {}s, database is {}".format(
-            self._total_columns_sent, time_spent, db_name
+        event["collection_errors"]["message"] = (
+            "Truncated after fetching {} columns, elapsed time is {}s, database is {}".format(
+                self._total_columns_sent, time_spent, db_name
+            )
         )
         json_event = json.dumps(event, default=default_json_event_encoding)
         self._log.debug("Reporting truncation of schema collection: {}".format(self.truncate(json_event)))
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 55871fb39aa2e..f5561a0233944 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -378,6 +378,8 @@ def test_schemas_collection_truncated(aggregator, dd_run_check, dbm_instance):
     found = False
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
         if "collection_errors" in schema_event:
-            if schema_event["collection_errors"]["error"] == "truncated" and re.fullmatch(expected_pattern, schema_event["collection_errors"]["message"]):
+            if schema_event["collection_errors"]["error"] == "truncated" and re.fullmatch(
+                expected_pattern, schema_event["collection_errors"]["message"]
+            ):
                 found = True
     assert found

From 72c61f834ae846016c15c6bc45e39bb000bf3d0e Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Thu, 13 Jun 2024 16:56:18 +0000
Subject: [PATCH 113/132] Require base package version

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 4 ++--
 sqlserver/pyproject.toml                      | 2 +-
 sqlserver/tests/test_metadata.py              | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 0e8486f62bac7..064e996e85574 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -80,11 +80,11 @@ def send_truncated_msg(self, db_name, time_spent):
             **self._base_event,
             "metadata": [],
             "timestamp": time.time() * 1000,
-            "collection_errors": {"error": "truncated", "message": ""},
+            "collection_errors": [{"error": "truncated", "message": ""}],
         }
         db_info = self.db_info[db_name]
         event["metadata"] = [{**(db_info)}]
-        event["collection_errors"]["message"] = (
+        event["collection_errors"][0]["message"] = (
             "Truncated after fetching {} columns, elapsed time is {}s, database is {}".format(
                 self._total_columns_sent, time_spent, db_name
             )
diff --git a/sqlserver/pyproject.toml b/sqlserver/pyproject.toml
index 1d04d0124de61..dccce892d132f 100644
--- a/sqlserver/pyproject.toml
+++ b/sqlserver/pyproject.toml
@@ -28,7 +28,7 @@ classifiers = [
     "Private :: Do Not Upload",
 ]
 dependencies = [
-    "datadog-checks-base>=36.5.0",
+    "datadog-checks-base>=36.8.0",
 ]
 dynamic = [
     "version",
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index f5561a0233944..722244ab35d9d 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -378,8 +378,8 @@ def test_schemas_collection_truncated(aggregator, dd_run_check, dbm_instance):
     found = False
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
         if "collection_errors" in schema_event:
-            if schema_event["collection_errors"]["error"] == "truncated" and re.fullmatch(
-                expected_pattern, schema_event["collection_errors"]["message"]
+            if schema_event["collection_errors"][0]["error"] == "truncated" and re.fullmatch(
+                expected_pattern, schema_event["collection_errors"][0]["message"]
             ):
                 found = True
     assert found

From 772a90fbf237a2d9a1140a582acb46083d5eb722 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 18 Jun 2024 10:13:50 +0000
Subject: [PATCH 114/132] Removed deepdiff from ddev hatch

---
 ddev/hatch.toml                               | 1 -
 sqlserver/datadog_checks/sqlserver/schemas.py | 2 +-
 sqlserver/tests/test_metadata.py              | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/ddev/hatch.toml b/ddev/hatch.toml
index b39663cdf11e4..2f299a9ceb09c 100644
--- a/ddev/hatch.toml
+++ b/ddev/hatch.toml
@@ -10,7 +10,6 @@ e2e-env = false
 dependencies = [
   "pyyaml",
   "vcrpy",
-  "deepdiff",
 ]
 # TODO: remove this when the old CLI is gone
 pre-install-commands = [
diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 064e996e85574..b0b1e6397f30d 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -80,7 +80,7 @@ def send_truncated_msg(self, db_name, time_spent):
             **self._base_event,
             "metadata": [],
             "timestamp": time.time() * 1000,
-            "collection_errors": [{"error": "truncated", "message": ""}],
+            "collection_errors": [{"error_type": "truncated", "message": ""}],
         }
         db_info = self.db_info[db_name]
         event["metadata"] = [{**(db_info)}]
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 722244ab35d9d..6378fc11c935d 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -378,7 +378,7 @@ def test_schemas_collection_truncated(aggregator, dd_run_check, dbm_instance):
     found = False
     for schema_event in (e for e in dbm_metadata if e['kind'] == 'sqlserver_databases'):
         if "collection_errors" in schema_event:
-            if schema_event["collection_errors"][0]["error"] == "truncated" and re.fullmatch(
+            if schema_event["collection_errors"][0]["error_type"] == "truncated" and re.fullmatch(
                 expected_pattern, schema_event["collection_errors"][0]["message"]
             ):
                 found = True

From d88bc65db7f5e68b3b2393aa6d6caa2f9a09b1c0 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 18 Jun 2024 11:53:52 +0000
Subject: [PATCH 115/132] resolved errors after merge

---
 sqlserver/tests/compose/setup.sql | 2 +-
 sqlserver/tests/test_metadata.py  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 23fa756c303c4..3aaf14191bbf3 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -78,7 +78,7 @@ GO
 USE [datadog_test-1];
 -- This table is pronounced "things" except we've replaced "th" with the greek lower case "theta" to ensure we
 -- correctly support unicode throughout the integration.
-CREATE TABLE [datadog_test-1].dbo.ϑings (id int, name varchar(255));
+CREATE TABLE [datadog_test-1].dbo.ϑings (id int DEFAULT 0, name varchar(255));
 INSERT INTO [datadog_test-1].dbo.ϑings VALUES (1, 'foo'), (2, 'bar');
 CREATE USER bob FOR LOGIN bob;
 CREATE USER fred FOR LOGIN fred;
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 6378fc11c935d..6bb48de2f9d12 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -96,10 +96,10 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
 
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
-    databases_to_find = ['datadog_test_schemas', 'datadog_test']
+    databases_to_find = ['datadog_test_schemas', 'datadog_test-1']
     exp_datadog_test = {
         'id': '6',
-        'name': 'datadog_test',
+        'name': 'datadog_test-1',
         "collation": "SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
         'schemas': [
@@ -322,10 +322,10 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             }
         ],
     }
-    expected_data_for_db = {'datadog_test': exp_datadog_test, 'datadog_test_schemas': exp_datadog_test_schemas}
+    expected_data_for_db = {'datadog_test-1': exp_datadog_test, 'datadog_test_schemas': exp_datadog_test_schemas}
 
     dbm_instance['database_autodiscovery'] = True
-    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test']
+    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test-1']
     dbm_instance['dbm'] = True
     dbm_instance['schemas_collection'] = {"enabled": True}
 

From ca28cc627b4ad89d06f587a689de672999a867ae Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 18 Jun 2024 11:59:18 +0000
Subject: [PATCH 116/132] remove modification from base

---
 .../datadog_checks/base/utils/db/utils.py     | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
index 56f4a388b8368..1441846962a33 100644
--- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py
+++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
@@ -117,13 +117,20 @@ def __init__(self, rate_limit_s):
         self.period_s = 1.0 / self.rate_limit_s if self.rate_limit_s > 0 else 0
         self.last_event = 0
 
-    def sleep(self):
+    def update_last_time_and_sleep(self):
         """
         Sleeps long enough to enforce the rate limit
         """
         elapsed_s = time.time() - self.last_event
         sleep_amount = max(self.period_s - elapsed_s, 0)
         time.sleep(sleep_amount)
+        self.update_last_time()
+
+    def shall_execute(self):
+        elapsed_s = time.time() - self.last_event
+        return elapsed_s >= self.period_s
+
+    def update_last_time(self):
         self.last_event = time.time()
 
 
@@ -294,7 +301,7 @@ def run_job_loop(self, tags):
         self._last_check_run = time.time()
         if self._run_sync or is_affirmative(os.environ.get('DBM_THREADED_JOB_RUN_SYNC', "false")):
             self._log.debug("Running threaded job synchronously. job=%s", self._job_name)
-            self._run_job_rate_limited()
+            self._run_sync_job_rate_limited()
         elif self._job_loop_future is None or not self._job_loop_future.running():
             self._job_loop_future = DBMAsyncJob.executor.submit(self._job_loop)
         else:
@@ -358,7 +365,7 @@ def _job_loop(self):
     def _set_rate_limit(self, rate_limit):
         if self._rate_limiter.rate_limit_s != rate_limit:
             self._rate_limiter = ConstantRateLimiter(rate_limit)
-
+    
     def _run_sync_job_rate_limited(self):
         if self._rate_limiter.shall_execute():
             try:
@@ -369,9 +376,15 @@ def _run_sync_job_rate_limited(self):
                 self._rate_limiter.update_last_time()
 
     def _run_job_rate_limited(self):
-        self._run_job_traced()
-        if not self._cancel_event.isSet():
-            self._rate_limiter.sleep()
+        try:
+            self._run_job_traced()
+        except:
+            raise
+        finally:
+            if not self._cancel_event.isSet():
+                self._rate_limiter.update_last_time_and_sleep()
+            else:
+                self._rate_limiter.update_last_time()
 
     @_traced_dbm_async_job_method
     def _run_job_traced(self):

From 260c6ce448d590835ddbba80b1c83cd6df9874ec Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 18 Jun 2024 12:05:34 +0000
Subject: [PATCH 117/132] removed white space

---
 datadog_checks_base/datadog_checks/base/utils/db/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
index 1441846962a33..5b4dbf5709df4 100644
--- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py
+++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
@@ -365,7 +365,7 @@ def _job_loop(self):
     def _set_rate_limit(self, rate_limit):
         if self._rate_limiter.rate_limit_s != rate_limit:
             self._rate_limiter = ConstantRateLimiter(rate_limit)
-    
+ 
     def _run_sync_job_rate_limited(self):
         if self._rate_limiter.shall_execute():
             try:

From 893875fc8eb872b522a8c8eb084e7b3fc12c6194 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Tue, 18 Jun 2024 12:06:53 +0000
Subject: [PATCH 118/132] removed white space again

---
 datadog_checks_base/datadog_checks/base/utils/db/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
index 5b4dbf5709df4..2a2d081b9de76 100644
--- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py
+++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
@@ -365,7 +365,7 @@ def _job_loop(self):
     def _set_rate_limit(self, rate_limit):
         if self._rate_limiter.rate_limit_s != rate_limit:
             self._rate_limiter = ConstantRateLimiter(rate_limit)
- 
+
     def _run_sync_job_rate_limited(self):
         if self._rate_limiter.shall_execute():
             try:

From 2fae9279b3f14c762263e22b4eafe2ad19211d78 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 11:37:27 +0000
Subject: [PATCH 119/132] synced example

---
 .../sqlserver/data/conf.yaml.example          | 44 +++++++++----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
index 82f994d824c43..97199c413facd 100644
--- a/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
+++ b/sqlserver/datadog_checks/sqlserver/data/conf.yaml.example
@@ -361,29 +361,6 @@ instances:
         #
         # collection_interval: 10
 
-    ## Configure collection of database schemas
-    #
-    ## schemas_collection
-
-        ## @param enabled - boolean - optional - default: true 
-        ## Enable collection of schemas. Requires `dbm: true`. If `database_autodiscovery` is not enabled,
-        ## data is collected only for the database configured with `database` parameter.
-        #
-        # enabled: false
-
-        ## @param collection_interval - number - optional - default: 600
-        ## Set the database schema collection interval (in seconds).        
-        ## If a non-default value is chosen, then that exact same value must be used for *every* check instance.
-        ## Running different instances with different collection intervals is not supported.
-        #
-        # collection_interval: 600
-
-        ## @param max_execution_time - number - optional - default: 10
-        ## Set the maximum time for schema collection (in seconds).
-        ## Capped by `collection_interval`.
-        #
-        # max_execution_time: 10
-
     ## @param stored_procedure_characters_limit - integer - optional - default: 500
     ## Limit the number of characters of the text of a stored procedure that is collected.
     ## The characters limit is applicable to both query metrics and query samples.
@@ -682,6 +659,27 @@ instances:
     #
     # ignore_missing_database: false
 
+    ## Configure collection of schemas. If `database_autodiscovery` is not enabled, data is collected 
+    ## only for the database configured with `database` parameter.
+    #
+    # schemas_collection:
+
+        ## @param enabled - boolean - optional - default: false
+        ## Enable schema collection. Requires `dbm: true`. Defaults to false.
+        #
+        # enabled: false
+
+        ## @param collection_interval - number - optional - default: 600
+        ## Set the database schema collection interval (in seconds). Defaults to 600 seconds.
+        #
+        # collection_interval: 600
+
+        ## @param max_execution_time - number - optional - default: 10
+        ## Set the maximum time for schema collection (in seconds). Defaults to 10 seconds. 
+        ## Capped by `schemas_collection.collection_interval`
+        #
+        # max_execution_time: 10
+
     ## @param tags - list of strings - optional
     ## A list of tags to attach to every metric and service check emitted by this instance.
     ##

From a783db6023c7fe47b6eff6550d681bf251e075eb Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 11:43:09 +0000
Subject: [PATCH 120/132] Added a license

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index b0b1e6397f30d..a26e811300e88 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -1,3 +1,7 @@
+# (C) Datadog, Inc. 2018-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+
 try:
     import datadog_agent
 except ImportError:

From 12fe2fcc6f49ab2bd464332219cd9b16228f7edd Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 11:56:14 +0000
Subject: [PATCH 121/132] Put correct date in license

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index a26e811300e88..b0624663e007d 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -1,4 +1,4 @@
-# (C) Datadog, Inc. 2018-present
+# (C) Datadog, Inc. 2024-present
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
 

From ae640a0a38f1ded6ed5e1c2c27e9365bcaba1d5a Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 12:16:58 +0000
Subject: [PATCH 122/132] applied model sync

---
 .../sqlserver/config_models/instance.py             | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/config_models/instance.py b/sqlserver/datadog_checks/sqlserver/config_models/instance.py
index bdd5621c46a57..603d9a5da3955 100644
--- a/sqlserver/datadog_checks/sqlserver/config_models/instance.py
+++ b/sqlserver/datadog_checks/sqlserver/config_models/instance.py
@@ -51,9 +51,7 @@ class CustomQuery(BaseModel):
         arbitrary_types_allowed=True,
         frozen=True,
     )
-    collection_interval: Optional[int] = None
     columns: Optional[tuple[MappingProxyType[str, Any], ...]] = None
-    metric_prefix: Optional[str] = None
     query: Optional[str] = None
     tags: Optional[tuple[str, ...]] = None
 
@@ -139,6 +137,16 @@ class QueryMetrics(BaseModel):
     samples_per_hour_per_query: Optional[int] = None
 
 
+class SchemasCollection(BaseModel):
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        frozen=True,
+    )
+    collection_interval: Optional[float] = None
+    enabled: Optional[bool] = None
+    max_execution_time: Optional[float] = None
+
+
 class InstanceConfig(BaseModel):
     model_config = ConfigDict(
         validate_default=True,
@@ -199,6 +207,7 @@ class InstanceConfig(BaseModel):
     query_activity: Optional[QueryActivity] = None
     query_metrics: Optional[QueryMetrics] = None
     reported_hostname: Optional[str] = None
+    schemas_collection: Optional[SchemasCollection] = None
     server_version: Optional[str] = None
     service: Optional[str] = None
     stored_procedure: Optional[str] = None

From 73561652a0105740d1e970e389116eb4bbd47adf Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 12:37:57 +0000
Subject: [PATCH 123/132] create a dedicated test db for schemas

---
 sqlserver/tests/compose/setup.sql | 15 ++++++++++++++-
 sqlserver/tests/test_metadata.py  | 11 +++++------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/sqlserver/tests/compose/setup.sql b/sqlserver/tests/compose/setup.sql
index 3aaf14191bbf3..aac5c217160f2 100644
--- a/sqlserver/tests/compose/setup.sql
+++ b/sqlserver/tests/compose/setup.sql
@@ -15,7 +15,7 @@ CREATE USER fred FOR LOGIN fred;
 GRANT CONNECT ANY DATABASE to fred;
 GO
 
-
+-- Create test database for integration schema tests
 CREATE DATABASE datadog_test_schemas;
 GO
 USE datadog_test_schemas;
@@ -71,6 +71,19 @@ CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
 );
 GO
 
+-- Create second test database for integration schema tests
+CREATE DATABASE datadog_test_schemas_second;
+GO
+USE datadog_test_schemas_second;
+-- This table is pronounced "things" except we've replaced "th" with the greek lower case "theta" to ensure we
+-- correctly support unicode throughout the integration.
+CREATE TABLE datadog_test_schemas_second.dbo.ϑings (id int DEFAULT 0, name varchar(255));
+INSERT INTO datadog_test_schemas_second.dbo.ϑings VALUES (1, 'foo'), (2, 'bar');
+CREATE USER bob FOR LOGIN bob;
+CREATE USER fred FOR LOGIN fred;
+CREATE CLUSTERED INDEX thingsindex ON datadog_test_schemas_second.dbo.ϑings (name);
+GO
+
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 CREATE DATABASE [datadog_test-1];
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 6bb48de2f9d12..83a18489ea718 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -93,13 +93,11 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     assert event['kind'] == "sqlserver_configs"
     assert len(event["metadata"]) > 0
 
-
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
-
-    databases_to_find = ['datadog_test_schemas', 'datadog_test-1']
+    databases_to_find = ['datadog_test_schemas', 'datadog_test_schemas_second']
     exp_datadog_test = {
         'id': '6',
-        'name': 'datadog_test-1',
+        'name': 'datadog_test_schemas_second',
         "collation": "SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
         'schemas': [
@@ -322,10 +320,10 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             }
         ],
     }
-    expected_data_for_db = {'datadog_test-1': exp_datadog_test, 'datadog_test_schemas': exp_datadog_test_schemas}
+    expected_data_for_db = {'datadog_test_schemas_second': exp_datadog_test, 'datadog_test_schemas': exp_datadog_test_schemas}
 
     dbm_instance['database_autodiscovery'] = True
-    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test-1']
+    dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test_schemas_second']
     dbm_instance['dbm'] = True
     dbm_instance['schemas_collection'] = {"enabled": True}
 
@@ -352,6 +350,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             actual_payloads[db_name]['schemas'] = actual_payloads[db_name]['schemas'] + database_metadata[0]['schemas']
         else:
             actual_payloads[db_name] = database_metadata[0]
+ 
     assert len(actual_payloads) == len(expected_data_for_db)
 
     for db_name, actual_payload in actual_payloads.items():

From 026dd0fd9f86fc2ae523c270867baef73603e029 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 12:51:14 +0000
Subject: [PATCH 124/132] applied linter

---
 sqlserver/tests/test_metadata.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 83a18489ea718..fdc69712f9286 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -93,6 +93,7 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
     assert event['kind'] == "sqlserver_configs"
     assert len(event["metadata"]) > 0
 
+
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     databases_to_find = ['datadog_test_schemas', 'datadog_test_schemas_second']
     exp_datadog_test = {
@@ -320,7 +321,10 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             }
         ],
     }
-    expected_data_for_db = {'datadog_test_schemas_second': exp_datadog_test, 'datadog_test_schemas': exp_datadog_test_schemas}
+    expected_data_for_db = {
+        'datadog_test_schemas_second': exp_datadog_test,
+        'datadog_test_schemas': exp_datadog_test_schemas,
+    }
 
     dbm_instance['database_autodiscovery'] = True
     dbm_instance['autodiscovery_include'] = ['datadog_test_schemas', 'datadog_test_schemas_second']
@@ -350,7 +354,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             actual_payloads[db_name]['schemas'] = actual_payloads[db_name]['schemas'] + database_metadata[0]['schemas']
         else:
             actual_payloads[db_name] = database_metadata[0]
- 
+
     assert len(actual_payloads) == len(expected_data_for_db)
 
     for db_name, actual_payload in actual_payloads.items():

From 3816e00a618880ff68baa8f7a7c4207ce8c65ece Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 13:35:53 +0000
Subject: [PATCH 125/132] added test schema db to all envs

---
 .../tests/compose-ha/sql/aoag_primary.sql     | 69 +++++++++++++++++++
 .../setup.sql                                 | 69 +++++++++++++++++++
 .../tests/compose-high-cardinality/setup.sql  | 69 +++++++++++++++++++
 sqlserver/tests/compose-windows/setup.sql     | 69 +++++++++++++++++++
 sqlserver/tests/test_metadata.py              |  6 +-
 5 files changed, 280 insertions(+), 2 deletions(-)

diff --git a/sqlserver/tests/compose-ha/sql/aoag_primary.sql b/sqlserver/tests/compose-ha/sql/aoag_primary.sql
index 9ed17b021f6b6..07c79b03b6aa5 100644
--- a/sqlserver/tests/compose-ha/sql/aoag_primary.sql
+++ b/sqlserver/tests/compose-ha/sql/aoag_primary.sql
@@ -36,6 +36,75 @@ GO
 ALTER DATABASE restricted_db SET RESTRICTED_USER
 GO
 
+-- Create test database for integration schema tests
+CREATE DATABASE datadog_test_schemas;
+GO
+USE datadog_test_schemas;
+GO
+
+CREATE SCHEMA test_schema;
+GO
+
+-- Create the partition function
+CREATE PARTITION FUNCTION CityPartitionFunction (INT)
+AS RANGE LEFT FOR VALUES (100, 200, 300); -- Define your partition boundaries here
+
+-- Create the partition scheme
+CREATE PARTITION SCHEME CityPartitionScheme
+AS PARTITION CityPartitionFunction ALL TO ([PRIMARY]); -- Assign partitions to filegroups
+
+-- Create the partitioned table
+CREATE TABLE datadog_test_schemas.test_schema.cities (
+    id INT NOT NULL DEFAULT 0,
+    name VARCHAR(255),
+    population INT NOT NULL DEFAULT 0,
+    CONSTRAINT PK_Cities PRIMARY KEY (id)
+) ON CityPartitionScheme(id); -- Assign the partition scheme to the table
+
+-- Create indexes
+CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+CREATE INDEX single_column_index ON datadog_test_schemas.test_schema.cities (population);
+
+INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey', 100), (2, 'bar', 200);
+GO
+
+-- Create table with a foreign key
+CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);
+GO
+ALTER TABLE datadog_test_schemas.test_schema.landmarks ADD CONSTRAINT FK_CityId FOREIGN KEY (city_id) REFERENCES datadog_test_schemas.test_schema.cities(id);
+GO
+
+-- Create table with unique constraint
+CREATE TABLE datadog_test_schemas.test_schema.Restaurants (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Cuisine VARCHAR(100),
+    CONSTRAINT UC_RestaurantNameDistrict UNIQUE (RestaurantName, District)
+);
+GO
+
+-- Create table with a foreign key on two columns
+CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Review VARCHAR(MAX),
+    CONSTRAINT FK_RestaurantNameDistrict FOREIGN KEY (RestaurantName, District) REFERENCES datadog_test_schemas.test_schema.Restaurants(RestaurantName, District)
+);
+GO
+
+-- Create second test database for integration schema tests
+CREATE DATABASE datadog_test_schemas_second;
+GO
+USE datadog_test_schemas_second;
+-- This table is pronounced "things" except we've replaced "th" with the greek lower case "theta" to ensure we
+-- correctly support unicode throughout the integration.
+CREATE TABLE datadog_test_schemas_second.dbo.ϑings (id int DEFAULT 0, name varchar(255));
+INSERT INTO datadog_test_schemas_second.dbo.ϑings VALUES (1, 'foo'), (2, 'bar');
+CREATE USER bob FOR LOGIN bob;
+CREATE USER fred FOR LOGIN fred;
+CREATE CLUSTERED INDEX thingsindex ON datadog_test_schemas_second.dbo.ϑings (name);
+GO
+
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 USE [datadog_test-1];
diff --git a/sqlserver/tests/compose-high-cardinality-windows/setup.sql b/sqlserver/tests/compose-high-cardinality-windows/setup.sql
index fd4c0efa3d4cf..f33ceff2df42e 100644
--- a/sqlserver/tests/compose-high-cardinality-windows/setup.sql
+++ b/sqlserver/tests/compose-high-cardinality-windows/setup.sql
@@ -30,6 +30,75 @@ GO
 CREATE USER datadog FOR LOGIN datadog;
 GO
 
+-- Create test database for integration schema tests
+CREATE DATABASE datadog_test_schemas;
+GO
+USE datadog_test_schemas;
+GO
+
+CREATE SCHEMA test_schema;
+GO
+
+-- Create the partition function
+CREATE PARTITION FUNCTION CityPartitionFunction (INT)
+AS RANGE LEFT FOR VALUES (100, 200, 300); -- Define your partition boundaries here
+
+-- Create the partition scheme
+CREATE PARTITION SCHEME CityPartitionScheme
+AS PARTITION CityPartitionFunction ALL TO ([PRIMARY]); -- Assign partitions to filegroups
+
+-- Create the partitioned table
+CREATE TABLE datadog_test_schemas.test_schema.cities (
+    id INT NOT NULL DEFAULT 0,
+    name VARCHAR(255),
+    population INT NOT NULL DEFAULT 0,
+    CONSTRAINT PK_Cities PRIMARY KEY (id)
+) ON CityPartitionScheme(id); -- Assign the partition scheme to the table
+
+-- Create indexes
+CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+CREATE INDEX single_column_index ON datadog_test_schemas.test_schema.cities (population);
+
+INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey', 100), (2, 'bar', 200);
+GO
+
+-- Create table with a foreign key
+CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);
+GO
+ALTER TABLE datadog_test_schemas.test_schema.landmarks ADD CONSTRAINT FK_CityId FOREIGN KEY (city_id) REFERENCES datadog_test_schemas.test_schema.cities(id);
+GO
+
+-- Create table with unique constraint
+CREATE TABLE datadog_test_schemas.test_schema.Restaurants (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Cuisine VARCHAR(100),
+    CONSTRAINT UC_RestaurantNameDistrict UNIQUE (RestaurantName, District)
+);
+GO
+
+-- Create table with a foreign key on two columns
+CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Review VARCHAR(MAX),
+    CONSTRAINT FK_RestaurantNameDistrict FOREIGN KEY (RestaurantName, District) REFERENCES datadog_test_schemas.test_schema.Restaurants(RestaurantName, District)
+);
+GO
+
+-- Create second test database for integration schema tests
+CREATE DATABASE datadog_test_schemas_second;
+GO
+USE datadog_test_schemas_second;
+-- This table is pronounced "things" except we've replaced "th" with the greek lower case "theta" to ensure we
+-- correctly support unicode throughout the integration.
+CREATE TABLE datadog_test_schemas_second.dbo.ϑings (id int DEFAULT 0, name varchar(255));
+INSERT INTO datadog_test_schemas_second.dbo.ϑings VALUES (1, 'foo'), (2, 'bar');
+CREATE USER bob FOR LOGIN bob;
+CREATE USER fred FOR LOGIN fred;
+CREATE CLUSTERED INDEX thingsindex ON datadog_test_schemas_second.dbo.ϑings (name);
+GO
+
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 -- the datadog user has only connect access but can't read any objects
diff --git a/sqlserver/tests/compose-high-cardinality/setup.sql b/sqlserver/tests/compose-high-cardinality/setup.sql
index f8c2cc506500b..839fd7c690679 100644
--- a/sqlserver/tests/compose-high-cardinality/setup.sql
+++ b/sqlserver/tests/compose-high-cardinality/setup.sql
@@ -123,6 +123,75 @@ GRANT EXECUTE on nullCharTest to bob;
 GRANT EXECUTE on nullCharTest to fred;
 GO
 
+-- Create test database for integration schema tests
+CREATE DATABASE datadog_test_schemas;
+GO
+USE datadog_test_schemas;
+GO
+
+CREATE SCHEMA test_schema;
+GO
+
+-- Create the partition function
+CREATE PARTITION FUNCTION CityPartitionFunction (INT)
+AS RANGE LEFT FOR VALUES (100, 200, 300); -- Define your partition boundaries here
+
+-- Create the partition scheme
+CREATE PARTITION SCHEME CityPartitionScheme
+AS PARTITION CityPartitionFunction ALL TO ([PRIMARY]); -- Assign partitions to filegroups
+
+-- Create the partitioned table
+CREATE TABLE datadog_test_schemas.test_schema.cities (
+    id INT NOT NULL DEFAULT 0,
+    name VARCHAR(255),
+    population INT NOT NULL DEFAULT 0,
+    CONSTRAINT PK_Cities PRIMARY KEY (id)
+) ON CityPartitionScheme(id); -- Assign the partition scheme to the table
+
+-- Create indexes
+CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+CREATE INDEX single_column_index ON datadog_test_schemas.test_schema.cities (population);
+
+INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey', 100), (2, 'bar', 200);
+GO
+
+-- Create table with a foreign key
+CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);
+GO
+ALTER TABLE datadog_test_schemas.test_schema.landmarks ADD CONSTRAINT FK_CityId FOREIGN KEY (city_id) REFERENCES datadog_test_schemas.test_schema.cities(id);
+GO
+
+-- Create table with unique constraint
+CREATE TABLE datadog_test_schemas.test_schema.Restaurants (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Cuisine VARCHAR(100),
+    CONSTRAINT UC_RestaurantNameDistrict UNIQUE (RestaurantName, District)
+);
+GO
+
+-- Create table with a foreign key on two columns
+CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Review VARCHAR(MAX),
+    CONSTRAINT FK_RestaurantNameDistrict FOREIGN KEY (RestaurantName, District) REFERENCES datadog_test_schemas.test_schema.Restaurants(RestaurantName, District)
+);
+GO
+
+-- Create second test database for integration schema tests
+CREATE DATABASE datadog_test_schemas_second;
+GO
+USE datadog_test_schemas_second;
+-- This table is pronounced "things" except we've replaced "th" with the greek lower case "theta" to ensure we
+-- correctly support unicode throughout the integration.
+CREATE TABLE datadog_test_schemas_second.dbo.ϑings (id int DEFAULT 0, name varchar(255));
+INSERT INTO datadog_test_schemas_second.dbo.ϑings VALUES (1, 'foo'), (2, 'bar');
+CREATE USER bob FOR LOGIN bob;
+CREATE USER fred FOR LOGIN fred;
+CREATE CLUSTERED INDEX thingsindex ON datadog_test_schemas_second.dbo.ϑings (name);
+GO
+
 -- Create test database for integration tests.
 -- Only bob and fred have read/write access to this database.
 CREATE DATABASE [datadog_test-1];
diff --git a/sqlserver/tests/compose-windows/setup.sql b/sqlserver/tests/compose-windows/setup.sql
index 3df6386c8b4f2..d0f7c7cf5409d 100644
--- a/sqlserver/tests/compose-windows/setup.sql
+++ b/sqlserver/tests/compose-windows/setup.sql
@@ -30,6 +30,75 @@ GO
 CREATE USER datadog FOR LOGIN datadog;
 GO
 
+-- Create test database for integration schema tests
+CREATE DATABASE datadog_test_schemas;
+GO
+USE datadog_test_schemas;
+GO
+
+CREATE SCHEMA test_schema;
+GO
+
+-- Create the partition function
+CREATE PARTITION FUNCTION CityPartitionFunction (INT)
+AS RANGE LEFT FOR VALUES (100, 200, 300); -- Define your partition boundaries here
+
+-- Create the partition scheme
+CREATE PARTITION SCHEME CityPartitionScheme
+AS PARTITION CityPartitionFunction ALL TO ([PRIMARY]); -- Assign partitions to filegroups
+
+-- Create the partitioned table
+CREATE TABLE datadog_test_schemas.test_schema.cities (
+    id INT NOT NULL DEFAULT 0,
+    name VARCHAR(255),
+    population INT NOT NULL DEFAULT 0,
+    CONSTRAINT PK_Cities PRIMARY KEY (id)
+) ON CityPartitionScheme(id); -- Assign the partition scheme to the table
+
+-- Create indexes
+CREATE INDEX two_columns_index ON datadog_test_schemas.test_schema.cities (id, name);
+CREATE INDEX single_column_index ON datadog_test_schemas.test_schema.cities (population);
+
+INSERT INTO datadog_test_schemas.test_schema.cities  VALUES (1, 'yey', 100), (2, 'bar', 200);
+GO
+
+-- Create table with a foreign key
+CREATE TABLE datadog_test_schemas.test_schema.landmarks (name varchar(255), city_id int DEFAULT 0);
+GO
+ALTER TABLE datadog_test_schemas.test_schema.landmarks ADD CONSTRAINT FK_CityId FOREIGN KEY (city_id) REFERENCES datadog_test_schemas.test_schema.cities(id);
+GO
+
+-- Create table with unique constraint
+CREATE TABLE datadog_test_schemas.test_schema.Restaurants (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Cuisine VARCHAR(100),
+    CONSTRAINT UC_RestaurantNameDistrict UNIQUE (RestaurantName, District)
+);
+GO
+
+-- Create table with a foreign key on two columns
+CREATE TABLE datadog_test_schemas.test_schema.RestaurantReviews (
+    RestaurantName VARCHAR(255),
+    District VARCHAR(100),
+    Review VARCHAR(MAX),
+    CONSTRAINT FK_RestaurantNameDistrict FOREIGN KEY (RestaurantName, District) REFERENCES datadog_test_schemas.test_schema.Restaurants(RestaurantName, District)
+);
+GO
+
+-- Create second test database for integration schema tests
+CREATE DATABASE datadog_test_schemas_second;
+GO
+USE datadog_test_schemas_second;
+-- This table is pronounced "things" except we've replaced "th" with the greek lower case "theta" to ensure we
+-- correctly support unicode throughout the integration.
+CREATE TABLE datadog_test_schemas_second.dbo.ϑings (id int DEFAULT 0, name varchar(255));
+INSERT INTO datadog_test_schemas_second.dbo.ϑings VALUES (1, 'foo'), (2, 'bar');
+CREATE USER bob FOR LOGIN bob;
+CREATE USER fred FOR LOGIN fred;
+CREATE CLUSTERED INDEX thingsindex ON datadog_test_schemas_second.dbo.ϑings (name);
+GO
+
 -- Create test database for integration tests
 -- only bob and fred have read/write access to this database
 -- the datadog user has only connect access but can't read any objects
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index fdc69712f9286..812738cbae7c5 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -97,7 +97,7 @@ def test_sqlserver_collect_settings(aggregator, dd_run_check, dbm_instance):
 def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
     databases_to_find = ['datadog_test_schemas', 'datadog_test_schemas_second']
     exp_datadog_test = {
-        'id': '6',
+        'id': 'normalized_value',
         'name': 'datadog_test_schemas_second',
         "collation": "SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
@@ -144,7 +144,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         ],
     }
     exp_datadog_test_schemas = {
-        'id': '5',
+        'id': 'normalized_value',
         'name': 'datadog_test_schemas',
         "collation": "SQL_Latin1_General_CP1_CI_AS",
         'owner': 'dbo',
@@ -361,6 +361,8 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
         assert db_name in databases_to_find
 
+        #database id's a re different in different test envs
+        actual_payload['id'] = 'normalized_value'
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         diff_keys = list(difference.keys())

From 96d169705ecaa3ad1f3f5f1a987cf6d61c372d89 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 13:41:38 +0000
Subject: [PATCH 126/132] lint test

---
 sqlserver/tests/test_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 812738cbae7c5..598ebc0f31435 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -361,7 +361,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
         assert db_name in databases_to_find
 
-        #database id's a re different in different test envs
+        # database id's a re different in different test envs
         actual_payload['id'] = 'normalized_value'
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 

From c112f88bbb5e9fb591ba4342dba8a1a06d8c2a96 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 14:39:01 +0000
Subject: [PATCH 127/132] normalized ids

---
 sqlserver/tests/test_metadata.py | 19 ++++++++++---------
 sqlserver/tests/utils.py         |  8 ++++++++
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 598ebc0f31435..7655feb59d873 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -14,6 +14,7 @@
 from datadog_checks.sqlserver import SQLServer
 
 from .common import CHECK_NAME
+from .utils import normalize_ids
 
 try:
     import pyodbc
@@ -104,11 +105,11 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         'schemas': [
             {
                 'name': 'dbo',
-                'id': '1',
+                'id': 'normalized_value',
                 'owner_name': 'dbo',
                 'tables': [
                     {
-                        'id': '885578193',
+                        'id': 'normalized_value',
                         'name': 'ϑings',
                         'columns': [
                             {
@@ -151,11 +152,11 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
         'schemas': [
             {
                 'name': 'test_schema',
-                'id': '5',
+                'id': 'normalized_value',
                 'owner_name': 'dbo',
                 'tables': [
                     {
-                        'id': '885578193',
+                        'id': 'normalized_value',
                         'name': 'cities',
                         'columns': [
                             {
@@ -221,7 +222,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
                         ],
                     },
                     {
-                        'id': '949578421',
+                        'id': 'normalized_value',
                         'name': 'landmarks',
                         'columns': [
                             {
@@ -242,7 +243,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
                         'partitions': {'partition_count': 1},
                     },
                     {
-                        'id': '1029578706',
+                        'id': 'normalized_value',
                         'name': 'RestaurantReviews',
                         'columns': [
                             {
@@ -270,7 +271,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
                         'partitions': {'partition_count': 1},
                     },
                     {
-                        'id': '997578592',
+                        'id': 'normalized_value',
                         'name': 'Restaurants',
                         'columns': [
                             {
@@ -361,8 +362,8 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
         assert db_name in databases_to_find
 
-        # database id's a re different in different test envs
-        actual_payload['id'] = 'normalized_value'
+        normalize_ids(actual_payload)
+
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         diff_keys = list(difference.keys())
diff --git a/sqlserver/tests/utils.py b/sqlserver/tests/utils.py
index 1d009b47ed6f5..f479439def832 100644
--- a/sqlserver/tests/utils.py
+++ b/sqlserver/tests/utils.py
@@ -220,3 +220,11 @@ def run_query_and_ignore_exception(conn, query):
     @staticmethod
     def _create_rand_string(length=5):
         return ''.join(choice(string.ascii_lowercase + string.digits) for _ in range(length))
+
+
+def normalize_ids(actual_payload):
+    actual_payload['id'] = 'normalized_value'
+    for schema in actual_payload['schemas']:
+        schema['id'] = 'normalized_value'
+        for table in schema['tables']:
+            table['id'] = 'normalized_value'

From f22ff21a7b803a190ad0f4964d87bf98b7d650f1 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 16:05:35 +0000
Subject: [PATCH 128/132] convert to bool windows value

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 9 +++++++--
 sqlserver/datadog_checks/sqlserver/utils.py   | 5 +++++
 sqlserver/tests/test_metadata.py              | 6 ++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index b0624663e007d..7c4354efb28ca 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -28,7 +28,7 @@
     SCHEMA_QUERY,
     TABLES_IN_SCHEMA_QUERY,
 )
-from datadog_checks.sqlserver.utils import execute_query, get_list_chunks, is_azure_sql_database
+from datadog_checks.sqlserver.utils import convert_to_bool, execute_query, get_list_chunks, is_azure_sql_database
 
 
 class SubmitData:
@@ -397,7 +397,12 @@ def _populate_with_index_data(self, table_ids, table_id_to_table_data, cursor):
             table_id = row.pop("id", None)
             table_id_str = str(table_id)
             table_id_to_table_data[table_id_str].setdefault("indexes", [])
-            table_id_to_table_data[table_id_str]["indexes"].append(row)
+            if "is_unique" in row:
+                row["is_unique"] = convert_to_bool(row["is_unique"])
+            if "is_primary_key" in row:
+                row["is_primary_key"] = convert_to_bool(row["is_primary_key"])
+            if "is_disabled" in row:
+                row["is_disabled"] = convert_to_bool(row["is_disabled"])
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _populate_with_foreign_keys_data(self, table_ids, table_id_to_table_data, cursor):
diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index 667b1f8d1dff5..904152abc4bc9 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -158,3 +158,8 @@ def get_list_chunks(lst, n):
     """Yield successive n-sized chunks from lst."""
     for i in range(0, len(lst), n):
         yield lst[i : i + n]
+
+
+def convert_to_bool(value):
+    if isinstance(value, int):
+        return bool(value)
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 7655feb59d873..87e294f0bdb7f 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -11,6 +11,7 @@
 import pytest
 from deepdiff import DeepDiff
 
+from datadog_checks.dev.utils import running_on_windows_ci
 from datadog_checks.sqlserver import SQLServer
 
 from .common import CHECK_NAME
@@ -322,6 +323,11 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
             }
         ],
     }
+
+    if running_on_windows_ci():
+        exp_datadog_test['owner'] = 'None'
+        exp_datadog_test_schemas['owner'] = 'None'
+
     expected_data_for_db = {
         'datadog_test_schemas_second': exp_datadog_test,
         'datadog_test_schemas': exp_datadog_test_schemas,

From aba611e16cfc4c64c3b7769d59748ee0ca8e0759 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 16:52:14 +0000
Subject: [PATCH 129/132] fix convert function

---
 sqlserver/datadog_checks/sqlserver/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sqlserver/datadog_checks/sqlserver/utils.py b/sqlserver/datadog_checks/sqlserver/utils.py
index 904152abc4bc9..a35106bd1ce09 100644
--- a/sqlserver/datadog_checks/sqlserver/utils.py
+++ b/sqlserver/datadog_checks/sqlserver/utils.py
@@ -163,3 +163,5 @@ def get_list_chunks(lst, n):
 def convert_to_bool(value):
     if isinstance(value, int):
         return bool(value)
+    else:
+        return value

From 272b6846fd022cad7d797cb634a8082f73c441ad Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 16:57:59 +0000
Subject: [PATCH 130/132] fixed put back index row

---
 sqlserver/datadog_checks/sqlserver/schemas.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 7c4354efb28ca..4594d927f596e 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -396,13 +396,14 @@ def _populate_with_index_data(self, table_ids, table_id_to_table_data, cursor):
         for row in rows:
             table_id = row.pop("id", None)
             table_id_str = str(table_id)
-            table_id_to_table_data[table_id_str].setdefault("indexes", [])
             if "is_unique" in row:
                 row["is_unique"] = convert_to_bool(row["is_unique"])
             if "is_primary_key" in row:
                 row["is_primary_key"] = convert_to_bool(row["is_primary_key"])
             if "is_disabled" in row:
                 row["is_disabled"] = convert_to_bool(row["is_disabled"])
+            table_id_to_table_data[table_id_str].setdefault("indexes", [])
+            table_id_to_table_data[table_id_str]["indexes"].append(row)
 
     @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True)
     def _populate_with_foreign_keys_data(self, table_ids, table_id_to_table_data, cursor):

From 6819b642415348d85b3e66a6b3c873b5cbad064d Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 18:10:08 +0000
Subject: [PATCH 131/132] Make test agnostic to order of index columns

---
 sqlserver/datadog_checks/sqlserver/schemas.py |  2 ++
 sqlserver/tests/test_metadata.py              | 10 +++++++---
 sqlserver/tests/utils.py                      | 12 ++++++++++++
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/sqlserver/datadog_checks/sqlserver/schemas.py b/sqlserver/datadog_checks/sqlserver/schemas.py
index 4594d927f596e..8888ea7c0e0bf 100644
--- a/sqlserver/datadog_checks/sqlserver/schemas.py
+++ b/sqlserver/datadog_checks/sqlserver/schemas.py
@@ -402,6 +402,8 @@ def _populate_with_index_data(self, table_ids, table_id_to_table_data, cursor):
                 row["is_primary_key"] = convert_to_bool(row["is_primary_key"])
             if "is_disabled" in row:
                 row["is_disabled"] = convert_to_bool(row["is_disabled"])
+            if "is_unique_constraint" in row:
+                row["is_unique_constraint"] = convert_to_bool(row["is_unique_constraint"])
             table_id_to_table_data[table_id_str].setdefault("indexes", [])
             table_id_to_table_data[table_id_str]["indexes"].append(row)
 
diff --git a/sqlserver/tests/test_metadata.py b/sqlserver/tests/test_metadata.py
index 87e294f0bdb7f..361add055f9db 100644
--- a/sqlserver/tests/test_metadata.py
+++ b/sqlserver/tests/test_metadata.py
@@ -15,7 +15,7 @@
 from datadog_checks.sqlserver import SQLServer
 
 from .common import CHECK_NAME
-from .utils import normalize_ids
+from .utils import normalize_ids, normalize_indexes_columns
 
 try:
     import pyodbc
@@ -209,7 +209,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
                                 'is_primary_key': False,
                                 'is_unique_constraint': False,
                                 'is_disabled': False,
-                                'column_names': 'population,id',
+                                'column_names': 'id,population',
                             },
                             {
                                 'name': 'two_columns_index',
@@ -315,7 +315,7 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
                                 'is_primary_key': False,
                                 'is_unique_constraint': True,
                                 'is_disabled': False,
-                                'column_names': 'RestaurantName,District',
+                                'column_names': 'District,RestaurantName',
                             }
                         ],
                     },
@@ -368,8 +368,12 @@ def test_collect_schemas(aggregator, dd_run_check, dbm_instance):
 
         assert db_name in databases_to_find
 
+        # id's are env dependant
         normalize_ids(actual_payload)
 
+        # index columns may be in any order
+        normalize_indexes_columns(actual_payload)
+
         difference = DeepDiff(actual_payload, expected_data_for_db[db_name], ignore_order=True)
 
         diff_keys = list(difference.keys())
diff --git a/sqlserver/tests/utils.py b/sqlserver/tests/utils.py
index f479439def832..eac8dceebde69 100644
--- a/sqlserver/tests/utils.py
+++ b/sqlserver/tests/utils.py
@@ -228,3 +228,15 @@ def normalize_ids(actual_payload):
         schema['id'] = 'normalized_value'
         for table in schema['tables']:
             table['id'] = 'normalized_value'
+
+
+def normalize_indexes_columns(actual_payload):
+    for schema in actual_payload['schemas']:
+        schema['id'] = 'normalized_value'
+        for table in schema['tables']:
+            if 'indexes' in table:
+                for index in table['indexes']:
+                    column_names = index['column_names']
+                    columns = column_names.split(',')
+                    sorted_columns = sorted(columns)
+                    index['column_names'] = ','.join(sorted_columns)

From 3a101e5f4ad452ac9878b3f15594a247a2b33fa8 Mon Sep 17 00:00:00 2001
From: Boris Kozlov <boris.kozlov@datadoghq.com>
Date: Wed, 19 Jun 2024 18:35:59 +0000
Subject: [PATCH 132/132] updated with latest ddev

---
 sqlserver/datadog_checks/sqlserver/config_models/instance.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sqlserver/datadog_checks/sqlserver/config_models/instance.py b/sqlserver/datadog_checks/sqlserver/config_models/instance.py
index 603d9a5da3955..44d971fabc633 100644
--- a/sqlserver/datadog_checks/sqlserver/config_models/instance.py
+++ b/sqlserver/datadog_checks/sqlserver/config_models/instance.py
@@ -51,7 +51,9 @@ class CustomQuery(BaseModel):
         arbitrary_types_allowed=True,
         frozen=True,
     )
+    collection_interval: Optional[int] = None
     columns: Optional[tuple[MappingProxyType[str, Any], ...]] = None
+    metric_prefix: Optional[str] = None
     query: Optional[str] = None
     tags: Optional[tuple[str, ...]] = None