Skip to content

Commit 96dfdfe

Browse files
authored
Merge pull request #61 from digital-land/save_perf_tables_to_parquet
Save perf tables to parquet
2 parents ea9ae64 + f18e7c8 commit 96dfdfe

4 files changed

+19
-10
lines changed

bin/concat-column-field.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,4 @@ def process_column_fields(column_field_dir, input_dir):
4444
w.writerow(row)
4545

4646
if __name__ == "__main__":
47-
process_column_fields()
47+
process_column_fields()

bin/load_performance.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,12 @@ def fetch_issue_data(db_path):
3131
query = """
3232
select
3333
count(*) as count_issues, strftime('%d-%m-%Y', 'now') as date,
34-
i.issue_type as issue_type, it.severity, it.responsibility, i.dataset, i.resource, GROUP_CONCAT(DISTINCT i.field) as fields
34+
i.issue_type as issue_type, it.severity, it.responsibility, i.dataset, i.resource, i.field
3535
from issue i
3636
inner join resource r on i.resource = r.resource
3737
inner join issue_type it on i.issue_type = it.issue_type
3838
where r.end_date = ''
39-
group by i.dataset,i.resource,i.issue_type
39+
group by i.dataset,i.resource,i.issue_type,i.field
4040
"""
4141
df_issue = pd.read_sql_query(query, conn)
4242
return df_issue
@@ -82,6 +82,7 @@ def fetch_endpoint_summary(perf_path):
8282
dataset,
8383
endpoint,
8484
endpoint_url,
85+
documentation_url,
8586
resource,
8687
latest_status,
8788
latest_exception,
@@ -107,19 +108,25 @@ def create_performance_tables(merged_data, cf_merged_data, endpoint_summary_data
107108
cf_merged_data_filtered = cf_merged_data[cf_merged_data['resource'] != ""]
108109
cf_merged_data_filtered = cf_merged_data_filtered[cf_merged_data_filtered['endpoint'].notna(
109110
)]
111+
cf_merged_data_filtered[column_field_table_fields].to_parquet(os.path.join(PARQUET_PERFORMANCE_DIR,
112+
"endpoint_dataset_resource_summary.parquet"), engine="pyarrow")
110113
cf_merged_data_filtered[column_field_table_fields].to_sql(
111114
column_field_table_name, conn, if_exists="replace", index=False)
112115

113116
issue_table_name = "endpoint_dataset_issue_type_summary"
114117
issue_table_fields = ["organisation", "organisation_name", "cohort", "dataset", "collection", "pipeline", "endpoint", "endpoint_url", "resource", "resource_start_date",
115-
"resource_end_date", "latest_log_entry_date", "count_issues", "date", "issue_type", "severity", "responsibility", "fields"]
118+
"resource_end_date", "latest_log_entry_date", "count_issues", "date", "issue_type", "severity", "responsibility", "field"]
116119
issue_data_filtered = merged_data[merged_data['resource'] != ""]
117120
issue_data_filtered = issue_data_filtered[issue_data_filtered['endpoint'].notna(
118121
)]
122+
issue_data_filtered[issue_table_fields].to_parquet(os.path.join(PARQUET_PERFORMANCE_DIR,
123+
"endpoint_dataset_issue_type_summary.parquet"), engine="pyarrow")
119124
issue_data_filtered[issue_table_fields].to_sql(issue_table_name, conn, if_exists='replace', index=False, dtype={
120125
'count_issues': 'INTEGER'})
121126

122127
endpoint_summary_table_name = "endpoint_dataset_summary"
128+
endpoint_summary_data.to_parquet(os.path.join(PARQUET_PERFORMANCE_DIR,
129+
"endpoint_dataset_summary.parquet"), engine="pyarrow")
123130
endpoint_summary_data.to_sql(
124131
endpoint_summary_table_name, conn, if_exists='replace', index=False)
125132

bin/load_reporting_tables.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def fetch_historic_endpoints_data_from_dl(db_path):
2626
sp.pipeline,
2727
l.endpoint,
2828
e.endpoint_url,
29+
s.documentation_url,
2930
s.licence,
3031
l.status as latest_status,
3132
l.exception as latest_exception,
@@ -46,7 +47,7 @@ def fetch_historic_endpoints_data_from_dl(db_path):
4647
LEFT JOIN resource r on l.resource = r.resource
4748
4849
GROUP BY
49-
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
50+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
5051
5152
ORDER BY
5253
s.organisation, o.name, o.dataset, s.collection, sp.pipeline, latest_log_entry_date DESC
@@ -134,6 +135,7 @@ def create_reporting_tables(historic_endpoints_data, latest_endpoint_data, perfo
134135
pipeline TEXT,
135136
endpoint TEXT,
136137
endpoint_url TEXT,
138+
documentation_url TEXT,
137139
licence TEXT,
138140
latest_status TEXT,
139141
latest_exception TEXT,
@@ -147,9 +149,9 @@ def create_reporting_tables(historic_endpoints_data, latest_endpoint_data, perfo
147149
""")
148150
cursor.executemany("""
149151
INSERT INTO reporting_historic_endpoints (
150-
organisation, name, organisation_name, dataset, collection, pipeline, endpoint, endpoint_url, licence, latest_status, latest_exception, resource,
152+
organisation, name, organisation_name, dataset, collection, pipeline, endpoint, endpoint_url, documentation_url, licence, latest_status, latest_exception, resource,
151153
latest_log_entry_date, endpoint_entry_date, endpoint_end_date, resource_start_date, resource_end_date
152-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
154+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
153155
""", historic_endpoints_data)
154156

155157
# Create the reporting_latest_endpoints table

check/performance.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
logger = logging.getLogger("__name__")
88

99
EXPECTED = {
10-
"endpoint_dataset_issue_type_summary": ['organisation', 'organisation_name', 'cohort', 'dataset', 'collection', 'pipeline', 'endpoint', 'endpoint_url', 'resource', 'resource_start_date', 'resource_end_date', 'latest_log_entry_date', 'count_issues', 'date', 'issue_type', 'severity', 'responsibility', 'fields'],
10+
"endpoint_dataset_issue_type_summary": ['organisation', 'organisation_name', 'cohort', 'dataset', 'collection', 'pipeline', 'endpoint', 'endpoint_url', 'resource', 'resource_start_date', 'resource_end_date', 'latest_log_entry_date', 'count_issues', 'date', 'issue_type', 'severity', 'responsibility', 'field'],
1111
"endpoint_dataset_resource_summary": ['organisation', 'organisation_name', 'cohort', 'dataset', 'collection', 'pipeline', 'endpoint', 'endpoint_url', 'resource', 'resource_start_date', 'resource_end_date', 'latest_log_entry_date', 'mapping_field', 'non_mapping_field'],
12-
"endpoint_dataset_summary": ['organisation', 'dataset', 'endpoint', 'endpoint_url', 'resource', 'latest_status', 'latest_exception', 'latest_log_entry_date', 'entry_date', 'end_date', 'latest_resource_start_date', 'resource_end_date'],
12+
"endpoint_dataset_summary": ['organisation', 'dataset', 'endpoint', 'endpoint_url', 'documentation_url', 'resource', 'latest_status', 'latest_exception', 'latest_log_entry_date', 'entry_date', 'end_date', 'latest_resource_start_date', 'resource_end_date'],
1313
"provision_summary": ['organisation', 'organisation_name', 'dataset', 'provision_reason', 'active_endpoint_count', 'error_endpoint_count', 'count_issue_error_internal', 'count_issue_error_external', 'count_issue_warning_internal', 'count_issue_warning_external', 'count_issue_notice_internal', 'count_issue_notice_external'],
14-
"reporting_historic_endpoints": ['organisation', 'name', 'organisation_name', 'dataset', 'collection', 'pipeline', 'endpoint', 'endpoint_url', 'licence', 'latest_status', 'latest_exception', 'resource', 'latest_log_entry_date', 'endpoint_entry_date', 'endpoint_end_date', 'resource_start_date', 'resource_end_date'],
14+
"reporting_historic_endpoints": ['organisation', 'name', 'organisation_name', 'dataset', 'collection', 'pipeline', 'endpoint', 'endpoint_url', 'documentation_url', 'licence', 'latest_status', 'latest_exception', 'resource', 'latest_log_entry_date', 'endpoint_entry_date', 'endpoint_end_date', 'resource_start_date', 'resource_end_date'],
1515
"reporting_latest_endpoints": ['organisation', 'name', 'organisation_name', 'dataset', 'collection', 'pipeline', 'endpoint', 'endpoint_url', 'licence', 'latest_status', 'days_since_200', 'latest_exception', 'resource', 'latest_log_entry_date', 'endpoint_entry_date', 'endpoint_end_date', 'resource_start_date', 'resource_end_date', 'rn']
1616
}
1717

0 commit comments

Comments
 (0)