|
6 | 6 | import logging
|
7 | 7 | import sqlite3
|
8 | 8 | import pandas as pd
|
| 9 | +import os |
9 | 10 |
|
10 | 11 | indexes = {
|
11 | 12 | "provision_summary": ["organisation", "organisation_name", "dataset"]
|
12 | 13 | }
|
13 | 14 |
|
| 15 | +PARQUET_PERFORMANCE_DIR = os.getenv("PARQUET_PERFORMANCE_DIR") |
| 16 | + |
14 | 17 |
|
15 | 18 | def fetch_provision_data(db_path):
|
16 | 19 | conn = sqlite3.connect(db_path)
|
17 | 20 | query = """
|
18 |
| - select p.organisation, o.name as organisation_name, p.cohort, p.dataset from provision p |
| 21 | + select p.organisation, o.name as organisation_name, p.cohort, p.dataset,p.provision_reason from provision p |
19 | 22 | inner join organisation o on o.organisation = p.organisation
|
20 | 23 | order by p.organisation
|
21 | 24 | """
|
@@ -121,7 +124,7 @@ def create_performance_tables(merged_data, cf_merged_data, endpoint_summary_data
|
121 | 124 | endpoint_summary_table_name, conn, if_exists='replace', index=False)
|
122 | 125 |
|
123 | 126 | # Filter out endpoints with an end date as we don't want to count them in provision summary
|
124 |
| - final_result = merged_data.groupby(['organisation', 'organisation_name', 'dataset']).agg( |
| 127 | + final_result = merged_data.groupby(['organisation', 'organisation_name', 'dataset', 'provision_reason']).agg( |
125 | 128 | active_endpoint_count=pd.NamedAgg(
|
126 | 129 | column='endpoint',
|
127 | 130 | aggfunc=lambda x: x[(merged_data.loc[x.index,
|
@@ -191,6 +194,7 @@ def create_performance_tables(merged_data, cf_merged_data, endpoint_summary_data
|
191 | 194 | })
|
192 | 195 |
|
193 | 196 | provision_table_name = "provision_summary"
|
| 197 | + final_result.to_parquet(os.path.join(PARQUET_PERFORMANCE_DIR,"provision_summary.parquet"), engine="pyarrow") |
194 | 198 | final_result.to_sql(provision_table_name, conn,
|
195 | 199 | if_exists='replace', index=False)
|
196 | 200 | conn.close()
|
|
0 commit comments