Skip to content

Commit 277517a

Browse files
authored
Merge pull request #50 from digital-land/upload_parquet_to_S3
Upload parquet to s3
2 parents 32fdc69 + 1e7a272 commit 277517a

8 files changed

+86
-5
lines changed

.github/workflows/run.yml

+18
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,12 @@ jobs:
6363
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
6464
run: make save-dataset
6565

66+
- name: Save Parquet files to Development S3
67+
env:
68+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
69+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
70+
run: make save-tables-to-parquet
71+
6672
# Staging
6773
- name: Configure Staging AWS Credentials
6874
uses: aws-actions/configure-aws-credentials@v1-node16
@@ -76,6 +82,12 @@ jobs:
7682
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
7783
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
7884
run: make save-dataset
85+
86+
- name: Save Parquet files to Staging S3
87+
env:
88+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
89+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
90+
run: make save-tables-to-parquet
7991

8092
# Production
8193
- name: Configure Production AWS Credentials
@@ -90,6 +102,12 @@ jobs:
90102
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
91103
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
92104
run: make save-dataset
105+
106+
- name: Save Parquet files to Prod S3
107+
env:
108+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
109+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
110+
run: make save-tables-to-parquet
93111

94112
check-digital-land-builder-errors:
95113
runs-on: ubuntu-latest

.github/workflows/run_dev.yml

+7-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ env:
77
DLB_BOT_USERNAME: ${{ secrets.DLB_BOT_USERNAME }}
88
jobs:
99
build:
10-
runs-on: ubuntu-latest
10+
runs-on: ubuntu-22.04
1111
steps:
1212

1313
- name: Free up disk space
@@ -61,4 +61,10 @@ jobs:
6161
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
6262
run: make save-dataset
6363

64+
- name: Save Parquet files to Development S3
65+
env:
66+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
67+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
68+
run: make save-tables-to-parquet
69+
6470

.github/workflows/run_performance.yml

+20-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,13 @@ jobs:
6363
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
6464
run: make save-dataset
6565

66-
# Staging
66+
- name: Save Parquet files to Development S3
67+
env:
68+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
69+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
70+
run: make save-tables-to-parquet
71+
72+
# Staging
6773
- name: Configure Staging AWS Credentials
6874
uses: aws-actions/configure-aws-credentials@v1-node16
6975
with:
@@ -76,6 +82,12 @@ jobs:
7682
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
7783
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
7884
run: make save-dataset
85+
86+
- name: Save Parquet files to Staging S3
87+
env:
88+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
89+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
90+
run: make save-tables-to-parquet
7991

8092
# Production
8193
- name: Configure Production AWS Credentials
@@ -90,3 +102,10 @@ jobs:
90102
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
91103
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
92104
run: make save-dataset
105+
106+
- name: Save Parquet files to Prod S3
107+
env:
108+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
109+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
110+
run: make save-tables-to-parquet
111+

.github/workflows/run_performance_dev.yml

+6
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,10 @@ jobs:
6363
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
6464
run: make save-dataset
6565

66+
- name: Save Parquet files to Development S3
67+
env:
68+
COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
69+
HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
70+
run: make save-tables-to-parquet
71+
6672

Makefile

+18-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,18 @@ include makerules/development.mk
1111

1212
DB=dataset/digital-land.sqlite3
1313
DB_PERF = dataset/performance.sqlite3
14+
15+
ifeq ($(PARQUET_DIR),)
16+
PARQUET_DIR=data/
17+
endif
18+
ifeq ($(PARQUET_SPECIFICATION_DIR),)
19+
export PARQUET_SPECIFICATION_DIR=$(PARQUET_DIR)specification/
20+
endif
21+
ifeq ($(PARQUET_PERFORMANCE_DIR),)
22+
export PARQUET_PERFORMANCE_DIR=$(PARQUET_DIR)performance/
23+
endif
24+
25+
1426
DATASTORE_URL = https://files.planning.data.gov.uk/
1527

1628
first-pass::
@@ -35,11 +47,13 @@ third-pass:: $(DB_PERF)
3547

3648
$(DB): bin/load.py
3749
@rm -f $@
50+
mkdir -p $(PARQUET_SPECIFICATION_DIR)
3851
python3 bin/load.py $@
3952

4053
$(DB_PERF): bin/load_reporting_tables.py bin/load_performance.py
4154
bin/download-digital-land.sh
4255
@rm -f $@
56+
mkdir -p $(PARQUET_PERFORMANCE_DIR)
4357
python3 bin/load_reporting_tables.py $@ $(DB)
4458
python3 bin/load_performance.py $@ $(DB)
4559

@@ -52,6 +66,9 @@ clobber::
5266
rm -rf dataset/
5367
rm -rf $(DB)
5468
rm -rf $(DB_PERF)
69+
rm -rf $(PARQUET_SPECIFICATION_DIR)
70+
rm -rf $(PARQUET_PERFORMANCE_DIR)
71+
rm -rf $(PARQUET_DIR)
5572

5673
clobber-performance::
5774
rm -rf $(DB_PERF)
@@ -62,7 +79,7 @@ aws-build::
6279
push::
6380
aws s3 cp $(DB) s3://digital-land-collection/digital-land.sqlite3
6481
aws s3 cp $(DB_PERF) s3://digital-land-collection/performance.sqlite3
65-
82+
6683
specification::
6784
# additional
6885
curl -qfsL '$(SOURCE_URL)/specification/main/specification/issue-type.csv' > specification/issue-type.csv

bin/load.py

+10
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import pandas as pd
1212
from digital_land.package.sqlite import SqlitePackage
1313

14+
PARQUET_SPECIFICATION_DIR = os.getenv("PARQUET_SPECIFICATION_DIR")
15+
1416

1517
tables = {
1618
"organisation": "var/cache",
@@ -79,6 +81,11 @@
7981
}
8082

8183

84+
def create_parquet_from_table(df, name, output_dir):
85+
parquet_file_path = os.path.join(output_dir, f"{name}.parquet")
86+
df.to_parquet(parquet_file_path, engine="pyarrow")
87+
88+
8289
if __name__ == "__main__":
8390
level = logging.INFO
8491
logging.basicConfig(level=level, format="%(asctime)s %(levelname)s %(message)s")
@@ -88,6 +95,8 @@
8895

8996
conn = sqlite3.connect(path)
9097

98+
specification_df = pd.read_sql_query("SELECT * FROM specification", conn)
99+
create_parquet_from_table(specification_df, "specification", PARQUET_SPECIFICATION_DIR)
91100
operational_issue_log = pd.read_csv("performance/operational_issue/operational-issue.csv")
92101
operational_issue_log.to_sql("operational_issue", conn, if_exists="replace", index=False)
93102

@@ -197,3 +206,4 @@
197206
where t1.rn = 1
198207
""")
199208
conn.close()
209+

bin/load_performance.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,19 @@
66
import logging
77
import sqlite3
88
import pandas as pd
9+
import os
910

1011
indexes = {
1112
"provision_summary": ["organisation", "organisation_name", "dataset"]
1213
}
1314

15+
PARQUET_PERFORMANCE_DIR = os.getenv("PARQUET_PERFORMANCE_DIR")
16+
1417

1518
def fetch_provision_data(db_path):
1619
conn = sqlite3.connect(db_path)
1720
query = """
18-
select p.organisation, o.name as organisation_name, p.cohort, p.dataset from provision p
21+
select p.organisation, o.name as organisation_name, p.cohort, p.dataset,p.provision_reason from provision p
1922
inner join organisation o on o.organisation = p.organisation
2023
order by p.organisation
2124
"""
@@ -121,7 +124,7 @@ def create_performance_tables(merged_data, cf_merged_data, endpoint_summary_data
121124
endpoint_summary_table_name, conn, if_exists='replace', index=False)
122125

123126
# Filter out endpoints with an end date as we don't want to count them in provision summary
124-
final_result = merged_data.groupby(['organisation', 'organisation_name', 'dataset']).agg(
127+
final_result = merged_data.groupby(['organisation', 'organisation_name', 'dataset', 'provision_reason']).agg(
125128
active_endpoint_count=pd.NamedAgg(
126129
column='endpoint',
127130
aggfunc=lambda x: x[(merged_data.loc[x.index,
@@ -191,6 +194,7 @@ def create_performance_tables(merged_data, cf_merged_data, endpoint_summary_data
191194
})
192195

193196
provision_table_name = "provision_summary"
197+
final_result.to_parquet(os.path.join(PARQUET_PERFORMANCE_DIR,"provision_summary.parquet"), engine="pyarrow")
194198
final_result.to_sql(provision_table_name, conn,
195199
if_exists='replace', index=False)
196200
conn.close()

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ chardet
77
PyPDF2
88
pandas
99
tqdm
10+
pyarrow
1011
-e git+https://github.com/digital-land/digital-land-python.git#egg=digital-land

0 commit comments

Comments
 (0)