Skip to content

Commit 593dacd

Browse files
committed
store provision_summary file in cache
1 parent eb6679e commit 593dacd

File tree

5 files changed

+62
-61
lines changed

5 files changed

+62
-61
lines changed

digital_land/cli.py

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
column_field_dir,
4545
converted_resource_dir,
4646
output_log_dir,
47+
provision_summary_dir,
4748
)
4849

4950

@@ -227,6 +228,7 @@ def dataset_dump_flattened_cmd(ctx, input_path, output_path):
227228
@dataset_resource_dir
228229
@converted_resource_dir
229230
@organisation_path
231+
@provision_summary_dir
230232
@collection_dir
231233
@operational_issue_dir
232234
@output_log_dir
@@ -250,6 +252,7 @@ def pipeline_command(
250252
config_path,
251253
resource,
252254
output_log_dir,
255+
provision_summary_dir,
253256
):
254257
dataset = ctx.obj["DATASET"]
255258
pipeline = ctx.obj["PIPELINE"]
@@ -279,6 +282,7 @@ def pipeline_command(
279282
config_path=config_path,
280283
resource=resource,
281284
output_log_dir=output_log_dir,
285+
provision_summary_dir=provision_summary_dir,
282286
)
283287

284288

digital_land/command_arguments.py

+8
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,11 @@ def organisation_path(f):
9797
type=click.Path(exists=True),
9898
default="var/cache/organisation.csv",
9999
)(f)
100+
101+
102+
def provision_summary_dir(f):
103+
return click.option(
104+
"--provision-summary-dir",
105+
type=click.Path(exists=True),
106+
default="var/cache/provision-summary/",
107+
)(f)

digital_land/commands.py

+2
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ def pipeline_run(
216216
resource=None,
217217
output_log_dir=None,
218218
converted_path=None,
219+
provision_summary_dir="var/cache/provision_summary",
219220
):
220221
# set up paths
221222
cache_dir = Path(cache_dir)
@@ -330,6 +331,7 @@ def pipeline_run(
330331
issue_log=issue_log,
331332
operational_issue_log=operational_issue_log,
332333
entity_range=[entity_range_min, entity_range_max],
334+
provision_summary_dir=provision_summary_dir,
333335
),
334336
SavePhase(
335337
default_output_path("harmonised", input_path),

digital_land/phase/lookup.py

+34-58
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import re
22
import logging
3-
import time
43
import pandas as pd
5-
import urllib
64

75
from .phase import Phase
86

@@ -36,13 +34,15 @@ def __init__(
3634
issue_log=None,
3735
operational_issue_log=None,
3836
entity_range=[],
37+
provision_summary_dir=None,
3938
):
4039
self.lookups = lookups
4140
self.redirect_lookups = redirect_lookups
4241
self.issues = issue_log
4342
self.operational_issues = operational_issue_log
4443
self.reverse_lookups = self.build_reverse_lookups()
4544
self.entity_range = entity_range
45+
self.provision_summary_dir = provision_summary_dir
4646

4747
def build_reverse_lookups(self):
4848
reverse_lookups = {}
@@ -167,67 +167,43 @@ def process(self, stream):
167167
row[self.entity_field]
168168
)
169169

170+
linked_datasets = ["article-4-direction", "tree-preservation-order"]
170171
if row[self.entity_field]:
171-
if (
172-
row.get("article-4-direction", "")
173-
or row.get("tree-preservation-order", "").strip()
174-
):
175-
linked_dataset = (
176-
"article-4-direction"
177-
if "article-4-direction" in row
178-
else "tree-preservation-order"
179-
)
180-
181-
# check applied for organisations that have provided a document dataset
182-
if not hasattr(
183-
self, "lpa_list"
184-
): # check if data fetched already
185-
params = urllib.parse.urlencode(
186-
{
187-
"sql": f"""select organisation from provision_summary where active_endpoint_count > 0 and dataset == '{linked_dataset}'""",
188-
"_size": "max",
189-
}
172+
for linked_dataset in linked_datasets:
173+
if (
174+
row.get(linked_dataset, "")
175+
or row.get(linked_dataset, "").strip()
176+
):
177+
get_organisations = pd.read_csv(
178+
self.provision_summary_dir + linked_dataset + ".csv"
190179
)
191-
base_url = f"https://datasette.planning.data.gov.uk/performance.csv?{params}"
192-
193-
max_retries = 60 # Retry for an hour
194-
for attempt in range(max_retries):
195-
try:
196-
get_lpa = pd.read_csv(base_url)
197-
self.lpa_list = get_lpa["organisation"].to_list()
198-
break
199-
except urllib.error.HTTPError:
200-
if attempt < max_retries - 1:
201-
time.sleep(60)
202-
else:
203-
raise Exception(
204-
"Failed to fetch datasette after multiple attempts"
205-
)
206-
207-
if row.get("organisation", "") in self.lpa_list:
208-
reference = row.get(linked_dataset, "")
209180

210-
find_entity = self.lookup(
211-
prefix=linked_dataset,
212-
organisation=row.get("organisation", ""),
213-
reference=row.get(linked_dataset, ""),
214-
)
215-
# raise issue if the found entity is retired in old-entity.csv
216-
if not find_entity or (
217-
str(find_entity) in self.redirect_lookups
218-
and int(
219-
self.redirect_lookups[str(find_entity)].get(
220-
"status", 0
221-
)
222-
)
223-
== 410
181+
if (
182+
row.get("organisation", "")
183+
in get_organisations["organisation"].values
224184
):
225-
self.issues.log_issue(
226-
linked_dataset,
227-
"no associated documents found for this area",
228-
reference,
229-
line_number=line_number,
185+
reference = row.get(linked_dataset, "")
186+
find_entity = self.lookup(
187+
prefix=linked_dataset,
188+
organisation=row.get("organisation", ""),
189+
reference=reference,
230190
)
191+
# raise issue if the found entity is retired in old-entity.csv
192+
if not find_entity or (
193+
str(find_entity) in self.redirect_lookups
194+
and int(
195+
self.redirect_lookups[str(find_entity)].get(
196+
"status", 0
197+
)
198+
)
199+
== 410
200+
):
201+
self.issues.log_issue(
202+
linked_dataset,
203+
"no associated documents found for this area",
204+
reference,
205+
line_number=line_number,
206+
)
231207
yield block
232208

233209

tests/unit/phase/test_lookup.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,11 @@ def test_no_associated_documents_issue(
151151
}
152152
issues = IssueLog()
153153

154-
phase = LookupPhase(lookups=lookups, issue_log=issues)
154+
phase = LookupPhase(
155+
lookups=lookups,
156+
issue_log=issues,
157+
provision_summary_dir="var/cache/provision-summary/",
158+
)
155159
phase.entity_field = "entity"
156160
mock_df = pd.DataFrame({"organisation": ["local-authority:ABC"]})
157161
mocker.patch("pandas.read_csv", return_value=mock_df)
@@ -175,7 +179,11 @@ def test_no_associated_documents_issue_for_missing_dataset(
175179
}
176180
issues = IssueLog()
177181

178-
phase = LookupPhase(lookups=lookups, issue_log=issues)
182+
phase = LookupPhase(
183+
lookups=lookups,
184+
issue_log=issues,
185+
provision_summary_dir="var/cache/provision-summary/",
186+
)
179187
phase.entity_field = "entity"
180188
mock_df = pd.DataFrame({"organisation": ["local-authority:XYZ"]})
181189
mocker.patch("pandas.read_csv", return_value=mock_df)
@@ -197,7 +205,10 @@ def test_no_associated_documents_issue_for_retired_entity(
197205
redirect_lookups = {"1": {"entity": "", "status": "410"}}
198206

199207
phase = LookupPhase(
200-
lookups=lookups, redirect_lookups=redirect_lookups, issue_log=issues
208+
lookups=lookups,
209+
redirect_lookups=redirect_lookups,
210+
issue_log=issues,
211+
provision_summary_dir="var/cache/provision-summary/",
201212
)
202213
phase.entity_field = "entity"
203214
mock_df = pd.DataFrame({"organisation": ["local-authority:ABC"]})

0 commit comments

Comments
 (0)