store provision_summary file in cache

kena-SL · kena-SL · commit 593dacd3c5a5 · 2025-02-21T09:02:01.000Z
diff --git a/digital_land/cli.py b/digital_land/cli.py
@@ -44,6 +44,7 @@
     column_field_dir,
     converted_resource_dir,
     output_log_dir,
+    provision_summary_dir,
 )
 
 
@@ -227,6 +228,7 @@ def dataset_dump_flattened_cmd(ctx, input_path, output_path):
 @dataset_resource_dir
 @converted_resource_dir
 @organisation_path
+@provision_summary_dir
 @collection_dir
 @operational_issue_dir
 @output_log_dir
@@ -250,6 +252,7 @@ def pipeline_command(
     config_path,
     resource,
     output_log_dir,
+    provision_summary_dir,
 ):
     dataset = ctx.obj["DATASET"]
     pipeline = ctx.obj["PIPELINE"]
@@ -279,6 +282,7 @@ def pipeline_command(
         config_path=config_path,
         resource=resource,
         output_log_dir=output_log_dir,
+        provision_summary_dir=provision_summary_dir,
     )
 
 
diff --git a/digital_land/command_arguments.py b/digital_land/command_arguments.py
@@ -97,3 +97,11 @@ def organisation_path(f):
         type=click.Path(exists=True),
         default="var/cache/organisation.csv",
     )(f)
+
+
+def provision_summary_dir(f):
+    return click.option(
+        "--provision-summary-dir",
+        type=click.Path(exists=True),
+        default="var/cache/provision-summary/",
+    )(f)
diff --git a/digital_land/commands.py b/digital_land/commands.py
@@ -216,6 +216,7 @@ def pipeline_run(
     resource=None,
     output_log_dir=None,
     converted_path=None,
+    provision_summary_dir="var/cache/provision_summary",
 ):
     # set up paths
     cache_dir = Path(cache_dir)
@@ -330,6 +331,7 @@ def pipeline_run(
             issue_log=issue_log,
             operational_issue_log=operational_issue_log,
             entity_range=[entity_range_min, entity_range_max],
+            provision_summary_dir=provision_summary_dir,
         ),
         SavePhase(
             default_output_path("harmonised", input_path),
diff --git a/digital_land/phase/lookup.py b/digital_land/phase/lookup.py
@@ -1,8 +1,6 @@
 import re
 import logging
-import time
 import pandas as pd
-import urllib
 
 from .phase import Phase
 
@@ -36,13 +34,15 @@ def __init__(
         issue_log=None,
         operational_issue_log=None,
         entity_range=[],
+        provision_summary_dir=None,
     ):
         self.lookups = lookups
         self.redirect_lookups = redirect_lookups
         self.issues = issue_log
         self.operational_issues = operational_issue_log
         self.reverse_lookups = self.build_reverse_lookups()
         self.entity_range = entity_range
+        self.provision_summary_dir = provision_summary_dir
 
     def build_reverse_lookups(self):
         reverse_lookups = {}
@@ -167,67 +167,43 @@ def process(self, stream):
                             row[self.entity_field]
                         )
 
+                linked_datasets = ["article-4-direction", "tree-preservation-order"]
                 if row[self.entity_field]:
-                    if (
-                        row.get("article-4-direction", "")
-                        or row.get("tree-preservation-order", "").strip()
-                    ):
-                        linked_dataset = (
-                            "article-4-direction"
-                            if "article-4-direction" in row
-                            else "tree-preservation-order"
-                        )
-
-                        # check applied for organisations that have provided a document dataset
-                        if not hasattr(
-                            self, "lpa_list"
-                        ):  # check if data fetched already
-                            params = urllib.parse.urlencode(
-                                {
-                                    "sql": f"""select organisation from provision_summary where active_endpoint_count > 0 and dataset == '{linked_dataset}'""",
-                                    "_size": "max",
-                                }
+                    for linked_dataset in linked_datasets:
+                        if (
+                            row.get(linked_dataset, "")
+                            or row.get(linked_dataset, "").strip()
+                        ):
+                            get_organisations = pd.read_csv(
+                                self.provision_summary_dir + linked_dataset + ".csv"
                             )
-                            base_url = f"https://datasette.planning.data.gov.uk/performance.csv?{params}"
-
-                            max_retries = 60  # Retry for an hour
-                            for attempt in range(max_retries):
-                                try:
-                                    get_lpa = pd.read_csv(base_url)
-                                    self.lpa_list = get_lpa["organisation"].to_list()
-                                    break
-                                except urllib.error.HTTPError:
-                                    if attempt < max_retries - 1:
-                                        time.sleep(60)
-                            else:
-                                raise Exception(
-                                    "Failed to fetch datasette after multiple attempts"
-                                )
-
-                        if row.get("organisation", "") in self.lpa_list:
-                            reference = row.get(linked_dataset, "")
 
-                            find_entity = self.lookup(
-                                prefix=linked_dataset,
-                                organisation=row.get("organisation", ""),
-                                reference=row.get(linked_dataset, ""),
-                            )
-                            # raise issue if the found entity is retired in old-entity.csv
-                            if not find_entity or (
-                                str(find_entity) in self.redirect_lookups
-                                and int(
-                                    self.redirect_lookups[str(find_entity)].get(
-                                        "status", 0
-                                    )
-                                )
-                                == 410
+                            if (
+                                row.get("organisation", "")
+                                in get_organisations["organisation"].values
                             ):
-                                self.issues.log_issue(
-                                    linked_dataset,
-                                    "no associated documents found for this area",
-                                    reference,
-                                    line_number=line_number,
+                                reference = row.get(linked_dataset, "")
+                                find_entity = self.lookup(
+                                    prefix=linked_dataset,
+                                    organisation=row.get("organisation", ""),
+                                    reference=reference,
                                 )
+                                # raise issue if the found entity is retired in old-entity.csv
+                                if not find_entity or (
+                                    str(find_entity) in self.redirect_lookups
+                                    and int(
+                                        self.redirect_lookups[str(find_entity)].get(
+                                            "status", 0
+                                        )
+                                    )
+                                    == 410
+                                ):
+                                    self.issues.log_issue(
+                                        linked_dataset,
+                                        "no associated documents found for this area",
+                                        reference,
+                                        line_number=line_number,
+                                    )
             yield block
 
 
diff --git a/tests/unit/phase/test_lookup.py b/tests/unit/phase/test_lookup.py
@@ -151,7 +151,11 @@ def test_no_associated_documents_issue(
         }
         issues = IssueLog()
 
-        phase = LookupPhase(lookups=lookups, issue_log=issues)
+        phase = LookupPhase(
+            lookups=lookups,
+            issue_log=issues,
+            provision_summary_dir="var/cache/provision-summary/",
+        )
         phase.entity_field = "entity"
         mock_df = pd.DataFrame({"organisation": ["local-authority:ABC"]})
         mocker.patch("pandas.read_csv", return_value=mock_df)
@@ -175,7 +179,11 @@ def test_no_associated_documents_issue_for_missing_dataset(
         }
         issues = IssueLog()
 
-        phase = LookupPhase(lookups=lookups, issue_log=issues)
+        phase = LookupPhase(
+            lookups=lookups,
+            issue_log=issues,
+            provision_summary_dir="var/cache/provision-summary/",
+        )
         phase.entity_field = "entity"
         mock_df = pd.DataFrame({"organisation": ["local-authority:XYZ"]})
         mocker.patch("pandas.read_csv", return_value=mock_df)
@@ -197,7 +205,10 @@ def test_no_associated_documents_issue_for_retired_entity(
         redirect_lookups = {"1": {"entity": "", "status": "410"}}
 
         phase = LookupPhase(
-            lookups=lookups, redirect_lookups=redirect_lookups, issue_log=issues
+            lookups=lookups,
+            redirect_lookups=redirect_lookups,
+            issue_log=issues,
+            provision_summary_dir="var/cache/provision-summary/",
         )
         phase.entity_field = "entity"
         mock_df = pd.DataFrame({"organisation": ["local-authority:ABC"]})