Merge branch 'main' into feat/parquet_owen

eveleighoj · eveleighoj · commit 320ef2e2d752 · 2025-02-13T14:46:07.000Z
merge changes to main branch in
diff --git a/digital_land/cli.py b/digital_land/cli.py
@@ -84,11 +84,17 @@ def fetch_cmd(ctx, url):
     type=click.Path(exists=True),
     default="collection/endpoint.csv",
 )
+@click.option("--refill-todays-logs", default=False)
 @collection_dir
 @click.pass_context
-def collect_cmd(ctx, endpoint_path, collection_dir):
+def collect_cmd(ctx, endpoint_path, collection_dir, refill_todays_logs):
     """fetch resources from collection endpoints"""
-    return collect(endpoint_path, collection_dir, ctx.obj["PIPELINE"])
+    return collect(
+        endpoint_path,
+        collection_dir,
+        ctx.obj["PIPELINE"],
+        refill_todays_logs=refill_todays_logs,
+    )
 
 
 #
@@ -110,9 +116,10 @@ def collection_pipeline_makerules_cmd(collection_dir):
 
 
 @cli.command("collection-save-csv", short_help="save collection as CSV package")
+@click.option("--refill-todays-logs", default=False)
 @collection_dir
-def collection_save_csv_cmd(collection_dir):
-    return collection_save_csv(collection_dir)
+def collection_save_csv_cmd(collection_dir, refill_todays_logs):
+    return collection_save_csv(collection_dir, refill_todays_logs)
 
 
 @cli.command(
diff --git a/digital_land/collect.py b/digital_land/collect.py
@@ -51,18 +51,23 @@ def log_path(self, log_datetime, endpoint):
         log_date = log_datetime.isoformat()[:10]
         return os.path.join(self.log_dir, log_date, endpoint + ".json")
 
-    def save_log(self, path, log):
-        self.save(path, canonicaljson.encode_canonical_json(log))
+    def save_log(self, path, log, refill_todays_logs=False):
+        self.save(
+            path,
+            canonicaljson.encode_canonical_json(log),
+            refill_todays_logs=refill_todays_logs,
+        )
 
     def save_content(self, content):
         resource = hashlib.sha256(content).hexdigest()
         path = os.path.join(self.resource_dir, resource)
         self.save(path, content)
         return resource
 
-    def save(self, path, data):
+    def save(self, path, data, refill_todays_logs=False):
         os.makedirs(os.path.dirname(path), exist_ok=True)
-        if not os.path.exists(path):
+        # if refill_todays_logs=True then files in log_path need to be overwritten
+        if not os.path.exists(path) or refill_todays_logs:
             logging.info(path)
             with open(path, "wb") as f:
                 f.write(data)
@@ -126,6 +131,7 @@ def fetch(
         log_datetime=datetime.utcnow(),
         end_date="",
         plugin="",
+        refill_todays_logs=False,
     ):
         if end_date and datetime.strptime(end_date, "%Y-%m-%d") < log_datetime:
             return FetchStatus.EXPIRED
@@ -139,11 +145,12 @@ def fetch(
             )
             return FetchStatus.HASH_FAILURE
 
-        # fetch each source at most once per-day
+        # fetch each source at most once per-day, though with an option to re-collect the latest day's sources
         log_path = self.log_path(log_datetime, endpoint)
-        if os.path.isfile(log_path):
-            logging.debug(f"{log_path} exists")
-            return FetchStatus.ALREADY_FETCHED
+        if not refill_todays_logs:
+            if os.path.isfile(log_path):
+                logging.debug(f"{log_path} exists")
+                return FetchStatus.ALREADY_FETCHED
 
         log = {
             "endpoint-url": url,
@@ -167,8 +174,7 @@ def fetch(
         log["elapsed"] = str(round(timer() - start, 3))
 
         status = self.save_resource(content, log_path, log)
-
-        self.save_log(log_path, log)
+        self.save_log(log_path, log, refill_todays_logs=refill_todays_logs)
         return status
 
     def save_resource(self, content, url, log):
@@ -182,7 +188,7 @@ def save_resource(self, content, url, log):
 
         return FetchStatus.FAILED
 
-    def collect(self, endpoint_path):
+    def collect(self, endpoint_path, refill_todays_logs=False):
         for row in csv.DictReader(open(endpoint_path, newline="")):
             endpoint = row["endpoint"]
             url = row["endpoint-url"]
@@ -197,4 +203,5 @@ def collect(self, endpoint_path):
                 endpoint=endpoint,
                 end_date=row.get("end-date", ""),
                 plugin=plugin,
+                refill_todays_logs=refill_todays_logs,
             )
diff --git a/digital_land/collection.py b/digital_land/collection.py
@@ -354,7 +354,7 @@ def save_csv(self, directory=None):
         self.log.save_csv(directory=directory)
         self.resource.save_csv(directory=directory)
 
-    def load(self, directory=None):
+    def load(self, directory=None, refill_todays_logs=False):
         directory = directory or self.dir
         self.source.load(directory=directory)
         self.endpoint.load(directory=directory)
@@ -363,7 +363,9 @@ def load(self, directory=None):
 
         # Try to load log store from csv first
         try:
-            self.log.load_csv(directory=directory)
+            self.log.load_csv(
+                directory=directory, refill_todays_logs=refill_todays_logs
+            )
             logging.info(f"Log loaded from CSV - {len(self.log.entries)} entries")
         except FileNotFoundError:
             logging.info("No log.csv - building from log items")
diff --git a/digital_land/commands.py b/digital_land/commands.py
@@ -86,10 +86,10 @@ def fetch(url, pipeline):
     collector.fetch(url)
 
 
-def collect(endpoint_path, collection_dir, pipeline):
+def collect(endpoint_path, collection_dir, pipeline, refill_todays_logs=False):
     """fetch the sources listed in the endpoint-url column of the ENDPOINT_PATH CSV file"""
     collector = Collector(pipeline.name, Path(collection_dir))
-    collector.collect(endpoint_path)
+    collector.collect(endpoint_path, refill_todays_logs=refill_todays_logs)
 
 
 #
@@ -109,9 +109,9 @@ def collection_pipeline_makerules(collection_dir):
     collection.pipeline_makerules()
 
 
-def collection_save_csv(collection_dir):
+def collection_save_csv(collection_dir, refill_todays_logs=False):
     collection = Collection(name=None, directory=collection_dir)
-    collection.load()
+    collection.load(refill_todays_logs=refill_todays_logs)
     collection.update()
     collection.save_csv()
 
diff --git a/digital_land/expectations/operation.py b/digital_land/expectations/operation.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import urllib
 import os
+import time
 
 
 # # TODO is there a way to represent this in a generalised count or not
@@ -39,7 +40,7 @@ def count_lpa_boundary(
         lpa_geometry = data["geometry"]
     except requests.exceptions.RequestException as err:
         passed = False
-        message = f"An error occured when retrieving lpa geometry from platform {err}"
+        message = f"An error occurred when retrieving lpa geometry from platform {err}"
         details = {}
         return passed, message, details
 
@@ -142,7 +143,18 @@ def count_deleted_entities(
         }
     )
     base_url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
-    get_resource = pd.read_csv(base_url)
+
+    # Can have an issue getting data from datasette. If this occurs then wait a minute and retry
+    max_retries = 60  # Retry for an hour
+    for attempt in range(max_retries):
+        try:
+            get_resource = pd.read_csv(base_url)
+            break
+        except urllib.error.HTTPError:
+            time.sleep(60)
+    else:
+        raise Exception("Failed to fetch datasette after multiple attempts")
+
     resource_list = get_resource["resource"].to_list()
 
     # use resource list to get current entities
diff --git a/digital_land/phase/patch.py b/digital_land/phase/patch.py
@@ -16,6 +16,8 @@ def __init__(
     def apply_patch(self, fieldname, value):
         patches = {**self.patch.get(fieldname, {}), **self.patch.get("", {})}
         for pattern, replacement in patches.items():
+            if pattern == value:
+                pattern = f"^{re.escape(pattern)}$"
             match = re.match(pattern, value, flags=re.IGNORECASE)
             if match:
                 newvalue = match.expand(replacement)
diff --git a/digital_land/store/csv.py b/digital_land/store/csv.py
@@ -4,19 +4,29 @@
 import csv
 import logging
 from pathlib import Path
+from datetime import datetime
 from .memory import MemoryStore
 
 
 class CSVStore(MemoryStore):
     def csv_path(store, directory=""):
         return Path(directory) / (store.schema.name + ".csv")
 
-    def load_csv(self, path=None, directory=""):
+    def load_csv(self, path=None, directory="", refill_todays_logs=False):
         path = path or self.csv_path(directory)
-        logging.debug("loading %s" % (path))
+        today = datetime.now().date()
+        logging.debug("loading %s" % path)
         reader = csv.DictReader(open(path, newline=""))
         for row in reader:
-            self.add_entry(row)
+            if not refill_todays_logs:
+                self.add_entry(row)
+            else:
+                # Don't load in values of today's log so it can be overwritten
+                if (
+                    "entry-date" in row
+                    and datetime.fromisoformat(row["entry-date"]).date() < today
+                ):
+                    self.add_entry(row)
 
     def load(self, *args, **kwargs):
         self.load_csv(*args, **kwargs)
@@ -28,7 +38,7 @@ def save_csv(self, path=None, directory="", entries=None):
             entries = self.entries
 
         os.makedirs(os.path.dirname(path), exist_ok=True)
-        logging.debug("saving %s" % (path))
+        logging.debug("saving %s" % path)
         f = open(path, "w", newline="")
         writer = csv.DictWriter(
             f, fieldnames=self.schema.fieldnames, extrasaction="ignore"
diff --git a/setup.py b/setup.py
@@ -55,6 +55,7 @@ def get_long_description():
         "json-stream",
         "duckdb",
         "dask",
+        "dask[dataframe]",
         "pyarrow",
         "pygit2",
     ],
diff --git a/tests/integration/test_collection.py b/tests/integration/test_collection.py
@@ -426,6 +426,75 @@ def test_collection_update_today(test_collection_update_fixture):
     assert collection.resource.entries[0]["end-date"] == ""
 
 
+def test_collection_update_refill_todays_logs(tmp_path):
+    collection_dir = os.path.join(tmp_path, "collection")
+    os.makedirs(collection_dir, exist_ok=True)
+
+    # Write the existing log and resource file
+    _write_csv(
+        dir=collection_dir,
+        log={
+            "bytes": "2",
+            "content-type": "",
+            "elapsed": "0.5",
+            "endpoint": "test",
+            "resource": "test",
+            "status": "200",
+            "entry-date": datetime.datetime.now().isoformat(),
+            "start-date": datetime.datetime.now().isoformat(),
+            "end-date": "",
+            "exception": "",
+        },
+        resource={
+            "resource": "test",
+            "bytes": "2",
+            "organisations": "test",
+            "datasets": "test",
+            "endpoints": "test",
+            "start-date": "2019-01-01",
+            "end-date": "",
+        },
+    )
+
+    # Write the endpoint/source for the new log item
+    _write_csv(
+        dir=collection_dir,
+        endpoint={
+            "endpoint": "test",
+            "endpoint-url": "test.com",
+            "parameters": "",
+            "plugin": "",
+            "entry-date": "2019-01-01",
+            "start-date": "2019-01-01",
+            "end-date": "",
+        },
+        source={
+            "source": "test1",
+            "attribution": "",
+            "collection": "test",
+            "documentation-url": "testing.com",
+            "endpoint": "test",
+            "licence": "test",
+            "organisation": "test-org",
+            "pipelines": "test",
+            "entry-date": "2019-01-01",
+            "start-date": "2019-01-01",
+            "end-date": "",
+        },
+    )
+
+    collection = Collection(directory=collection_dir)
+
+    # Load from CSVs
+    # With overwrite today true it shouldn't load today's log
+    collection.load(refill_todays_logs=True)
+    assert len(collection.log.entries) == 0
+
+    # While False it should load todays log as normal
+    collection.load(refill_todays_logs=False)
+    assert len(collection.log.entries) == 1
+
+
 def test_collection_retire_endpoints_and_sources(tmp_path):
 
     # Create a temporary directory for the test collection
diff --git a/tests/unit/test_collect.py b/tests/unit/test_collect.py
@@ -59,6 +59,15 @@ def test_already_fetched(collector, prepared_response):
     assert new_status == FetchStatus.ALREADY_FETCHED
 
 
+@responses.activate
+def test_refill_todays_logs(collector, prepared_response):
+    status = collector.fetch("http://some.url")
+    assert status == FetchStatus.OK
+
+    new_status = collector.fetch("http://some.url", refill_todays_logs=True)
+    assert new_status == FetchStatus.OK
+
+
 @responses.activate
 def test_expired(collector):
     yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
diff --git a/tests/unit/test_patch.py b/tests/unit/test_patch.py
@@ -14,7 +14,10 @@ def test_patch_regex():
             "^2\\*$": "II*",
             "^2 Star$": "II*",
             "^3$": "III",
-        }
+        },
+        "OrganisationURI": {
+            "https://example.com/search?query=data&filter=name%20contains%20test": "patch_organisation",
+        },
     }
 
     p = PatchPhase(patches=patches, issues=issues)
@@ -48,3 +51,19 @@ def test_patch_regex():
     assert issue["issue-type"] == "patch"
     assert issue["value"] == "2 Star"
     assert issues.rows == []
+
+    assert (
+        p.apply_patch(
+            "OrganisationURI",
+            "https://example.com/search?query=data&filter=name%20contains%20test",
+        )
+        == "patch_organisation"
+    )
+    issue = issues.rows.pop()
+    assert issue["field"] == "OrganisationURI"
+    assert issue["issue-type"] == "patch"
+    assert (
+        issue["value"]
+        == "https://example.com/search?query=data&filter=name%20contains%20test"
+    )
+    assert issues.rows == []