digital-land · CarlosCoelhoSL · Feb 26, 2025 · Dec 9, 2024 · Dec 12, 2024 · Feb 14, 2025
diff --git a/Makefile b/Makefile
@@ -26,14 +26,15 @@ endif
 DATASTORE_URL = https://files.planning.data.gov.uk/
 
 first-pass::
+	export PYTHONPATH=$(pwd)
 	mkdir -p dataset/
 	bin/download-collection.sh
 	bin/download-pipeline.sh
 	bin/concat.sh
-	bin/download-issues.sh
+	python3 bin/download-issues.py
 	bin/download-operational-issues.sh
-	bin/download-column-field.sh
-	bin/download-converted-resources.sh
+	python3 bin/download-column-field.py
+	python3 bin/download-converted-resources.py
 	#bin/download-resources.sh
 	./bin/concat-issues.py
 	./bin/concat-column-field.py

diff --git a/bin/__init__.py b/bin/__init__.py
diff --git a/bin/download-column-field.py b/bin/download-column-field.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+import logging
+import click
+from datetime import datetime
+
+from bin.resources import get_resources
+from file_downloader import download_urls
+
+logger =  logging.getLogger("__name__")
+
+@click.command()
+@click.option("--timestamp",default=None)
+def download_column_field(timestamp=None):
+    resources = get_resources("collection/")
+    url_map = {}
+    now = datetime.now()
+    timestamp = int(now.replace(minute=0, second=0, microsecond=0).timestamp())
+
+    for resource in resources:
+        collection = resources[resource]["collection"]
+        for pipeline in resources[resource]["pipelines"]:
+            if not pipeline:
+                logger.error(f"no pipeline for {resource} in {collection} so cannot download")
+            else:
+                url = f"https://files.planning.data.gov.uk/{collection}-collection/var/column-field/{pipeline}/{resource}.csv?version={timestamp}"
+                output_path = f"var/column-field/{pipeline}/{resource}.csv"
+                url_map[url]=output_path
+
+    download_urls(url_map)
+
+
+
+if __name__ == "__main__":
+    download_column_field()
diff --git a/bin/download-column-field.sh b/bin/download-column-field.sh
diff --git a/bin/download-converted-resources.py b/bin/download-converted-resources.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+import logging
+import click
+from datetime import datetime
+
+from bin.resources import get_resources
+from file_downloader import download_urls
+
+logger =  logging.getLogger("__name__")
+
+@click.command()
+@click.option("--timestamp",default=None)
+def download_converted_resource(timestamp=None):
+    resources = get_resources("collection/")
+    url_map = {}
+    now = datetime.now()
+    timestamp = int(now.replace(minute=0, second=0, microsecond=0).timestamp())
+
+    for resource in resources:
+        collection = resources[resource]["collection"]
+        for pipeline in resources[resource]["pipelines"]:
+            if not pipeline:
+                logger.error(f"no pipeline for {resource} in {collection} so cannot download")
+            else:
+                url = f"https://files.planning.data.gov.uk/{collection}-collection/var/converted-resource/{pipeline}/{resource}.csv?version={timestamp}"
+                output_path = f"var/converted-resource/{pipeline}/{resource}.csv"
+                url_map[url]=output_path
+
+    download_urls(url_map)
+
+
+
+if __name__ == "__main__":
+    download_converted_resource()
diff --git a/bin/download-converted-resources.sh b/bin/download-converted-resources.sh
diff --git a/bin/download-issues.py b/bin/download-issues.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+import logging
+import click
+from datetime import datetime
+
+from bin.resources import get_resources
+from file_downloader import download_urls
+
+
+logger =  logging.getLogger("__name__")
+
+@click.command()
+@click.option("--timestamp",default=None)
+def download_issues(timestamp=None):
+    resources = get_resources("collection/")
+    url_map = {}
+    now = datetime.now()
+    timestamp = int(now.replace(minute=0, second=0, microsecond=0).timestamp())
+
+    for resource in resources:
+        collection = resources[resource]["collection"]
+        for pipeline in resources[resource]["pipelines"]:
+            if not pipeline:
+                logger.error(f"no pipeline for {resource} in {collection} so cannot download")
+            else:
+                url = f"https://files.planning.data.gov.uk/{collection}-collection/issue/{pipeline}/{resource}.csv?version={timestamp}"
+                output_path = f"var/issue/{pipeline}/{resource}.csv"
+                url_map[url]=output_path
+    download_urls(url_map)
+
+
+
+if __name__ == "__main__":
+    download_issues()
diff --git a/bin/download-issues.sh b/bin/download-issues.sh
diff --git a/bin/file_downloader.py b/bin/file_downloader.py
@@ -10,17 +10,21 @@
 
 logger = logging.getLogger("__name__")
 
-def download_file(url, output_path,raise_error=False):
+def download_file(url, output_path,raise_error=False, max_retries=5):
     """Downloads a file using urllib and saves it to the output directory."""
     output_path = Path(output_path)
     output_path.parent.mkdir(parents=True,exist_ok=True)
-    try:
-        urlretrieve(url, output_path)
-    except Exception as e:
-        if raise_error:
-            raise e
-        else:
-            logger.error(f"error downloading file {e}")
+    retries = 0
+    while retries < max_retries:
+        try:
+            urlretrieve(url, output_path)
+            break
+        except Exception as e:
+            if raise_error:
+                raise e
+            else:
+                logger.error(f"error downloading file from url {url}: {e}")
+        retries += 1
 
 def download_urls(url_map, max_threads=4):
     """Downloads multiple files concurrently using threads."""

diff --git a/bin/resources.py b/bin/resources.py
@@ -4,15 +4,7 @@
 import csv
 import click
 
-@click.command()
-@click.option(
-    "--input-dir", 
-    default="collection/", 
-    help="Directory containing the CSV files"
-)
-def process_data(input_dir):
-    """Process CSV files to map resources, endpoints, and pipelines."""
-
+def get_resources(input_dir):
     endpoints = {}
     resources = {}
 
@@ -74,12 +66,22 @@ def process_data(input_dir):
     except FileNotFoundError:
         print("Error: resource.csv not found", file=sys.stderr)
         sys.exit(1)
+    return resources
 
+@click.command()
+@click.option(
+    "--input-dir", 
+    default="collection/", 
+    help="Directory containing the CSV files"
+)
+def process_data(input_dir):
+    """Process CSV files to map resources, endpoints, and pipelines."""
+    resources = get_resources(input_dir)
     # Print results
     for resource in resources:
         collection = resources[resource]["collection"]
         for pipeline in resources[resource]["pipelines"]:
             print(collection, pipeline, resource)
 
 if __name__ == "__main__":
-    process_data()
+    process_data()
diff --git a/makerules/makerules.mk b/makerules/makerules.mk
@@ -35,6 +35,9 @@ ifeq ($(CACHE_DIR),)
 CACHE_DIR=$(VAR_DIR)cache/
 endif
 
+ifeq ($(DATASETTE_URL),)
+DATASETTE_URL=https://datasette.planning.data.gov.uk/
+endif
 
 .PHONY: \
 	makerules\
@@ -154,6 +157,9 @@ else
 	curl -qfs "$(DATASTORE_URL)organisation-collection/dataset/organisation.csv" > $(CACHE_DIR)organisation.csv
 endif
 
+$(CACHE_DIR)provision_summary.csv:
+	curl -qfs "$(DATASETTE_URL)performance.csv?sql=select%20organisation%2C%20dataset%2C%20active_endpoint_count%20from%20provision_summary%20where%20active_endpoint_count%20%3E%200%20order%20by%20organisation%3B&_size=max" > $(CACHE_DIR)provision_summary.csv
+
 init:: config
 
 config::;

diff --git a/makerules/pipeline.mk b/makerules/pipeline.mk
@@ -121,7 +121,7 @@ endif
 
 define run-pipeline
 	mkdir -p $(@D) $(ISSUE_DIR)$(notdir $(@D)) $(OPERATIONAL_ISSUE_DIR) $(OUTPUT_LOG_DIR) $(COLUMN_FIELD_DIR)$(notdir $(@D)) $(DATASET_RESOURCE_DIR)$(notdir $(@D)) $(CONVERTED_RESOURCE_DIR)$(notdir $(@D))
-	digital-land ${DIGITAL_LAND_OPTS} --dataset $(notdir $(@D)) --pipeline-dir $(PIPELINE_DIR) $(DIGITAL_LAND_FLAGS) pipeline $(1) --issue-dir $(ISSUE_DIR)$(notdir $(@D)) --column-field-dir $(COLUMN_FIELD_DIR)$(notdir $(@D)) --dataset-resource-dir $(DATASET_RESOURCE_DIR)$(notdir $(@D)) --converted-resource-dir $(CONVERTED_RESOURCE_DIR)$(notdir $(@D)) --config-path $(CACHE_DIR)config.sqlite3 --organisation-path $(CACHE_DIR)organisation.csv $(PIPELINE_FLAGS) $< $@
+	digital-land ${DIGITAL_LAND_OPTS} --dataset $(notdir $(@D)) --pipeline-dir $(PIPELINE_DIR) $(DIGITAL_LAND_FLAGS) pipeline $(1) --issue-dir $(ISSUE_DIR)$(notdir $(@D)) --column-field-dir $(COLUMN_FIELD_DIR)$(notdir $(@D)) --dataset-resource-dir $(DATASET_RESOURCE_DIR)$(notdir $(@D)) --converted-resource-dir $(CONVERTED_RESOURCE_DIR)$(notdir $(@D)) --config-path $(CACHE_DIR)config.sqlite3 --organisation-path $(CACHE_DIR)organisation.csv --provision-summary-dir $(CACHE_DIR)provision_summary.csv $(PIPELINE_FLAGS) $< $@
 endef
 
 define build-dataset =
@@ -172,7 +172,7 @@ clean::
 
 # local copy of the organisation dataset
 # Download historic operational issue log data for relevant datasets
-init:: $(CACHE_DIR)organisation.csv
+init:: $(CACHE_DIR)organisation.csv $(CACHE_DIR)provision_summary.csv
 ifeq ($(COLLECTION_DATASET_BUCKET_NAME),)
 	@datasets=$$(awk -F , '$$2 == "$(COLLECTION_NAME)" {print $$4}' specification/dataset.csv); \
 	for dataset in $$datasets; do \

diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,7 @@ chardet
 PyPDF2
 pandas
 tqdm
+click
 pyarrow
 click==8.1.8
 -e git+https://github.com/digital-land/digital-land-python.git#egg=digital-land
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ chardet @@
     PyPDF2
     pandas
     tqdm
+    click
     pyarrow
     click==8.1.8
     -e git+https://github.com/digital-land/digital-land-python.git#egg=digital-land