diff --git a/Makefile b/Makefile index edb6d7c..37afe11 100644 --- a/Makefile +++ b/Makefile @@ -30,17 +30,16 @@ first-pass:: bin/download-collection.sh bin/download-pipeline.sh bin/concat.sh - bin/download-issues.sh + python bin/download_issues.py bin/download-operational-issues.sh - bin/download-column-field.sh - bin/download-converted-resources.sh + python bin/download_column_field.py + python bin/download_converted_resources.py #bin/download-resources.sh ./bin/concat-issues.py ./bin/concat-column-field.py ./bin/concat-converted-resource.py python3 bin/download_expectations.py - second-pass:: $(DB) third-pass:: $(DB_PERF) diff --git a/bin/__init__.py b/bin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bin/concat-column-field.py b/bin/concat-column-field.py index 8ba8366..ed0b92c 100755 --- a/bin/concat-column-field.py +++ b/bin/concat-column-field.py @@ -32,14 +32,16 @@ def process_column_fields(column_field_dir, input_dir): w.writeheader() - for path in glob.glob(f"{input_dir}/*/*.csv"): - m = re.search(r"/([a-zA-Z0-9_-]+)/([a-f0-9]+).csv$", path) - pipeline = m.group(1) - resource = m.group(2) - - with open(path, newline="") as infile: - for row in csv.DictReader(infile): - row["resource"] = resource - row["dataset"] = pipeline - w.writerow(row) - + for path in glob.glob(f"{input_dir}/*/*.csv"): + m = re.search(r"/([a-zA-Z0-9_-]+)/([a-f0-9]+).csv$", path) + pipeline = m.group(1) + resource = m.group(2) + + with open(path, newline="") as infile: + for row in csv.DictReader(infile): + row["resource"] = resource + row["dataset"] = pipeline + w.writerow(row) + +if __name__ == "__main__": + process_column_fields() \ No newline at end of file diff --git a/bin/download-column-field.sh b/bin/download-column-field.sh deleted file mode 100755 index 084b342..0000000 --- a/bin/download-column-field.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh - -s3="https://files.planning.data.gov.uk/" -column_field_dir="column-field/" -timestamp=`date +%s` - -python3 bin/resources.py | -while read collection pipeline resource -do - dir=var/column-field/$pipeline - path=$dir/$resource.csv - - echo collection: $collection pipeline: $pipeline resource: $resource - - if [ ! -f $path ] ; then - mkdir -p $dir - set -x - curl -qsfL $flags "$s3$collection-collection/var/column-field/$pipeline/$resource.csv?version=$timestamp" > $path - set +x - fi -done \ No newline at end of file diff --git a/bin/download-converted-resources.sh b/bin/download-converted-resources.sh deleted file mode 100755 index b3dbe4e..0000000 --- a/bin/download-converted-resources.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh -set -e - -s3="https://files.planning.data.gov.uk/" -dir=var/converted-resource -timestamp=`date +%s` - -python3 bin/resources.py | -while read collection pipeline resource -do - path=$dir/$pipeline/$resource.csv - - echo collection: $collection pipeline: $pipeline resource: $resource - - if [ ! -f $path ] ; then - mkdir -p $dir/$pipeline - set -x - curl -qsfL $flags "$s3$collection-collection/$dir/$pipeline/$resource.csv?version=$timestamp" -o $path ||: - set +x - fi -done \ No newline at end of file diff --git a/bin/download-issues.sh b/bin/download-issues.sh deleted file mode 100755 index 8029b7d..0000000 --- a/bin/download-issues.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh - -#set -e - -s3="https://files.planning.data.gov.uk/" -timestamp=`date +%s` - -python3 bin/resources.py | -while read collection pipeline resource -do - # https://digital-land-production-collection-dataset.s3.eu-west-2.amazonaws.com/{COLLECTION}-collection/issue/{PIPELINE}/{RESOURCE}.csv - dir=var/issue/$pipeline - path=$dir/$resource.csv - - if [ ! -f $path ] ; then - mkdir -p $dir - set -x - curl -qsfL $flags "$s3$collection-collection/issue/$pipeline/$resource.csv?version=$timestamp" > $path - set +x - fi -done diff --git a/bin/download_column_field.py b/bin/download_column_field.py new file mode 100644 index 0000000..8315a6f --- /dev/null +++ b/bin/download_column_field.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import logging +import click +from datetime import datetime + +from resources import get_resources +from file_downloader import download_urls + +logger = logging.getLogger("__name__") + +@click.command() +def download_column_field(timestamp=None): + resources = get_resources("collection/") + url_map = {} + now = datetime.now() + timestamp = int(now.replace(minute=0, second=0, microsecond=0).timestamp()) + + for resource in resources: + collection = resources[resource]["collection"] + for pipeline in resources[resource]["pipelines"]: + if not pipeline: + logger.error(f"no pipeline for {resource} in {collection} so cannot download") + else: + url = f"https://files.planning.data.gov.uk/{collection}-collection/var/column-field/{pipeline}/{resource}.csv?version={timestamp}" + output_path = f"var/column-field/{pipeline}/{resource}.csv" + url_map[url]=output_path + + download_urls(url_map) + + + +if __name__ == "__main__": + download_column_field() \ No newline at end of file diff --git a/bin/download_converted_resources.py b/bin/download_converted_resources.py new file mode 100644 index 0000000..e6f501c --- /dev/null +++ b/bin/download_converted_resources.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import logging +import click +from datetime import datetime + +from resources import get_resources +from file_downloader import download_urls + +logger = logging.getLogger("__name__") + +@click.command() +def download_converted_resource(timestamp=None): + resources = get_resources("collection/") + url_map = {} + now = datetime.now() + timestamp = int(now.replace(minute=0, second=0, microsecond=0).timestamp()) + + for resource in resources: + collection = resources[resource]["collection"] + for pipeline in resources[resource]["pipelines"]: + if not pipeline: + logger.error(f"no pipeline for {resource} in {collection} so cannot download") + else: + url = f"https://files.planning.data.gov.uk/{collection}-collection/var/converted-resource/{pipeline}/{resource}.csv?version={timestamp}" + output_path = f"var/converted-resource/{pipeline}/{resource}.csv" + url_map[url]=output_path + + download_urls(url_map) + + + +if __name__ == "__main__": + download_converted_resource() \ No newline at end of file diff --git a/bin/download_issues.py b/bin/download_issues.py new file mode 100755 index 0000000..acba8e3 --- /dev/null +++ b/bin/download_issues.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +import logging +import click +from datetime import datetime + +from resources import get_resources +from file_downloader import download_urls + + +logger = logging.getLogger("__name__") + +@click.command() +def download_issues(timestamp=None): + resources = get_resources("collection/") + url_map = {} + now = datetime.now() + timestamp = int(now.replace(minute=0, second=0, microsecond=0).timestamp()) + + for resource in resources: + collection = resources[resource]["collection"] + for pipeline in resources[resource]["pipelines"]: + if not pipeline: + logger.error(f"no pipeline for {resource} in {collection} so cannot download") + else: + url = f"https://files.planning.data.gov.uk/{collection}-collection/issue/{pipeline}/{resource}.csv?version={timestamp}" + output_path = f"var/issue/{pipeline}/{resource}.csv" + url_map[url]=output_path + download_urls(url_map) + + + +if __name__ == "__main__": + download_issues() \ No newline at end of file diff --git a/bin/file_downloader.py b/bin/file_downloader.py index a7516fc..eaaea94 100644 --- a/bin/file_downloader.py +++ b/bin/file_downloader.py @@ -10,17 +10,21 @@ logger = logging.getLogger("__name__") -def download_file(url, output_path,raise_error=False): +def download_file(url, output_path,raise_error=False, max_retries=5): """Downloads a file using urllib and saves it to the output directory.""" output_path = Path(output_path) output_path.parent.mkdir(parents=True,exist_ok=True) - try: - urlretrieve(url, output_path) - except Exception as e: - if raise_error: - raise e - else: - logger.error(f"error downloading file {e}") + retries = 0 + while retries < max_retries: + try: + urlretrieve(url, output_path) + break + except Exception as e: + if raise_error: + raise e + else: + logger.error(f"error downloading file from url {url}: {e}") + retries += 1 def download_urls(url_map, max_threads=4): """Downloads multiple files concurrently using threads.""" diff --git a/bin/resources.py b/bin/resources.py old mode 100755 new mode 100644 index 194d0ce..83566c7 --- a/bin/resources.py +++ b/bin/resources.py @@ -4,15 +4,7 @@ import csv import click -@click.command() -@click.option( - "--input-dir", - default="collection/", - help="Directory containing the CSV files" -) -def process_data(input_dir): - """Process CSV files to map resources, endpoints, and pipelines.""" - +def get_resources(input_dir): endpoints = {} resources = {} @@ -74,7 +66,17 @@ def process_data(input_dir): except FileNotFoundError: print("Error: resource.csv not found", file=sys.stderr) sys.exit(1) + return resources +@click.command() +@click.option( + "--input-dir", + default="collection/", + help="Directory containing the CSV files" +) +def process_data(input_dir): + """Process CSV files to map resources, endpoints, and pipelines.""" + resources = get_resources(input_dir) # Print results for resource in resources: collection = resources[resource]["collection"] @@ -82,4 +84,4 @@ def process_data(input_dir): print(collection, pipeline, resource) if __name__ == "__main__": - process_data() + process_data() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 55fcd00..21ee0ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ chardet PyPDF2 pandas tqdm +click pyarrow click==8.1.8 -e git+https://github.com/digital-land/digital-land-python.git#egg=digital-land