From 89d17453de23ae69e6ca422be98af7777b8c8fe8 Mon Sep 17 00:00:00 2001 From: Samriti Sadhu Date: Mon, 10 Feb 2025 09:43:46 +0000 Subject: [PATCH 1/6] Adding specification API --- README.md | 27 ++++++++++++++++++++++++++- src/db.py | 34 +++++++++++++++++++++++++++++----- tests/integration/test_main.py | 27 ++++++++++++++++++++++++--- 3 files changed, 79 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ecff8dd..24e92d7 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ In order to build and test the software outside of Docker, you will need You can run the API locally by running either `make compose-up` or `docker compose up -d --build`. -The docker compose setup runs the S3 locally using Localstack as well as the API. An S3 bucket called local-collection-data is created and seeded with example issue log data. +The docker compose setup runs the S3 locally using Localstack as well as the API. An S3 bucket called local-collection-data is created and seeded with example files in the collection-data directory. ## Swagger UI @@ -69,3 +69,28 @@ Request for issues for a specific dataset and resource: curl http://localhost:8000/log/issue?dataset=border&resource=4a57239e3c1174c80b6d4a0278ab386a7c3664f2e985b2e07a66bbec84988b30&field=geometry ``` +### provision_summary endpoint + +can be accessed via +``` +http://localhost:8000/performance/provision_summary?organisation=local-authority:LBH&offset=50&limit=100 +``` + +Optional Parameters: + * Offset + * Limit + * Organisation + * Dataset + + +### specification endpoint + +can be accessed via +``` +http://localhost:8000/specification/specification?offset=0&limit=10 +``` + +Optional Parameters: + * Offset + * Limit + * Dataset \ No newline at end of file diff --git a/src/db.py b/src/db.py index 671ae1e..34eebf6 100644 --- a/src/db.py +++ b/src/db.py @@ -3,7 +3,7 @@ from schema import IssuesParams, ProvisionParams, SpecificationsParams from pagination_model import PaginationParams, PaginatedResult from config import config - +import json logger = get_logger(__name__) @@ -101,12 +101,23 @@ def get_specification(params: SpecificationsParams): pagination = f"LIMIT {params.limit} OFFSET {params.offset}" where_clause = "" + if params.dataset: - where_clause += _add_condition(where_clause, f"dataset = '{params.dataset}'") + where_clause += _add_condition( + where_clause, + f"TRIM(BOTH '\"' FROM json_extract(json(value), '$.dataset')) = '{params.dataset}'", + ) - sql_count = f"SELECT COUNT(*) FROM '{s3_uri}' {where_clause}" + sql_count = f""" + SELECT COUNT(*) FROM (SELECT unnest(CAST(json AS VARCHAR[])) AS value FROM '{s3_uri}') + as parsed_json {where_clause} {pagination} + """ logger.debug(sql_count) - sql_results = f"SELECT * FROM '{s3_uri}' {where_clause} {pagination}" + sql_results = f""" + SELECT value as json FROM + (SELECT unnest(CAST(json AS VARCHAR[])) AS value FROM '{s3_uri}') + as parsed_json {where_clause} {pagination} + """ logger.debug(sql_results) with duckdb.connect() as conn: @@ -118,14 +129,27 @@ def get_specification(params: SpecificationsParams): ).fetchall() ) logger.debug(conn.execute("FROM duckdb_secrets();").fetchall()) + count = conn.execute(sql_count).fetchone()[ 0 ] # Count is first item in Tuple results = conn.execute(sql_results).arrow().to_pylist() + + # Extract and parse the JSON field + json_results = [] + for item in results: + logger.error(item) + if "json" in item and isinstance(item["json"], str): + try: + parsed_json = json.loads(item["json"]) + json_results.append(parsed_json) + except json.JSONDecodeError: + logger.warning(f"Invalid JSON format in row: {item['json']}") + return PaginatedResult( params=PaginationParams(offset=params.offset, limit=params.limit), total_results_available=count, - data=results, + data=json_results, ) except Exception as e: logger.exception( diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py index 1c32878..dde277c 100644 --- a/tests/integration/test_main.py +++ b/tests/integration/test_main.py @@ -1,6 +1,5 @@ from fastapi.testclient import TestClient from main import app -import json # Create a test client for the FastAPI app client = TestClient(app) @@ -88,8 +87,30 @@ def test_specification(s3_bucket): response_data = response.json() assert "X-Pagination-Total-Results" in response.headers - assert response.headers["X-Pagination-Total-Results"] == str(16) + assert response.headers["X-Pagination-Total-Results"] == str(36) assert response.headers["X-Pagination-Limit"] == "8" assert len(response_data) > 0 - assert response_data[0]["name"] == "Article 4 direction" + +def test_specification_with_dataset(s3_bucket): + # Prepare test params + params = { + "offset": 0, + "limit": 8, + "dataset": "article-4-direction-area", + } + + response = client.get("/specification/specification", params=params) + + # Validate the results from the search + assert response.status_code == 200 + + response_data = response.json() + assert "X-Pagination-Total-Results" in response.headers + assert response.headers["X-Pagination-Total-Results"] == str(1) + assert response.headers["X-Pagination-Limit"] == "8" + + assert len(response_data) > 0 + assert response_data[0]["dataset"] == "article-4-direction-area" + assert response_data[0]["fields"] + assert len(response_data[0]["fields"]) > 1 \ No newline at end of file From dabfa4ddcfb6c7d0ad7c713be595dc7771daf432 Mon Sep 17 00:00:00 2001 From: Samriti Sadhu Date: Mon, 10 Feb 2025 09:45:13 +0000 Subject: [PATCH 2/6] black fix --- tests/integration/test_main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py index dde277c..a74701c 100644 --- a/tests/integration/test_main.py +++ b/tests/integration/test_main.py @@ -92,6 +92,7 @@ def test_specification(s3_bucket): assert len(response_data) > 0 + def test_specification_with_dataset(s3_bucket): # Prepare test params params = { @@ -113,4 +114,4 @@ def test_specification_with_dataset(s3_bucket): assert len(response_data) > 0 assert response_data[0]["dataset"] == "article-4-direction-area" assert response_data[0]["fields"] - assert len(response_data[0]["fields"]) > 1 \ No newline at end of file + assert len(response_data[0]["fields"]) > 1 From adaae69aa44b63c8d9690a3c5bc705e6447d5d66 Mon Sep 17 00:00:00 2001 From: Samriti Sadhu Date: Mon, 10 Feb 2025 11:00:41 +0000 Subject: [PATCH 3/6] flake checks --- src/db.py | 10 +++++----- tests/integration/test_main.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/db.py b/src/db.py index 34eebf6..831c246 100644 --- a/src/db.py +++ b/src/db.py @@ -109,15 +109,15 @@ def get_specification(params: SpecificationsParams): ) sql_count = f""" - SELECT COUNT(*) FROM (SELECT unnest(CAST(json AS VARCHAR[])) AS value FROM '{s3_uri}') - as parsed_json {where_clause} {pagination} + SELECT COUNT(*) FROM (SELECT unnest(CAST(json AS VARCHAR[])) AS value + FROM '{s3_uri}') as parsed_json {where_clause} {pagination} """ logger.debug(sql_count) sql_results = f""" SELECT value as json FROM - (SELECT unnest(CAST(json AS VARCHAR[])) AS value FROM '{s3_uri}') - as parsed_json {where_clause} {pagination} - """ + (SELECT unnest(CAST(json AS VARCHAR[])) AS value FROM '{s3_uri}') AS parsed_json + {where_clause} {pagination} + """ logger.debug(sql_results) with duckdb.connect() as conn: diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py index a74701c..286b3dd 100644 --- a/tests/integration/test_main.py +++ b/tests/integration/test_main.py @@ -1,5 +1,6 @@ from fastapi.testclient import TestClient from main import app +import json # Create a test client for the FastAPI app client = TestClient(app) From ab504f3c0b9b29a3a7f63e76346fffe502af1477 Mon Sep 17 00:00:00 2001 From: Samriti Sadhu Date: Mon, 10 Feb 2025 11:33:51 +0000 Subject: [PATCH 4/6] removing logger statements --- src/db.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/db.py b/src/db.py index 831c246..37e0e37 100644 --- a/src/db.py +++ b/src/db.py @@ -138,7 +138,6 @@ def get_specification(params: SpecificationsParams): # Extract and parse the JSON field json_results = [] for item in results: - logger.error(item) if "json" in item and isinstance(item["json"], str): try: parsed_json = json.loads(item["json"]) From 4ec3ab80524afbe7d3b196f63e2fe4fd31be6eb8 Mon Sep 17 00:00:00 2001 From: Samriti Sadhu Date: Tue, 11 Feb 2025 13:50:15 +0000 Subject: [PATCH 5/6] parametrised queries --- src/db.py | 78 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/src/db.py b/src/db.py index 37e0e37..218a0bd 100644 --- a/src/db.py +++ b/src/db.py @@ -56,19 +56,22 @@ def search_issues(params: IssuesParams): def search_provision_summary(params: ProvisionParams): s3_uri = f"s3://{config.collection_bucket}/{config.performance_base_path}/*.parquet" # noqa - pagination = f"LIMIT {params.limit} OFFSET {params.offset}" where_clause = "" + query_params = [] + if params.dataset: - where_clause += _add_condition(where_clause, f"dataset = '{params.dataset}'") + where_clause += _add_condition(where_clause, "dataset = ?") + query_params.append(params.dataset) + if params.organisation: - where_clause += _add_condition( - where_clause, f"organisation = '{params.organisation}'" - ) + where_clause += _add_condition(where_clause, "organisation = ?") + query_params.append(params.organisation) sql_count = f"SELECT COUNT(*) FROM '{s3_uri}' {where_clause}" + sql_results = f"SELECT * FROM '{s3_uri}' {where_clause} LIMIT ? OFFSET ?" + logger.debug(sql_count) - sql_results = f"SELECT * FROM '{s3_uri}' {where_clause} {pagination}" logger.debug(sql_results) with duckdb.connect() as conn: @@ -80,44 +83,56 @@ def search_provision_summary(params: ProvisionParams): ).fetchall() ) logger.debug(conn.execute("FROM duckdb_secrets();").fetchall()) - count = conn.execute(sql_count).fetchone()[ - 0 - ] # Count is first item in Tuple - results = conn.execute(sql_results).arrow().to_pylist() + + # Execute parameterized queries + count = conn.execute(sql_count, query_params).fetchone()[0] + results = ( + conn.execute(sql_results, query_params + [params.limit, params.offset]) + .arrow() + .to_pylist() + ) + return PaginatedResult( params=PaginationParams(offset=params.offset, limit=params.limit), total_results_available=count, data=results, ) except Exception as e: - logger.exception( - "Failure executing DuckDB queries", - ) + logger.exception("Failure executing DuckDB queries") raise e def get_specification(params: SpecificationsParams): s3_uri = f"s3://{config.collection_bucket}/{config.specification_base_path}/*.parquet" # noqa - pagination = f"LIMIT {params.limit} OFFSET {params.offset}" where_clause = "" + query_params = {} if params.dataset: where_clause += _add_condition( where_clause, - f"TRIM(BOTH '\"' FROM json_extract(json(value), '$.dataset')) = '{params.dataset}'", + "TRIM(BOTH '\"' FROM json_extract(json(value), '$.dataset')) = ?", ) + query_params["dataset"] = params.dataset sql_count = f""" - SELECT COUNT(*) FROM (SELECT unnest(CAST(json AS VARCHAR[])) AS value - FROM '{s3_uri}') as parsed_json {where_clause} {pagination} + SELECT COUNT(*) FROM ( + SELECT unnest(CAST(json AS VARCHAR[])) AS value + FROM '{s3_uri}' + ) AS parsed_json {where_clause} + LIMIT ? OFFSET ? """ - logger.debug(sql_count) + sql_results = f""" - SELECT value as json FROM - (SELECT unnest(CAST(json AS VARCHAR[])) AS value FROM '{s3_uri}') AS parsed_json - {where_clause} {pagination} + SELECT value AS json FROM ( + SELECT unnest(CAST(json AS VARCHAR[])) AS value + FROM '{s3_uri}' + ) AS parsed_json + {where_clause} + LIMIT ? OFFSET ? """ + + logger.debug(sql_count) logger.debug(sql_results) with duckdb.connect() as conn: @@ -130,12 +145,19 @@ def get_specification(params: SpecificationsParams): ) logger.debug(conn.execute("FROM duckdb_secrets();").fetchall()) - count = conn.execute(sql_count).fetchone()[ - 0 - ] # Count is first item in Tuple - results = conn.execute(sql_results).arrow().to_pylist() + # Execute queries with parameters + count = conn.execute( + sql_count, [*query_params.values(), params.limit, params.offset] + ).fetchone()[0] + results = ( + conn.execute( + sql_results, [*query_params.values(), params.limit, params.offset] + ) + .arrow() + .to_pylist() + ) - # Extract and parse the JSON field + # Convert JSON strings to actual JSON objects json_results = [] for item in results: if "json" in item and isinstance(item["json"], str): @@ -151,9 +173,7 @@ def get_specification(params: SpecificationsParams): data=json_results, ) except Exception as e: - logger.exception( - "Failure executing DuckDB queries", - ) + logger.exception("Failure executing DuckDB queries") raise e From d9cc01ff80ef66032c5f948e7bb04b97fd778a4b Mon Sep 17 00:00:00 2001 From: Samriti Sadhu Date: Tue, 11 Feb 2025 13:54:23 +0000 Subject: [PATCH 6/6] flake check --- src/db.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/db.py b/src/db.py index 218a0bd..b953627 100644 --- a/src/db.py +++ b/src/db.py @@ -117,18 +117,15 @@ def get_specification(params: SpecificationsParams): sql_count = f""" SELECT COUNT(*) FROM ( - SELECT unnest(CAST(json AS VARCHAR[])) AS value - FROM '{s3_uri}' - ) AS parsed_json {where_clause} + SELECT unnest(CAST(json AS VARCHAR[])) AS value + FROM '{s3_uri}') AS parsed_json {where_clause} LIMIT ? OFFSET ? """ sql_results = f""" SELECT value AS json FROM ( - SELECT unnest(CAST(json AS VARCHAR[])) AS value - FROM '{s3_uri}' - ) AS parsed_json - {where_clause} + SELECT unnest(CAST(json AS VARCHAR[])) AS value + FROM '{s3_uri}') AS parsed_json {where_clause} LIMIT ? OFFSET ? """