|
1 |
| -import datetime |
2 |
| -import logging |
3 |
| - |
4 |
| -import pandas as pd |
5 |
| -import requests |
6 |
| -from requests import adapters |
7 |
| -from urllib3 import Retry |
8 |
| - |
9 |
| - |
10 |
| -def get_datasette_http(): |
11 |
| - """ |
12 |
| - Function to return http for the use of querying datasette, |
13 |
| - specifically to add retries for larger queries |
14 |
| - """ |
15 |
| - retry_strategy = Retry(total=3, status_forcelist=[400], backoff_factor=0) |
16 |
| - |
17 |
| - adapter = adapters.HTTPAdapter(max_retries=retry_strategy) |
18 |
| - |
19 |
| - http = requests.Session() |
20 |
| - http.mount("https://", adapter) |
21 |
| - http.mount("http://", adapter) |
22 |
| - |
23 |
| - return http |
24 |
| - |
25 |
| - |
26 |
| -def get_datasette_query( |
27 |
| - db, sql, filter=None, url="https://datasette.planning.data.gov.uk" |
28 |
| -): |
29 |
| - url = f"{url}/{db}.json" |
30 |
| - params = {"sql": sql, "_shape": "array", "_size": "max"} |
31 |
| - if filter: |
32 |
| - params.update(filter) |
33 |
| - try: |
34 |
| - http = get_datasette_http() |
35 |
| - resp = http.get(url, params=params) |
36 |
| - resp.raise_for_status() |
37 |
| - df = pd.DataFrame.from_dict(resp.json()) |
38 |
| - return df |
39 |
| - except Exception as e: |
40 |
| - logging.warning(e) |
41 |
| - return None |
42 |
| - |
43 |
| - |
44 |
| -def get_datasette_query_issue_summary( |
45 |
| - db, filter=None, url="https://datasette.planning.data.gov.uk" |
46 |
| -): |
47 |
| - url = f"{url}/{db}.json" |
48 |
| - params = {} |
49 |
| - |
50 |
| - if filter: |
51 |
| - params.update(filter) |
52 |
| - |
53 |
| - try: |
54 |
| - http = get_datasette_http() |
55 |
| - all_rows = [] |
56 |
| - |
57 |
| - while True: |
58 |
| - """ |
59 |
| - Datasette returns a max of 1000 rows. This should be able to be changed but for now, |
60 |
| - if there is more than 1000 rows, a pagination next will be returned in the response. |
61 |
| - We can use this to fetch the next 1000 rows repeatedly until all rows have been accumulated. |
62 |
| - """ |
63 |
| - |
64 |
| - resp = http.get(url, params=params) |
65 |
| - response_json = resp.json() |
66 |
| - rows = response_json.get("rows", []) |
67 |
| - |
68 |
| - # Accumulate rows |
69 |
| - all_rows.extend(rows) |
70 |
| - |
71 |
| - # Check if there's a "next" token for pagination |
72 |
| - next_token = response_json.get("next") |
73 |
| - if not next_token: |
74 |
| - break |
75 |
| - |
76 |
| - params["_next"] = next_token |
77 |
| - if all_rows and response_json.get("columns"): |
78 |
| - df = pd.DataFrame(all_rows, columns=response_json["columns"]) |
79 |
| - return df |
80 |
| - else: |
81 |
| - logging.error("No rows or columns available to create a DataFrame") |
82 |
| - return None |
83 |
| - |
84 |
| - except Exception as e: |
85 |
| - logging.warning(f"Exception occurred: {e}") |
86 |
| - return None |
87 |
| - |
88 |
| - |
89 |
| -# def get_datasets_summary(): |
90 |
| -# # get all the datasets listed with their active status |
91 |
| -# all_datasets = index_by("dataset", get_datasets()) |
92 |
| -# missing = [] |
93 |
| - |
94 |
| -# # add the publisher coverage numbers |
95 |
| -# dataset_coverage = publisher_coverage() |
96 |
| -# for d in dataset_coverage: |
97 |
| -# if all_datasets.get(d["pipeline"]): |
98 |
| -# all_datasets[d["pipeline"]] = {**all_datasets[d["pipeline"]], **d} |
99 |
| -# else: |
100 |
| -# missing.append(d["pipeline"]) |
101 |
| - |
102 |
| -# # add the total resource count |
103 |
| -# dataset_resource_counts = resources_by_dataset() |
104 |
| -# for d in dataset_resource_counts: |
105 |
| -# if all_datasets.get(d["pipeline"]): |
106 |
| -# all_datasets[d["pipeline"]] = {**all_datasets[d["pipeline"]], **d} |
107 |
| -# else: |
108 |
| -# missing.append(d["pipeline"]) |
109 |
| - |
110 |
| -# # add the first and last resource dates |
111 |
| -# dataset_resource_dates = first_and_last_resource() |
112 |
| -# for d in dataset_resource_dates: |
113 |
| -# if all_datasets.get(d["pipeline"]): |
114 |
| -# all_datasets[d["pipeline"]] = {**all_datasets[d["pipeline"]], **d} |
115 |
| -# else: |
116 |
| -# missing.append(d["pipeline"]) |
117 |
| - |
118 |
| -# return all_datasets |
119 |
| - |
120 |
| - |
121 |
| -def generate_weeks(number_of_weeks=None, date_from=None): |
122 |
| - now = datetime.datetime.now() |
123 |
| - monday = now - datetime.timedelta(days=now.weekday()) |
124 |
| - dates = [] |
125 |
| - |
126 |
| - if date_from: |
127 |
| - date = datetime.datetime.strptime(date_from, "%Y-%m-%d") |
128 |
| - while date < now: |
129 |
| - week_number = int(date.strftime("%W")) |
130 |
| - year_number = int(date.year) |
131 |
| - dates.append( |
132 |
| - {"date": date, "week_number": week_number, "year_number": year_number} |
133 |
| - ) |
134 |
| - date = date + datetime.timedelta(days=7) |
135 |
| - return dates |
136 |
| - elif number_of_weeks: |
137 |
| - for week in range(0, number_of_weeks): |
138 |
| - date = monday - datetime.timedelta(weeks=week) |
139 |
| - week_number = int(date.strftime("%W")) |
140 |
| - year_number = int(date.year) |
141 |
| - dates.append( |
142 |
| - {"date": date, "week_number": week_number, "year_number": year_number} |
143 |
| - ) |
144 |
| - return list(reversed(dates)) |
| 1 | +import datetime |
| 2 | +import logging |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | +import requests |
| 6 | +from requests import adapters |
| 7 | +from urllib3 import Retry |
| 8 | + |
| 9 | + |
| 10 | +def get_datasette_http(): |
| 11 | + """ |
| 12 | + Function to return http for the use of querying datasette, |
| 13 | + specifically to add retries for larger queries |
| 14 | + """ |
| 15 | + retry_strategy = Retry(total=3, status_forcelist=[400], backoff_factor=0) |
| 16 | + |
| 17 | + adapter = adapters.HTTPAdapter(max_retries=retry_strategy) |
| 18 | + |
| 19 | + http = requests.Session() |
| 20 | + http.mount("https://", adapter) |
| 21 | + http.mount("http://", adapter) |
| 22 | + |
| 23 | + return http |
| 24 | + |
| 25 | + |
| 26 | +def get_datasette_query( |
| 27 | + db, sql, filter=None, url="https://datasette.planning.data.gov.uk" |
| 28 | +): |
| 29 | + url = f"{url}/{db}.json" |
| 30 | + params = {"sql": sql, "_shape": "array", "_size": "max"} |
| 31 | + if filter: |
| 32 | + params.update(filter) |
| 33 | + try: |
| 34 | + http = get_datasette_http() |
| 35 | + resp = http.get(url, params=params) |
| 36 | + resp.raise_for_status() |
| 37 | + df = pd.DataFrame.from_dict(resp.json()) |
| 38 | + return df |
| 39 | + except Exception as e: |
| 40 | + logging.warning(e) |
| 41 | + return None |
| 42 | + |
| 43 | + |
| 44 | +def get_datasette_query_issue_summary( |
| 45 | + db, filter=None, url="https://datasette.planning.data.gov.uk" |
| 46 | +): |
| 47 | + url = f"{url}/{db}.json" |
| 48 | + params = {} |
| 49 | + |
| 50 | + if filter: |
| 51 | + params.update(filter) |
| 52 | + |
| 53 | + try: |
| 54 | + http = get_datasette_http() |
| 55 | + all_rows = [] |
| 56 | + |
| 57 | + while True: |
| 58 | + """ |
| 59 | + Datasette returns a max of 1000 rows. This should be able to be changed but for now, |
| 60 | + if there is more than 1000 rows, a pagination next will be returned in the response. |
| 61 | + We can use this to fetch the next 1000 rows repeatedly until all rows have been accumulated. |
| 62 | + """ |
| 63 | + |
| 64 | + resp = http.get(url, params=params) |
| 65 | + response_json = resp.json() |
| 66 | + rows = response_json.get("rows", []) |
| 67 | + |
| 68 | + # Accumulate rows |
| 69 | + all_rows.extend(rows) |
| 70 | + |
| 71 | + # Check if there's a "next" token for pagination |
| 72 | + next_token = response_json.get("next") |
| 73 | + if not next_token: |
| 74 | + break |
| 75 | + |
| 76 | + params["_next"] = next_token |
| 77 | + if all_rows and response_json.get("columns"): |
| 78 | + df = pd.DataFrame(all_rows, columns=response_json["columns"]) |
| 79 | + return df |
| 80 | + else: |
| 81 | + logging.error("No rows or columns available to create a DataFrame") |
| 82 | + return None |
| 83 | + |
| 84 | + except Exception as e: |
| 85 | + logging.warning(f"Exception occurred: {e}") |
| 86 | + return None |
| 87 | + |
| 88 | + |
| 89 | +# def get_datasets_summary(): |
| 90 | +# # get all the datasets listed with their active status |
| 91 | +# all_datasets = index_by("dataset", get_datasets()) |
| 92 | +# missing = [] |
| 93 | + |
| 94 | +# # add the publisher coverage numbers |
| 95 | +# dataset_coverage = publisher_coverage() |
| 96 | +# for d in dataset_coverage: |
| 97 | +# if all_datasets.get(d["pipeline"]): |
| 98 | +# all_datasets[d["pipeline"]] = {**all_datasets[d["pipeline"]], **d} |
| 99 | +# else: |
| 100 | +# missing.append(d["pipeline"]) |
| 101 | + |
| 102 | +# # add the total resource count |
| 103 | +# dataset_resource_counts = resources_by_dataset() |
| 104 | +# for d in dataset_resource_counts: |
| 105 | +# if all_datasets.get(d["pipeline"]): |
| 106 | +# all_datasets[d["pipeline"]] = {**all_datasets[d["pipeline"]], **d} |
| 107 | +# else: |
| 108 | +# missing.append(d["pipeline"]) |
| 109 | + |
| 110 | +# # add the first and last resource dates |
| 111 | +# dataset_resource_dates = first_and_last_resource() |
| 112 | +# for d in dataset_resource_dates: |
| 113 | +# if all_datasets.get(d["pipeline"]): |
| 114 | +# all_datasets[d["pipeline"]] = {**all_datasets[d["pipeline"]], **d} |
| 115 | +# else: |
| 116 | +# missing.append(d["pipeline"]) |
| 117 | + |
| 118 | +# return all_datasets |
| 119 | + |
| 120 | + |
| 121 | +def generate_weeks(number_of_weeks=None, date_from=None): |
| 122 | + now = datetime.datetime.now() |
| 123 | + monday = now - datetime.timedelta(days=now.weekday()) |
| 124 | + dates = [] |
| 125 | + |
| 126 | + if date_from: |
| 127 | + date = datetime.datetime.strptime(date_from, "%Y-%m-%d") |
| 128 | + while date < now: |
| 129 | + week_number = int(date.strftime("%W")) |
| 130 | + year_number = int(date.year) |
| 131 | + dates.append( |
| 132 | + {"date": date, "week_number": week_number, "year_number": year_number} |
| 133 | + ) |
| 134 | + date = date + datetime.timedelta(days=7) |
| 135 | + return dates |
| 136 | + elif number_of_weeks: |
| 137 | + for week in range(0, number_of_weeks): |
| 138 | + date = monday - datetime.timedelta(weeks=week) |
| 139 | + week_number = int(date.strftime("%W")) |
| 140 | + year_number = int(date.year) |
| 141 | + dates.append( |
| 142 | + {"date": date, "week_number": week_number, "year_number": year_number} |
| 143 | + ) |
| 144 | + return list(reversed(dates)) |
0 commit comments