|
1 | 1 | import re
|
2 | 2 | import logging
|
3 |
| -import time |
4 | 3 | import pandas as pd
|
5 |
| -import urllib |
6 | 4 |
|
7 | 5 | from .phase import Phase
|
8 | 6 |
|
@@ -36,13 +34,15 @@ def __init__(
|
36 | 34 | issue_log=None,
|
37 | 35 | operational_issue_log=None,
|
38 | 36 | entity_range=[],
|
| 37 | + provision_summary_dir=None, |
39 | 38 | ):
|
40 | 39 | self.lookups = lookups
|
41 | 40 | self.redirect_lookups = redirect_lookups
|
42 | 41 | self.issues = issue_log
|
43 | 42 | self.operational_issues = operational_issue_log
|
44 | 43 | self.reverse_lookups = self.build_reverse_lookups()
|
45 | 44 | self.entity_range = entity_range
|
| 45 | + self.provision_summary_dir = provision_summary_dir |
46 | 46 |
|
47 | 47 | def build_reverse_lookups(self):
|
48 | 48 | reverse_lookups = {}
|
@@ -167,67 +167,43 @@ def process(self, stream):
|
167 | 167 | row[self.entity_field]
|
168 | 168 | )
|
169 | 169 |
|
| 170 | + linked_datasets = ["article-4-direction", "tree-preservation-order"] |
170 | 171 | if row[self.entity_field]:
|
171 |
| - if ( |
172 |
| - row.get("article-4-direction", "") |
173 |
| - or row.get("tree-preservation-order", "").strip() |
174 |
| - ): |
175 |
| - linked_dataset = ( |
176 |
| - "article-4-direction" |
177 |
| - if "article-4-direction" in row |
178 |
| - else "tree-preservation-order" |
179 |
| - ) |
180 |
| - |
181 |
| - # check applied for organisations that have provided a document dataset |
182 |
| - if not hasattr( |
183 |
| - self, "lpa_list" |
184 |
| - ): # check if data fetched already |
185 |
| - params = urllib.parse.urlencode( |
186 |
| - { |
187 |
| - "sql": f"""select organisation from provision_summary where active_endpoint_count > 0 and dataset == '{linked_dataset}'""", |
188 |
| - "_size": "max", |
189 |
| - } |
| 172 | + for linked_dataset in linked_datasets: |
| 173 | + if ( |
| 174 | + row.get(linked_dataset, "") |
| 175 | + or row.get(linked_dataset, "").strip() |
| 176 | + ): |
| 177 | + get_organisations = pd.read_csv( |
| 178 | + self.provision_summary_dir + linked_dataset + ".csv" |
190 | 179 | )
|
191 |
| - base_url = f"https://datasette.planning.data.gov.uk/performance.csv?{params}" |
192 |
| - |
193 |
| - max_retries = 60 # Retry for an hour |
194 |
| - for attempt in range(max_retries): |
195 |
| - try: |
196 |
| - get_lpa = pd.read_csv(base_url) |
197 |
| - self.lpa_list = get_lpa["organisation"].to_list() |
198 |
| - break |
199 |
| - except urllib.error.HTTPError: |
200 |
| - if attempt < max_retries - 1: |
201 |
| - time.sleep(60) |
202 |
| - else: |
203 |
| - raise Exception( |
204 |
| - "Failed to fetch datasette after multiple attempts" |
205 |
| - ) |
206 |
| - |
207 |
| - if row.get("organisation", "") in self.lpa_list: |
208 |
| - reference = row.get(linked_dataset, "") |
209 | 180 |
|
210 |
| - find_entity = self.lookup( |
211 |
| - prefix=linked_dataset, |
212 |
| - organisation=row.get("organisation", ""), |
213 |
| - reference=row.get(linked_dataset, ""), |
214 |
| - ) |
215 |
| - # raise issue if the found entity is retired in old-entity.csv |
216 |
| - if not find_entity or ( |
217 |
| - str(find_entity) in self.redirect_lookups |
218 |
| - and int( |
219 |
| - self.redirect_lookups[str(find_entity)].get( |
220 |
| - "status", 0 |
221 |
| - ) |
222 |
| - ) |
223 |
| - == 410 |
| 181 | + if ( |
| 182 | + row.get("organisation", "") |
| 183 | + in get_organisations["organisation"].values |
224 | 184 | ):
|
225 |
| - self.issues.log_issue( |
226 |
| - linked_dataset, |
227 |
| - "no associated documents found for this area", |
228 |
| - reference, |
229 |
| - line_number=line_number, |
| 185 | + reference = row.get(linked_dataset, "") |
| 186 | + find_entity = self.lookup( |
| 187 | + prefix=linked_dataset, |
| 188 | + organisation=row.get("organisation", ""), |
| 189 | + reference=reference, |
230 | 190 | )
|
| 191 | + # raise issue if the found entity is retired in old-entity.csv |
| 192 | + if not find_entity or ( |
| 193 | + str(find_entity) in self.redirect_lookups |
| 194 | + and int( |
| 195 | + self.redirect_lookups[str(find_entity)].get( |
| 196 | + "status", 0 |
| 197 | + ) |
| 198 | + ) |
| 199 | + == 410 |
| 200 | + ): |
| 201 | + self.issues.log_issue( |
| 202 | + linked_dataset, |
| 203 | + "no associated documents found for this area", |
| 204 | + reference, |
| 205 | + line_number=line_number, |
| 206 | + ) |
231 | 207 | yield block
|
232 | 208 |
|
233 | 209 |
|
|
0 commit comments