-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy path__main__.py
189 lines (161 loc) · 5.95 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""
Cli for scraping Scottish Parliament 2024
Use `python -m pyscraper.sp_2024 --help` to see available commands
"""
from __future__ import annotations
import datetime
from pathlib import Path
import click
from .convert import convert_xml_to_twfy
from .convert_wrans import convert_wrans_xml_to_twfy
from .download import fetch_debates_for_dates, fetch_wrans_for_dates
from .parse import tidy_up_html
from .parse_wrans import tidy_up_wrans_html
file_dir = Path(__file__).parent
parldata = Path(file_dir, "..", "..", "..", "parldata")
download_dir = parldata / "cmpages" / "sp_2024" / "raw"
parsed_dir = parldata / "cmpages" / "sp_2024" / "parsed"
output_dir = parldata / "scrapedxml" / "sp-new"
output_dir_wrans = parldata / "scrapedxml" / "sp-written"
@click.group()
def cli():
pass
def cache_dir_iterator(
cache_dir: Path,
start_date: datetime.date,
end_date: datetime.date,
partial_file_name: str | None,
):
"""
Return an iterator of files in the cache_dir that are between the start and end date
"""
for file in cache_dir.glob("*.xml"):
if partial_file_name:
if not file.name.startswith(partial_file_name):
continue
# date is an iso date at the start of the filename
date = datetime.date.fromisoformat(file.stem[:10])
if start_date <= date <= end_date:
yield file
@cli.command()
@click.option(
"--start-date", help="isodate to start fetching debates from", required=True
)
@click.option("--end-date", help="isodate to end fetching debates at", required=True)
@click.option(
"--download",
is_flag=True,
help="Download the debates, pair with 'override' to redownload all files",
)
@click.option("--parse", is_flag=True, help="Parse the downloaded debates")
@click.option("--convert", is_flag=True, help="Convert the parsed debates")
@click.option("--verbose", is_flag=True, help="Print verbose output")
@click.option("--override", is_flag=True, help="Override existing files")
@click.option(
"--partial-file-name", help="Only parse/convert files that match this string"
)
def debates(
start_date: str,
end_date: str,
download: bool = False,
parse: bool = False,
convert: bool = False,
verbose: bool = False,
override: bool = False,
partial_file_name: str | None = None,
):
"""
Download transcripts from Scottish Parliament between a start and end date.
"""
start = datetime.date.fromisoformat(start_date)
end = datetime.date.fromisoformat(end_date)
# if none of the flags are set, error that at least one flag must be set
if not any([download, parse, convert]):
click.echo("At least one of the flags must be set")
return
# iterate through downloaded files if we're downloading them
# otherwise go find the relevant files based on name
if download:
file_iterator = fetch_debates_for_dates(
start.isoformat(),
end.isoformat(),
verbose=verbose,
cache_dir=download_dir,
override=override,
)
for file in file_iterator:
pass
if parse:
file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Parsing up {file}")
tidy_up_html(file, parsed_dir)
if convert:
file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Converting {file} to TheyWorkForYou format")
convert_xml_to_twfy(file, output_dir, verbose=verbose)
@cli.command()
@click.option(
"--start-date", help="isodate to start fetching wrans from", required=True
)
@click.option("--end-date", help="isodate to end fetching wrans at", required=True)
@click.option(
"--download",
is_flag=True,
help="Download the wrans, pair with 'override' to redownload all files",
)
@click.option("--parse", is_flag=True, help="Parse the downloaded wrans")
@click.option("--convert", is_flag=True, help="Convert the parsed wrans")
@click.option("--verbose", is_flag=True, help="Print verbose output")
@click.option("--override", is_flag=True, help="Override existing files")
@click.option(
"--partial-file-name", help="Only parse/convert files that match this string"
)
def wrans(
start_date: str,
end_date: str,
download: bool = False,
parse: bool = False,
convert: bool = False,
verbose: bool = False,
override: bool = False,
partial_file_name: str | None = None,
):
"""
Download written answers from Scottish Parliament between a start and end date.
"""
start = datetime.date.fromisoformat(start_date)
end = datetime.date.fromisoformat(end_date)
# if none of the flags are set, error that at least one flag must be set
if not any([download, parse, convert]):
click.echo("At least one of the flags must be set")
return
# iterate through downloaded files if we're downloading them
# otherwise go find the relevant files based on name
if download:
file_iterator = fetch_wrans_for_dates(
start.isoformat(),
end.isoformat(),
verbose=verbose,
cache_dir=download_dir,
override=override,
)
for file in file_iterator:
pass
if parse:
file_iterator = cache_dir_iterator(download_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Parsing up {file}")
tidy_up_wrans_html(file, parsed_dir)
if convert:
file_iterator = cache_dir_iterator(parsed_dir, start, end, partial_file_name)
for file in file_iterator:
if verbose:
print(f"Converting {file} to TheyWorkForYou format")
convert_wrans_xml_to_twfy(file, output_dir_wrans, verbose=verbose)
if __name__ == "__main__":
cli(prog_name="python -m pyscraper.sp_2024")