-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_ma_legislators.py
109 lines (83 loc) · 3.27 KB
/
scrape_ma_legislators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Scrape legislator names and contact info from https://malegislature.gov/Legislators.
TODO: Replace this with a Scrapy project.
"""
import functools
import json
import logging
import pathlib
import sys
import warnings
from collections import OrderedDict
import backoff
import requests
import requests_cache
import urllib3
from bs4 import BeautifulSoup
from tqdm import tqdm
BASE_URL = "https://malegislature.gov"
FILE_DIR = pathlib.Path(__file__).parent.resolve()
CACHE_DIR = FILE_DIR.parent / "cache"
requests_cache.install_cache(str(CACHE_DIR / "ma_legislators_requests"))
debug = functools.partial(print, file=sys.stderr)
logging.getLogger("backoff").addHandler(logging.StreamHandler())
@backoff.on_exception(
backoff.expo,
exception=requests.exceptions.RequestException,
max_tries=8,
)
def get_soup(url):
# debug(url)
# HACK: Work around SSLError "unable to get local issuer certificate"
with warnings.catch_warnings():
warnings.simplefilter("ignore", urllib3.exceptions.InsecureRequestWarning)
response = requests.get(url, verify=False)
response.raise_for_status()
# debug(f"Status: {response.status_code}, Cached: {response.from_cache}")
return BeautifulSoup(response.text, "lxml")
def select_string(soup, selector):
try:
return (soup.select_one(selector).string or "").strip()
except AttributeError:
return ""
def parse_chamber(soup):
for row_soup in tqdm(soup.select("#legislatorTable tbody tr")):
profile = OrderedDict()
first_name = select_string(row_soup, "td:nth-of-type(3)")
last_name = select_string(row_soup, "td:nth-of-type(4)")
profile_url = BASE_URL + row_soup.select_one("td:nth-of-type(3) a")["href"]
if first_name == "Vacant":
district = last_name
title = full_name = last_name = ""
else:
district = select_string(row_soup, "td:nth-of-type(5)")
profile_soup = get_soup(profile_url)
title_tag = profile_soup.select_one("h1 span")
title = title_tag.string.strip()
full_name = title_tag.next_sibling.string.strip()
profile["chamber"] = select_string(soup, "h1").split()[0]
profile["district"] = district
profile["title"] = title
profile["full_name"] = full_name
profile["first_name"] = first_name
profile["last_name"] = last_name
profile["party"] = select_string(row_soup, "td:nth-of-type(6)")
profile["url"] = profile_url
profile["email"] = select_string(row_soup, "td:nth-of-type(9) a")
profile["phone"] = select_string(row_soup, "td:nth-of-type(8)")
profile["room"] = select_string(row_soup, "td:nth-of-type(7)")
profile["photo"] = BASE_URL + row_soup.select_one(".thumb img")["src"]
yield profile
def get_chamber(chamber_name):
chamber_url = f"{BASE_URL}/Legislators/Members/{chamber_name.title()}"
debug(chamber_url)
chamber_soup = get_soup(chamber_url)
chamber = list(parse_chamber(chamber_soup))
debug(json.dumps([chamber[0], "...", chamber[-1]], indent=2))
return chamber
def main():
representatives = get_chamber("house")
senators = get_chamber("senate")
json.dump(representatives + senators, sys.stdout)
if __name__ == "__main__":
main()