-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcovid_data_scraper.py
88 lines (75 loc) · 5.47 KB
/
covid_data_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas
from html.parser import HTMLParser
from urllib.request import urlopen, urljoin, Request
import re
# GLOBAL VARIABLES
url = 'http://www.usnews.com/news/best-states/articles/covid-19-vaccine-eligibility-by-state'
target_file = "state_vaccine_eligibility.csv"
delimiter = "|"
state_list = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina',
'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
'Wisconsin', 'Wyoming']
class WebScraper(HTMLParser):
def __init__(self, url):
"""given a URL will scrape all links and state covid vaccine eligibility text"""
HTMLParser.__init__(self)
self.url = url # store url
self.HTMLcontent = self.retrieveHTML() # store contents of HTML content
self.links = [] # list accumulator for links
self.site_updated_date = [] # date website was last updated
self.state_eligibility = {} # dictionary with current and next phase eligibility by state
def retrieveHTML(self):
"""when called, will retrieve HTML content from url"""
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64)" # sets user agent
request = Request(self.url) # creates request
request.add_header("User-Agent", user_agent) # adds header to request
content = urlopen(request).read().decode() # open url and get HTML content
return content # return HTML content
def handle_starttag(self, tag, attrs):
"""extends original function in HTMLParser to look for all ancher tabs and find links as href attributes"""
if tag == 'a': # if tag is an anchor tag
for attr in attrs: # for each attribute in this anchor tags list of attributes
if attr[0] == 'href': # if the attribute is href (attribute that contains hyperlink)
absolute = urljoin(self.url, attr[1]) # combines relative reference with original url
if absolute[:4] == 'http': # if link is http
if absolute.find('?') != -1: # chop off specific search queries at end of link (this prevents duplicate pages accessed from different links)
absolute = absolute[:absolute.find('?')] # changes link to exclude search query
self.links.append(absolute) # adds link to list of links created earlier
def handle_data(self, data):
"""extends original function from HTMLParser to collect text from HTML content"""
for item in re.findall('"dateModified":"([0-9]{4}-[0-9]{2}-[0-9]{2})T', data.strip()): # find last update date
self.site_updated_date.append(item) # append update date to list
for state in state_list: # for each state
current_phase = re.findall(f"{state} Vaccine Eligibility(?::)?\s?Current Phase(?::)?\s?([\w+\s\.,!\-']+)(?:(?:Next Phase)|(?:More information))", data.strip()) # get current phase eligibility
next_phase = re.findall(f"{state} Vaccine Eligibility(?::)?\s?Current Phase(?::)?\s?[\w+\s\.,!\-']+\s?Next Phase(?::)?\s?([\w+\s\.,!\-']+)(?:More information)", data.strip()) # get next phase eligibility
if state not in self.state_eligibility: # if state is not already included in dict
self.state_eligibility[state] = {"current_phase": "", "next_Phase": "", "last_update": ""} # add to dict
if len(current_phase) > 0: # if current phase text found
self.state_eligibility[state]["current_phase"] = current_phase[0] # add to dict
if len(next_phase) > 0: # if next phase text found
self.state_eligibility[state]["next_phase"] = next_phase[0] # add to dict
if len(self.site_updated_date) > 0: # if site updated was found
self.state_eligibility[state]["last_update"] = self.site_updated_date[0] # add to dict
def getLinks(self):
"""when called, will return a list of links that are contained on the current website"""
return self.links # return links list
def getStateEligibility(self):
"""when called will return a list of all the words/text that are on the current website"""
df = pandas.DataFrame.from_dict(self.state_eligibility, orient="index", columns=['current_phase', 'next_phase', 'last_update']) # convert dict to dataframe
df.index.name = "state" # name the index of the data frame
return df # return the data frame with state eligibility
def getHTML(self):
"""when called, will return the HTML content frim the url"""
return self.HTMLcontent
def main():
"""when called, will scrape website and save state vaccine eligibility to csv with specified delimiter"""
scraper = WebScraper(url) # create scraper obj
scraper.feed(scraper.getHTML()) # feed HTML file to scraper obj
data = scraper.getStateEligibility() # get state eligibility text
data.to_csv(target_file, sep=delimiter, quoting=0) # save dataframe to csv
if __name__ == "__main__":
main()