-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscrape.py
153 lines (121 loc) · 4.68 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import sys
# run as
# $ python scrape.py gender_int max_place output_filename
# read geneder_int from command line
# (gender_int = 0 for both )
# (gender_int = 1 for men )
# (gender_int = 2 for women)
gender_int = int( sys.argv[1] )
# read maximum place in any race
max_place = int( sys.argv[2] )
# output filename
output_filename = sys.argv[3]
# generic marathon guide browse url and makelinks url
browse_url = "http://www.marathonguide.com/results/browse.cfm"
makelinks_url = "http://www.marathonguide.com/results/makelinks.cfm"
# data and header objects to post to website
post_data = {
"RaceRange" : "",
"MIDD" : "",
"SubmitButton" : "View"
}
headers = {
"Referer" : "",
"User-Agent" : ("Mozilla/5.0 (Windows NT 6.1; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/34.0.1847.116 Safari/537.36")
}
# all possible fields
all_fields = [
'Last Name, First Name(Sex/Age)', 'Time',
'OverAllPlace', 'Sex Place', 'Sex Place/Div Place', 'DIV', 'Net Time',
'City, State, Country', 'State, Country', 'Country', 'City', 'City, Country',
'AG Time*', 'BQ*']
race_results = pd.DataFrame(columns = all_fields)
# read in race codes
reader = csv.DictReader(open("race_codes.csv", "rb"))
race_codes = [ line for line in reader ]
# loop over all races
for r in range(0, len(race_codes)):
# midd and marathon url
midd = race_codes[r]["midd"]
mar_name = race_codes[r]["race"]
mar_year = race_codes[r]["year"]
mar_date = race_codes[r]["date"]
mar_url = browse_url + "?MIDD=" + midd
print(mar_name + " " + mar_year)
# update the midd and referer
post_data["MIDD"] = midd
headers["Referer"] = mar_url
# start up a session
sesh = requests.Session()
# request browse page and soup it
browse_page = sesh.get( mar_url )
browse_soup = BeautifulSoup( browse_page.content, "html5lib" )
# extract the ranges for this race
select_gender = browse_soup.find_all("select")[gender_int]
options_gender = select_gender.find_all("option")
all_race_ranges = [ options_gender[i]["value"] for i in range(1,len(options_gender)) ]
# loop over these and only keep places less than max_place
race_ranges = []
for j in range(0,len(all_race_ranges)):
rr_list = all_race_ranges[j].split(",")
if int(rr_list[1]) < max_place:
race_ranges.append( all_race_ranges[j] )
# loop over the race ranges
for j in range(0,len(race_ranges)):
race_range = race_ranges[j]
# update post data
post_data["RaceRange"] = race_range
# make post request and soup it
page = sesh.post(makelinks_url, data=post_data, headers=headers)
soup = BeautifulSoup(page.content, "html5lib")
# results sit in the 9th table
table = soup.find_all("table")[8]
# all but first three tr's are results
results_rows = table.find_all("tr")
for row in range(0,len(results_rows)):
names = results_rows[row].find_all("th")
if len(names) > 1:
field_names = [ f.get_text().encode("utf-8").strip("\n") for f in names ]
first_row = row+1
# loop over these rows
for row in range(first_row,len(results_rows)):
# extract the data fields for this row
fields = results_rows[row].find_all("td")
# insert NAs into all fields
this_result = {}
for field in all_fields:
this_result[field] = "NA"
# extract the data and add some additional race info
for f in range(0,len(fields)):
this_result[field_names[f]] = fields[f].get_text().encode("utf-8")
this_result["marathon"] = mar_name
this_result["year"] = mar_year
this_result["date"] = mar_date
this_result["midd"] = midd
race_results = race_results.append( this_result, ignore_index = True )
# rename the columns of race_results
col_names = {
'Last Name, First Name(Sex/Age)' : "name_age",
'Time' : "time",
'OverAllPlace' : "overall_place",
'Sex Place/Div Place' : "sex_div_place",
'Sex Place' : "sex_place",
'DIV' : "div",
'Net Time' : "net_time",
'City, State, Country' : "city_state_country",
'State, Country' : "state_country",
'Country' : "country",
'City' : "city",
'City, Country' : "city_country",
'AG Time*' : "ag_time",
'BQ*' : "bq"
}
race_results = race_results.rename( columns = col_names )
# write to csv without row names
race_results.to_csv(output_filename, index = False)