-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnndss_scrape_detail.py
73 lines (55 loc) · 2.33 KB
/
nndss_scrape_detail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import datetime
from bs4 import BeautifulSoup
import csv
import os
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
col_headings = ['condition', 'ACT', 'NSW', 'NT', 'Qld', 'SA', 'Tas', 'Vic', 'WA', 'Aust', 'Aust YTD', "month", "year",'updated']
years = list(map(str, range(1991,2021)))
months = list(map(str, range(1,13)))
with open("data/detail_count.csv", "w") as f:
wr = csv.writer(f)
wr.writerow(col_headings)
driver.get("http://www9.health.gov.au/cda/source/rpt_1_sel.cfm")
for y in years:
for m in months:
report = driver.find_element_by_id("REPORT_TYPE_b")
report.click()
month = driver.find_element_by_name("Sel_Month")
month.send_keys(m)
year = driver.find_element_by_name("Sel_Year")
year.send_keys(y)
button = driver.find_element_by_name("submit")
button.click()
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#table = driver.find_element_by_css_selector("#content > div:nth-child(1) > div > div > table")
table = soup.find('table')
#updated = driver.find_elements_by_class_name("publish-date")
updated = soup.find("p", class_ = "publish-date").getText().strip()
# find all rows
rows = table.findAll('tr')
# init row text array
row_text_array = []
# loop through rows and add row text to array
for row in rows[1:]:
row_text = []
# loop through the elements
for row_element in row.findAll(['td']):
# append the array with the elements inner text
row_text.append(row_element.text.replace('\n', '').strip())
# append the text array to the row text array
row_text_array.append(row_text)
with open("data/detail_count.csv", "a") as f:
wr = csv.writer(f)
# loop through each row array
for row_text_single in row_text_array:
wr.writerow(row_text_single + [m, y, updated])
driver.get("http://www9.health.gov.au/cda/source/rpt_1_sel.cfm")
time.sleep(3)