-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_monitoring_comments.py
113 lines (103 loc) · 5 KB
/
update_monitoring_comments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from urllib.request import urlopen
from bs4 import BeautifulSoup
import sys
import datetime
import pandas as pd
import tabula
import re
from janome.tokenizer import Tokenizer
import numpy as np
def getLastMeetingNo():
df = pd.read_csv('data/monitoring_comments.csv')
if len(df.index) > 0:
return int(df.tail(1)['meeting_no'])
else:
return 0
def getLatestMeetingInfo():
html = urlopen('https://www.bousai.metro.tokyo.lg.jp/taisaku/saigai/1013388/index.html')
bsObj = BeautifulSoup(html, 'html.parser')
ul = bsObj.select_one('ul.listlink')
li = ul.select_one('li')
url = 'https://www.bousai.metro.tokyo.lg.jp/' + li.select_one('a').get('href').replace('../../../', '')
title = li.select_one('a').text
no = int(re.search(r'第(\d+?)回', title).group(1))
date = re.search(r'令和(\d+?)年(\d+?)月(\d+?)日', title)
y = 2018 + int(date.group(1))
m = int(date.group(2))
d = int(date.group(3))
date = '{0:%Y-%m-%d}'.format(datetime.date(y,m,d))
return no, date, url
def getCommentPdfUrl(meeting_url):
html = urlopen(meeting_url)
bsObj = BeautifulSoup(html, 'html.parser')
ul = bsObj.select_one('ul.objectlink')
lis = ul.select('li')
for li in lis:
a = li.select_one('a')
title = a.text
if '専門家によるモニタリングコメント・意見' in title:
url = 'https://www.bousai.metro.tokyo.lg.jp/' + a.get('href').replace('../../../', '')
return url
def getPdfDataFrame(pdf_url):
df = tabula.read_pdf(pdf_url, lattice=True, pages='all')
df = pd.concat(df, ignore_index=True)
df.columns = ['monitoring_index', 'graph', 'monitoring_comment']
df['monitoring_comment']=df['monitoring_comment'].fillna(df['graph'])
df = df.drop('graph', axis=1)
df['monitoring_index']=df['monitoring_index'].replace(r'[\r\n]','', regex=True)
df['monitoring_comment']=df['monitoring_comment'].replace(r'[\r\n]','', regex=True)
df['monitoring_index']=df['monitoring_index'].fillna(method='ffill')
df['monitoring_index']=df['monitoring_index'].apply(lambda x: str(x)[0])
df = df[df['monitoring_index'] != 'n']
df = df.groupby('monitoring_index')['monitoring_comment'].apply(''.join).reset_index()
return df
def splitComment(df):
df_split = pd.DataFrame(index=[], columns=['meeting_no', 'meeting_date', 'monitoring_index', 'line_number', 'monitoring_comment'])
for row in df.itertuples(name=None):
meeting_no = row[1]
meeting_date = row[2]
monitoring_index = row[3]
monitoring_comment = row[4]
splits = monitoring_comment.split('。')
splits.pop()
for i in range(len(splits)):
df_split.loc[len(df_split.index)] = [meeting_no, meeting_date, monitoring_index, i + 1, splits[i] + '。']
return df_split
def generateToken(df):
df_token = pd.DataFrame(index=[], columns=['meeting_no', 'meeting_date', 'monitoring_index', 'line_number', 'token', 'part_of_speech', 'part_of_speech2', 'part_of_speech3', 'part_of_speech4', 'infl_type', 'base_form'])
for row in df.itertuples(name=None):
meeting_no = row[1]
meeting_date = row[2]
monitoring_index = row[3]
line_number = row[4]
t = Tokenizer()
tokens = t.tokenize(row[5])
for token in tokens:
if not re.search(r'[、。I,%%~~##※\\\(\)\.\-\/]', token.surface) and token.surface not in ['ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ']:
word_category = token.part_of_speech.split(',')[0]
word_type = token.part_of_speech.split(',')[1]
if word_category == '名詞' and word_type != '数'and word_type != '代名詞' and word_type != '非自立' and word_type != '接尾':
df_token.loc[len(df_token.index)] = [meeting_no, meeting_date, monitoring_index, line_number, token.surface] + token.part_of_speech.split(',') + [token.infl_type, token.base_form]
df_token = df_token.replace('*', np.nan)
return df_token
def main():
last_meeting_no = getLastMeetingNo()
meeting_no, meeting_date, meeting_url = getLatestMeetingInfo()
if meeting_no > last_meeting_no:
comment_pdf_url = getCommentPdfUrl(meeting_url)
df = getPdfDataFrame(comment_pdf_url)
df['meeting_no'] = meeting_no
df['meeting_date'] = meeting_date
df = df[['meeting_no', 'meeting_date', 'monitoring_index', 'monitoring_comment']]
df.to_csv('data/monitoring_comments.csv', mode='a', header=False, index=False, encoding='utf-8-sig')
df_split = splitComment(df)
df_split.to_csv('data/monitoring_comments_split.csv', mode='a', header=False, index=False, encoding='utf-8-sig')
df_token = generateToken(df_split)
df_token.to_csv('data/monitoring_comments_token.csv', mode='a', header=False, index=False, encoding='utf-8-sig')
else:
print('error')
print('meeting_no = ', meeting_no)
print('last_meeting_no = ', last_meeting_no)
sys.exit(1)
if __name__ == '__main__':
main()