-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_hp_release.py
109 lines (93 loc) · 4.64 KB
/
parse_hp_release.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pprint
import pandas
from bs4 import BeautifulSoup
from search_on_itunes import safe_request_get_as_text
from urllib import parse
pandas.options.display.max_rows = None
pandas.options.display.max_columns = None
pandas.options.display.width = 6000
pandas.options.display.max_colwidth = 6000
pandas.options.display.colheader_justify = 'left'
single_release_url_base = 'http://www.helloproject.com/release/search/?-s=1&g=single&p='
album_release_url_base = 'http://www.helloproject.com/release/search/?-s=1&g=album&p='
distribution_release_url_base = 'http://www.helloproject.com/release/search/?-s=1&g=distribution&p='
# uf_single_release_url_base = 'http://www.up-front-works.jp/release/search/?-s=1&g=single'
# uf_album_release_url_base = 'http://www.up-front-works.jp/release/search/?-s=1&g=album'
# uf_distribution_release_url_base = 'http://www.up-front-works.jp/release/search/?-s=1&g=distribution'
dataframe = pandas.DataFrame(
columns=['url', 'artist_name', 'release_name', 'release_type', 'release_date',
'record_label', 'song_name',
'length', 'lyrics', 'composition', 'arrangement'])
print(dataframe)
df_index = 0
def crawl(base_url):
page_range = 1
while True:
page = BeautifulSoup(safe_request_get_as_text(base_url + str(page_range)), 'html.parser')
contents = page.find_all('section')
if not len(contents):
break
for content in contents:
scheme = parse.urlparse(base_url).scheme + '://'
hostname = parse.urlparse(base_url).hostname
print(scheme + hostname + content.find('a')['href'])
process_page(scheme + hostname + content.find('a')['href'])
print()
page_range += 1
def process_page(page_url):
global dataframe
global df_index
page = BeautifulSoup(safe_request_get_as_text(page_url), 'html.parser')
if page.find('a', {'class': 'modal'})['href'] == \
'nowprinting.jpg':
return 0
content = page.find('div', {'id': 'rd_right'})
release_name = content.find_all('h2')[0].text
artist_name = content.find('p', {'id': 'artist_name'}).text
release_type = content.find('div', {'id': 'table_wrapper'}).find_all('td', {'class': 'item02'})[0].text
release_date = content.find('div', {'id': 'table_wrapper'}).find_all('td', {'class': 'item02'})[1].text
record_label = content.find('div', {'id': 'table_wrapper'}).find_all('td', {'class': 'item02'})[2].text
print('\t' + release_name, end='')
print('\t' + artist_name)
print('\t' + release_type)
print('\t' + release_date)
print('\t' + record_label)
print('\t盤数: ' + str(len(content.find_all('div', {'class': 'release_edition'}))))
for tables in content.find_all('table', {'class': 'typeB'}):
if 'CD' in tables.find('th', {'colspan': '7'}).text or '配信' in tables.find('th', {'colspan': '7'}).text:
print('\t\t' + tables.find('th', {'colspan': '7'}).text)
else:
continue
music_list = tables.find_all('tr')[2:]
for list_col in music_list:
if list_col.find('td', {'class': 'hide_cell'}):
continue
try:
data = [page_url, artist_name, release_name, release_type, release_date, record_label,
list_col.find('td', {'class': 'item02'}).text,
list_col.find('td', {'class': 'item03'}).text,
list_col.find('td', {'class': 'item04'}).text,
list_col.find('td', {'class': 'item05'}).text,
list_col.find('td', {'class': 'item06'}).text]
escaped_data = []
[escaped_data.append(s.replace('\r', '').replace('\t', '').replace('\n', '')) for s in data]
pprint.pprint(escaped_data, width=1000)
dataframe.loc[df_index] = escaped_data
df_index += 1
except:
pass
crawl(single_release_url_base)
print()
crawl(album_release_url_base)
print()
crawl(distribution_release_url_base)
print()
dataframe.sort_values('release_date', inplace=True)
print('別verを削除: ' +
str(len(dataframe[dataframe['song_name'].str.contains(r'【.*?】')].index)
+ len(dataframe[dataframe['song_name'].str.contains(r'\(*?inst|ver|mix.*?\)', case=False)].index)))
dataframe = dataframe[dataframe['song_name'].str.contains(r'【.*?】') == False]
dataframe = dataframe[dataframe['song_name'].str.contains(r'\(*?inst|ver|mix.*?\)', case=False) == False]
dataframe.drop_duplicates(subset=['song_name', 'release_name', 'artist_name'], inplace=True)
dataframe.reset_index(drop=True, inplace=True)
dataframe.to_excel('hp.xlsx')