-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
123 lines (85 loc) · 5.06 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from readline import get_current_history_length
from datasets import Dataset, load_dataset
import pyarrow
from bs4 import BeautifulSoup as bs
from .utils import get_current_ym, get_num_proc
from .fetchMonthArticle import monthArticlesFetch
import os
import datasets
import shutil
from .fetchYMD import ymdFetch
# ympFetch takes no arguments and returns Year+Month pairs available in the archive, as a list of tuples.
from .fetchArticle import articleFetch
# articleFetch takes a dataset row (dict), fetches ['author', 'dateTime', 'text'] using ['url'], updates the dict,
# and returns it, setting 'needs_collection' to false if it successfully fetches the article. (Still runs if needs_collection=False!)
package_path = os.path.dirname(os.path.realpath(__file__))
def db_init():
print('Fetching list of available years and months from the sitemap...')
ymd = ymdFetch()
print('Finished fetching years and months. Making folders locally...')
os.mkdir("/".join([package_path, 'database']))
current_year, current_month = get_current_ym()
for year in ymd.keys():
print(f'Getting Data for {year}')
os.mkdir("/".join([package_path, 'database', str(year)]))
for month in ymd[year]:
print(f'\tMonth {month}')
os.mkdir("/".join([package_path, 'database', str(year), str(month)]))
mafDataset = monthArticlesFetch(year, month)
mafDataset.save_to_disk("/".join([package_path, 'database', str(year), str(month)]), num_shards=1, num_proc=min(get_num_proc(), len(mafDataset)))
if year == current_year and month == current_month:
file = open("/".join([package_path, 'database', str(year), str(month), 'INCOMPLETE']), 'w')
file.write('This portion of the dataset was not done being appended at the time when it was downloaded.')
file.close()
def db_update():
print('Fetching list of available years and months from the sitemap...')
ymd = ymdFetch()
print('Finished fetching years and months.')
local_ymd = {}
for year in os.listdir("/".join([package_path, 'database'])):
local_ymd[int(year)] = []
for month in os.listdir("/".join([package_path, 'database', year])):
if "INCOMPLETE" not in os.listdir("/".join([package_path, 'database', year, month])): # don't consider INCOMPLETE ones as already downloaded since they must be re-downloaded
local_ymd[int(year)].append(int(month))
else:
pass
ymd_needs_download = {}
for year in ymd.keys():
if year not in local_ymd.keys():
local_ymd[year] = []
ymd_needs_download[year] = [x for x in ymd[year] if x not in local_ymd[year]]
if ymd_needs_download[year] == []:
del ymd_needs_download[year]
current_year, current_month = get_current_ym()
for year in ymd_needs_download.keys():
print(f'Getting Data for {year}')
if not os.path.exists("/".join([package_path, 'database', str(year)])):
os.mkdir("/".join([package_path, 'database', str(year)]))
for month in ymd_needs_download[year]:
print(f'\tMonth {month}')
if os.path.exists("/".join([package_path, 'database', str(year), str(month)])):
shutil.rmtree("/".join([package_path, 'database', str(year), str(month)]))
os.mkdir("/".join([package_path, 'database', str(year), str(month)]))
mafDataset = monthArticlesFetch(year, month)
mafDataset.save_to_disk("/".join([package_path, 'database', str(year), str(month)]), num_shards=1, num_proc=min(get_num_proc(), len(mafDataset)))
if year == current_year and month == current_month:
file = open("/".join([package_path, 'database', str(year), str(month), 'INCOMPLETE']), 'w')
file.write('This portion of the dataset was not done being appended at the time when it was downloaded.')
file.close()
def db_fetch():
local_files = {}
local_files_list = []
for year in os.listdir("/".join([package_path, 'database'])):
local_files[int(year)] = {}
for month in os.listdir("/".join([package_path, 'database', year])):
data_files = [x for x in os.listdir("/".join([package_path, 'database', year, month])) if x.endswith('.arrow')]
local_files[int(year)][int(month)] = data_files
local_files_list.extend(["/".join([package_path, 'database', year, month, x]) for x in data_files])
dataset = load_dataset('arrow', data_files=local_files_list, split='train')
old_features = dataset.features
old_features.pop('author') # feature that didn't pan out.
old_features.pop('needs_collection')
dataset = dataset.add_column('dateTime2', pyarrow.compute.strptime(dataset.data.table['dateTime'], "%Y-%m-%dT%H:%M:%S.000Z", "s", error_is_null=True)).remove_columns('dateTime').rename_column('dateTime2', 'dateTime')
dataset = dataset.select_columns([x for x in old_features.keys()]) # restore order
dataset = dataset.sort('dateTime', reverse=True)
return dataset