-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup.py
153 lines (127 loc) · 5.6 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import gzip
import time
import shutil
import pandas as pd
from dateutil import relativedelta
from datetime import datetime, timedelta
import ImportListing
import OptimizeCalendar
import LabelizePeriods
import DownloadDatasets
import Validation
import MergeCalendar
import DatabaseConnector
import ConvertReviews
import Proba
from dotenv import load_dotenv
load_dotenv('./dev.env')
DatasetsFolderPath = os.getenv("DATASETS_FOLDER_PATTH")
datasets = ['listings','reviews','calendar']
DATAFRAME_CALENDARS_COLUMNS = [
"listing_id",
"available",
"start_date",
"end_date",
"num_day",
"minimum_nights",
"maximum_nights",
"label",
"proba",
"validation",
"ext_validation",
"cal_key"
]
'''
Create folder for saving .csv
'''
def CreateFolder(folderName):
try:
os.mkdir(f'{DatasetsFolderPath}/{folderName}')
except:
pass
def UnzipFiles(date):
for i in range(2):
newDate = date-relativedelta.relativedelta(months=i)
fileNameDate = str(newDate)[:7]
for folderName in datasets:
if newDate == date or (newDate < date and folderName == "reviews") :
fileName = folderName + '-' +fileNameDate
with gzip.open(f'{DatasetsFolderPath}/{folderName}/{fileName}.csv.gz', 'rb') as f_in:
with open(f'{DatasetsFolderPath}/{folderName}/{fileName}.csv', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
fileNameDate = str(date)[:7]
if os.path.isfile(f'{DatasetsFolderPath}/saved/{fileNameDate}.zip'):
shutil.unpack_archive(f'{DatasetsFolderPath}/saved/{fileNameDate}.zip', f'{DatasetsFolderPath}/saved/{fileNameDate}', 'zip')
def CleanProcess(date):
for i in range(2):
newDate = date-relativedelta.relativedelta(months=i)
fileNameDate = str(newDate)[:7]
for folderName in datasets:
if newDate == date or (newDate < date and folderName == "reviews") :
fileName = folderName + '-' +fileNameDate
os.remove(f'{DatasetsFolderPath}/{folderName}/{fileName}.csv')
fileNameDate = str(date)[:7]
shutil.make_archive(f'{DatasetsFolderPath}/saved/{fileNameDate}', 'zip', f'{DatasetsFolderPath}/saved/{fileNameDate}')
shutil.rmtree(f'{DatasetsFolderPath}/saved/{fileNameDate}')
'''
Process all the datasets and save te results in the database
'''
def ProcessDatasets(date):
fileNameDate = str(date)[:7]
CreateFolder('saved')
CreateFolder(f'saved/{fileNameDate}')
UnzipFiles(date)
start_time = time.time()
print('------- Start of listings process -------')
ImportListing.ImportListings(fileNameDate)
print('------- End of listings process -------')
print("------------ %s seconds ------------" % (time.time() - start_time))
start_time = time.time()
print('------- Start of calendar process -------')
optimizedCalendar = OptimizeCalendar.ProcessAndSave(fileNameDate,'optimized_calendar')
mergedCalendar = MergeCalendar.ProcessAndSave(fileNameDate,'merged_calendar',optimizedCalendar)
labelizedCalendar = LabelizePeriods.ProcessAndSave(fileNameDate,'labelized_calendar',mergedCalendar)
probaCalendar = Proba.ProcessAndSave(fileNameDate,'probalized_calendar',labelizedCalendar)
probaCalendar['cal_key'] = probaCalendar['listing_id'].astype(str) + '_' + probaCalendar['start_date'].astype(str)
DatabaseConnector.Insert(probaCalendar.values.tolist(),'calendars',DATAFRAME_CALENDARS_COLUMNS)
print('------- End of calendar process -------')
print("------------ %s seconds ------------" % (time.time() - start_time))
start_time = time.time()
print('------- Start of reviews process -------')
validatedCalendar = Validation.ProcessAndSave(fileNameDate,'validated_calendar',date)
extValidatedCalendar = ConvertReviews.ProcessAndSave(fileNameDate,'ext_validated_calendar',validatedCalendar)
DatabaseConnector.UpdateValidation(extValidatedCalendar)
print('------- End of reviews process -------')
print("------------ %s seconds ------------" % (time.time() - start_time))
CleanProcess(date)
'''
Process a specific date and download the datasets if they don't exist
'''
def CreateFolder(date):
# Verify if datasets exists
isListings = DownloadDatasets.VerifyDatasetExists('listings',date)
isCalendar = DownloadDatasets.VerifyDatasetExists('calendar',date)
isReviews = DownloadDatasets.VerifyDatasetExists('reviews',date)
hasDatasets = isListings and isCalendar # and isReviews
if not hasDatasets: # Download if datasets don't exist
hasDatasets = DownloadDatasets.DownloadDate(date)
elif hasDatasets:
ProcessDatasets(date)
else:
print(f'Cannot process this date : {date.strftime("%Y-%m-%d")}')
def ProcessDateRange(startDate,endDate):
startDate = datetime.strptime(startDate,"%Y-%m-%d").date()
endDate = datetime.strptime(endDate,"%Y-%m-%d").date()
while startDate < endDate:
start_time = time.time()
ProcessDatasets(startDate)
print(f'------------------------ {startDate.strftime("%Y-%m-%d")} processing time : {(time.time() - start_time)} seconds -------------------------')
startDate = startDate + relativedelta.relativedelta(months=1)
if __name__ == "__main__":
ProcessDateRange('2017-05-01','2017-06-01')
# period = "2017-02"
# date = datetime.strptime(period, "%Y-%m")
# validatedCalendar = Validation.ProcessAndSave(period,'validated_calendar',date)
# extValidatedCalendar = ConvertReviews.ProcessAndSave(period,'ext_validated_calendar',validatedCalendar)
# DatabaseConnector.UpdateValidation(extValidatedCalendar)