-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataExtraction.py
223 lines (195 loc) · 7.53 KB
/
DataExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
"""
@FileName: 001. Data Extraction.py
@Description: The main program file for commodity data extraction, main capability is to extract the latest price
and historical prices for the specified time period
@Author: Satsawat N.
@Version:
01/03/2019 (SN): Initial version
@Note:
First version is only supported London gas oil future price
"""
import pandas as pd
import requests
import datetime
from bs4 import BeautifulSoup
import urllib
# Global variable for https://www.investing.com/commodities/london-gas-oil-historical-data
# this can get from inspecting the html elements on the web
CFD = {
'name': 'CFD',
'curr_id': 8861,
'smlID': 300084,
'header': 'London Gas Oil Futures',
'sort_col': 'date',
'action': 'historical_data',
'url': 'https://www.investing.com/commodities/london-gas-oil-historical-data'
}
# Global variable to set https header parameters
HEADERS = {
'User-Agent': 'Mozilla/5.0',
'referer': 'https://www.investing.com',
'host': 'www.investing.com',
'X-Requested-With': 'XMLHttpRequest'
}
class IndexHistoricalData:
"""
A Class to represent the index data from investing.com
Attributes
----------
api_url: str
The string of API URL
headers: dict
The dictionary contains the header information
data: dict
The dictionary contains the detail data for index scraping
query_time: string
The current time of the query is being called
response: string
The string contains the request post from the API URL and specified data
observations:
The pandas data frame of historical index price of the specified commodity
latest_price: float
The latest price of the selected index / commodity
Methods
-------
set_headers(headers)
Method to set up the header in making API requests
set_form_data(data)
Method to set up the data
set_time_frame_frequency(frequency)
Method to set up the time frame for the web to scrape
set_start_end_date(starting_date, ending_date=datetime.datetime.now().strftime("%m/%d/%Y"))
Method to set the desired time period to scrape from/to
set_sort_order(sorting_order)
Method to set the data sorting order
get_historical_price()
Method to make the post request and retrieve the historical price from the designated API url
get_latest_price()
Method to get the latest price for the current run time
print_data()
Method to print the query time, latest price for that time, and historical prices for the specified starting
and ending time onto the console
save_historical_data_to_csv()
Method to save the historical to the csv file
"""
def __init__(self, api_url):
"""
Initialize the class variables
:param api_url: the string of API URL
:return:
"""
self.api_url = api_url
self.headers = None
self.data = None
self.query_time = datetime.datetime.now()
self.response = None
self.observations = None
self.latest_price = 0
def set_headers(self, headers):
"""
Method to set up the header in making API requests
:param headers: dictionary contains the header information
:return:
"""
self.headers = headers
def set_form_data(self, data):
"""
Method to set up the data
:param data: the dictionary contains the detail data for index scraping
:return:
"""
self.data = data
def set_time_frame_frequency(self, frequency):
"""
Method to set up the time frame for the web to scrape
:param frequency: time frame period of the index price; possible values are Monthly, Weekly, or Daily
:return:
"""
self.data['frequency'] = frequency
def set_start_end_date(self, starting_date, ending_date=datetime.datetime.now().strftime("%m/%d/%Y")):
"""
Method to set the desired time period to scrape from/to
:param starting_date: the starting date to scrape; format is MM/DD/YYYY
:param ending_date: the ending date to scrape; format is MM/DD/YYYY, default is the current run date
:return:
"""
self.data['st_date'] = starting_date
self.data['end_date'] = ending_date
def set_sort_order(self, sorting_order):
"""
Method to set the data sorting order
:param sorting_order: the sorting order of the returned historical data; possible values are DESC, or ASC
:return:
"""
self.data['sort_ord'] = sorting_order
def get_historical_price(self):
"""
Method to make the post request and retrieve the historical price from the designated API url
:return:
"""
self.response = requests.post(self.api_url, data=self.data, headers=self.headers).content
# parse tables with pandas - [0] probably there is only one html table in response
self.observations = pd.read_html(self.response)[0]
return self.observations
def get_latest_price(self):
"""
Method to get the latest price for the current run time
:except URLError: if the web page cannot be requested
:except ValueError: if the current price cannot be scraped
:return:
"""
req = urllib.request.Request(self.data['url'], headers=self.headers)
web_page = None
try:
web_page = urllib.request.urlopen(req).read()
except urllib.error.URLError as err:
print(err.reason)
if web_page is not None:
try:
bs = BeautifulSoup(web_page, 'lxml')
spans = bs.find_all('span', id='last_last')
lines = [span.get_text() for span in spans]
str_price = lines[0]
if ',' not in str_price:
current_price = float(str_price)
else:
current_price = float(str_price.replae(',', ''))
except ValueError:
print('Fail to parse the latest price')
return
self.latest_price = current_price
else:
return
def print_data(self):
"""
Method to print the query time, latest price for that time, and historical prices for the specified starting
and ending time onto the console (top 10 records only)
:return:
"""
print("%s: The latest price of %s is %s" % (self.query_time.strftime("%Y-%m-%d %H:%M:%S"), self.data["name"],
self.latest_price))
print("The retrieved historical prices (top 10) are shown below:")
print(self.observations.head(10))
def save_historical_data_to_csv(self):
"""
Method to save the historical to the csv file
:return:
"""
self.observations.to_csv(self.data['name'] + '_' + self.query_time.strftime('%Y%m%d_%H%M%S') + '.csv',
sep=',', index=False)
print("Save completed!")
if __name__ == "__main__":
# initialize the index, headers and data
ihd = IndexHistoricalData('https://www.investing.com/instruments/HistoricalDataAjax')
ihd.set_headers(HEADERS)
ihd.set_form_data(CFD)
# initialize scraping mechanics
ihd.set_time_frame_frequency('Daily')
ihd.set_sort_order('DESC')
ihd.set_start_end_date('1/1/2017')
# retrieve the data
ihd.get_latest_price()
ihd.get_historical_price()
# data presentation
ihd.print_data()
ihd.save_historical_data_to_csv()