-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathishares.py
196 lines (151 loc) · 7.33 KB
/
ishares.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 21 22:45:46 2021
@author: nsa
"""
import time
#import lxml
import requests
import numpy as np
import pandas as pd
from lxml import html
from tqdm.auto import tqdm
#from datetime import datetime
print('''Welcome to the Ishares Scraper.
Please follow the instructions and you are ready to go to get a millionaire.''')
inp = input('1. Enter your source path file of the Ishares Excel. Make sure the file format is "xlsx" Excel format: ' )
if inp[-1] == 'x':
source = inp[:-5]
postfix = inp[-5:]
if inp[-1] == 'v':
source = inp[:-4]
postfix = inp[-4:]
class IsharesScraper:
# ishares_xlsx = path ishares excel, cleaned_csv = path to save cleaned isahres data, fundamentals_xlsx = path to save fundamentals
def __init__(self, ishares_xlsx,cleaned_csv=f'{source}_cleaned.csv', fundamentals_xlsx=f'{source}_fundamentals.xlsx',postfix = postfix):
self.postfix = postfix
if self.postfix == '.xlsx':
self.ishares_xlsx = pd.read_excel(f'{ishares_xlsx}.xlsx')
elif self.postfix == '.csv':
self.ishares_xlsx = pd.read_csv(f'{ishares_xlsx}.csv', skiprows=2)
else:
self.ishares_xlsx = None
self.cleaned_csv = cleaned_csv
self.fundamentals_xlsx = fundamentals_xlsx
def file_controll(self):
if self.ishares_xlsx == None:
return 'Process stopped: please only use Excel (.xlsx) or CSV (.csv) files as your source'
def isin_to_ticker(self):
ishares = self.ishares_xlsx
ticker = []
url = "https://query2.finance.yahoo.com/v1/finance/search"
if 'Emittententicker' in ishares.columns:
ishares['ticker'] = ishares['Emittententicker']
return ishares
print('receive ticker symbol based on ISIN from provieded iShares Excel. Please wait.')
for i,_ in zip(ishares['ISIN'], tqdm(ishares['ISIN'])):
#if ISIN is -, position is no stock equity (cash, futures etc.), no fundamentals given
if i == '-':
ticker.append(np.nan)
continue
params = {'q': f'{i}', 'quotesCount': 1, 'newsCount': 0}
r = requests.get(url, params=params)
data = r.json()
#if data['quotes'] is empty, stock is not known by yahoo finance / or can't find matching ticker. Skip value
if not data['quotes']:
ticker.append(np.nan)
else:
ticker.append(data['quotes'][0]['symbol'])
time.sleep(1)
ishares['ticker'] = ticker
ishares.dropna(subset=['ticker'], inplace=True)
ishares.to_csv(self.cleaned_csv)
return ishares
def fundamentals(self,symbol,sleep):
url = 'https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol
# Set up the request headers that we're going to use, to simulate
# a request by the Chrome browser. Simulating a request from a browser
# is generally good practice when building a scraper
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Pragma': 'no-cache',
'Referrer': 'https://google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
}
# Fetch the page that we're going to parse, using the request headers
# defined above
page = requests.get(url, headers)
# Parse the page with LXML, so that we can start doing some XPATH queries
# to extract the data that we want
tree = html.fromstring(page.content)
# Smoke test that we fetched the page by fetching and displaying the H1 element
tree.xpath("//h1/text()")
table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
# Ensure that some table rows are found; if none are found, then it's possible
# that Yahoo Finance has changed their page layout, or have detected
# that you're scraping the page.
#edit: if scaping detected, sleep increases by sleep**2
if len(table_rows) == 0:
time.sleep(sleep)
return pd.DataFrame()
parsed_rows = []
for table_row in table_rows:
parsed_row = []
el = table_row.xpath("./div")
none_count = 0
for rs in el:
try:
(text,) = rs.xpath('.//span/text()[1]')
parsed_row.append(text)
except ValueError:
parsed_row.append(np.NaN)
none_count += 1
if (none_count < 4):
parsed_rows.append(parsed_row)
df = pd.DataFrame(parsed_rows)
df = pd.DataFrame(parsed_rows)
df = df.set_index(0) # Set the index to the first column: 'Period Ending'.
df = df.transpose() # Transpose the DataFrame, so that our header contains the account names
# Rename the "Breakdown" column to "Date"
cols = list(df.columns)
cols[0] = 'Date'
df = df.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df.columns)[1::] # Take all columns, except the first (which is the 'Date' column)
for column_name in numeric_columns:
df[column_name] = df[column_name].str.replace(',', '') # Remove the thousands separator
df[column_name] = df[column_name].astype(np.float64) # Convert the column to float64
df['ticker'] = symbol
time.sleep(sleep)
return df
def run_scrapper(self):
#get table of tickers
sleep = 1
ishares = self.isin_to_ticker()
symbol = ishares.ticker[1]
df = self.fundamentals(symbol,sleep)
#define empty DF with fitting Columns
yahoo = pd.DataFrame(columns=df.columns)
print('scraping yahoo finance. Will take some time, please grab some coffee')
for i,_ in zip(ishares.ticker, tqdm(ishares.ticker)):
df = self.fundamentals(i,sleep)
while df.empty:
sleep += sleep
print(f'yahoo finance detected scraping. Increased delay by {sleep}. Continue after a break of {60} seconds with and updated delay of {sleep} seconds. ')
time.sleep(60)
df = self.fundamentals(i,sleep)
yahoo = yahoo.append(df)
fundamentals = pd.merge(ishares, yahoo, how="left", on=["ticker"])
fundamentals.set_index(['Name', 'Date'], inplace=True)
fundamentals.to_excel(self.fundamentals_xlsx)
print("process finished - you'r almost a millionaire. Your file is located in the folder of the source data.")
#print('scraping yahoo fince, grab some coffee')
#for i,_ in zip(ishares.ticker, tqdm(ishares.ticker)):
if input('start scrapint? y/n ' ) == 'y':
IsharesScraper(f'{source}', f'{source}_cleaned.csv', f'{source}_fundamentals.xlsx').run_scrapper()
else:
print('process ended')
exit(5)