-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtable-parser.py
354 lines (315 loc) · 12.8 KB
/
table-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/python3
# coding: UTF-8
# Copyright (c) 2021, Alexander Vankov
# Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted provided that redistributions of source code retain the above copyright notice and this condition.
# THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
# Tested with:
# Python 3.8 or 3.9
# package versions:
# beautifulsoup4 4.9
# requests 2.24
import sys
import re
import time
import datetime
from datetime import datetime
from datetime import timedelta
import os.path
import sqlite3
import requests
from bs4 import BeautifulSoup
# Initial variables:
prog_name = "Table Parser"
prog_ver = "0.9"
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
start_url = "http://127.0.0.1" # place your address here
myheaders = {"User-Agent": user_agent}
mycookies = {}
username = "your_login_name" # place your login here
password_file = "authdata.dat" # store your password in this file
text_to_understand_not_logged_in = "Some text that shows your are not logged in" # insert your text
# Load auth data:
try:
f = open(password_file, "r")
password = f.readline()
f.close()
except Exception as e:
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Loading: unable to get authdata" + "\n" + str(e))
# authdata - login and password if required to login
# you need to see the complete data with you browser developer tools:
authdata = {"username": username, "password": password}
# Some codes:
req_timeout = 100 # time to wait for responce (requests package)
conn_error = 801 # some error code related to connection problems
force_encd = False # set this to True if your need encd value below
encd = 'utf-8' # web pages encoding, not applicable if force_encd = False
# Dicts and lists:
# from local DB:
elements_list_db = [] # List from DB [elemet ID, elemet ID, ...]
elements_dict_db = {} # Dict from DB {elemet ID: {name: value}}
# From the web page:
elements_list = [] # List from the page [elemet ID, elemet ID, ...]
elements_dict = {} # Dict from the page {elemet ID: {name: value}}
# New elements found on the page:
new_elements_list = [] # New elements from the page
# Some colors for terminal output
BLUE = '\033[1;34m'
GREEN = '\033[1;32m'
RED = '\033[1;31;48m'
END = '\033[0;37m'
# Functions definitions
# Save cookies:
def saveCookies():
try:
f = open("cookies.dat", "w")
for item in mycookies:
f.write(item + ' = ' + mycookies[item] + '\n')
f.close()
except Exception as e:
print(curtime(), "saveCookies: unable to save cookies file" + "\n" + str(e))
# Load cookies:
def loadCookies():
mycookies.clear()
try:
f = open("cookies.dat", "r")
for stringcook in f:
onecooklist = stringcook.rstrip().split(' = ', 1)
mycookies[onecooklist[0]] = onecooklist[1]
f.close()
except Exception as e:
print(curtime(), "loadCookies: unable to open cookies file" + "\n" + str(e))
# Check if the user is logged in, and login if not:
def checkLogin():
try:
r = requests.get(start_url, headers = myheaders, cookies = mycookies, timeout = req_timeout)
if r.status_code >= 400:
print(curtime(), 'checkLogin: HTTP_Error:', r.status_code)
return r.status_code
except Exception as e:
print(curtime(), "checkLogin: connection failure" + "\n" + str(e))
return conn_error
if text_to_understand_not_logged_in in r.text:
print(curtime(), "User is not logged in")
with requests.Session() as s:
try:
r = s.post(start_url, data = authdata, headers = myheaders, timeout = req_timeout)
mycookies.clear()
mycookies.update(requests.utils.dict_from_cookiejar(s.cookies))
print(curtime(), "User now is logged in")
saveCookies()
if r.status_code >= 400:
print(curtime(), 'checkLogin: HTTP_Error:', r.status_code)
return r.status_code
else:
if force_encd:
r.encoding = encd
return r.text
except Exception as e:
print(curtime(), "checkLogin: login failure" + "\n" + str(e))
return conn_error
else:
if force_encd:
r.encoding = encd
return r.text
# Build elements dict and list from the raw html page:
def buildElemsDict(raw_html):
soup = BeautifulSoup(raw_html, 'html.parser')
trows = soup.find_all('tr')
for row in trows:
cells = row.find_all('td')
if len(cells) > 0:
element = {}
# below is an integer extraction example:
element['elem_ID'] = int(cells[0].text.strip())
# below is some text data extraction examples:
element['name1'] = cells[1].text.strip()
element['name2'] = cells[2].text.strip()
element['name3'] = cells[3].text.strip()
# could be more table data cells ...
# below is date and time extraction example:
element['date_and_time'] = datetime.strptime(cells[4].text.strip(), '%Y-%m-%d %H:%M')
# could be more table data cells ...
elements_dict[element['elem_ID']] = element # add to dict
elements_list.append(element['elem_ID']) # add to list
elements_list.sort()
# Let's suppose each element has additional data on its page:
def addElemsDetails(elemslist):
for elem_ID in elemslist:
raw_page = getElemPage(elem_ID)
if type(raw_page) == int:
print(curtime(), 'addElemsDetails: get element failure ' + str(elem_ID))
return raw_page
# The code of this function could be some different, surely
# Example of regex search, non-greedy, dot inclides newlines:
elem_data_regex = re.compile('<p id="someid">.*?</p>', flags=re.DOTALL)
elem_data = elem_data_regex.search(raw_page).group().replace('<p id="someid">', '').replace('</p>', '').strip()
# Make the soup from the element page:
tsoup = BeautifulSoup(raw_page, 'html.parser')
# Update the elements dict with example of beautifulsoup search:
elements_dict[elem_ID]['somecontent'] = tsoup.find('div', id = "some_content_id").text.strip()
elements_dict[elem_ID]['somelinks'] = tsoup.find_all('a', class_= "some_link_class")
elements_dict[elem_ID]['somedata'] = elem_data
# Get element page:
def getElemPage(elem_ID):
if type(elem_ID) != str:
elem_ID = str(elem_ID)
complete_url = start_url + '/' + elem_ID + '.html' # could be some different, surely
try:
r = requests.get(complete_url, headers = myheaders, cookies = mycookies, timeout = req_timeout)
if r.status_code >= 400:
print(curtime(), 'getElemPage: ' + elem_ID + ' HTTP_Error:', r.status_code)
return r.status_code
else:
if force_encd:
r.encoding = encd
return r.text
except Exception as e:
print(curtime(), "getElemPage: connection to element failed " + elem_ID + "\n" + str(e))
return conn_error
# Load files from element page:
def getElemFiles(elem_ID):
# this is not complete in this version of the script
pass
# Find new elements which are not in DB:
def findNewElems():
for elem_ID in elements_list:
if elem_ID not in elements_list_db:
new_elements_list.append(elem_ID)
new_elements_list.sort()
# Add elemtnts to DB:
def addElemsIntoDB(elemslist):
elements_to_add = []
for ins_elem in elemslist:
elements_to_add.append((elements_dict[ins_elem]['elem_ID'],
elements_dict[ins_elem]['name1'],
elements_dict[ins_elem]['name2'],
elements_dict[ins_elem]['name3'],
elements_dict[ins_elem]['date_and_time'].strftime('%Y-%m-%d %H:%M'),
elements_dict[ins_elem]['somecontent'],
str(elements_dict[ins_elem]['somelinks']),
elements_dict[ins_elem]['somedata']))
c.executemany('INSERT INTO elements VALUES (?,?,?,?,?,?,?,?)', elements_to_add)
conn.commit()
# Delete elements from DB:
def deleteElemsFromDB(elemslist):
elements_to_delete = []
for ins_elem in elemslist:
elements_to_delete.append((elements_dict[ins_elem]['elem_ID'],))
c.executemany('DELETE FROM elements WHERE elem_ID=?', elements_to_delete)
conn.commit()
# Update somedata value:
def updateDataInBD(elem_ID, performer_fullname):
somedata = elements_dict[elem_ID]['somedata']
data_to_update = (somedata, elem_ID)
c.execute('UPDATE elements SET somedata=?, WHERE elem_ID=?', data_to_update)
conn.commit()
# Analyze element using parsed and extracted data:
def analyzeElem(elem_ID):
pass # place your code to do smth with data extracted
some_result = None
return some_result
# Return current time in string human readable form:
def curtime():
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Main code
# -------------------
# Greeting on launch:
print(prog_name)
print('Version: ' + prog_ver)
print(GREEN + '''
TABLE PARSER
_--------_
|[0] [0]|
| .. |
\__====__/
| |
ANALYZER BOT
''' + END)
# Open log:
try:
logfilename = datetime.now().strftime('%Y-%m') + '-log.txt'
l = open(logfilename, 'a', encoding=encd)
sys.stdout = l # redirection to log file, comment when testing
except Exception as e:
print(curtime(), RED, "Cannot open log file", END, "\n", str(e))
# Print current time, uncomment for testing or if necessary:
# print('Bot run at:', curtime())
# loading cookies:
loadCookies()
# Check if the user is logged in, and login if not,
# and get the start page:
resp = checkLogin()
# If no errors, then build dict and list of elements:
if type(resp) == int:
print(curtime(), 'Server error while trying to connect:', resp)
else:
buildElemsDict(resp)
# load elements table from DB, or create DB if it does not exist:
if os.path.exists('tablep.db'):
# Open DB and load data to dict
conn = sqlite3.connect('tablep.db')
c = conn.cursor()
c.execute('SELECT * FROM elements ORDER BY elem_ID DESC')
# comment the line above and uncomment below if you need to set SQL limit
# c.execute('SELECT * FROM elements ORDER BY elem_ID DESC LIMIT 1024')
result = ''
while not result == None:
result = c.fetchone()
if not result == None:
# Add to list:
elem_ID = result[0]
elements_list_db.append(elem_ID)
# Add to dict:
element = {}
element['elem_ID'] = result[0]
element['name1'] = result[1]
element['name2'] = result[2]
element['name3'] = result[3]
element['date_and_time'] = datetime.strptime(result[4], '%Y-%m-%d %H:%M')
element['somecontent'] = result[5]
element['somelinks'] = BeautifulSoup(result[6], 'html.parser').find_all('a')
element['somedata'] = result[7]
elements_dict_db[element['elem_ID']] = element
elements_list_db.sort()
else:
# Create new DB if it does not exist:
conn = sqlite3.connect('tablep.db')
c = conn.cursor()
c.execute('''CREATE TABLE elements
(elem_ID INTEGER PRIMARY KEY,
name1 TEXT,
name2 TEXT,
name3 TEXT,
date_and_time TEXT,
somecontent TEXT,
somelinks TEXT,
somedata TEXT)''')
conn.commit()
# Looking for new elements:
findNewElems()
# Printing new elements found:
if new_elements_list:
print(curtime(), 'Found new elements:', new_elements_list)
# Update new elements list:
addElemsDetails(new_elements_list)
for each_elem in elements_list:
if each_elem in elements_list_db:
elements_dict[each_elem]['somecontent'] = elements_dict_db[each_elem]['somecontent']
elements_dict[each_elem]['somelinks'] = elements_dict_db[each_elem]['somelinks']
elements_dict[each_elem]['somedata'] = elements_dict_db[each_elem]['somedata']
# Updating DB with new elements found:
addElemsIntoDB(new_elements_list)
print(curtime(), 'Added to DB:', new_elements_list)
# Here is some placeholder code:
for each_elem in new_elements_list:
some_return_value = analyzeElem(each_elem)
# You can add some code to send POST requests, etc.
# Close connection to DB, and close log:
conn.close()
f.close()
# # Print debug info: all elements in the dict:
# # uncomment if required for testing
# print(elements_list)
# print(elements_dict)
# print(elements_list_db)
# print(elements_dict_db)