-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetchArticle.py
55 lines (48 loc) · 1.55 KB
/
fetchArticle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import requests
from bs4 import BeautifulSoup as bs
import random
from func_timeout import func_set_timeout, FunctionTimedOut
from .utils import parsed_from_url
def getDateTime(parsedpage):
dateTime = parsedpage.find(class_='published-date')
if dateTime is not None:
return dateTime['datetime']
else:
return ""
def getText(parsedpage):
main_element = parsedpage.find(attrs={'data-element': 'story-body'})
text = ""
if main_element is not None:
for child in main_element.findAll('p'):
text += child.getText().replace('\n', "") + '\n\n'
return text
def getArticleInfo(url):
num_retries = 0
failed = True
while num_retries < 3 and failed == True:
try:
parsed = parsed_from_url(url)
failed = False
except:
print('Function errored. Retrying.')
if failed == True:
print('Function failed after 3 attempts.')
raise Exception()
dateTime = getDateTime(parsed)
text = getText(parsed)
return {
'author': "",
'dateTime': dateTime,
'text': text,
'needs_collection': False
}
def articleFetch(rowDict: dict) -> dict:
try:
newinfo = getArticleInfo(rowDict['url'])
except (KeyboardInterrupt, SystemExit):
raise
except:
print(f"Got an exception. url: {rowDict['url']}")
newinfo = {'author': "", 'dateTime': "", 'text': "", 'needs_collection': True} # if it fails, it still must be collected (although perhaps later)
rowDict.update(newinfo)
return rowDict