-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpdf_saver.py
72 lines (48 loc) · 1.65 KB
/
pdf_saver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import requests
from lxml import html
import urllib
import os
'''
specify directory to save the files
'''
HOME_DIR = os.getenv("HOME")
file_dir = os.chdir(os.path.join(HOME_DIR, "Downloads", "test"))
'''
specify the URL where you want to download the files
'''
url = "http://www.mobile.ifi.lmu.de/lehrveranstaltungen/bs-ws1819/"
links_array = []
ns = {'re': 'http://exslt.org/regular-expressions'}
def get_file_links():
'''
get the hyperlinks to pdf files with html parsing and add all links to array with hyperlinks
'''
# create response object from url
r = requests.get(url)
root = html.fromstring(r.content)
for node in root.xpath('//a[re:test(@href, "\.pdf$", "i")]', namespaces=ns):
links_array.append(urllib.parse.urljoin(url, node.attrib['href']))
return links_array
#download pdf files
def download_files(links):
'''
download pdf files from hyperlinks
:type links: list
:param links: list with pdf hyperlinks
'''
for link in links:
#download files in defined directory one by one iterating throw the file_links array
# obtain filename by splitting url and getting the last string
file_name = link.split('/')[-1]
print("Downloading file: %s" % file_name)
# create response object
r = requests.get(link, stream=True)
# start download
with open(file_name, 'wb') as file_dir:
file_dir.write(r.content)
print("%s was downloaded!\n" % file_name)
print("All files were saved in directory:%s" % os.getcwd())
return
if __name__ == "__main__":
links_array = get_file_links()
download_files(links_array)