-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwget.py
74 lines (61 loc) · 1.94 KB
/
wget.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/local/bin/python
import argparse
from bs4 import BeautifulSoup
import os
import os.path
from random import randint
import requests
import select
import shutil
import subprocess
import sys
from sys import argv, exit
import time
import urlparse
from urlparse import urlparse
from urlparse import urlsplit
def get_siteFiles(url):
print " ---------------------------------- "
print "| this function is slow on purpose |"
print " ---------------------------------- "
if not os.path.exists(save):
os.makedirs(save)
fold = urlsplit(url)
folder = fold.hostname
location = "whole_site"
_wget = "/usr/local/Cellar/wget/1.14/bin/wget"
_r = "-r"
_wait = "--wait=7"
_random = "--random-wait"
_no = "--no-check-certificate"
_html = "--accept=html"
_convert = "--convert-links"
_mirror = "--mirror"
_trust = "--trust-server-names"
_adjust = "--adjust-extension"
_user = "--user="
_passwd = "--password="
try:
subprocess.call([_wget, _r, _wait, _random, _no, _convert, _mirror, _trust, _adjust, url])
os.rename(folder, location)
except:
print sys.exc_info()[0]
for dirpath, dirnames, filenames in os.walk(os.path.abspath(location)):
for filename in filenames:
root, ext = os.path.splitext(filename)
if ext in (".php", ".html"):
print filename
shutil.copyfile(os.path.join(dirpath, filename), os.path.join(save, root + ".html"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="wget ALL the pages")
parser.add_argument("url", help="url of site")
args = parser.parse_args()
resp = requests.get(args.url)
if resp.status_code >= 400:
print "Sorry, site not reachable, error occurred."
exit()
save = "raw_files"
if get_siteFiles(args.url):
print "Error"
else:
print "Done"