-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathgoogle-images-download.py
219 lines (183 loc) · 7.12 KB
/
google-images-download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#coding: UTF-8
import time
import sys
import os
import urllib2
keywords = ['']
search_keyword = [
'Paxinus involutus',
'sarcosphaera',
'Inonatus tamaricis',
'Lactarius torminousus',
'Omphalotus olearius',
'Agaricus campestre',
'Agrocybe aegerita',
'Amanita caesa',
'boletus aestivalis',
'boletus pinophilus',
'boletus edulis',
'Calocybe gambosa',
'Cantharellus cibbarius',
'Hidnum repandum o Lengua de vaca',
'Higrophorus limacinus',
'Hygrophorus marzuolus',
'Lactarius deliciosus',
'Marasmius oreades, Senderuela',
'Morchella esculenta',
'Pleurotus erygii',
'Russula virescens',
'Russula ilicis',
'Tricholoma georgil',
'Tuber melanosporum',
'Aleuria auranta',
'Boletus aereus',
'Cantaharellus lutecens',
'Chroogomplus rutilus',
'Coprinus comatus',
'craterellus cornucopioides',
'Hygrophorus russula',
'Lactarius fuliginosus',
'macrolepiota procera',
'Pleorutus ostreatus',
'Russula cyanoxantha',
'Suillus bellini',
'Tricholoma terreum',
'Clitocibe nebularis',
'Giromintra esculenta',
'Phallus impudicus',
'Romaria aurea',
'Suillus granulatus',
'amanita muscaria',
'amanita muscaria',
'amanita panteherina',
'amanita phalloides',
'amanita proxima',
'boletus satanás',
'democybe especiossisimus',
'entoloma lividum',
'galerina marginata',
'Lepiota bruneoincarnata'
]
########### End of Editing ###########
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
version = (3,0)
cur_version = sys.version_info
if cur_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
else: #If the Current Version of Python is 2.x
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
#Finding 'Next Image' from the given raw page
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
#Getting all links with the help of '_images_get_next_image'
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item) #Append all the links in the list named 'Links'
time.sleep(0.1) #Timer could be used to slow down the request for image downloads
page = page[end_content:]
return items
############## Main Program ############
t0 = time.time() #start the timer
#Download Image Links
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
#make a search keyword directory
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
# time.sleep might help here
time.sleep(30)
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&tbm=isch&tbas=0&source=lnt&sa=X&ved=0ahUKEwjQuY_vx6PYAhVIvxQKHTMrDJgQpwUIHg&biw=1536&bih=734&dpr=1.25'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
#print ("Image Links = "+str(items))
print ("Total Image Links = "+str(len(items)))
print ("\n")
#This allows you to write all the links into a test file. This text file will be created in the same directory as your code. You can comment out the below 3 lines to stop writing the output to the text file.
info = open('output.txt', 'a') #Open the text file called database.txt
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n") #Write the title of the page
info.close() #Close the file
t1 = time.time() #stop the timer
total_time = t1-t0 #Calculating the total time required to crawl, find and download all the links of 60,000 images
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
## To save imges to the same directory
# IN this saving process we are just skipping the URL if there is any error
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close()
print("completed ====> "+str(k+1))
k=k+1;
except IOError: #If there is any IOError
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e: #If there is any HTTPError
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except Exception as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")