Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mergetest #96

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions GoogleScraper/config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,6 @@ check_proxies: True
; response when something fails.
raise_exceptions_while_scraping: False

; The following two options only make sense when search_engine is set to "googleimg"
; do NOT use them unless you are sure what you are goint to do
image_type: None
image_size: None

; Global configuration parameters that apply on all modes.
[GLOBAL]
; The proxy file. If this is a valid file path, each line will represent a proxy.
Expand Down
4 changes: 2 additions & 2 deletions GoogleScraper/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,15 +962,15 @@ def get_parser_by_search_engine(search_engine):
Raises:
NoParserForSearchEngineException if no parser could be found for the name.
"""
if search_engine == 'google' or search_engine == 'googleimg':
if search_engine == 'google':
return GoogleParser
elif search_engine == 'yandex':
return YandexParser
elif search_engine == 'bing':
return BingParser
elif search_engine == 'yahoo':
return YahooParser
elif search_engine == 'baidu' or search_engine == 'baiduimg':
elif search_engine == 'baidu':
return BaiduParser
elif search_engine == 'duckduckgo':
return DuckduckgoParser
Expand Down
84 changes: 2 additions & 82 deletions GoogleScraper/selenium_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,7 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'baidu': '.n',
'ask': '#paging div a.txt3.l_nu',
'blekko': '',
'duckduckgo': '',
'googleimg': '#pnnext',
'baiduimg': '.n',
'duckduckgo': ''
}

input_field_selectors = {
Expand All @@ -76,23 +74,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'duckduckgo': (By.NAME, 'q'),
'ask': (By.NAME, 'q'),
'blekko': (By.NAME, 'q'),
'google': (By.NAME, 'q'),
'googleimg': (By.NAME, 'as_q'),
'baiduimg': (By.NAME, 'word'),
}

param_field_selectors = {
'googleimg': {
'image_type': (By.ID, 'imgtype_input'),
'image_size': (By.ID, 'imgsz_input'),
},
}

search_params = {
'googleimg': {
'image_type': None,
'image_size': None,
},
}

normal_search_locations = {
Expand All @@ -103,7 +84,7 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'baidu': 'http://baidu.com/',
'duckduckgo': 'https://duckduckgo.com/',
'ask': 'http://ask.com/',
'blekko': 'http://blekko.com/',
'blekko': 'http://blekko.com/'
}

image_search_locations = {
Expand All @@ -115,8 +96,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'duckduckgo': None, # duckduckgo doesnt't support direct image search
'ask': 'http://www.ask.com/pictures/',
'blekko': None,
'googleimg':'https://www.google.com/advanced_image_search',
'baiduimg': 'http://image.baidu.com/',
}

def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs):
Expand All @@ -139,8 +118,6 @@ def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs):

self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None)

self.search_param_values = self._get_search_param_values()

# get the base search url based on the search engine.
self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method)
super().instance_creation_info(self.__class__.__name__)
Expand Down Expand Up @@ -325,15 +302,6 @@ def build_search(self):

self.webdriver.get(starting_point)

def _get_search_param_values(self):
search_param_values = {}
if self.search_engine_name in self.search_params:
for param_key in self.search_params[self.search_engine_name]:
cfg = Config['SCRAPING'].get(param_key, None)
if cfg:
search_param_values[param_key] = cfg
return search_param_values

def _get_search_input_field(self):
"""Get the search input field for the current search_engine.

Expand All @@ -342,12 +310,6 @@ def _get_search_input_field(self):
"""
return self.input_field_selectors[self.search_engine_name]

def _get_search_param_fields(self):
if self.search_engine_name in self.param_field_selectors:
return self.param_field_selectors[self.search_engine_name]
else:
return {}

def _wait_until_search_input_field_appears(self, max_wait=5):
"""Waits until the search input field can be located for the current search engine

Expand All @@ -369,26 +331,6 @@ def find_visible_search_input(driver):
logger.error('{}: TimeoutException waiting for search input field: {}'.format(self.name, e))
return False

def _wait_until_search_param_fields_appears(self, max_wait=5):
"""Waits until the search input field contains the query.

Args:
max_wait: How long to wait maximally before returning False.
"""
def find_visible_search_param(driver):
for param, field in self._get_search_param_fields().items():
input_field = driver.find_element(*field)
if not input_field:
return False
return True

try:
fields = WebDriverWait(self.webdriver, max_wait).until(find_visible_search_param)
return fields
except TimeoutException as e:
logger.error('{}: TimeoutException waiting for search param field: {}'.format(self.name, e))
return False

def _goto_next_page(self):
"""Click the next page element.
"""
Expand Down Expand Up @@ -492,28 +434,6 @@ def search(self):
self.search_input.clear()
time.sleep(.25)

self.search_param_fields = self._get_search_param_fields()

if self.search_param_fields:
wait_res = self._wait_until_search_param_fields_appears()
if wait_res is False:
raise Exception('Waiting search param input fields time exceeds')
for param, field in self.search_param_fields.items():
if field[0] == By.ID:
js_tpl = '''
var field = document.getElementById("%s");
field.setAttribute("value", "%s");
'''
elif field[0] == By.NAME:
js_tpl = '''
var fields = document.getElementsByName("%s");
for (var f in fields) {
f.setAttribute("value", "%s");
}
'''
js_str = js_tpl % (field[1], self.search_param_values[param])
self.webdriver.execute_script(js_str)

try:
self.search_input.send_keys(self.query + Keys.ENTER)
except ElementNotVisibleException:
Expand Down