From b9bcd0a520e14da8d327a80df2c07a8cb92f5d63 Mon Sep 17 00:00:00 2001 From: jffifa Date: Fri, 3 Apr 2015 21:45:20 +0800 Subject: [PATCH] Revert "modified" This reverts commit fd0b920c9c8758ef54b22d2084928ac5de53114a. --- GoogleScraper/config.cfg | 5 --- GoogleScraper/parsing.py | 4 +- GoogleScraper/selenium_mode.py | 78 +--------------------------------- 3 files changed, 4 insertions(+), 83 deletions(-) diff --git a/GoogleScraper/config.cfg b/GoogleScraper/config.cfg index d711a517..840f43dd 100644 --- a/GoogleScraper/config.cfg +++ b/GoogleScraper/config.cfg @@ -106,11 +106,6 @@ check_proxies: True ; response when something fails. raise_exceptions_while_scraping: False -; The following two options only make sense when search_engine is set to "googleimg" -; do NOT use them unless you are sure what you are goint to do -image_type: None -image_size: None - ; Global configuration parameters that apply on all modes. [GLOBAL] ; The proxy file. If this is a valid file path, each line will represent a proxy. diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index bfa2a2b9..58e62f59 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -953,7 +953,7 @@ def get_parser_by_search_engine(search_engine): Raises: NoParserForSearchEngineException if no parser could be found for the name. """ - if search_engine == 'google' or search_engine == 'googleimg': + if search_engine == 'google': return GoogleParser elif search_engine == 'yandex': return YandexParser @@ -961,7 +961,7 @@ def get_parser_by_search_engine(search_engine): return BingParser elif search_engine == 'yahoo': return YahooParser - elif search_engine == 'baidu' or search_engine == 'baiduimg': + elif search_engine == 'baidu': return BaiduParser elif search_engine == 'duckduckgo': return DuckduckgoParser diff --git a/GoogleScraper/selenium_mode.py b/GoogleScraper/selenium_mode.py index 1180d851..b0a358b3 100644 --- a/GoogleScraper/selenium_mode.py +++ b/GoogleScraper/selenium_mode.py @@ -62,9 +62,7 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': '.n', 'ask': '#paging div a.txt3.l_nu', 'blekko': '', - 'duckduckgo': '', - 'googleimg': '#pnnext', - 'baiduimg': '.n', + 'duckduckgo': '' } input_field_selectors = { @@ -76,23 +74,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'duckduckgo': (By.NAME, 'q'), 'ask': (By.NAME, 'q'), 'blekko': (By.NAME, 'q'), - 'google': (By.NAME, 'q'), - 'googleimg': (By.NAME, 'as_q'), - 'baiduimg': (By.NAME, 'word'), - } - - param_field_selectors = { - 'googleimg': { - 'image_type': (By.ID, 'imgtype_input'), - 'image_size': (By.ID, 'imgsz_input'), - }, - } - - search_params = { - 'googleimg': { - 'image_type': None, - 'image_size': None, - }, } normal_search_locations = { @@ -103,7 +84,7 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': 'http://baidu.com/', 'duckduckgo': 'https://duckduckgo.com/', 'ask': 'http://ask.com/', - 'blekko': 'http://blekko.com/', + 'blekko': 'http://blekko.com/' } image_search_locations = { @@ -115,8 +96,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'duckduckgo': None, # duckduckgo doesnt't support direct image search 'ask': 'http://www.ask.com/pictures/', 'blekko': None, - 'googleimg':'https://www.google.com/advanced_image_search', - 'baiduimg': 'http://image.baidu.com/', } def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs): @@ -139,8 +118,6 @@ def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs): self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None) - self.search_param_values = self._get_search_param_values() - # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) @@ -323,15 +300,6 @@ def build_search(self): self.webdriver.get(self.starting_point) - def _get_search_param_values(self): - search_param_values = {} - if self.search_engine_name in self.search_params: - for param_key in self.search_params[self.search_engine_name]: - cfg = Config['SCRAPING'].get(param_key, None) - if cfg: - search_param_values[param_key] = cfg - return search_param_values - def _get_search_input_field(self): """Get the search input field for the current search_engine. @@ -340,12 +308,6 @@ def _get_search_input_field(self): """ return self.input_field_selectors[self.search_engine_name] - def _get_search_param_fields(self): - if self.search_engine_name in self.param_field_selectors: - return self.param_field_selectors[self.search_engine_name] - else: - return {} - def _wait_until_search_input_field_appears(self, max_wait=5): """Waits until the search input field can be located for the current search engine @@ -367,20 +329,6 @@ def find_visible_search_input(driver): logger.error('{}: TimeoutException waiting for search input field: {}'.format(self.name, e)) return False - def _wait_until_search_param_fields_appears(self, max_wait=5): - def find_visible_search_param(driver): - for param, field in self._get_search_param_fields().items(): - input_field = driver.find_element(*field) - if not input_field: - return False - return True - - try: - fields = WebDriverWait(self.webdriver, max_wait).until(find_visible_search_param) - return fields - except TimeoutException as e: - logger.error('{}: TimeoutException waiting for search param field: {}'.format(self.name, e)) - return False def _wait_until_search_input_field_contains_query(self, max_wait=5): """Waits until the search input field contains the query. @@ -499,28 +447,6 @@ def search(self): self.search_input.clear() time.sleep(.25) - self.search_param_fields = self._get_search_param_fields() - - if self.search_param_fields: - wait_res = self._wait_until_search_param_fields_appears() - if wait_res is False: - raise Exception('Waiting search param input fields time exceeds') - for param, field in self.search_param_fields.items(): - if field[0] == By.ID: - js_tpl = ''' - var field = document.getElementById("%s"); - field.setAttribute("value", "%s"); - ''' - elif field[0] == By.NAME: - js_tpl = ''' - var fields = document.getElementsByName("%s"); - for (var f in fields) { - f.setAttribute("value", "%s"); - } - ''' - js_str = js_tpl % (field[1], self.search_param_values[param]) - self.webdriver.execute_script(js_str) - try: self.search_input.send_keys(self.query + Keys.ENTER) except ElementNotVisibleException as e: