From 6a77750a72368c41084a86730d471d260e26d1fd Mon Sep 17 00:00:00 2001 From: sbmzhcn Date: Mon, 12 Jan 2015 06:52:11 -0800 Subject: [PATCH 1/3] fix for phantomjs send_keys --- GoogleScraper/scraping.py | 4 ++++ GoogleScraper/selenium.py | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/GoogleScraper/scraping.py b/GoogleScraper/scraping.py index 672ab03d..7e23253e 100644 --- a/GoogleScraper/scraping.py +++ b/GoogleScraper/scraping.py @@ -275,7 +275,11 @@ def blocking_search(self, callback, *args, **kwargs): # Leave search when search engines detected us # add the rest of the keywords as missed one logger.critical(e) +<<<<<<< HEAD self.missed_keywords.add(self.keywords[i:]) +======= + self.missed_keywords.add(self.keywords[i]) +>>>>>>> some fix for http scraping continue @abc.abstractmethod diff --git a/GoogleScraper/selenium.py b/GoogleScraper/selenium.py index 980703fb..33bca0ce 100644 --- a/GoogleScraper/selenium.py +++ b/GoogleScraper/selenium.py @@ -364,7 +364,10 @@ def search(self): if self.search_input: self.search_input.clear() time.sleep(.25) - self.search_input.send_keys(self.current_keyword + Keys.ENTER) + self.search_input.send_keys(self.current_keyword) + if self.browser_type == 'phantomjs': + time.sleep(1) # Phantomjs are much faster than firefox, chrome + self.search_input.send_keys(Keys.ENTER) self.current_request_time = datetime.datetime.utcnow() else: logger.warning('Cannot get handle to the input form for keyword {}.'.format(self.current_keyword)) From ec54e7fa3c399372833e66564a1cc2ccd5ba0923 Mon Sep 17 00:00:00 2001 From: sbmzhcn Date: Mon, 12 Jan 2015 06:58:10 -0800 Subject: [PATCH 2/3] some fix --- GoogleScraper/scraping.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/GoogleScraper/scraping.py b/GoogleScraper/scraping.py index 7e23253e..1c97a985 100644 --- a/GoogleScraper/scraping.py +++ b/GoogleScraper/scraping.py @@ -275,11 +275,7 @@ def blocking_search(self, callback, *args, **kwargs): # Leave search when search engines detected us # add the rest of the keywords as missed one logger.critical(e) -<<<<<<< HEAD - self.missed_keywords.add(self.keywords[i:]) -======= self.missed_keywords.add(self.keywords[i]) ->>>>>>> some fix for http scraping continue @abc.abstractmethod From 3448cc6d1f018281412c3be9202371f8dd7598cc Mon Sep 17 00:00:00 2001 From: sbmzhcn Date: Mon, 12 Jan 2015 07:11:32 -0800 Subject: [PATCH 3/3] http requests timeout support --- GoogleScraper/config.cfg | 3 +++ GoogleScraper/core.py | 5 +++++ GoogleScraper/http.py | 3 ++- GoogleScraper/scraping.py | 6 +++++- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/GoogleScraper/config.cfg b/GoogleScraper/config.cfg index ea217da5..3ced8f11 100644 --- a/GoogleScraper/config.cfg +++ b/GoogleScraper/config.cfg @@ -99,6 +99,9 @@ use_own_ip: True ; Whether to check proxies before starting the scrape check_proxies: True +; Set HTTP requests to stop waiting for a response after a given number of seconds +timeout: 10 + ; Global configuration parameters that apply on all modes. [GLOBAL] ; The proxy file. If this is a valid file path, each line will represent a proxy. diff --git a/GoogleScraper/core.py b/GoogleScraper/core.py index 2ac70ad1..b6c6eef3 100755 --- a/GoogleScraper/core.py +++ b/GoogleScraper/core.py @@ -259,6 +259,10 @@ def main(return_results=False, parse_cmd_line=True): if Config['SCRAPING'].getboolean('use_own_ip'): proxies.append(None) + + request_timeout = Config['SCRAPING'].getint('timeout', 10) + if request_timeout < 10: + request_timeout = 10 if not proxies: raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.") @@ -398,6 +402,7 @@ def main(return_results=False, parse_cmd_line=True): db_lock=db_lock, proxy=proxy_to_use, progress_queue=q, + request_timeout=request_timeout ) ) diff --git a/GoogleScraper/http.py b/GoogleScraper/http.py index dbcb1dd7..88f2036b 100644 --- a/GoogleScraper/http.py +++ b/GoogleScraper/http.py @@ -250,7 +250,8 @@ def search(self, *args, rand=False, **kwargs): super().detection_prevention_sleep() super().keyword_info() - request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, timeout=5) + request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, + timeout=self.request_timeout) self.current_request_time = datetime.datetime.utcnow() self.html = request.text diff --git a/GoogleScraper/scraping.py b/GoogleScraper/scraping.py index 1c97a985..7eda989b 100644 --- a/GoogleScraper/scraping.py +++ b/GoogleScraper/scraping.py @@ -136,7 +136,8 @@ class SearchEngineScrape(metaclass=abc.ABCMeta): } def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None, - start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None): + start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None, + request_timeout=10): """Instantiate an SearchEngineScrape object. Args: @@ -240,6 +241,9 @@ def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=Non # the default timeout self.timeout = 5 + # http request timeout + self.request_timeout = request_timeout + @abc.abstractmethod