Skip to content

Commit

Permalink
Fetching - Custom browser on experimental/puppeteer fetcher - Don't s…
Browse files Browse the repository at this point in the history
…witch to custom puppeteer mode if external browser URL is active (#2068)
  • Loading branch information
dgtlmoon authored Jan 1, 2024
1 parent 3d1e102 commit 273bd45
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 16 deletions.
33 changes: 20 additions & 13 deletions changedetectionio/content_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,19 +91,20 @@ def __init__(self, status_code, url, screenshot=None, has_filters=False, html_co


class Fetcher():
browser_connection_is_custom = None
browser_connection_url = None
browser_steps = None
browser_steps_screenshot_path = None
content = None
error = None
fetcher_description = "No description"
browser_connection_url = None
headers = {}
instock_data = None
instock_data_js = ""
status_code = None
webdriver_js_execute_code = None
xpath_data = None
xpath_element_js = ""
instock_data = None
instock_data_js = ""

# Will be needed in the future by the VisualSelector, always get this where possible.
screenshot = False
Expand Down Expand Up @@ -252,16 +253,19 @@ class base_html_playwright(Fetcher):

proxy = None

def __init__(self, proxy_override=None, browser_connection_url=None):
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()

self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')

# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not browser_connection_url:
self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
if custom_browser_connection_url:
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url
else:
self.browser_connection_url = browser_connection_url
# Fallback to fetching from system
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')


# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
Expand Down Expand Up @@ -421,8 +425,10 @@ def run(self,
current_include_filters=None,
is_binary=False):


# For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
# browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
# Temporary backup solution until we rewrite the playwright code
return self.run_fetch_browserless_puppeteer(
Expand Down Expand Up @@ -569,15 +575,16 @@ class base_html_webdriver(Fetcher):
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
proxy = None

def __init__(self, proxy_override=None, browser_connection_url=None):
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy

# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if not browser_connection_url:
if not custom_browser_connection_url:
self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
else:
self.browser_connection_url = browser_connection_url
self.browser_connection_is_custom = True
self.browser_connection_url = custom_browser_connection_url

# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
Expand Down Expand Up @@ -674,7 +681,7 @@ def quit(self):
class html_requests(Fetcher):
fetcher_description = "Basic fast Plaintext/HTTP Client"

def __init__(self, proxy_override=None, browser_connection_url=None):
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
self.proxy_override = proxy_override
# browser_connection_url is none because its always 'launched locally'
Expand Down
6 changes: 3 additions & 3 deletions changedetectionio/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,14 @@ def call_browser(self):

# In the case that the preferred fetcher was a browser config with custom connection URL..
# @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..)
browser_connection_url = None
custom_browser_connection_url = None
if prefer_fetch_backend.startswith('extra_browser_'):
(t, key) = prefer_fetch_backend.split('extra_browser_')
connection = list(
filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
if connection:
prefer_fetch_backend = 'base_html_playwright'
browser_connection_url = connection[0].get('browser_connection_url')
custom_browser_connection_url = connection[0].get('browser_connection_url')

# PDF should be html_requests because playwright will serve it up (so far) in a embedded page
# @todo https://github.com/dgtlmoon/changedetection.io/issues/2019
Expand All @@ -74,7 +74,7 @@ def call_browser(self):
# Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
# When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
self.fetcher = fetcher_obj(proxy_override=proxy_url,
browser_connection_url=browser_connection_url
custom_browser_connection_url=custom_browser_connection_url
)

if self.watch.has_browser_steps:
Expand Down

0 comments on commit 273bd45

Please sign in to comment.