diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml index 959b4c65750..e505da4d3d2 100644 --- a/.github/workflows/test-stack-reusable-workflow.yml +++ b/.github/workflows/test-stack-reusable-workflow.yml @@ -292,8 +292,8 @@ jobs: - name: Specific tests in built container for Selenium run: | - docker run --rm -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py' - + docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py' + docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py' # SMTP tests smtp-tests: diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 8a0cc88626a..742c05346bf 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -10,7 +10,6 @@ from loguru import logger import getopt import logging -import os import platform import signal import threading diff --git a/changedetectionio/api/Import.py b/changedetectionio/api/Import.py index 9b0cec4e45e..a27d06219a8 100644 --- a/changedetectionio/api/Import.py +++ b/changedetectionio/api/Import.py @@ -154,11 +154,10 @@ def post(self): if extras['processor'] not in available: return f"Invalid processor '{extras['processor']}'. Available processors: {', '.join(available)}", 400 - # Validate fetch_backend if provided + # Validate fetch_backend if provided (legacy API compat — still accepted, stored as-is) if 'fetch_backend' in extras: from changedetectionio.content_fetchers import available_fetchers available = [f[0] for f in available_fetchers()] - # Also allow 'system' and extra_browser_* patterns is_valid = ( extras['fetch_backend'] == 'system' or extras['fetch_backend'] in available or @@ -167,6 +166,14 @@ def post(self): if not is_valid: return f"Invalid fetch_backend '{extras['fetch_backend']}'. Available: system, {', '.join(available)}", 400 + # Validate browser_profile if provided + if 'browser_profile' in extras: + from changedetectionio.model.browser_profile import get_builtin_profiles, RESERVED_MACHINE_NAMES + store_profiles = self.datastore.data['settings']['application'].get('browser_profiles', {}) + known = set(get_builtin_profiles().keys()) | set(store_profiles.keys()) | {'system', None} + if extras['browser_profile'] not in known: + return f"Invalid browser_profile '{extras['browser_profile']}'. Available: {', '.join(str(k) for k in known)}", 400 + # Validate notification_urls if provided if 'notification_urls' in extras: from wtforms import ValidationError diff --git a/changedetectionio/api/Tags.py b/changedetectionio/api/Tags.py index 1142cd9d606..0e3e6e0542a 100644 --- a/changedetectionio/api/Tags.py +++ b/changedetectionio/api/Tags.py @@ -85,6 +85,9 @@ def queue_watches_background(): # Create clean tag dict without Watch-specific fields clean_tag = {k: v for k, v in tag.items() if k not in watch_only_fields} + # fetch_backend is a legacy field superseded by browser_profile — omit from API response + clean_tag.pop('fetch_backend', None) + return clean_tag @auth.check_token diff --git a/changedetectionio/api/Watch.py b/changedetectionio/api/Watch.py index 18d1d30e099..4c1df01cdf6 100644 --- a/changedetectionio/api/Watch.py +++ b/changedetectionio/api/Watch.py @@ -105,6 +105,9 @@ def get(self, uuid): watch['viewed'] = watch_obj.viewed watch['link'] = watch_obj.link, + # fetch_backend is a legacy field superseded by browser_profile — omit from API response + watch.pop('fetch_backend', None) + return watch @auth.check_token diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py index 92289ec2a47..431beb000e7 100644 --- a/changedetectionio/blueprint/browser_steps/__init__.py +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -208,28 +208,23 @@ async def start_browsersteps_session(watch_uuid): browsersteps_start_session = {'start_time': time.time()} # Build proxy dict first — needed by both the CDP path and fetcher-specific launchers - proxy_id = datastore.get_preferred_proxy_for_watch(uuid=watch_uuid) + proxy_url = datastore.get_proxy_url_for_watch(uuid=watch_uuid) proxy = None - if proxy_id: - proxy_url = datastore.proxy_list.get(proxy_id, {}).get('url') - if proxy_url: - from urllib.parse import urlparse - parsed = urlparse(proxy_url) - proxy = {'server': proxy_url} - if parsed.username: - proxy['username'] = parsed.username - if parsed.password: - proxy['password'] = parsed.password - logger.debug(f"Browser Steps: UUID {watch_uuid} selected proxy {proxy_url}") + if proxy_url: + from urllib.parse import urlparse + parsed = urlparse(proxy_url) + proxy = {'server': proxy_url} + if parsed.username: + proxy['username'] = parsed.username + if parsed.password: + proxy['password'] = parsed.password + logger.debug(f"Browser Steps: UUID {watch_uuid} selected proxy {proxy_url}") # Resolve the fetcher class for this watch so we can ask it to launch its own browser # if it supports that (e.g. CloakBrowser, which runs locally rather than via CDP) watch = datastore.data['watching'][watch_uuid] from changedetectionio import content_fetchers - fetcher_name = watch.get_fetch_backend or 'system' - if fetcher_name == 'system': - fetcher_name = datastore.data['settings']['application'].get('fetch_backend', 'html_requests') - fetcher_class = getattr(content_fetchers, fetcher_name, None) + fetcher_class = content_fetchers.get_fetcher(watch.effective_browser_profile.fetch_backend) browser = None playwright_context = None @@ -241,7 +236,7 @@ async def start_browsersteps_session(watch_uuid): result = await fetcher_class.get_browsersteps_browser(proxy=proxy, keepalive_ms=keepalive_ms) if result is not None: browser, playwright_context = result - logger.debug(f"Browser Steps: using fetcher-specific browser for '{fetcher_name}'") + logger.debug(f"Browser Steps: using fetcher-specific browser for '{fetcher_class.__name__}'") # Default: connect to the remote Playwright/sockpuppetbrowser via CDP if browser is None: diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py index 2a07222f035..24b2e34ce5f 100644 --- a/changedetectionio/blueprint/check_proxies/__init__.py +++ b/changedetectionio/blueprint/check_proxies/__init__.py @@ -46,7 +46,8 @@ def long_task(uuid, preferred_proxy): watch_uuid=uuid ) - asyncio.run(update_handler.call_browser(preferred_proxy_id=preferred_proxy)) + update_handler.preferred_proxy_override = preferred_proxy + asyncio.run(update_handler.call_browser()) # title, size is len contents not len xfer except content_fetcher_exceptions.Non200ErrorCodeReceived as e: if e.status_code == 404: diff --git a/changedetectionio/blueprint/imports/importer.py b/changedetectionio/blueprint/imports/importer.py index 663a5c42804..fd9abacb3dc 100644 --- a/changedetectionio/blueprint/imports/importer.py +++ b/changedetectionio/blueprint/imports/importer.py @@ -175,9 +175,9 @@ def run(self, dynamic_wachet = str(data.get('dynamic wachet', '')).strip().lower() # Convert bool to str to cover all cases # libreoffice and others can have it as =FALSE() =TRUE(), or bool(true) if 'true' in dynamic_wachet or dynamic_wachet == '1': - extras['fetch_backend'] = 'html_webdriver' + extras['browser_profile'] = 'browser_chromeplaywright' elif 'false' in dynamic_wachet or dynamic_wachet == '0': - extras['fetch_backend'] = 'html_requests' + extras['browser_profile'] = 'direct_http_requests' if data.get('xpath'): # @todo split by || ? diff --git a/changedetectionio/blueprint/settings/__init__.py b/changedetectionio/blueprint/settings/__init__.py index 53389213738..655bba03743 100644 --- a/changedetectionio/blueprint/settings/__init__.py +++ b/changedetectionio/blueprint/settings/__init__.py @@ -15,6 +15,9 @@ def construct_blueprint(datastore: ChangeDetectionStore): settings_blueprint = Blueprint('settings', __name__, template_folder="templates") + from changedetectionio.blueprint.settings.browser_profile import construct_blueprint as construct_browser_profile_blueprint + settings_blueprint.register_blueprint(construct_browser_profile_blueprint(datastore), url_prefix='/browsers') + @settings_blueprint.route("", methods=['GET', "POST"]) @login_optionally_required def settings_page(): diff --git a/changedetectionio/blueprint/settings/browser_profile/__init__.py b/changedetectionio/blueprint/settings/browser_profile/__init__.py new file mode 100644 index 00000000000..028e08b216e --- /dev/null +++ b/changedetectionio/blueprint/settings/browser_profile/__init__.py @@ -0,0 +1,200 @@ +import flask_login +from flask import Blueprint, render_template, request, redirect, url_for, flash +from flask_babel import gettext + +from changedetectionio.store import ChangeDetectionStore +from changedetectionio.auth_decorator import login_optionally_required + + +def construct_blueprint(datastore: ChangeDetectionStore): + settings_browser_profile_blueprint = Blueprint( + 'settings_browsers', + __name__, + template_folder="templates" + ) + + def _render_index(browser_profile_form=None, editing_machine_name=None): + from changedetectionio import forms + from changedetectionio import content_fetchers as cf + from changedetectionio.model.browser_profile import BrowserProfile, RESERVED_MACHINE_NAMES + + # Only browser-capable fetchers are valid profile types + fetcher_choices = cf.available_browser_fetchers() + if browser_profile_form is None: + browser_profile_form = forms.BrowserProfileForm() + browser_profile_form.fetch_backend.choices = fetcher_choices + + fetcher_supports_screenshots = {name: True for name, _ in fetcher_choices} + fetcher_requires_connection_url = {name: True for name, cls in cf.FETCHERS.items() + if getattr(cls, 'requires_connection_url', False)} + + # Table shows default built-in profiles first, then user-created profiles + store_profiles = datastore.data['settings']['application'].get('browser_profiles', {}) + user_profiles = dict(cf.DEFAULT_BROWSER_PROFILES) + for machine_name, raw in store_profiles.items(): + try: + user_profiles[machine_name] = BrowserProfile(**raw) if isinstance(raw, dict) else raw + except Exception: + pass + + current_default = datastore.data['settings']['application'].get('browser_profile') or 'direct_http_requests' + + return render_template( + "browser_profiles.html", + browser_profiles=user_profiles, + browser_profile_form=browser_profile_form, + reserved_browser_profile_names=RESERVED_MACHINE_NAMES, + fetcher_choices=fetcher_choices, + fetcher_supports_screenshots=fetcher_supports_screenshots, + fetcher_requires_connection_url=fetcher_requires_connection_url, + current_default_profile=current_default, + editing_machine_name=editing_machine_name, + ) + + @settings_browser_profile_blueprint.route("", methods=['GET']) + @login_optionally_required + def index(): + return _render_index() + + @settings_browser_profile_blueprint.route("//edit", methods=['GET']) + @login_optionally_required + def edit(machine_name): + from changedetectionio import forms + from changedetectionio.model.browser_profile import BrowserProfile, RESERVED_MACHINE_NAMES + + if machine_name in RESERVED_MACHINE_NAMES: + flash(gettext("Built-in browser profiles cannot be edited."), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + store_profiles = datastore.data['settings']['application'].get('browser_profiles', {}) + raw = store_profiles.get(machine_name) + if raw is None: + flash(gettext("Browser profile not found."), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + profile = BrowserProfile(**raw) if isinstance(raw, dict) else raw + form = forms.BrowserProfileForm(data=profile.model_dump()) + return _render_index(browser_profile_form=form, editing_machine_name=machine_name) + + @settings_browser_profile_blueprint.route("/save", methods=['POST']) + @login_optionally_required + def save(): + from changedetectionio import forms + from changedetectionio import content_fetchers as cf + from changedetectionio.model.browser_profile import BrowserProfile, RESERVED_MACHINE_NAMES + + fetcher_choices = [(name, desc) for name, desc in cf.available_fetchers()] + browser_profile_form = forms.BrowserProfileForm(formdata=request.form) + browser_profile_form.fetch_backend.choices = fetcher_choices + + if not browser_profile_form.validate(): + flash(gettext("Browser profile error: {}").format( + '; '.join(str(e) for errs in browser_profile_form.errors.values() for e in errs) + ), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + name = browser_profile_form.name.data.strip() + machine_name = BrowserProfile.machine_name_from_str(name) + + if machine_name in RESERVED_MACHINE_NAMES: + flash(gettext("Cannot use reserved profile name '{}'. Please choose a different name.").format(name), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + original_machine_name = request.form.get('original_machine_name', '').strip() + store_profiles = datastore.data['settings']['application'].setdefault('browser_profiles', {}) + + if machine_name != original_machine_name and machine_name in store_profiles: + flash(gettext("A browser profile named '{}' already exists.").format(name), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + profile_data = { + 'name': name, + 'fetch_backend': browser_profile_form.fetch_backend.data, + 'browser_connection_url': browser_profile_form.browser_connection_url.data or None, + 'viewport_width': browser_profile_form.viewport_width.data or 1280, + 'viewport_height': browser_profile_form.viewport_height.data or 1000, + 'block_images': bool(browser_profile_form.block_images.data), + 'block_fonts': bool(browser_profile_form.block_fonts.data), + 'ignore_https_errors': bool(browser_profile_form.ignore_https_errors.data), + 'user_agent': browser_profile_form.user_agent.data or None, + 'locale': browser_profile_form.locale.data or None, + 'custom_headers': browser_profile_form.custom_headers.data or '', + 'is_builtin': False, + } + + try: + BrowserProfile(**profile_data) + except Exception as e: + flash(gettext("Browser profile validation error: {}").format(str(e)), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + # Handle rename: remove old key, cascade-update watches and tags + if original_machine_name and original_machine_name != machine_name and original_machine_name in store_profiles: + del store_profiles[original_machine_name] + for watch in datastore.data['watching'].values(): + if watch.get('browser_profile') == original_machine_name: + watch['browser_profile'] = machine_name + for tag in datastore.data.get('settings', {}).get('application', {}).get('tags', {}).values(): + if tag.get('browser_profile') == original_machine_name: + tag['browser_profile'] = machine_name + + store_profiles[machine_name] = profile_data + datastore.commit() + flash(gettext("Browser profile '{}' saved.").format(name), 'notice') + return redirect(url_for('settings.settings_browsers.index')) + + @settings_browser_profile_blueprint.route("//delete", methods=['GET']) + @login_optionally_required + def delete(machine_name): + from changedetectionio.model.browser_profile import RESERVED_MACHINE_NAMES + + if machine_name in RESERVED_MACHINE_NAMES: + flash(gettext("Built-in browser profiles cannot be deleted."), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + store_profiles = datastore.data['settings']['application'].get('browser_profiles', {}) + if machine_name not in store_profiles: + flash(gettext("Browser profile not found."), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + raw = store_profiles[machine_name] + profile_name = raw.get('name', machine_name) if isinstance(raw, dict) else machine_name + + for watch in datastore.data['watching'].values(): + if watch.get('browser_profile') == machine_name: + watch['browser_profile'] = None + + for tag in datastore.data.get('settings', {}).get('application', {}).get('tags', {}).values(): + if tag.get('browser_profile') == machine_name: + tag['browser_profile'] = None + + if datastore.data['settings']['application'].get('browser_profile') == machine_name: + datastore.data['settings']['application']['browser_profile'] = None + + del store_profiles[machine_name] + datastore.commit() + flash(gettext("Browser profile '{}' deleted.").format(profile_name), 'notice') + return redirect(url_for('settings.settings_browsers.index')) + + @settings_browser_profile_blueprint.route("/set-default", methods=['POST']) + @login_optionally_required + def set_default(): + from changedetectionio import content_fetchers as cf + + machine_name = request.form.get('machine_name', '').strip() + if not machine_name: + flash(gettext("No profile specified."), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + from changedetectionio.model.browser_profile import get_profile + store_profiles = datastore.data['settings']['application'].get('browser_profiles', {}) + if get_profile(machine_name, store_profiles) is None: + flash(gettext("Unknown browser profile '{}'.").format(machine_name), 'error') + return redirect(url_for('settings.settings_browsers.index')) + + datastore.data['settings']['application']['browser_profile'] = machine_name + datastore.commit() + flash(gettext("Default browser profile set to '{}'.").format(machine_name), 'notice') + return redirect(url_for('settings.settings_browsers.index')) + + return settings_browser_profile_blueprint diff --git a/changedetectionio/blueprint/settings/browser_profile/templates/browser_profiles.html b/changedetectionio/blueprint/settings/browser_profile/templates/browser_profiles.html new file mode 100644 index 00000000000..d62c4080a10 --- /dev/null +++ b/changedetectionio/blueprint/settings/browser_profile/templates/browser_profiles.html @@ -0,0 +1,163 @@ +{% extends 'base.html' %} +{% block content %} +{% from '_helpers.html' import render_field, render_checkbox_field, render_button %} + +
+
+

{{ _('Browser Profiles') }}

+

{{ _('Create named profiles to configure browser settings — viewport size, connection URL, image/font blocking, and more. Each profile is based on an available browser type.') }}

+ +
+ + +
+ {% if browser_profiles %} + + + + + + + + + + + + + + {% for machine_name, profile in browser_profiles.items() %} + + + + + + + + + + {% endfor %} + +
{{ _('Default') }}{{ _('Name') }}{{ _('Type') }}{{ _('Viewport') }}{{ _('Options') }}
+ + {{ profile.name }}{{ profile.fetch_backend }}{{ profile.get_fetcher_class_name()|fetcher_status_icons }}{{ profile.viewport_width }}×{{ profile.viewport_height }} + {% if profile.block_images %}{{ _('No images') }}
{% endif %} + {% if profile.block_fonts %}{{ _('No fonts') }}
{% endif %} + {% if profile.ignore_https_errors %}{{ _('Ignore TLS') }}
{% endif %} + {% if profile.browser_connection_url %}{{ _('Custom URL') }}{% endif %} +
+ {% if not profile.is_builtin %} + {{ _('Edit') }} + {{ _('Delete') }} + {% endif %} +
+ {% else %} +

{{ _('No browser profiles configured yet. Add one below.') }}

+ {% endif %} + +
+

{{ _('Edit browser profile') if editing_machine_name else _('Add new browser profile') }}

+ {% if not editing_machine_name %} +

{{ _('Choose a browser type, give it a name, and configure its settings. You can create multiple profiles from the same type with different connection URLs or options.') }}

+ {% endif %} +
+ + +
+
+ {{ render_field(browser_profile_form.name) }} +
+
+ {{ render_field(browser_profile_form.fetch_backend, id="profile-fetch-backend") }} +
+
+ {{ render_field(browser_profile_form.browser_connection_url) }} + {{ _('Optional — override the system CDP/WebSocket URL for this profile only (e.g.') }} ws://my-chrome:3000). +
+
+
{{ render_field(browser_profile_form.viewport_width) }}
+
{{ render_field(browser_profile_form.viewport_height) }}
+
+
+ {{ render_checkbox_field(browser_profile_form.block_images) }} + {{ _('Block image downloads — speeds up loads on image-heavy pages.') }} +
+
+ {{ render_checkbox_field(browser_profile_form.block_fonts) }} + {{ _('Block web font downloads.') }} +
+
+ {{ render_checkbox_field(browser_profile_form.ignore_https_errors) }} + {{ _('Ignore TLS/HTTPS certificate errors (useful for self-signed certs on staging sites).') }} +
+
+ {{ render_field(browser_profile_form.user_agent) }} + {{ _("Leave blank to use the fetcher's default User-Agent.") }} +
+
+ {{ render_field(browser_profile_form.locale) }} + {{ _('Sets Accept-Language and navigator.language (e.g. en-US, de-DE).') }} +
+
+ {{ render_field(browser_profile_form.custom_headers) }} + {{ _('Extra HTTP headers for all requests using this profile (one per line, Key: Value). Applied before per-watch headers.') }} +
+
+ + {% if editing_machine_name %} + {{ _('Cancel') }} + {% endif %} + {{ _('Back to Settings') }} +
+
+
+
+
+
+ + +{% endblock %} diff --git a/changedetectionio/blueprint/settings/templates/settings.html b/changedetectionio/blueprint/settings/templates/settings.html index 81a28754981..6839355c349 100644 --- a/changedetectionio/blueprint/settings/templates/settings.html +++ b/changedetectionio/blueprint/settings/templates/settings.html @@ -28,6 +28,7 @@
  • {{ _('Backups') }}
  • {{ _('Time & Date') }}
  • {{ _('CAPTCHA & Proxies') }}
  • +
  • {{ _('Browsers') }}
  • {% if plugin_tabs %} {% for tab in plugin_tabs %}
  • {{ tab.tab_label }}
  • @@ -115,14 +116,7 @@
    -
    - {{ render_field(form.application.form.fetch_backend, class="fetch-backend") }} - -

    {{ _('Use the') }} {{ _('Basic') }} {{ _('method (default) where your watched sites don\'t need Javascript to render.') }}

    -

    {{ _('The') }} {{ _('Chrome/Javascript') }} {{ _('method requires a network connection to a running WebDriver+Chrome server, set by the ENV var') }} 'WEBDRIVER_URL'.

    -
    -
    -
    +
    {{ _('If you\'re having trouble waiting for the page to be fully rendered (text missing etc), try increasing the \'wait\' time here.') }}
    @@ -146,13 +140,6 @@ {{ render_field(form.requests.form.timeout) }} {{ _('For regular plain requests (not chrome based), maximum number of seconds until timeout, 1-999.') }}
    -
    - {{ render_field(form.requests.form.default_ua) }} - - {{ _('Applied to all requests.') }}

    - {{ _('Note: Simply changing the User-Agent often does not defeat anti-robot technologies, it\'s important to consider') }} {{ _('all of the ways that the browser is detected') }}. -
    -

    {{ _('Tip:') }} {{ _('Connect using Bright Data proxies, find out more here.') }} diff --git a/changedetectionio/blueprint/ui/edit.py b/changedetectionio/blueprint/ui/edit.py index 26f43d238da..e474e1a755e 100644 --- a/changedetectionio/blueprint/ui/edit.py +++ b/changedetectionio/blueprint/ui/edit.py @@ -67,6 +67,10 @@ def edit_page(uuid): default['proxy'] = '' # proxy_override set to the json/text list of the items + # browser_profile: None means "use system default" — map to 'system' so the radio pre-selects correctly + if not default.get('browser_profile'): + default['browser_profile'] = 'system' + # Does it use some custom form? does one exist? processor_name = datastore.data['watching'][uuid].get('processor', '') processor_classes = next((tpl for tpl in processors.find_processors() if tpl[1] == processor_name), None) @@ -139,10 +143,37 @@ def edit_page(uuid): except Exception as e: logger.warning(f"Failed to load processor config: {e}") - for p in datastore.extra_browsers: - form.fetch_backend.choices.append(p) - - form.fetch_backend.choices.append(("system", 'System settings default')) + from changedetectionio.model.browser_profile import BrowserProfile + from changedetectionio import content_fetchers as cf + store_profiles = datastore.data['settings']['application'].get('browser_profiles', {}) + + # Resolve the name of the system-level default profile for the label + from changedetectionio.model.browser_profile import get_profile + _system_default_machine_name = datastore.data['settings']['application'].get('browser_profile') or 'direct_http_requests' + _all_store_profiles = datastore.data['settings']['application'].get('browser_profiles', {}) + _default_profile = get_profile(_system_default_machine_name, _all_store_profiles) + if _default_profile: + _system_label = gettext('System settings default') + ' \u2013 ' + _default_profile.name + else: + _system_label = gettext('System settings default') + + # Choices: system default + always-present defaults (requests) + user-created profiles + form.browser_profile.choices = [('system', _system_label)] + [ + (p.get_machine_name(), p.name) + for p in cf.DEFAULT_BROWSER_PROFILES.values() + ] + [ + (machine_name, raw.get('name', machine_name) if isinstance(raw, dict) else getattr(raw, 'name', machine_name)) + for machine_name, raw in store_profiles.items() + ] + + # Build a map of machine_name → fetcher class name for the JS visibility system + all_profiles = dict(cf.DEFAULT_BROWSER_PROFILES) + for machine_name, raw in store_profiles.items(): + try: + all_profiles[machine_name] = BrowserProfile(**raw) if isinstance(raw, dict) else raw + except Exception: + pass + browser_profile_fetchers = {mn: p.get_fetcher_class_name() for mn, p in all_profiles.items()} # form.browser_steps[0] can be assumed that we 'goto url' first @@ -210,7 +241,7 @@ def edit_page(uuid): # Recast it if need be to right data Watch handler watch_class = processors.get_custom_watch_obj_for_processor(form.data.get('processor')) - datastore.data['watching'][uuid] = watch_class(datastore_path=datastore.datastore_path, __datastore=datastore.data, default=datastore.data['watching'][uuid]) + datastore.data['watching'][uuid] = watch_class(datastore_path=datastore.datastore_path, __datastore=datastore, default=datastore.data['watching'][uuid]) # Save the watch immediately datastore.data['watching'][uuid].commit() @@ -296,6 +327,7 @@ def edit_page(uuid): template_args = { 'available_processors': processors.available_processors(), 'available_timezones': sorted(available_timezones()), + 'browser_profile_fetchers': browser_profile_fetchers, 'browser_steps_config': browser_step_ui_config, 'emailprefix': os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False), 'extra_classes': ' '.join(c), diff --git a/changedetectionio/blueprint/ui/preview.py b/changedetectionio/blueprint/ui/preview.py index 5a68b9a3591..06c195b6432 100644 --- a/changedetectionio/blueprint/ui/preview.py +++ b/changedetectionio/blueprint/ui/preview.py @@ -61,7 +61,7 @@ def preview_page(uuid): timestamp = None extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')] - is_html_webdriver = watch.fetcher_supports_screenshots + fetcher_supports_screenshots = watch.fetcher_supports_screenshots triggered_line_numbers = [] ignored_line_numbers = [] @@ -112,7 +112,7 @@ def preview_page(uuid): highlight_triggered_line_numbers=triggered_line_numbers, highlight_blocked_line_numbers=blocked_line_numbers, history_n=watch.history_n, - is_html_webdriver=is_html_webdriver, + fetcher_supports_screenshots=fetcher_supports_screenshots, last_error=watch['last_error'], last_error_screenshot=watch.get_error_snapshot(), last_error_text=watch.get_error_text(), diff --git a/changedetectionio/blueprint/ui/templates/diff.html b/changedetectionio/blueprint/ui/templates/diff.html index 4a89e24c8f0..fb61c77b3b5 100644 --- a/changedetectionio/blueprint/ui/templates/diff.html +++ b/changedetectionio/blueprint/ui/templates/diff.html @@ -143,7 +143,7 @@
    {{ _('For now, Differences are performed on text, not graphically, only the latest screenshot is available.') }}
    - {% if is_html_webdriver %} + {% if fetcher_supports_screenshots %} {% if screenshot %}
    {{watch_a.snapshot_screenshot_ctime|format_timestamp_timeago}}
    {{ _('Current screenshot from most recent request') }} diff --git a/changedetectionio/blueprint/ui/templates/edit.html b/changedetectionio/blueprint/ui/templates/edit.html index 45ea60e2642..fa4801bbdb1 100644 --- a/changedetectionio/blueprint/ui/templates/edit.html +++ b/changedetectionio/blueprint/ui/templates/edit.html @@ -27,7 +27,8 @@ const proxy_recheck_status_url="{{url_for('check_proxies.get_recheck_status', uuid=uuid)}}"; const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}"; const watch_visual_selector_data_url="{{url_for('static_content', group='visual_selector_data', filename=uuid)}}"; - const default_system_fetch_backend="{{ settings_application['fetch_backend'] }}"; + const default_system_fetch_backend = {{ (browser_profile_fetchers.get(settings_application.get('browser_profile') or 'direct_http_requests', 'requests')) | tojson }}; + const browserProfileFetcherMap = {{ browser_profile_fetchers | tojson }}; @@ -131,11 +132,19 @@ {% if capabilities.supports_request_type %}
    - {{ render_field(form.fetch_backend, class="fetch-backend") }} +
    +
      + {%- for subfield in form.browser_profile %} +
    • + {{ subfield() }} + {{ browser_profile_fetchers.get(subfield.data, '')|fetcher_status_icons }} + +
    • + {%- endfor %} +
    -

    {{ _('Use the') }} {{ _('Basic') }} {{ _('method (default) where your watched site doesn\'t need Javascript to render.') }}

    -

    {{ _('The') }} {{ _('Chrome/Javascript') }} {{ _('method requires a network connection to a running WebDriver+Chrome server, set by the ENV var \'WEBDRIVER_URL\'.') }}

    - {{ _('Tip:') }} {{ _('Connect using Bright Data and Oxylabs Proxies, find out more here.') }} +

    {{ _('Choose how this watch fetches its target URL. \'System settings default\' inherits the global setting.') }}

    +

    {{ _('Manage browser profiles in') }} {{ _('Settings → Browsers') }}.

    {% if form.proxy %} @@ -149,7 +158,7 @@ {% endif %} -
    +
    {{ render_field(form.webdriver_delay) }}
    @@ -172,8 +181,8 @@
    - -
    + +
    @@ -210,7 +219,7 @@ ({{ _('Not supported by Selenium browser') }})
    -
    +
    diff --git a/changedetectionio/blueprint/watchlist/__init__.py b/changedetectionio/blueprint/watchlist/__init__.py index ebdfa627baf..830eee131e4 100644 --- a/changedetectionio/blueprint/watchlist/__init__.py +++ b/changedetectionio/blueprint/watchlist/__init__.py @@ -105,7 +105,7 @@ def index(): search_q=request.args.get('q', '').strip(), sort_attribute=request.args.get('sort') if request.args.get('sort') else request.cookies.get('sort'), sort_order=request.args.get('order') if request.args.get('order') else request.cookies.get('order'), - system_default_fetcher=datastore.data['settings']['application'].get('fetch_backend'), + system_default_fetcher=datastore.data['settings']['application'].get('browser_profile'), tags=sorted_tags, unread_changes_count=datastore.unread_changes_count, watches=sorted_watches diff --git a/changedetectionio/blueprint/watchlist/templates/watch-overview.html b/changedetectionio/blueprint/watchlist/templates/watch-overview.html index 55fbb9e0c18..a12d380b15f 100644 --- a/changedetectionio/blueprint/watchlist/templates/watch-overview.html +++ b/changedetectionio/blueprint/watchlist/templates/watch-overview.html @@ -285,10 +285,7 @@
    - {%- set effective_fetcher = watch.get_fetch_backend if watch.get_fetch_backend != "system" else system_default_fetcher -%} - {%- if effective_fetcher and ("html_webdriver" in effective_fetcher or "html_" in effective_fetcher or "extra_browser_" in effective_fetcher) -%} - {{ effective_fetcher|fetcher_status_icons }} - {%- endif -%} + {{ watch.effective_browser_profile.get_fetcher_class_name()|fetcher_status_icons }} {%- if watch.is_pdf -%}Converting PDF to text{%- endif -%} {%- if watch.has_browser_steps -%}Browser Steps is enabled{%- endif -%} diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index e6e6420ea48..bc285105c45 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -1,5 +1,4 @@ import sys -from changedetectionio.strtobool import strtobool from loguru import logger from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException import os @@ -25,87 +24,71 @@ # Most modern GPUs support 16384x16384 textures, so 1280x10000 is safe SCREENSHOT_SIZE_STITCH_THRESHOLD = int(os.getenv("SCREENSHOT_CHUNK_HEIGHT", 10000)) -# available_fetchers() will scan this implementation looking for anything starting with html_ -# this information is used in the form selections -from changedetectionio.content_fetchers.requests import fetcher as html_requests - - import importlib.resources XPATH_ELEMENT_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8') INSTOCK_DATA_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8') FAVICON_FETCHER_JS = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('favicon-fetcher.js').read_text(encoding='utf-8') +# Registry: clean fetcher name → fetcher class (e.g. 'requests', 'playwright', 'cloakbrowser') +FETCHERS: dict = {} + + +def register_fetcher(name: str, cls) -> None: + """Register a fetcher class under its clean name (no html_ prefix).""" + FETCHERS[name] = cls + + +def get_fetcher(name: str): + """Return the fetcher class for a clean name, or None.""" + return FETCHERS.get(name) + + def available_fetchers(): - # See the if statement at the bottom of this file for how we switch between playwright and webdriver - import inspect - p = [] - - # Get built-in fetchers (but skip plugin fetchers that were added via setattr) - for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass): - if inspect.isclass(obj): - # @todo html_ is maybe better as fetcher_ or something - # In this case, make sure to edit the default one in store.py and fetch_site_status.py - if name.startswith('html_'): - # Skip plugin fetchers that were already registered - if name not in _plugin_fetchers: - t = tuple([name, obj.fetcher_description]) - p.append(t) - - # Get plugin fetchers from cache (already loaded at module init) - for name, fetcher_class in _plugin_fetchers.items(): - if hasattr(fetcher_class, 'fetcher_description'): - t = tuple([name, fetcher_class.fetcher_description]) - p.append(t) - else: - logger.warning(f"Plugin fetcher '{name}' does not have fetcher_description attribute") - - return p - - -def get_plugin_fetchers(): - """Load and return all plugin fetchers from the centralized plugin manager.""" - from changedetectionio.pluggy_interface import plugin_manager - - fetchers = {} + """Return list of (name, description) for all registered fetchers.""" + return [(name, cls.fetcher_description) for name, cls in FETCHERS.items() + if hasattr(cls, 'fetcher_description')] + + +def available_browser_fetchers(): + """Return list of (name, description) for fetchers that support screenshots (browser-type fetchers).""" + return [(name, cls.fetcher_description) for name, cls in FETCHERS.items() + if cls.supports_screenshots] + + +def _load_fetchers(): + """Load all fetchers (built-ins + plugins) into the FETCHERS registry.""" + from changedetectionio.pluggy_interface import plugin_manager, register_builtin_fetchers + + # Built-ins must be registered first + register_builtin_fetchers() + + # Then external plugins try: - # Call the register_content_fetcher hook from all registered plugins results = plugin_manager.hook.register_content_fetcher() for result in results: if result: name, fetcher_class = result - fetchers[name] = fetcher_class - # Register in current module so hasattr() checks work - setattr(sys.modules[__name__], name, fetcher_class) - logger.info(f"Registered plugin fetcher: {name} - {getattr(fetcher_class, 'fetcher_description', 'No description')}") + register_fetcher(name, fetcher_class) + logger.info(f"Registered fetcher: {name} - {getattr(fetcher_class, 'fetcher_description', '?')}") except Exception as e: logger.error(f"Error loading plugin fetchers: {e}") - return fetchers -# Initialize plugins at module load time -_plugin_fetchers = get_plugin_fetchers() +# Default browser profiles always shown in the browser profiles table (keyed by machine name) +DEFAULT_BROWSER_PROFILES: dict = {} + +def _register_default_browser_profiles(): + """Register browser profiles that are always present in the profiles table.""" + from changedetectionio.model.browser_profile import BUILTIN_REQUESTS + DEFAULT_BROWSER_PROFILES[BUILTIN_REQUESTS.get_machine_name()] = BUILTIN_REQUESTS -# Decide which is the 'real' HTML webdriver, this is more a system wide config -# rather than site-specific. -use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) -if use_playwright_as_chrome_fetcher: - # @note - For now, browser steps always uses playwright - if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')): - logger.debug('Using Playwright library as fetcher') - from .playwright import fetcher as html_webdriver - else: - logger.debug('Using direct Python Puppeteer library as fetcher') - from .puppeteer import fetcher as html_webdriver -else: - logger.debug("Falling back to selenium as fetcher") - from .webdriver_selenium import fetcher as html_webdriver +# Populate the registry at module load time +_load_fetchers() -# Register built-in fetchers as plugins after all imports are complete -from changedetectionio.pluggy_interface import register_builtin_fetchers -register_builtin_fetchers() +_register_default_browser_profiles() diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 8ec5364bdee..35786542f4d 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -70,37 +70,41 @@ class Fetcher(): supports_screenshots = False # Can capture page screenshots supports_xpath_element_data = False # Can extract xpath element positions/data for visual selector + # Icon shown in the watch list when this fetcher is the effective fetcher. + # Set to a dict with 'filename', 'alt', 'title' keys (image served from static/images/). + # None means no icon is shown (e.g. plain HTTP requests fetcher). + status_icon = None + # Screenshot element locking - prevents layout shifts during screenshot capture # Only needed for visual comparison (image_ssim_diff processor) # Locks element dimensions in the first viewport to prevent headers/ads from resizing lock_viewport_elements = False # Default: disabled for performance + # BrowserProfile-derived settings — applied by browser fetchers, ignored by html_requests + viewport_width: int = 1280 + viewport_height: int = 1000 + block_images: bool = False + block_fonts: bool = False + profile_user_agent: str = None # Profile-level UA; lower priority than request_headers User-Agent + ignore_https_errors: bool = False + locale: str = None + service_workers: str = 'allow' + extra_delay: int = 0 + def __init__(self, **kwargs): if kwargs and 'screenshot_format' in kwargs: self.screenshot_format = kwargs.get('screenshot_format') - # Allow lock_viewport_elements to be set via kwargs if kwargs and 'lock_viewport_elements' in kwargs: self.lock_viewport_elements = kwargs.get('lock_viewport_elements') + # BrowserProfile fields — store whatever was passed, subclasses use them + for field in ('viewport_width', 'viewport_height', 'block_images', 'block_fonts', + 'profile_user_agent', 'ignore_https_errors', 'locale', + 'service_workers', 'extra_delay'): + if field in kwargs: + setattr(self, field, kwargs[field]) - @classmethod - def get_status_icon_data(cls): - """Return data for status icon to display in the watch overview. - - This method can be overridden by subclasses to provide custom status icons. - - Returns: - dict or None: Dictionary with icon data: - { - 'filename': 'icon-name.svg', # Icon filename - 'alt': 'Alt text', # Alt attribute - 'title': 'Tooltip text', # Title attribute - 'style': 'height: 1em;' # Optional inline CSS - } - Or None if no icon - """ - return None def clear_content(self): """ @@ -198,6 +202,16 @@ async def iterate_browser_steps(self, start_url=None): # Stop processing here raise BrowserStepsStepException(step_n=step_n, original_e=e) + def disk_cleanup_after_fetch(self): + """Remove any temporary files written to disk during a fetch. + + The default implementation is a no-op. Browser-based fetchers + override this to delete browser-step screenshots and any other + ephemeral files they create. Called by the processor after + ``quit()`` regardless of whether the fetch succeeded or failed. + """ + pass + # It's always good to reset these def delete_browser_steps_screenshots(self): import glob diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py deleted file mode 100644 index 4002fbbf15c..00000000000 --- a/changedetectionio/content_fetchers/playwright.py +++ /dev/null @@ -1,471 +0,0 @@ -import asyncio -import gc -import json -import os -from urllib.parse import urlparse - -from loguru import logger - -from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ - SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, FAVICON_FETCHER_JS -from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent -from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable, \ - BrowserStepsStepException - - -async def capture_full_page_async(page, screenshot_format='JPEG', watch_uuid=None, lock_viewport_elements=False): - import os - import time - - start = time.time() - watch_info = f"[{watch_uuid}] " if watch_uuid else "" - - setup_start = time.time() - page_height = await page.evaluate("document.documentElement.scrollHeight") - page_width = await page.evaluate("document.documentElement.scrollWidth") - original_viewport = page.viewport_size - dimensions_time = time.time() - setup_start - - logger.debug(f"{watch_info}Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width} (got dimensions in {dimensions_time:.2f}s)") - - # Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks - step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow - screenshot_chunks = [] - y = 0 - elements_locked = False - - # Only lock viewport elements if explicitly enabled (for image_ssim_diff processor) - # This prevents headers/ads from resizing when viewport changes - if lock_viewport_elements and page_height > page.viewport_size['height']: - lock_start = time.time() - lock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'lock-elements-sizing.js') - with open(lock_elements_js_path, 'r') as f: - lock_elements_js = f.read() - await page.evaluate(lock_elements_js) - elements_locked = True - lock_time = time.time() - lock_start - logger.debug(f"{watch_info}Viewport element locking enabled (took {lock_time:.2f}s)") - - if page_height > page.viewport_size['height']: - if page_height < step_size: - step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size - viewport_start = time.time() - logger.debug(f"{watch_info}Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size") - # Set viewport to a larger size to capture more content at once - await page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size}) - viewport_time = time.time() - viewport_start - logger.debug(f"{watch_info}Viewport changed to {page.viewport_size['width']}x{step_size} (took {viewport_time:.2f}s)") - - # Capture screenshots in chunks up to the max total height - capture_start = time.time() - chunk_times = [] - # Use PNG for better quality (no compression artifacts), JPEG for smaller size - screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg' - # PNG should use quality 100, JPEG uses configurable quality - screenshot_quality = 100 if screenshot_type == 'png' else int(os.getenv("SCREENSHOT_QUALITY", 72)) - - while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT): - # Only scroll if not at the top (y > 0) - if y > 0: - await page.evaluate(f"window.scrollTo(0, {y})") - - # Request GC only before screenshot (not 3x per chunk) - await page.request_gc() - - screenshot_kwargs = { - 'type': screenshot_type, - 'full_page': False - } - # Only pass quality parameter for jpeg (PNG doesn't support it in Playwright) - if screenshot_type == 'jpeg': - screenshot_kwargs['quality'] = screenshot_quality - - chunk_start = time.time() - screenshot_chunks.append(await page.screenshot(**screenshot_kwargs)) - chunk_time = time.time() - chunk_start - chunk_times.append(chunk_time) - logger.debug(f"{watch_info}Chunk {len(screenshot_chunks)} captured in {chunk_time:.2f}s") - y += step_size - - # Restore original viewport size - await page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']}) - - # Unlock element dimensions if they were locked - if elements_locked: - unlock_elements_js_path = os.path.join(os.path.dirname(__file__), 'res', 'unlock-elements-sizing.js') - with open(unlock_elements_js_path, 'r') as f: - unlock_elements_js = f.read() - await page.evaluate(unlock_elements_js) - logger.debug(f"{watch_info}Element dimensions unlocked after screenshot capture") - - capture_time = time.time() - capture_start - total_capture_time = sum(chunk_times) - logger.debug(f"{watch_info}All {len(screenshot_chunks)} chunks captured in {capture_time:.2f}s (total chunk time: {total_capture_time:.2f}s)") - - # If we have multiple chunks, stitch them together - if len(screenshot_chunks) > 1: - stitch_start = time.time() - logger.debug(f"{watch_info}Starting stitching of {len(screenshot_chunks)} chunks") - - # Always use spawn subprocess for ANY stitching (2+ chunks) - # PIL allocates at C level and Python GC never releases it - subprocess exit forces OS to reclaim - # Trade-off: 35MB resource_tracker vs 500MB+ PIL leak in main process - from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker_raw_bytes - import multiprocessing - import struct - - ctx = multiprocessing.get_context('spawn') - parent_conn, child_conn = ctx.Pipe() - p = ctx.Process(target=stitch_images_worker_raw_bytes, args=(child_conn, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT)) - p.start() - - # Send via raw bytes (no pickle) - parent_conn.send_bytes(struct.pack('I', len(screenshot_chunks))) - for chunk in screenshot_chunks: - parent_conn.send_bytes(chunk) - - screenshot = parent_conn.recv_bytes() - p.join() - - parent_conn.close() - child_conn.close() - del p, parent_conn, child_conn - - stitch_time = time.time() - stitch_start - total_time = time.time() - start - setup_time = total_time - capture_time - stitch_time - logger.debug( - f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | " - f"Setup: {setup_time:.2f}s, Capture: {capture_time:.2f}s, Stitching: {stitch_time:.2f}s, Total: {total_time:.2f}s") - return screenshot - - total_time = time.time() - start - setup_time = total_time - capture_time - logger.debug( - f"{watch_info}Screenshot complete - Page height: {page_height}px, Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT}px | " - f"Setup: {setup_time:.2f}s, Single chunk: {capture_time:.2f}s, Total: {total_time:.2f}s") - - return screenshot_chunks[0] - -class fetcher(Fetcher): - fetcher_description = "Playwright {}/Javascript".format( - os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() - ) - if os.getenv("PLAYWRIGHT_DRIVER_URL"): - fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) - - browser_type = '' - command_executor = '' - - # Configs for Proxy setup - # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" - playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password'] - - proxy = None - - # Capability flags - supports_browser_steps = True - supports_screenshots = True - supports_xpath_element_data = True - - @classmethod - def get_status_icon_data(cls): - """Return Chrome browser icon data for Playwright fetcher.""" - return { - 'filename': 'google-chrome-icon.png', - 'alt': 'Using a Chrome browser', - 'title': 'Using a Chrome browser' - } - - def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): - super().__init__(**kwargs) - - self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') - - if custom_browser_connection_url: - self.browser_connection_is_custom = True - self.browser_connection_url = custom_browser_connection_url - else: - # Fallback to fetching from system - # .strip('"') is going to save someone a lot of time when they accidently wrap the env value - self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"') - - # If any proxy settings are enabled, then we should setup the proxy object - proxy_args = {} - for k in self.playwright_proxy_settings_mappings: - v = os.getenv('playwright_proxy_' + k, False) - if v: - proxy_args[k] = v.strip('"') - - if proxy_args: - self.proxy = proxy_args - - # allow per-watch proxy selection override - if proxy_override: - self.proxy = {'server': proxy_override} - - if self.proxy: - # Playwright needs separate username and password values - parsed = urlparse(self.proxy.get('server')) - if parsed.username: - self.proxy['username'] = parsed.username - self.proxy['password'] = parsed.password - - async def screenshot_step(self, step_n=''): - super().screenshot_step(step_n=step_n) - watch_uuid = getattr(self, 'watch_uuid', None) - screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements) - - # Request GC immediately after screenshot to free memory - # Screenshots can be large and browser steps take many of them - await self.page.request_gc() - - if self.browser_steps_screenshot_path is not None: - destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) - logger.debug(f"Saving step screenshot to {destination}") - with open(destination, 'wb') as f: - f.write(screenshot) - # Clear local reference to allow screenshot bytes to be collected - del screenshot - gc.collect() - - async def save_step_html(self, step_n): - super().save_step_html(step_n=step_n) - content = await self.page.content() - - # Request GC after getting page content - await self.page.request_gc() - - destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) - logger.debug(f"Saving step HTML to {destination}") - with open(destination, 'w', encoding='utf-8') as f: - f.write(content) - # Clear local reference - del content - gc.collect() - - async def run(self, - fetch_favicon=True, - current_include_filters=None, - empty_pages_are_a_change=False, - ignore_status_codes=False, - is_binary=False, - request_body=None, - request_headers=None, - request_method=None, - screenshot_format=None, - timeout=None, - url=None, - watch_uuid=None, - ): - - from playwright.async_api import async_playwright - import playwright._impl._errors - import time - self.delete_browser_steps_screenshots() - self.watch_uuid = watch_uuid # Store for use in screenshot_step - response = None - - async with async_playwright() as p: - browser_type = getattr(p, self.browser_type) - - # Seemed to cause a connection Exception even tho I can see it connect - # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) - # 60,000 connection timeout only - browser = await browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000) - - # SOCKS5 with authentication is not supported (yet) - # https://github.com/microsoft/playwright/issues/10567 - - # Set user agent to prevent Cloudflare from blocking the browser - # Use the default one configured in the App.py model that's passed from fetch_site_status.py - context = await browser.new_context( - accept_downloads=False, # Should never be needed - bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others - extra_http_headers=request_headers, - ignore_https_errors=True, - proxy=self.proxy, - service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers - user_agent=manage_user_agent(headers=request_headers), - ) - - self.page = await context.new_page() - - # Listen for all console events and handle errors - self.page.on("console", lambda msg: logger.debug(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) - - # Re-use as much code from browser steps as possible so its the same - from changedetectionio.browser_steps.browser_steps import steppable_browser_interface - browsersteps_interface = steppable_browser_interface(start_url=url) - browsersteps_interface.page = self.page - - response = await browsersteps_interface.action_goto_url(value=url) - - if response is None: - await context.close() - await browser.close() - logger.debug("Content Fetcher > Response object from the browser communication was none") - raise EmptyReply(url=url, status_code=None) - - # In async_playwright, all_headers() returns a coroutine - try: - self.headers = await response.all_headers() - except TypeError: - # Fallback for sync version - self.headers = response.all_headers() - - try: - if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): - await browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None) - except playwright._impl._errors.TimeoutError as e: - await context.close() - await browser.close() - # This can be ok, we will try to grab what we could retrieve - pass - except Exception as e: - logger.debug(f"Content Fetcher > Other exception when executing custom JS code {str(e)}") - await context.close() - await browser.close() - raise PageUnloadable(url=url, status_code=None, message=str(e)) - - extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay - await self.page.wait_for_timeout(extra_wait * 1000) - - try: - self.status_code = response.status - except Exception as e: - # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962 - logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.") - logger.critical(response) - await context.close() - await browser.close() - raise PageUnloadable(url=url, status_code=None, message=str(e)) - - if fetch_favicon: - try: - self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS) - await self.page.request_gc() - except Exception as e: - logger.error(f"Error fetching FavIcon info {str(e)}, continuing.") - - if self.status_code != 200 and not ignore_status_codes: - screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements) - # Finally block will handle cleanup - raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) - - if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0: - logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False") - await context.close() - await browser.close() - raise EmptyReply(url=url, status_code=response.status) - - # Wrap remaining operations in try/finally to ensure cleanup - try: - # Run Browser Steps here - if self.browser_steps: - try: - await self.iterate_browser_steps(start_url=url) - except BrowserStepsStepException: - # Finally block will handle cleanup - raise - - await self.page.wait_for_timeout(extra_wait * 1000) - - now = time.time() - # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) - if current_include_filters is not None: - await self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) - else: - await self.page.evaluate("var include_filters=''") - await self.page.request_gc() - - # request_gc before and after evaluate to free up memory - # @todo browsersteps etc - MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) - self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, { - "visualselector_xpath_selectors": visualselector_xpath_selectors, - "max_height": MAX_TOTAL_HEIGHT - }) - await self.page.request_gc() - - self.instock_data = await self.page.evaluate(INSTOCK_DATA_JS) - await self.page.request_gc() - - self.content = await self.page.content() - await self.page.request_gc() - logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s") - - - # Bug 3 in Playwright screenshot handling - # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it - # JPEG is better here because the screenshots can be very very large - - # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded - # which will significantly increase the IO size between the server and client, it's recommended to use the lowest - # acceptable screenshot quality here - # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage - self.screenshot = await capture_full_page_async(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements) - - # Force aggressive memory cleanup - screenshots are large and base64 decode creates temporary buffers - await self.page.request_gc() - gc.collect() - - except ScreenshotUnavailable: - # Re-raise screenshot unavailable exceptions - raise ScreenshotUnavailable(url=url, status_code=self.status_code) - - finally: - # Clean up resources properly with timeouts to prevent hanging - try: - if hasattr(self, 'page') and self.page: - await self.page.request_gc() - await asyncio.wait_for(self.page.close(), timeout=5.0) - logger.debug(f"Successfully closed page for {url}") - except asyncio.TimeoutError: - logger.warning(f"Timed out closing page for {url} (5s)") - except Exception as e: - logger.warning(f"Error closing page for {url}: {e}") - finally: - self.page = None - - try: - if context: - await asyncio.wait_for(context.close(), timeout=5.0) - logger.debug(f"Successfully closed context for {url}") - except asyncio.TimeoutError: - logger.warning(f"Timed out closing context for {url} (5s)") - except Exception as e: - logger.warning(f"Error closing context for {url}: {e}") - finally: - context = None - - try: - if browser: - await asyncio.wait_for(browser.close(), timeout=5.0) - logger.debug(f"Successfully closed browser connection for {url}") - except asyncio.TimeoutError: - logger.warning(f"Timed out closing browser connection for {url} (5s)") - except Exception as e: - logger.warning(f"Error closing browser for {url}: {e}") - finally: - browser = None - - # Force Python GC to release Playwright resources immediately - # Playwright objects can have circular references that delay cleanup - gc.collect() - - -# Plugin registration for built-in fetcher -class PlaywrightFetcherPlugin: - """Plugin class that registers the Playwright fetcher as a built-in plugin.""" - - def register_content_fetcher(self): - """Register the Playwright fetcher""" - return ('html_webdriver', fetcher) - - -# Create module-level instance for plugin registration -playwright_plugin = PlaywrightFetcherPlugin() - - - diff --git a/changedetectionio/content_fetchers/playwright/CDP.py b/changedetectionio/content_fetchers/playwright/CDP.py new file mode 100644 index 00000000000..1a1d234f255 --- /dev/null +++ b/changedetectionio/content_fetchers/playwright/CDP.py @@ -0,0 +1,41 @@ +""" +Playwright CDP fetcher — connects to a remote browser via Chrome DevTools Protocol. + +browser_connection_url must be supplied via the resolved BrowserProfile +(set by preconfigure_browser_profiles_based_on_env at startup or edited in the UI). +""" +from loguru import logger +from changedetectionio.pluggy_interface import hookimpl +from changedetectionio.content_fetchers.playwright import PlaywrightBaseFetcher + + +class fetcher(PlaywrightBaseFetcher): + fetcher_description = "Playwright Chrome (CDP/Remote)" + requires_connection_url = True + + def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): + super().__init__(proxy_override=proxy_override, custom_browser_connection_url=custom_browser_connection_url, **kwargs) + + if custom_browser_connection_url: + self.browser_connection_is_custom = True + self.browser_connection_url = custom_browser_connection_url + else: + logger.critical("Playwright CDP fetcher has no browser_connection_url — browser profile was not configured. " + "Set PLAYWRIGHT_DRIVER_URL or configure a browser profile in Settings.") + self.browser_connection_url = None + + # CDP always connects to Chromium + self.browser_type = 'chromium' + + async def _connect_browser(self, p): + browser_type = getattr(p, self.browser_type) + return await browser_type.connect_over_cdp(self.browser_connection_url, timeout=60_000) + + +class PlaywrightCDPPlugin: + @hookimpl + def register_content_fetcher(self): + return ('playwright_cdp', fetcher) + + +cdp_plugin = PlaywrightCDPPlugin() diff --git a/changedetectionio/content_fetchers/playwright/__init__.py b/changedetectionio/content_fetchers/playwright/__init__.py new file mode 100644 index 00000000000..d30e995ca7d --- /dev/null +++ b/changedetectionio/content_fetchers/playwright/__init__.py @@ -0,0 +1,403 @@ +""" +Playwright-based content fetchers. + +Submodules: + cdp — connect to a remote browser via Chrome DevTools Protocol (CDP/WebSocket) + chrome — launch a local Chromium browser + firefox — launch a local Firefox browser + webkit — launch a local WebKit (Safari-engine) browser +""" + +import asyncio +import gc +import json +import os +import re +from urllib.parse import urlparse + +from loguru import logger + +from changedetectionio.content_fetchers import ( + SCREENSHOT_MAX_HEIGHT_DEFAULT, + SCREENSHOT_MAX_TOTAL_HEIGHT, + SCREENSHOT_SIZE_STITCH_THRESHOLD, + FAVICON_FETCHER_JS, + INSTOCK_DATA_JS, + XPATH_ELEMENT_JS, + visualselector_xpath_selectors, +) +from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent +from changedetectionio.content_fetchers.exceptions import ( + BrowserStepsStepException, + EmptyReply, + Non200ErrorCodeReceived, + PageUnloadable, + ScreenshotUnavailable, +) + + +async def capture_full_page_async(page, screenshot_format='JPEG', watch_uuid=None, lock_viewport_elements=False): + import time + + start = time.time() + watch_info = f"[{watch_uuid}] " if watch_uuid else "" + + setup_start = time.time() + page_height = await page.evaluate("document.documentElement.scrollHeight") + page_width = await page.evaluate("document.documentElement.scrollWidth") + original_viewport = page.viewport_size + dimensions_time = time.time() - setup_start + + logger.debug(f"{watch_info}Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width} (got dimensions in {dimensions_time:.2f}s)") + + step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD + screenshot_chunks = [] + y = 0 + elements_locked = False + + if lock_viewport_elements and page_height > page.viewport_size['height']: + lock_start = time.time() + lock_elements_js_path = os.path.join(os.path.dirname(__file__), '..', 'res', 'lock-elements-sizing.js') + with open(lock_elements_js_path, 'r') as f: + lock_elements_js = f.read() + await page.evaluate(lock_elements_js) + elements_locked = True + logger.debug(f"{watch_info}Viewport element locking enabled (took {time.time() - lock_start:.2f}s)") + + if page_height > page.viewport_size['height']: + if page_height < step_size: + step_size = page_height + await page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size}) + + capture_start = time.time() + chunk_times = [] + screenshot_type = screenshot_format.lower() if screenshot_format else 'jpeg' + screenshot_quality = 100 if screenshot_type == 'png' else int(os.getenv("SCREENSHOT_QUALITY", 72)) + + while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT): + if y > 0: + await page.evaluate(f"window.scrollTo(0, {y})") + + await _safe_request_gc(page) + + screenshot_kwargs = {'type': screenshot_type, 'full_page': False} + if screenshot_type == 'jpeg': + screenshot_kwargs['quality'] = screenshot_quality + + chunk_start = time.time() + screenshot_chunks.append(await page.screenshot(**screenshot_kwargs)) + chunk_time = time.time() - chunk_start + chunk_times.append(chunk_time) + logger.debug(f"{watch_info}Chunk {len(screenshot_chunks)} captured in {chunk_time:.2f}s") + y += step_size + + await page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']}) + + if elements_locked: + unlock_elements_js_path = os.path.join(os.path.dirname(__file__), '..', 'res', 'unlock-elements-sizing.js') + with open(unlock_elements_js_path, 'r') as f: + unlock_elements_js = f.read() + await page.evaluate(unlock_elements_js) + + capture_time = time.time() - capture_start + + if len(screenshot_chunks) > 1: + stitch_start = time.time() + from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker_raw_bytes + import multiprocessing + import struct + + ctx = multiprocessing.get_context('spawn') + parent_conn, child_conn = ctx.Pipe() + p = ctx.Process(target=stitch_images_worker_raw_bytes, args=(child_conn, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT)) + p.start() + + parent_conn.send_bytes(struct.pack('I', len(screenshot_chunks))) + for chunk in screenshot_chunks: + parent_conn.send_bytes(chunk) + + screenshot = parent_conn.recv_bytes() + p.join() + parent_conn.close() + child_conn.close() + del p, parent_conn, child_conn + + stitch_time = time.time() - stitch_start + total_time = time.time() - start + setup_time = total_time - capture_time - stitch_time + logger.debug( + f"{watch_info}Screenshot complete - Page height: {page_height}px | " + f"Setup: {setup_time:.2f}s, Capture: {capture_time:.2f}s, Stitching: {stitch_time:.2f}s, Total: {total_time:.2f}s") + return screenshot + + total_time = time.time() - start + logger.debug( + f"{watch_info}Screenshot complete - Page height: {page_height}px | " + f"Setup: {total_time - capture_time:.2f}s, Single chunk: {capture_time:.2f}s, Total: {total_time:.2f}s") + return screenshot_chunks[0] + + +async def _safe_request_gc(page): + """Request browser GC — Chromium-specific, silently ignored on Firefox/WebKit.""" + try: + await page.request_gc() + except Exception: + pass + + +class PlaywrightBaseFetcher(Fetcher): + """ + Shared base for all Playwright fetchers. + + Subclasses implement ``_connect_browser(playwright_instance)`` to return a + connected-or-launched browser object. Everything else — context creation, + page interaction, screenshot capture, browser-steps execution — lives here. + """ + + playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password'] + + proxy = None + + # Capability flags + supports_browser_steps = True + supports_screenshots = True + supports_xpath_element_data = True + + status_icon = {'filename': 'google-chrome-icon.png', 'alt': 'Using a Chrome browser', 'title': 'Using a Chrome browser'} + + def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): + super().__init__(**kwargs) + + # Subclasses may use this (e.g. CDP); others ignore it + self._custom_browser_connection_url = custom_browser_connection_url + + proxy_args = {} + for k in self.playwright_proxy_settings_mappings: + v = os.getenv('playwright_proxy_' + k, False) + if v: + proxy_args[k] = v.strip('"') + + if proxy_args: + self.proxy = proxy_args + + if proxy_override: + self.proxy = {'server': proxy_override} + + if self.proxy: + parsed = urlparse(self.proxy.get('server', '')) + if parsed.username: + self.proxy['username'] = parsed.username + self.proxy['password'] = parsed.password + + def disk_cleanup_after_fetch(self): + """Delete browser-step screenshots written during this fetch.""" + self.delete_browser_steps_screenshots() + + async def _connect_browser(self, playwright_instance): + """Return an open browser object. Must be overridden by each subclass.""" + raise NotImplementedError(f"{type(self).__name__} must implement _connect_browser()") + + async def screenshot_step(self, step_n=''): + super().screenshot_step(step_n=step_n) + watch_uuid = getattr(self, 'watch_uuid', None) + screenshot = await capture_full_page_async( + page=self.page, + screenshot_format=self.screenshot_format, + watch_uuid=watch_uuid, + lock_viewport_elements=self.lock_viewport_elements, + ) + await _safe_request_gc(self.page) + + if self.browser_steps_screenshot_path is not None: + destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) + logger.debug(f"Saving step screenshot to {destination}") + with open(destination, 'wb') as f: + f.write(screenshot) + del screenshot + gc.collect() + + async def save_step_html(self, step_n): + super().save_step_html(step_n=step_n) + content = await self.page.content() + await _safe_request_gc(self.page) + + destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) + logger.debug(f"Saving step HTML to {destination}") + with open(destination, 'w', encoding='utf-8') as f: + f.write(content) + del content + gc.collect() + + async def run(self, + fetch_favicon=True, + current_include_filters=None, + empty_pages_are_a_change=False, + ignore_status_codes=False, + is_binary=False, + request_body=None, + request_headers=None, + request_method=None, + screenshot_format=None, + timeout=None, + url=None, + watch_uuid=None, + ): + from playwright.async_api import async_playwright + import playwright._impl._errors + import time + + self.delete_browser_steps_screenshots() + self.watch_uuid = watch_uuid + response = None + + async with async_playwright() as p: + browser = await self._connect_browser(p) + + ua = manage_user_agent(headers=request_headers) or self.profile_user_agent or None + + context_kwargs = dict( + accept_downloads=False, + bypass_csp=True, + extra_http_headers=request_headers, + ignore_https_errors=self.ignore_https_errors, + proxy=self.proxy, + service_workers=self.service_workers, + user_agent=ua, + viewport={'width': self.viewport_width, 'height': self.viewport_height}, + ) + if self.locale: + context_kwargs['locale'] = self.locale + + context = await browser.new_context(**context_kwargs) + + if self.block_images: + await context.route( + re.compile(r'\.(png|jpe?g|gif|svg|ico|webp|avif|bmp)(\?.*)?$', re.IGNORECASE), + lambda route: route.abort() + ) + if self.block_fonts: + await context.route( + re.compile(r'\.(woff2?|ttf|otf|eot)(\?.*)?$', re.IGNORECASE), + lambda route: route.abort() + ) + + self.page = await context.new_page() + self.page.on("console", lambda msg: logger.debug(f"Playwright console: {url} {msg.type}: {msg.text}")) + + from changedetectionio.browser_steps.browser_steps import steppable_browser_interface + browsersteps_interface = steppable_browser_interface(start_url=url) + browsersteps_interface.page = self.page + + response = await browsersteps_interface.action_goto_url(value=url) + + if response is None: + await context.close() + await browser.close() + raise EmptyReply(url=url, status_code=None) + + try: + self.headers = await response.all_headers() + except TypeError: + self.headers = response.all_headers() + + try: + if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): + await browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None) + except playwright._impl._errors.TimeoutError: + await context.close() + await browser.close() + pass + except Exception as e: + await context.close() + await browser.close() + raise PageUnloadable(url=url, status_code=None, message=str(e)) + + extra_wait = self.extra_delay + self.render_extract_delay + await self.page.wait_for_timeout(extra_wait * 1000) + + try: + self.status_code = response.status + except Exception as e: + await context.close() + await browser.close() + raise PageUnloadable(url=url, status_code=None, message=str(e)) + + if fetch_favicon: + try: + self.favicon_blob = await self.page.evaluate(FAVICON_FETCHER_JS) + await _safe_request_gc(self.page) + except Exception as e: + logger.error(f"Error fetching favicon: {e}") + + if self.status_code != 200 and not ignore_status_codes: + screenshot = await capture_full_page_async(self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements) + try: + page_html = await self.page.content() + except Exception as e: + logger.warning(f"Got non-200 status {self.status_code} but failed to fetch page content: {e}") + page_html = None + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot, page_html=page_html) + + if not empty_pages_are_a_change and len((await self.page.content()).strip()) == 0: + await context.close() + await browser.close() + raise EmptyReply(url=url, status_code=response.status) + + try: + if self.browser_steps: + try: + await self.iterate_browser_steps(start_url=url) + except BrowserStepsStepException: + raise + await self.page.wait_for_timeout(extra_wait * 1000) + + now = time.time() + if current_include_filters is not None: + await self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) + else: + await self.page.evaluate("var include_filters=''") + await _safe_request_gc(self.page) + + MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) + self.xpath_data = await self.page.evaluate(XPATH_ELEMENT_JS, { + "visualselector_xpath_selectors": visualselector_xpath_selectors, + "max_height": MAX_TOTAL_HEIGHT + }) + await _safe_request_gc(self.page) + + self.instock_data = await self.page.evaluate(INSTOCK_DATA_JS) + await _safe_request_gc(self.page) + + self.content = await self.page.content() + await _safe_request_gc(self.page) + logger.debug(f"Scrape xPath element data done in {time.time() - now:.2f}s") + + self.screenshot = await capture_full_page_async( + page=self.page, + screenshot_format=self.screenshot_format, + watch_uuid=watch_uuid, + lock_viewport_elements=self.lock_viewport_elements, + ) + await _safe_request_gc(self.page) + gc.collect() + + except ScreenshotUnavailable: + raise ScreenshotUnavailable(url=url, status_code=self.status_code) + + finally: + for obj, name, close_coro in [ + (self.page if hasattr(self, 'page') and self.page else None, 'page', lambda: self.page.close() if self.page else asyncio.sleep(0)), + (context, 'context', lambda: context.close() if context else asyncio.sleep(0)), + (browser, 'browser', lambda: browser.close() if browser else asyncio.sleep(0)), + ]: + try: + await asyncio.wait_for(close_coro(), timeout=5.0) + except asyncio.TimeoutError: + logger.warning(f"Timed out closing {name} for {url}") + except Exception as e: + logger.warning(f"Error closing {name} for {url}: {e}") + + self.page = None + context = None + browser = None + gc.collect() diff --git a/changedetectionio/content_fetchers/playwright/chrome.py b/changedetectionio/content_fetchers/playwright/chrome.py new file mode 100644 index 00000000000..6a2956f61e7 --- /dev/null +++ b/changedetectionio/content_fetchers/playwright/chrome.py @@ -0,0 +1,27 @@ +""" +Playwright Chrome fetcher — launches a local Chromium browser directly. + +No external browser container is required. Playwright must be installed +with Chromium browsers: ``playwright install chromium``. +""" +from changedetectionio.pluggy_interface import hookimpl +from changedetectionio.content_fetchers.playwright import PlaywrightBaseFetcher + + +class fetcher(PlaywrightBaseFetcher): + fetcher_description = "Playwright Chrome (local)" + + async def _connect_browser(self, p): + launch_kwargs = {'headless': True} + if self.proxy: + launch_kwargs['proxy'] = self.proxy + return await p.chromium.launch(**launch_kwargs) + + +class PlaywrightChromePlugin: + @hookimpl + def register_content_fetcher(self): + return ('playwright_chrome', fetcher) + + +chrome_plugin = PlaywrightChromePlugin() diff --git a/changedetectionio/content_fetchers/playwright/firefox.py b/changedetectionio/content_fetchers/playwright/firefox.py new file mode 100644 index 00000000000..1dcb64e8df0 --- /dev/null +++ b/changedetectionio/content_fetchers/playwright/firefox.py @@ -0,0 +1,33 @@ +""" +Playwright Firefox fetcher — launches a local Firefox browser directly. + +No external browser container is required. Playwright must be installed +with Firefox browsers: ``playwright install firefox``. + +Note: ``page.request_gc()`` is Chromium-specific and is silently skipped +on Firefox — this is handled transparently by ``_safe_request_gc()`` in +the base package. +""" +from changedetectionio.pluggy_interface import hookimpl +from changedetectionio.content_fetchers.playwright import PlaywrightBaseFetcher + + +class fetcher(PlaywrightBaseFetcher): + fetcher_description = "Playwright Firefox (local)" + + status_icon = {'filename': 'firefox-icon.svg', 'alt': 'Using Firefox', 'title': 'Using Firefox'} + + async def _connect_browser(self, p): + launch_kwargs = {'headless': True} + if self.proxy: + launch_kwargs['proxy'] = self.proxy + return await p.firefox.launch(**launch_kwargs) + + +class PlaywrightFirefoxPlugin: + @hookimpl + def register_content_fetcher(self): + return ('playwright_firefox', fetcher) + + +firefox_plugin = PlaywrightFirefoxPlugin() diff --git a/changedetectionio/content_fetchers/playwright/webkit.py b/changedetectionio/content_fetchers/playwright/webkit.py new file mode 100644 index 00000000000..bd55f4911e2 --- /dev/null +++ b/changedetectionio/content_fetchers/playwright/webkit.py @@ -0,0 +1,30 @@ +""" +Playwright WebKit fetcher — launches a local WebKit (Safari-engine) browser. + +No external browser container is required. Playwright must be installed +with WebKit browsers: ``playwright install webkit``. + +Note: ``page.request_gc()`` is Chromium-specific and is silently skipped +on WebKit — handled transparently by ``_safe_request_gc()`` in the base package. +""" +from changedetectionio.pluggy_interface import hookimpl +from changedetectionio.content_fetchers.playwright import PlaywrightBaseFetcher + + +class fetcher(PlaywrightBaseFetcher): + fetcher_description = "Playwright WebKit/Safari (local)" + + async def _connect_browser(self, p): + launch_kwargs = {'headless': True} + if self.proxy: + launch_kwargs['proxy'] = self.proxy + return await p.webkit.launch(**launch_kwargs) + + +class PlaywrightWebKitPlugin: + @hookimpl + def register_content_fetcher(self): + return ('playwright_webkit', fetcher) + + +webkit_plugin = PlaywrightWebKitPlugin() diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index e11612dbcf6..1ca54601b64 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -7,6 +7,7 @@ from loguru import logger +from changedetectionio.pluggy_interface import hookimpl from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \ SCREENSHOT_MAX_TOTAL_HEIGHT, FAVICON_FETCHER_JS @@ -166,11 +167,8 @@ async def capture_full_page(page, screenshot_format='JPEG', watch_uuid=None, loc class fetcher(Fetcher): - fetcher_description = "Puppeteer/direct {}/Javascript".format( - os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() - ) - if os.getenv("PLAYWRIGHT_DRIVER_URL"): - fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) + fetcher_description = "Puppeteer Chromium" + requires_connection_url = True browser = None browser_type = '' @@ -182,14 +180,10 @@ class fetcher(Fetcher): supports_screenshots = True supports_xpath_element_data = True - @classmethod - def get_status_icon_data(cls): - """Return Chrome browser icon data for Puppeteer fetcher.""" - return { - 'filename': 'google-chrome-icon.png', - 'alt': 'Using a Chrome browser', - 'title': 'Using a Chrome browser' - } + status_icon = {'filename': 'google-chrome-icon.png', 'alt': 'Using a Chrome browser', 'title': 'Using a Chrome browser'} + + def disk_cleanup_after_fetch(self): + self.delete_browser_steps_screenshots() def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): super().__init__(**kwargs) @@ -198,9 +192,10 @@ def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kw self.browser_connection_is_custom = True self.browser_connection_url = custom_browser_connection_url else: - # Fallback to fetching from system - # .strip('"') is going to save someone a lot of time when they accidently wrap the env value - self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"') + from loguru import logger + logger.critical("Puppeteer fetcher has no browser_connection_url — browser profile was not configured. " + "Set PLAYWRIGHT_DRIVER_URL or configure a browser profile in Settings.") + self.browser_connection_url = None # allow per-watch proxy selection override # @todo check global too? @@ -270,7 +265,7 @@ async def fetch_page(self, import re self.delete_browser_steps_screenshots() - n = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 12)) + self.render_extract_delay + n = self.extra_delay + self.render_extract_delay extra_wait = min(n, 15) logger.debug(f"Extra wait set to {extra_wait}s, requested was {n}s.") @@ -447,8 +442,12 @@ async def setup_frame_handlers_on_first_response(event): if self.status_code != 200 and not ignore_status_codes: screenshot = await capture_full_page(page=self.page, screenshot_format=self.screenshot_format, watch_uuid=watch_uuid, lock_viewport_elements=self.lock_viewport_elements) - - raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) + try: + page_html = await self.page.content + except Exception as e: + logger.warning(f"Got non-200 status {self.status_code} but failed to fetch page content: {e}") + page_html = None + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot, page_html=page_html) content = await self.page.content @@ -548,9 +547,10 @@ async def run(self, class PuppeteerFetcherPlugin: """Plugin class that registers the Puppeteer fetcher as a built-in plugin.""" + @hookimpl def register_content_fetcher(self): """Register the Puppeteer fetcher""" - return ('html_webdriver', fetcher) + return ('puppeteer', fetcher) # Create module-level instance for plugin registration diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index 7f4721c3314..77706e05e54 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -8,6 +8,7 @@ from changedetectionio import strtobool from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived from changedetectionio.content_fetchers.base import Fetcher +from changedetectionio.pluggy_interface import hookimpl from changedetectionio.validate_url import is_private_hostname @@ -258,9 +259,10 @@ async def quit(self, watch=None): class RequestsFetcherPlugin: """Plugin class that registers the requests fetcher as a built-in plugin.""" + @hookimpl def register_content_fetcher(self): """Register the requests fetcher""" - return ('html_requests', fetcher) + return ('requests', fetcher) # Create module-level instance for plugin registration diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py index 0ce08b234dc..354ea868142 100644 --- a/changedetectionio/content_fetchers/webdriver_selenium.py +++ b/changedetectionio/content_fetchers/webdriver_selenium.py @@ -3,13 +3,13 @@ from loguru import logger from changedetectionio.content_fetchers.base import Fetcher +from changedetectionio.content_fetchers.exceptions import Non200ErrorCodeReceived +from changedetectionio.pluggy_interface import hookimpl class fetcher(Fetcher): - if os.getenv("WEBDRIVER_URL"): - fetcher_description = f"WebDriver Chrome/Javascript via \"{os.getenv('WEBDRIVER_URL', '')}\"" - else: - fetcher_description = "WebDriver Chrome/Javascript" + fetcher_description = "Selenium WebDriver Chrome" + requires_connection_url = True proxy = None proxy_url = None @@ -19,26 +19,21 @@ class fetcher(Fetcher): supports_screenshots = True supports_xpath_element_data = True - @classmethod - def get_status_icon_data(cls): - """Return Chrome browser icon data for WebDriver fetcher.""" - return { - 'filename': 'google-chrome-icon.png', - 'alt': 'Using a Chrome browser', - 'title': 'Using a Chrome browser' - } + status_icon = {'filename': 'google-chrome-icon.png', 'alt': 'Using a Chrome browser', 'title': 'Using a Chrome browser'} def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs): super().__init__(**kwargs) from urllib.parse import urlparse from selenium.webdriver.common.proxy import Proxy - # .strip('"') is going to save someone a lot of time when they accidently wrap the env value - if not custom_browser_connection_url: - self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') - else: + if custom_browser_connection_url: self.browser_connection_is_custom = True self.browser_connection_url = custom_browser_connection_url + else: + from loguru import logger + logger.critical("Selenium WebDriver fetcher has no browser_connection_url — browser profile was not configured. " + "Set WEBDRIVER_URL or configure a browser profile in Settings.") + self.browser_connection_url = None ##### PROXY SETUP ##### @@ -130,22 +125,28 @@ def _run_sync(): if not "--window-size" in os.getenv("CHROME_OPTIONS", ""): driver.set_window_size(1280, 1024) - driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) + driver.implicitly_wait(self.extra_delay) if self.webdriver_js_execute_code is not None: driver.execute_script(self.webdriver_js_execute_code) # Selenium doesn't automatically wait for actions as good as Playwright, so wait again - driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) - - # @todo - how to check this? is it possible? - self.status_code = 200 - # @todo somehow we should try to get this working for WebDriver - # raise EmptyReply(url=url, status_code=r.status_code) + driver.implicitly_wait(self.extra_delay) # @todo - dom wait loaded? import time - time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) + time.sleep(self.extra_delay + self.render_extract_delay) self.content = driver.page_source + + # Use Navigation Timing API to get the real HTTP status code (Chrome 102+) + # Read after the sleep so the page is fully settled + try: + nav_status = driver.execute_script( + "return window.performance.getEntriesByType('navigation')[0]?.responseStatus" + ) + # Guard against 0 (file://, blocked requests) which should not raise Non200 + self.status_code = int(nav_status) if nav_status and int(nav_status) > 0 else 200 + except Exception: + self.status_code = 200 self.headers = {} # Selenium always captures as PNG, convert to JPEG if needed @@ -175,6 +176,10 @@ def _run_sync(): img.close() else: self.screenshot = screenshot_png + + if self.status_code != 200 and not ignore_status_codes: + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=self.screenshot, page_html=self.content) + except Exception as e: driver.quit() raise e @@ -190,9 +195,10 @@ def _run_sync(): class WebDriverSeleniumFetcherPlugin: """Plugin class that registers the WebDriver Selenium fetcher as a built-in plugin.""" + @hookimpl def register_content_fetcher(self): """Register the WebDriver Selenium fetcher""" - return ('html_webdriver', fetcher) + return ('selenium', fetcher) # Create module-level instance for plugin registration diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 2809ed01c36..4864097378c 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -341,52 +341,36 @@ def _jinja2_filter_format_duration(seconds): @app.template_filter('fetcher_status_icons') def _jinja2_filter_fetcher_status_icons(fetcher_name): - """Get status icon HTML for a given fetcher. + """Return status icon HTML for a fetcher, or empty string if none. - This filter checks both built-in fetchers and plugin fetchers for status icons. - - Args: - fetcher_name: The fetcher name (e.g., 'html_webdriver', 'html_js_zyte') - - Returns: - str: HTML string containing status icon elements + Built-in fetchers declare their icon via the ``status_icon`` class attribute + on their ``Fetcher`` subclass. Plugin fetchers may still use the pluggy + ``collect_fetcher_status_icons`` hook as a fallback. """ from changedetectionio import content_fetchers - from changedetectionio.pluggy_interface import collect_fetcher_status_icons from markupsafe import Markup from flask import url_for icon_data = None - # First check if it's a plugin fetcher (plugins have priority) - plugin_icon_data = collect_fetcher_status_icons(fetcher_name) - if plugin_icon_data: - icon_data = plugin_icon_data - # Check if it's a built-in fetcher - elif hasattr(content_fetchers, fetcher_name): - fetcher_class = getattr(content_fetchers, fetcher_name) - if hasattr(fetcher_class, 'get_status_icon_data'): + fetcher_class = content_fetchers.get_fetcher(fetcher_name) + if fetcher_class is not None: + icon_data = getattr(fetcher_class, 'status_icon', None) + if not icon_data and callable(getattr(fetcher_class, 'get_status_icon_data', None)): icon_data = fetcher_class.get_status_icon_data() - # Build HTML from icon data - if icon_data and isinstance(icon_data, dict): - # Use 'group' from icon_data if specified, otherwise default to 'images' - group = icon_data.get('group', 'images') - - # Try to use url_for, but fall back to manual URL building if endpoint not registered yet - try: - icon_url = url_for('static_content', group=group, filename=icon_data['filename']) - except: - # Fallback: build URL manually respecting APPLICATION_ROOT - from flask import request - app_root = request.script_root if hasattr(request, 'script_root') else '' - icon_url = f"{app_root}/static/{group}/{icon_data['filename']}" + # Fallback: pluggy hook for plugins that implement fetcher_status_icon + if not icon_data: + from changedetectionio.pluggy_interface import collect_fetcher_status_icons + icon_data = collect_fetcher_status_icons(fetcher_name) - style_attr = f' style="{icon_data["style"]}"' if icon_data.get('style') else '' - html = f'{icon_data[' - return Markup(html) + if not icon_data: + return '' - return '' + group = icon_data.get('group', 'images') + icon_url = url_for('static_content', group=group, filename=icon_data['filename']) + style_attr = f' style="{icon_data["style"]}"' if icon_data.get('style') else '' + return Markup(f'{icon_data[') _RE_SANITIZE_TAG = re.compile(r'[^a-zA-Z0-9]') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index e9a72c37e8a..68b1d553420 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -742,7 +742,6 @@ def __init__(self, formdata=None, obj=None, prefix="", data=None, meta=None, **k self.notification_title.extra_notification_tokens = kwargs.get('extra_notification_tokens', {}) self.notification_urls.extra_notification_tokens = kwargs.get('extra_notification_tokens', {}) - fetch_backend = RadioField(_l('Fetch Method'), choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) notification_body = TextAreaField(_l('Notification Body'), default='{{ watch_url }} had a change.', validators=[validators.Optional(), ValidateJinja2Template()]) notification_format = SelectField(_l('Notification format'), choices=list(valid_notification_formats.items())) notification_title = StringField(_l('Notification Title'), default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()]) @@ -779,6 +778,7 @@ class SingleBrowserStep(Form): class processor_text_json_diff_form(commonSettingsForm): + browser_profile = RadioField(_l('Browser / Fetch method'), choices=[]) # populated at runtime in edit.py url = fields.URLField('Web Page URL', validators=[validateURL()]) tags = StringTagUUID('Group Tag', [validators.Optional()], default='') @@ -940,10 +940,66 @@ class SingleExtraBrowser(Form): ValidateSimpleURL() ], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50}) -class DefaultUAInputForm(Form): - html_requests = StringField(_l('Plaintext requests'), validators=[validators.Optional()], render_kw={"placeholder": ""}) - if os.getenv("PLAYWRIGHT_DRIVER_URL") or os.getenv("WEBDRIVER_URL"): - html_webdriver = StringField(_l('Chrome requests'), validators=[validators.Optional()], render_kw={"placeholder": ""}) + +class BrowserProfileForm(Form): + """Create or edit a named BrowserProfile stored in settings.application.browser_profiles.""" + + name = StringField( + _l('Profile name'), + [validators.DataRequired(), validators.Length(max=100)], + render_kw={"placeholder": _l("e.g. Mobile Chrome, Bright Data CDP"), "maxlength": "100"} + ) + fetch_backend = SelectField( + _l('Fetch method'), + choices=[], # populated at runtime from available_fetchers() + ) + browser_connection_url = StringField( + _l('Browser connection URL'), + [ + validators.Optional(), + ValidateStartsWithRegex( + regex=r'^(wss?|ws|http|https)://', + flags=re.IGNORECASE, + message=_l('Browser connection URL must start with ws://, wss://, http://, https://') + ), + ValidateSimpleURL(), + ], + render_kw={"placeholder": "ws://my-chrome:3000", "size": 50} + ) + viewport_width = IntegerField( + _l('Viewport width (px)'), + [validators.Optional(), validators.NumberRange(min=100, max=7680)], + default=1280, + render_kw={"style": "width:5em;"} + ) + viewport_height = IntegerField( + _l('Viewport height (px)'), + [validators.Optional(), validators.NumberRange(min=100, max=4320)], + default=1000, + render_kw={"style": "width:5em;"} + ) + block_images = BooleanField(_l('Block images (faster loads)'), default=False) + block_fonts = BooleanField(_l('Block web fonts'), default=False) + ignore_https_errors = BooleanField(_l('Ignore HTTPS/TLS errors'), default=False) + user_agent = StringField( + _l('User-Agent override'), + [validators.Optional(), validators.Length(max=500)], + render_kw={"placeholder": _l("Leave blank to use fetcher default"), "size": 60} + ) + locale = StringField( + _l('Locale'), + [validators.Optional(), validators.Length(max=20)], + render_kw={"placeholder": "en-US, de-DE, fr-FR …", "size": 15} + ) + custom_headers = TextAreaField( + _l('Custom headers'), + [validators.Optional()], + render_kw={ + "placeholder": "Header-Name: value\nAnother-Header: value", + "rows": 4, "cols": 60, + "style": "font-family:monospace;" + } + ) # datastore.data['settings']['requests'].. class globalSettingsRequestForm(Form): @@ -967,8 +1023,6 @@ class globalSettingsRequestForm(Form): extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5) extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5) - default_ua = FormField(DefaultUAInputForm, label=_l("Default User-Agent overrides")) - def validate_extra_proxies(self, extra_validators=None): for e in self.data['extra_proxies']: if e.get('proxy_name') or e.get('proxy_url'): @@ -991,7 +1045,6 @@ class globalSettingsApplicationForm(commonSettingsForm): render_kw={"placeholder": os.getenv('BASE_URL', 'Not set')} ) empty_pages_are_a_change = BooleanField(_l('Treat empty pages as a change?'), default=False) - fetch_backend = RadioField(_l('Fetch Method'), default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) global_ignore_text = StringListField(_l('Ignore Text'), [ValidateListRegex()]) global_subtractive_selectors = StringListField(_l('Remove elements'), [ValidateCSSJSONXPATHInput(allow_json=False)]) ignore_whitespace = BooleanField(_l('Ignore whitespace')) diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index 588b5e82358..ecaeb8c1a4c 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -12,7 +12,6 @@ # Equal to or greater than this number of FilterNotFoundInResponse exceptions will trigger a filter-not-found notification _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6 -DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' @@ -31,10 +30,6 @@ class model(dict): 'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None}, 'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds 'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "5")), # Number of threads, lower is better for slow connections - 'default_ua': { - 'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT), - 'html_webdriver': None, - } }, 'application': { # Custom notification content @@ -43,7 +38,9 @@ class model(dict): 'api_access_token_enabled': True, 'base_url' : None, 'empty_pages_are_a_change': False, - 'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"), + 'browser_profile': None, # machine-name of the system-default BrowserProfile + 'browser_profiles': {}, # user-defined profiles keyed by machine name + 'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "requests"), 'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT, 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_subtractive_selectors': [], diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 4b09415d70a..e0cdecf3e9a 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -354,57 +354,38 @@ def is_source_type_url(self): return self.get('url', '').startswith('source:') @property - def get_fetch_backend(self): - """ - Get the fetch backend for this watch with special case handling. - - CHAIN RESOLUTION OPPORTUNITY: - Currently returns watch.fetch_backend directly, but doesn't implement - Watch → Tag → Global resolution chain. With Pydantic: - - @computed_field - def resolved_fetch_backend(self) -> str: - # Special case: PDFs always use html_requests - if self.is_pdf: - return 'html_requests' + def effective_browser_profile(self): + """Resolve the effective BrowserProfile for this watch. - # Watch override - if self.fetch_backend and self.fetch_backend != 'system': - return self.fetch_backend + Walks the chain: watch → tag (overrides_watch=True) → global settings → built-in fallback. + Never raises. Returns a BrowserProfile instance. + """ + from changedetectionio.model.browser_profile import resolve_browser_profile, BUILTIN_REQUESTS + if not self._datastore: + return BUILTIN_REQUESTS + try: + return resolve_browser_profile(self, self._datastore) + except Exception: + return BUILTIN_REQUESTS - # Tag override (first tag with overrides_watch=True wins) - for tag_uuid in self.tags: - tag = self._datastore.get_tag(tag_uuid) - if tag.overrides_watch and tag.fetch_backend: - return tag.fetch_backend + @property + def get_fetch_backend(self): + """Legacy property — prefer effective_browser_profile.fetch_backend for new code. - # Global default - return self._datastore.settings.fetch_backend + Returns the raw fetch_backend stored on this watch (or 'requests' for PDFs). + Does NOT walk the tag/global resolution chain. """ - # Maybe also if is_image etc? - # This is because chrome/playwright wont render the PDF in the browser and we will just fetch it and use pdf2html to see the text. if self.is_pdf: - return 'html_requests' - + return 'requests' return self.get('fetch_backend') @property def fetcher_supports_screenshots(self): - """Return True if the fetcher configured for this watch supports screenshots. - - Resolves 'system' via self._datastore, then checks supports_screenshots on - the actual fetcher class. Works for built-in and plugin fetchers alike. - """ + """Return True if the resolved fetcher for this watch supports screenshots.""" from changedetectionio import content_fetchers - - fetcher_name = self.get_fetch_backend # already handles is_pdf → html_requests - if not fetcher_name or fetcher_name == 'system': - fetcher_name = self._datastore['settings']['application'].get('fetch_backend', 'html_requests') - - fetcher_class = getattr(content_fetchers, fetcher_name, None) + fetcher_class = content_fetchers.get_fetcher(self.effective_browser_profile.fetch_backend) if fetcher_class is None: return False - return bool(getattr(fetcher_class, 'supports_screenshots', False)) @property diff --git a/changedetectionio/model/__init__.py b/changedetectionio/model/__init__.py index a16a3552349..3c084d7c140 100644 --- a/changedetectionio/model/__init__.py +++ b/changedetectionio/model/__init__.py @@ -187,6 +187,7 @@ def __init__(self, *arg, **kw): 'content-type': None, 'date_created': None, 'extract_text': [], # Extract text by regex after filters + 'browser_profile': 'system', # machine-name key of a BrowserProfile; 'system' → resolve via chain 'fetch_backend': 'system', # plaintext, playwright etc 'fetch_time': 0.0, 'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), @@ -589,7 +590,9 @@ def get_global_setting(self, *path): return None try: - value = self._datastore['settings'] + # _datastore is a ChangeDetectionStore (has .data) or a plain dict (unit tests) + store_data = self._datastore.data if hasattr(self._datastore, 'data') else self._datastore + value = store_data['settings'] for key in path: value = value[key] return value diff --git a/changedetectionio/model/browser_profile.py b/changedetectionio/model/browser_profile.py new file mode 100644 index 00000000000..66dcb021f6b --- /dev/null +++ b/changedetectionio/model/browser_profile.py @@ -0,0 +1,380 @@ +""" +BrowserProfile — named, reusable browser/fetcher configuration. + +Storage key +----------- +Profiles are stored in ``settings.application.browser_profiles`` as a plain dict +keyed by *machine name* — a lowercase, underscore-separated slug derived from the +human-readable ``name`` field: + + 'My Blocking Chrome' → 'my_blocking_chrome' + 'Custom CDP — Mobile (375px)' → 'custom_cdp_mobile_375px' + +Using the machine name as the key means that deleting a profile and recreating +it with the same name restores the original key, so all watches that referenced +it continue to work without any manual re-linking. + +Resolution chain +---------------- +``resolve_browser_profile(watch, datastore)`` walks: + + watch.browser_profile → first tag with overrides_watch=True → + settings.application.browser_profile → built-in fallback + +It never raises. Stale / missing machine-name references are logged and the +resolver falls through to the next level. + +Built-in profiles +----------------- +``BUILTIN_REQUESTS`` and ``BUILTIN_BROWSER`` are always available and cannot be +deleted from the UI (``is_builtin=True``). Their machine names are stored in +``RESERVED_MACHINE_NAMES`` to block user profiles from shadowing them. + +Migration +--------- +``store/updates.py::update_31`` converts the legacy ``fetch_backend`` field on +watches, tags and global settings into ``browser_profile`` machine-name +references. After that migration no legacy paths are needed here. +""" + +from __future__ import annotations + +import os +import re +from typing import Optional + +from loguru import logger +from pydantic import BaseModel, field_validator + +# Default User-Agent for the built-in plaintext requests profile. +# Overridable via environment variable for deployments that need a custom UA. +_DEFAULT_REQUESTS_UA = os.getenv( + "DEFAULT_SETTINGS_HEADERS_USERAGENT", + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' +) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +NAME_MAX_LEN = 100 + + +# --------------------------------------------------------------------------- +# Model +# --------------------------------------------------------------------------- + +class BrowserProfile(BaseModel): + """ + A named, reusable configuration for how a watch fetches its target URL. + + The *machine name* (see ``get_machine_name()``) is the stable storage key. + Updating ``name`` changes the machine name; any watch that referenced the + old machine name will then fall back through the resolution chain until it + is explicitly re-pointed. To replace a profile without breaking watches, + delete it and recreate it with the *same* name. + """ + + name: str + """Human-readable label shown in the UI. Max 100 characters.""" + + fetch_backend: str = 'requests' + """ + Which fetch engine to use. This is the *clean* fetcher name without the + ``html_`` module prefix (e.g. ``'requests'``, ``'webdriver'``, + ``'playwright'``, ``'puppeteer'``, ``'cloakbrowser'``). + + The module-level ``html_`` prefix (``html_requests``, ``html_webdriver``, + …) is an implementation detail of ``content_fetchers/``. Use + ``get_fetcher_class_name()`` to obtain the full module attribute name when + you need to look up the class. + + Must be non-empty and contain only ``[a-z0-9_]`` characters. + """ + + is_builtin: bool = False + """Built-in profiles are always present and cannot be deleted from the UI.""" + + # ------------------------------------------------------------------ + # Browser-specific settings (silently ignored by html_requests) + # ------------------------------------------------------------------ + + browser_connection_url: Optional[str] = None + """ + Custom CDP / WebSocket endpoint, e.g. ``ws://my-chrome:3000``. + Overrides the system-wide ``PLAYWRIGHT_DRIVER_URL`` for this profile. + Only meaningful for ``html_webdriver`` profiles. + """ + + viewport_width: int = 1280 + """ + Browser viewport width in pixels. + Common presets: 375 (iPhone), 768 (tablet), 1280 (desktop). + """ + + viewport_height: int = 1000 + """ + Browser viewport height in pixels. + Common presets: 812 (iPhone), 1024 (tablet), 1000 (desktop). + """ + + block_images: bool = False + """ + Block all image requests. Typically cuts page-load time by 40-70 % on + image-heavy sites with no impact on text-based change detection. + """ + + block_fonts: bool = False + """Block web-font requests. Modest speed gain; rarely affects detection.""" + + user_agent: Optional[str] = None + """ + Override the browser User-Agent string. + ``None`` keeps the fetcher's built-in default, which already strips + obvious headless markers such as ``HeadlessChrome``. + """ + + ignore_https_errors: bool = False + """ + Proceed even when the server's TLS certificate is invalid or self-signed. + Useful for staging / development environments. + """ + + locale: Optional[str] = None + """ + Browser locale (e.g. ``en-US``, ``de-DE``). + Sets the ``Accept-Language`` header and ``navigator.language``. + Some sites serve different prices or copy based on locale. + """ + + custom_headers: str = '' + """ + Extra HTTP headers sent with every request using this profile, in ``Key: Value`` format + (one per line, ``#`` lines are ignored). Applied before per-watch headers so + individual watches can override them. + """ + + service_workers: str = 'allow' + """ + Whether to allow Service Workers in the browser context. + Playwright accepts ``'allow'`` or ``'block'``. + Block to avoid large Service Worker data transfers (e.g. YouTube). + """ + + extra_delay: int = 0 + """ + Extra seconds to wait after page load before extracting content + (on top of the per-watch ``render_extract_delay``). + Sourced from ``WEBDRIVER_DELAY_BEFORE_CONTENT_READY`` at startup. + """ + + model_config = {"frozen": False} + + # ------------------------------------------------------------------ + # Validators + # ------------------------------------------------------------------ + + @field_validator('fetch_backend') + @classmethod + def _validate_fetch_backend(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError('fetch_backend cannot be empty') + if not re.fullmatch(r'[a-z0-9_]+', v): + raise ValueError( + f"fetch_backend must contain only lowercase letters, digits and underscores, got {v!r}" + ) + if v.startswith('html_'): + raise ValueError( + f"fetch_backend should be the clean fetcher name without the 'html_' prefix " + f"(e.g. 'requests', 'webdriver', 'playwright'). Got {v!r}. " + f"Use get_fetcher_class_name() to obtain the full module attribute name." + ) + return v + + @field_validator('name') + @classmethod + def _validate_name(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError('Name cannot be empty') + if len(v) > NAME_MAX_LEN: + raise ValueError(f'Name must be {NAME_MAX_LEN} characters or less') + return v + + # ------------------------------------------------------------------ + # Machine-name helpers + # ------------------------------------------------------------------ + + @staticmethod + def machine_name_from_str(name: str) -> str: + """ + Convert a human name to a machine-safe storage key. + + Transformation rules (applied in order): + + 1. Strip surrounding whitespace; lower-case. + 2. Replace runs of whitespace or hyphens with a single ``_``. + 3. Drop every character that is not ``[a-z0-9_]``. + 4. Collapse consecutive underscores. + 5. Strip leading / trailing underscores. + 6. Truncate to ``NAME_MAX_LEN`` characters. + + Examples:: + + 'My Blocking Browser Chrome' → 'my_blocking_browser_chrome' + 'Custom CDP — Mobile (375px)' → 'custom_cdp_mobile_375px' + ' Weird --- Name ' → 'weird_name' + """ + s = name.strip().lower() + s = re.sub(r'[\s\-]+', '_', s) # whitespace / hyphens → underscore + s = re.sub(r'[^a-z0-9_]', '', s) # drop everything else + s = re.sub(r'_+', '_', s) # collapse repeated underscores + s = s.strip('_') # drop leading / trailing underscores + return s[:NAME_MAX_LEN] + + def get_machine_name(self) -> str: + """Return the machine-safe storage key derived from this profile's ``name``.""" + return self.machine_name_from_str(self.name) + + def get_fetcher_class_name(self) -> str: + """Return the clean fetcher name for this profile (same as ``fetch_backend``). + + Use with ``content_fetchers.get_fetcher()``:: + + from changedetectionio import content_fetchers + fetcher_cls = content_fetchers.get_fetcher(profile.get_fetcher_class_name()) + """ + return self.fetch_backend + + +# --------------------------------------------------------------------------- +# Built-in profiles (always present, cannot be deleted) +# --------------------------------------------------------------------------- + +BUILTIN_REQUESTS = BrowserProfile( + name='Direct HTTP (requests)', + fetch_backend='requests', + is_builtin=True, + user_agent=_DEFAULT_REQUESTS_UA, +) + +BUILTIN_PLAYWRIGHT = BrowserProfile( + name='Browser (Chrome/Playwright)', + fetch_backend='playwright_cdp', + is_builtin=True, +) + +BUILTIN_SELENIUM = BrowserProfile( + name='Browser (Chrome/Selenium)', + fetch_backend='selenium', + is_builtin=True, +) + +BUILTIN_PUPPETEER = BrowserProfile( + name='Browser (Chrome/Puppeteer)', + fetch_backend='puppeteer', + is_builtin=True, +) + +# Backwards-compatible alias — code that imported BUILTIN_BROWSER keeps working. +BUILTIN_BROWSER = BUILTIN_PLAYWRIGHT + +# Keyed by machine name for O(1) lookup. +_BUILTINS: dict[str, BrowserProfile] = { + b.get_machine_name(): b + for b in (BUILTIN_REQUESTS, BUILTIN_PLAYWRIGHT, BUILTIN_SELENIUM, BUILTIN_PUPPETEER) +} + +# Machine names that cannot be used by user-created profiles. +RESERVED_MACHINE_NAMES: frozenset[str] = frozenset(_BUILTINS.keys()) + + +def get_default_browser_builtin() -> BrowserProfile: + """Final fallback when no profile can be resolved through the chain. + + ``preconfigure_browser_profiles_based_on_env()`` sets + ``settings.application.browser_profile`` explicitly at startup, so this + fallback is only reached for watches with stale / missing machine-name + references. Safe default is always direct HTTP requests. + """ + return BUILTIN_REQUESTS + + +# --------------------------------------------------------------------------- +# Lookup helpers +# --------------------------------------------------------------------------- + +def get_builtin_profiles() -> dict[str, BrowserProfile]: + """Return a shallow copy of the built-in profiles dict (keyed by machine name).""" + return dict(_BUILTINS) + + +def get_profile(machine_name: str, store_profiles: dict) -> Optional[BrowserProfile]: + """ + Look up a ``BrowserProfile`` by machine name. + + Stored profiles are checked first so that env-configured built-ins (written + by ``preconfigure_browser_profiles_based_on_env``) take priority over the + bare module-level defaults. Falls back to ``_BUILTINS`` when no stored + version exists. + + Returns ``None`` when the machine name is unknown or the stored data is + corrupt (a warning is logged in the latter case). + """ + raw = store_profiles.get(machine_name) + if raw is not None: + if isinstance(raw, BrowserProfile): + return raw + try: + return BrowserProfile(**raw) + except Exception as exc: + logger.warning(f"BrowserProfile '{machine_name}': failed to deserialize — {exc}") + # Fall through to built-in + + if machine_name in _BUILTINS: + return _BUILTINS[machine_name] + + return None + + +# --------------------------------------------------------------------------- +# Resolution +# --------------------------------------------------------------------------- + +def resolve_browser_profile(watch, datastore) -> BrowserProfile: + """ + Resolve the effective ``BrowserProfile`` for *watch*. + + Resolution chain + ~~~~~~~~~~~~~~~~ + 1. ``watch['browser_profile']`` — explicit machine name set on the watch. + 2. First tag with ``overrides_watch=True`` that has ``browser_profile`` set. + 3. ``settings.application['browser_profile']`` — system-wide default. + 4. Built-in fallback: ``BUILTIN_REQUESTS`` (requests is always the safe default). + + Never raises. A stale / missing machine-name reference produces a + ``logger.warning`` and the resolver continues down the chain. + """ + from changedetectionio.model.resolver import resolve_setting + + store_profiles: dict = datastore.data['settings']['application'].get('browser_profiles', {}) + + machine_name = resolve_setting( + watch, datastore, + field_name='browser_profile', + sentinel_values={'system', 'default', ''}, + default=None, + require_tag_override=True, + ) + + if machine_name: + profile = get_profile(machine_name, store_profiles) + if profile: + return profile + logger.warning( + f"Watch {watch.get('uuid')!r}: browser_profile {machine_name!r} not found, " + f"falling back through the chain" + ) + + return get_default_browser_builtin() diff --git a/changedetectionio/model/resolver.py b/changedetectionio/model/resolver.py new file mode 100644 index 00000000000..cd83c094915 --- /dev/null +++ b/changedetectionio/model/resolver.py @@ -0,0 +1,63 @@ +""" +Unified Watch → Tag → Global settings cascade resolver. + +All settings resolution follows the same priority order: + 1. Watch-level setting (if set and not a sentinel "use parent" value) + 2. First tag with overrides_watch=True that has the field set + 3. Global application settings + 4. Caller-supplied default + +This replaces the previously scattered manual resolution loops found in +notification_service.py, processors/base.py, and the restock processor. +""" + + +def resolve_setting(watch, datastore, field_name, *, + sentinel_values=None, + default=None, + require_tag_override=True): + """ + Resolve a single setting value by walking the Watch → Tag → Global chain. + + Args: + watch: Watch dict / model object. + datastore: App datastore (must have get_all_tags_for_watch() and + data['settings']['application']). + field_name: The setting key to look up at each level. + sentinel_values: Set of values that mean "not configured here, keep looking". + For example {'system'} for fetch_backend. + default: Value returned when nothing is found in the chain. + require_tag_override: If True (default), only tags where overrides_watch=True + contribute to the cascade. Set to False when every tag + that carries the field should be considered (e.g. for + fields that make sense to merge/override at any tag level). + + Returns: + The first non-sentinel, non-empty value found, or *default*. + """ + _sentinels = set(sentinel_values) if sentinel_values else set() + + def _is_unset(v): + return v is None or v == '' or v in _sentinels + + # 1. Watch level + v = watch.get(field_name) + if not _is_unset(v): + return v + + # 2. Tag level + tags = datastore.get_all_tags_for_watch(uuid=watch.get('uuid')) + if tags: + for tag in tags.values(): + if require_tag_override and not tag.get('overrides_watch'): + continue + v = tag.get(field_name) + if not _is_unset(v): + return v + + # 3. Global application settings + v = datastore.data['settings']['application'].get(field_name) + if not _is_unset(v): + return v + + return default diff --git a/changedetectionio/notification_profiles/__init__.py b/changedetectionio/notification_profiles/__init__.py new file mode 100644 index 00000000000..41dae46b9eb --- /dev/null +++ b/changedetectionio/notification_profiles/__init__.py @@ -0,0 +1,3 @@ +from .registry import registry, NotificationProfileType, AppriseProfileType + +__all__ = ['registry', 'NotificationProfileType', 'AppriseProfileType'] diff --git a/changedetectionio/notification_profiles/log.py b/changedetectionio/notification_profiles/log.py new file mode 100644 index 00000000000..37cc2421574 --- /dev/null +++ b/changedetectionio/notification_profiles/log.py @@ -0,0 +1,73 @@ +""" +Per-profile notification log. + +Each profile gets its own log file at: + {datastore_path}/notification-logs/{profile_uuid}.log + +Entries are stored as JSON-lines (one JSON object per line). +The file is capped at MAX_ENTRIES lines (oldest pruned first). +""" + +import json +import os +from datetime import datetime, timezone + +MAX_ENTRIES = 100 +_LOG_DIR = 'notification-logs' + + +def _log_file(datastore_path: str, profile_uuid: str) -> str: + return os.path.join(datastore_path, _LOG_DIR, f'{profile_uuid}.log') + + +def write_profile_log(datastore_path: str, profile_uuid: str, *, + watch_url: str = '', + watch_uuid: str = '', + status: str, # 'ok' | 'error' | 'test' + message: str = ''): + """Append one log entry; prune to MAX_ENTRIES.""" + log_dir = os.path.join(datastore_path, _LOG_DIR) + os.makedirs(log_dir, exist_ok=True) + + entry = json.dumps({ + 'ts': datetime.now(tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'), + 'watch_url': watch_url[:200], + 'watch_uuid': watch_uuid, + 'status': status, + 'message': message[:500], + }, ensure_ascii=False) + + path = _log_file(datastore_path, profile_uuid) + try: + with open(path, 'r', encoding='utf-8') as fh: + lines = [l for l in fh.read().splitlines() if l.strip()] + except FileNotFoundError: + lines = [] + + lines.append(entry) + lines = lines[-MAX_ENTRIES:] + + with open(path, 'w', encoding='utf-8') as fh: + fh.write('\n'.join(lines) + '\n') + + +def read_profile_log(datastore_path: str, profile_uuid: str) -> list: + """Return log entries as a list of dicts, newest first.""" + path = _log_file(datastore_path, profile_uuid) + try: + with open(path, 'r', encoding='utf-8') as fh: + lines = [l.strip() for l in fh if l.strip()] + except FileNotFoundError: + return [] + + entries = [] + for line in reversed(lines): + try: + entries.append(json.loads(line)) + except (json.JSONDecodeError, ValueError): + pass + return entries + + +def has_log(datastore_path: str, profile_uuid: str) -> bool: + return os.path.exists(_log_file(datastore_path, profile_uuid)) diff --git a/changedetectionio/notification_profiles/registry.py b/changedetectionio/notification_profiles/registry.py new file mode 100644 index 00000000000..477b65b201a --- /dev/null +++ b/changedetectionio/notification_profiles/registry.py @@ -0,0 +1,111 @@ +""" +Notification Profile Type plugin registry. + +NotificationProfileType is the abstract base — the only contract is send(). +Plugins are free to use any delivery mechanism (Apprise, direct HTTP, SDK, etc.). + +Built-in: AppriseProfileType (raw Apprise URL list). + +Third-party plugins register additional types: + + from changedetectionio.notification_profiles.registry import registry, NotificationProfileType + + @registry.register + class MyProfileType(NotificationProfileType): + type_id = "mytype" + display_name = "My Service" + icon = "bell" + template = "my_plugin/notification_profiles/types/mytype.html" + + def send(self, config: dict, n_object: dict, datastore) -> bool: + requests.post(config['webhook_url'], json={"text": n_object['notification_body']}) + return True +""" + +from abc import ABC, abstractmethod + + +class NotificationProfileType(ABC): + type_id: str = NotImplemented + display_name: str = NotImplemented + icon: str = "bell" # feather icon name + template: str = NotImplemented # Jinja2 partial rendered in the profile edit form + + @abstractmethod + def send(self, config: dict, n_object: dict, datastore) -> bool: + """ + Deliver the notification. + + Args: + config: The profile's config dict (type-specific fields). + n_object: Fully-rendered NotificationContextData (title, body, format, etc.). + datastore: App datastore for any extra lookups. + + Returns True on success, False on failure (do not raise — log instead). + """ + + def validate(self, config: dict) -> None: + """Raise ValueError with a user-readable message on invalid config.""" + pass + + def get_url_hint(self, config: dict) -> str: + """Short display string shown in the selector chip tooltip / dropdown row.""" + return '' + + +class AppriseProfileType(NotificationProfileType): + """Delivers notifications via Apprise using a raw URL list.""" + + type_id = "apprise" + display_name = "Apprise" + icon = "bell" + template = "notification_profiles/types/apprise.html" + + def get_apprise_urls(self, config: dict) -> list: + return config.get('notification_urls') or [] + + def send(self, config: dict, n_object, datastore) -> bool: + from changedetectionio.notification.handler import process_notification + from changedetectionio.notification_service import NotificationContextData + urls = self.get_apprise_urls(config) + if not urls: + return False + if not isinstance(n_object, NotificationContextData): + n_object = NotificationContextData(n_object) + n_object['notification_urls'] = urls + n_object['notification_title'] = config.get('notification_title') or n_object.get('notification_title') + n_object['notification_body'] = config.get('notification_body') or n_object.get('notification_body') + n_object['notification_format'] = config.get('notification_format') or n_object.get('notification_format') + process_notification(n_object, datastore) + return True + + def get_url_hint(self, config: dict) -> str: + urls = config.get('notification_urls') or [] + if urls: + u = urls[0] + return (u[:60] + '…') if len(u) > 60 else u + return '' + + +class _Registry: + def __init__(self): + self._types: dict = {} + + def register(self, cls): + """Register a NotificationProfileType subclass. Usable as a decorator.""" + instance = cls() + self._types[instance.type_id] = instance + return cls + + def get(self, type_id: str) -> NotificationProfileType: + return self._types.get(type_id, self._types.get('apprise')) + + def all(self) -> list: + return list(self._types.values()) + + def choices(self) -> list: + return [(t.type_id, t.display_name) for t in self._types.values()] + + +registry = _Registry() +registry.register(AppriseProfileType) diff --git a/changedetectionio/notification_profiles/resolver.py b/changedetectionio/notification_profiles/resolver.py new file mode 100644 index 00000000000..9fea7e41d3a --- /dev/null +++ b/changedetectionio/notification_profiles/resolver.py @@ -0,0 +1,49 @@ +""" +Resolve the full set of NotificationProfile objects that should fire for a given watch. + +Merges profile UUIDs from: Watch → Tags → System (union, deduplicated). +Mute cascade is checked separately via resolve_setting() before calling this. +""" + +from loguru import logger + + +def resolve_notification_profiles(watch, datastore) -> list: + """ + Return list of (profile_dict, NotificationProfileType) tuples to fire for *watch*. + + Profiles are deduplicated by UUID — if the same UUID appears at multiple levels + it fires once, not multiple times. + """ + from changedetectionio.notification_profiles.registry import registry + + all_profiles = datastore.data['settings']['application'].get('notification_profile_data', {}) + + seen = set() + result = [] + + def _add(uuids): + for uid in (uuids or []): + if uid in seen: + continue + profile = all_profiles.get(uid) + if not profile: + logger.warning(f"Notification profile UUID {uid!r} not found, skipping") + continue + seen.add(uid) + type_handler = registry.get(profile.get('type', 'apprise')) + result.append((profile, type_handler)) + + # 1. Watch-level + _add(watch.get('notification_profiles', [])) + + # 2. Tag/group level + tags = datastore.get_all_tags_for_watch(uuid=watch.get('uuid')) + if tags: + for tag in tags.values(): + _add(tag.get('notification_profiles', [])) + + # 3. System level + _add(datastore.data['settings']['application'].get('notification_profiles', [])) + + return result diff --git a/changedetectionio/pluggy_interface.py b/changedetectionio/pluggy_interface.py index 07cd46727ea..7309da268a0 100644 --- a/changedetectionio/pluggy_interface.py +++ b/changedetectionio/pluggy_interface.py @@ -237,14 +237,23 @@ def register_builtin_fetchers(): This is called from content_fetchers/__init__.py after all fetchers are imported to avoid circular import issues. """ - from changedetectionio.content_fetchers import requests, playwright, puppeteer, webdriver_selenium + from changedetectionio.content_fetchers import requests, puppeteer, webdriver_selenium + from changedetectionio.content_fetchers.playwright import CDP, chrome, firefox, webkit - # Register each built-in fetcher plugin if hasattr(requests, 'requests_plugin'): plugin_manager.register(requests.requests_plugin, 'builtin_requests') - if hasattr(playwright, 'playwright_plugin'): - plugin_manager.register(playwright.playwright_plugin, 'builtin_playwright') + if hasattr(CDP, 'cdp_plugin'): + plugin_manager.register(CDP.cdp_plugin, 'builtin_playwright_cdp') + + if hasattr(chrome, 'chrome_plugin'): + plugin_manager.register(chrome.chrome_plugin, 'builtin_playwright_chrome') + + if hasattr(firefox, 'firefox_plugin'): + plugin_manager.register(firefox.firefox_plugin, 'builtin_playwright_firefox') + + if hasattr(webkit, 'webkit_plugin'): + plugin_manager.register(webkit.webkit_plugin, 'builtin_playwright_webkit') if hasattr(puppeteer, 'puppeteer_plugin'): plugin_manager.register(puppeteer.puppeteer_plugin, 'builtin_puppeteer') @@ -360,57 +369,28 @@ def get_active_plugins(): def get_fetcher_capabilities(watch, datastore): - """Get capability flags for a watch's fetcher. + """Get capability flags for a watch's resolved fetcher. - Args: - watch: The watch object/dict - datastore: The datastore to resolve 'system' fetcher + Uses the BrowserProfile resolution chain (watch → tag → global → built-in) + to determine the actual fetcher class, then reads its capability flags. Returns: - dict: Dictionary with capability flags: - { - 'supports_browser_steps': bool, - 'supports_screenshots': bool, - 'supports_xpath_element_data': bool - } + dict: {'supports_browser_steps': bool, 'supports_screenshots': bool, + 'supports_xpath_element_data': bool} """ - # Get the fetcher name from watch - fetcher_name = watch.get('fetch_backend', 'system') - - # Resolve 'system' to actual fetcher - if fetcher_name == 'system': - fetcher_name = datastore.data['settings']['application'].get('fetch_backend', 'html_requests') - - # Get the fetcher class + from changedetectionio.model.browser_profile import resolve_browser_profile from changedetectionio import content_fetchers - # Try to get from built-in fetchers first - if hasattr(content_fetchers, fetcher_name): - fetcher_class = getattr(content_fetchers, fetcher_name) - return { - 'supports_browser_steps': getattr(fetcher_class, 'supports_browser_steps', False), - 'supports_screenshots': getattr(fetcher_class, 'supports_screenshots', False), - 'supports_xpath_element_data': getattr(fetcher_class, 'supports_xpath_element_data', False) - } - - # Try to get from plugin-provided fetchers - # Query all plugins for registered fetchers - plugin_fetchers = plugin_manager.hook.register_content_fetcher() - for fetcher_registration in plugin_fetchers: - if fetcher_registration: - name, fetcher_class = fetcher_registration - if name == fetcher_name: - return { - 'supports_browser_steps': getattr(fetcher_class, 'supports_browser_steps', False), - 'supports_screenshots': getattr(fetcher_class, 'supports_screenshots', False), - 'supports_xpath_element_data': getattr(fetcher_class, 'supports_xpath_element_data', False) - } + profile = resolve_browser_profile(watch, datastore) + fetcher_class = content_fetchers.get_fetcher(profile.fetch_backend) + + if fetcher_class is None: + return {'supports_browser_steps': False, 'supports_screenshots': False, 'supports_xpath_element_data': False} - # Default: no capabilities return { - 'supports_browser_steps': False, - 'supports_screenshots': False, - 'supports_xpath_element_data': False + 'supports_browser_steps': getattr(fetcher_class, 'supports_browser_steps', False), + 'supports_screenshots': getattr(fetcher_class, 'supports_screenshots', False), + 'supports_xpath_element_data': getattr(fetcher_class, 'supports_xpath_element_data', False), } diff --git a/changedetectionio/processors/base.py b/changedetectionio/processors/base.py index 743914c6b4a..024a9405752 100644 --- a/changedetectionio/processors/base.py +++ b/changedetectionio/processors/base.py @@ -23,6 +23,7 @@ class difference_detection_processor(): watch = None xpath_data = None preferred_proxy = None + preferred_proxy_override = None # Set externally to force a specific proxy (e.g. proxy checker) screenshot_format = SCREENSHOT_FORMAT_JPEG last_raw_content_checksum = None @@ -36,6 +37,8 @@ def __init__(self, datastore, watch_uuid): # 2. Preserves Watch object with properties (.link, .is_pdf, etc.) - can't use dict() # 3. Safe now: Watch.__deepcopy__() shares datastore ref (no memory leak) but copies dict data self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) + if self.watch is None: + raise KeyError(f"Watch UUID {watch_uuid} not found in datastore (deleted before processing?)") # Generic fetcher that should be extended (requests, playwright etc) self.fetcher = Fetcher() @@ -115,82 +118,65 @@ async def validate_iana_url(self): f"Set ALLOW_IANA_RESTRICTED_ADDRESSES=true to allow." ) - async def call_browser(self, preferred_proxy_id=None): + async def call_browser(self): from requests.structures import CaseInsensitiveDict + from changedetectionio.model.browser_profile import resolve_browser_profile, BUILTIN_REQUESTS url = self.watch.link - # Protect against file:, file:/, file:// access, check the real "link" without any meta "source:" etc prepended. + # Protect against file:, file:/, file:// access if re.search(r'^file:', url.strip(), re.IGNORECASE): if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): - raise Exception( - "file:// type access is denied for security reasons." - ) + raise Exception("file:// type access is denied for security reasons.") await self.validate_iana_url() - # Requests, playwright, other browser via wss:// etc, fetch_extra_something - prefer_fetch_backend = self.watch.get('fetch_backend', 'system') - - # Proxy ID "key" - preferred_proxy_id = preferred_proxy_id if preferred_proxy_id else self.datastore.get_preferred_proxy_for_watch( - uuid=self.watch.get('uuid')) - - # Pluggable content self.fetcher - if not prefer_fetch_backend or prefer_fetch_backend == 'system': - prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') - - # In the case that the preferred fetcher was a browser config with custom connection URL.. - # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..) - custom_browser_connection_url = None - if prefer_fetch_backend.startswith('extra_browser_'): - (t, key) = prefer_fetch_backend.split('extra_browser_') - connection = list( - filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', []))) - if connection: - prefer_fetch_backend = 'html_webdriver' - custom_browser_connection_url = connection[0].get('browser_connection_url') - - # PDF should be html_requests because playwright will serve it up (so far) in a embedded page + # Resolve the full browser profile for this watch (watch → tag → global → built-in) + profile = resolve_browser_profile(self.watch, self.datastore) + + # PDFs always use the requests fetcher — browsers render them in an embedded viewer # @todo https://github.com/dgtlmoon/changedetection.io/issues/2019 - # @todo needs test to or a fix if self.watch.is_pdf: - prefer_fetch_backend = "html_requests" + profile = BUILTIN_REQUESTS - # Grab the right kind of 'fetcher', (playwright, requests, etc) - from changedetectionio import content_fetchers - if hasattr(content_fetchers, prefer_fetch_backend): - # @todo TEMPORARY HACK - SWITCH BACK TO PLAYWRIGHT FOR BROWSERSTEPS - if prefer_fetch_backend == 'html_webdriver' and self.watch.has_browser_steps: - # This is never supported in selenium anyway - logger.warning( - "Using playwright fetcher override for possible puppeteer request in browsersteps, because puppetteer:browser steps is incomplete.") - from changedetectionio.content_fetchers.playwright import fetcher as playwright_fetcher - fetcher_obj = playwright_fetcher - else: - fetcher_obj = getattr(content_fetchers, prefer_fetch_backend) - else: - # What it referenced doesnt exist, Just use a default - fetcher_obj = getattr(content_fetchers, "html_requests") - - proxy_url = None - if preferred_proxy_id: - # Custom browser endpoints should NOT have a proxy added - if not prefer_fetch_backend.startswith('extra_browser_'): - proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url') - logger.debug(f"Selected proxy key '{preferred_proxy_id}' as proxy URL '{proxy_url}' for {url}") - else: - logger.debug("Skipping adding proxy data when custom Browser endpoint is specified. ") + # Resolve proxy for the target URL fetch. + # Note: browser_connection_url is the WebSocket endpoint to reach the remote browser, + # which is separate from the proxy used by the browser to fetch target pages. + proxy_url = self.datastore.get_proxy_url_for_watch(self.watch.get('uuid'), override_id=self.preferred_proxy_override) + if proxy_url: + logger.debug(f"Proxy '{proxy_url}' for {url}") - logger.debug(f"Using proxy '{proxy_url}' for {self.watch['uuid']}") + logger.debug(f"BrowserProfile '{profile.get_machine_name()}' (fetcher={profile.fetch_backend}) for watch {self.watch['uuid']}") - # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. - # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc) - self.fetcher = fetcher_obj(proxy_override=proxy_url, - custom_browser_connection_url=custom_browser_connection_url, - screenshot_format=self.screenshot_format - ) + # Select the fetcher class + from changedetectionio import content_fetchers + fetcher_class_name = profile.get_fetcher_class_name() + + fetcher_obj = content_fetchers.get_fetcher(fetcher_class_name) + if fetcher_obj is None: + logger.warning(f"Fetcher '{fetcher_class_name}' not found, falling back to requests") + fetcher_obj = content_fetchers.get_fetcher('requests') + elif self.watch.has_browser_steps and not getattr(fetcher_obj, 'supports_browser_steps', False): + # Browser steps require Playwright — override if the resolved fetcher doesn't support them + logger.warning(f"Fetcher '{fetcher_class_name}' does not support browser steps, overriding to Playwright") + fetcher_obj = content_fetchers.get_fetcher('playwright') + + self.fetcher = fetcher_obj( + proxy_override=proxy_url, + custom_browser_connection_url=profile.browser_connection_url, + screenshot_format=self.screenshot_format, + # BrowserProfile fields — browser fetchers use these; html_requests ignores them + viewport_width=profile.viewport_width, + viewport_height=profile.viewport_height, + block_images=profile.block_images, + block_fonts=profile.block_fonts, + profile_user_agent=profile.user_agent, + ignore_https_errors=profile.ignore_https_errors, + locale=profile.locale, + service_workers=profile.service_workers, + extra_delay=profile.extra_delay, + ) if self.watch.has_browser_steps: self.fetcher.browser_steps = browser_steps_get_valid_steps(self.watch.get('browser_steps', [])) @@ -200,9 +186,17 @@ async def call_browser(self, preferred_proxy_id=None): from changedetectionio.jinja2_custom import render as jinja_render request_headers = CaseInsensitiveDict() - ua = self.datastore.data['settings']['requests'].get('default_ua') - if ua and ua.get(prefer_fetch_backend): - request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)}) + # Browser profile: UA override (lowest priority — watch headers override this) + if profile.user_agent: + request_headers['User-Agent'] = profile.user_agent + + # Browser profile: custom headers (override profile UA, but watch headers override these) + if profile.custom_headers: + for line in profile.custom_headers.splitlines(): + line = line.strip() + if not line.startswith('#') and ':' in line: + k, v = line.split(':', 1) + request_headers[k.strip()] = v.strip() request_headers.update(self.watch.get('headers', {})) request_headers.update(self.datastore.get_all_base_headers()) @@ -259,6 +253,7 @@ async def call_browser(self, preferred_proxy_id=None): # @todo .quit here could go on close object, so we can run JS if change-detected await self.fetcher.quit(watch=self.watch) + self.fetcher.disk_cleanup_after_fetch() # Sanitize lone surrogates - these can appear when servers return malformed/mixed-encoding # content that gets decoded into surrogate characters (e.g. \udcad). Without this, diff --git a/changedetectionio/processors/extract.py b/changedetectionio/processors/extract.py index 955b50addcc..a62f7764002 100644 --- a/changedetectionio/processors/extract.py +++ b/changedetectionio/processors/extract.py @@ -42,7 +42,7 @@ def render_form(watch, datastore, request, url_for, render_template, flash, redi # Get error information for the template screenshot_url = watch.get_screenshot() - is_html_webdriver = watch.fetcher_supports_screenshots + fetcher_supports_screenshots = watch.fetcher_supports_screenshots password_enabled_and_share_is_off = False if datastore.data['settings']['application'].get('password') or os.getenv("SALTED_PASS", False): @@ -59,7 +59,7 @@ def render_form(watch, datastore, request, url_for, render_template, flash, redi last_error_screenshot=watch.get_error_snapshot(), last_error_text=watch.get_error_text(), screenshot=screenshot_url, - is_html_webdriver=is_html_webdriver, + fetcher_supports_screenshots=fetcher_supports_screenshots, password_enabled_and_share_is_off=password_enabled_and_share_is_off, extra_title=f" - {watch.label} - Extract Data", extra_stylesheets=[url_for('static_content', group='styles', filename='diff.css')], diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 5ee6a430148..3e37176abce 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -489,19 +489,9 @@ def run_changedetection(self, watch, force_reprocess=False): # @TODO !!! some setting like "Use as fallback" or "always use", "t if not (has_price and has_availability) or True: from changedetectionio.pluggy_interface import get_itemprop_availability_from_plugin - fetcher_name = watch.get('fetch_backend', 'html_requests') - - # Resolve 'system' to the actual fetcher being used - # This allows plugins to work even when watch uses "system settings default" - if fetcher_name == 'system': - # Get the actual fetcher that was used (from self.fetcher) - # Fetcher class name gives us the actual backend (e.g., 'html_requests', 'html_webdriver') - actual_fetcher = type(self.fetcher).__name__ - if 'html_requests' in actual_fetcher.lower(): - fetcher_name = 'html_requests' - elif 'webdriver' in actual_fetcher.lower() or 'playwright' in actual_fetcher.lower(): - fetcher_name = 'html_webdriver' - logger.debug(f"Resolved 'system' fetcher to actual fetcher: {fetcher_name}") + # Use the actual resolved fetcher name from the fetcher instance + fetcher_name = self.watch.effective_browser_profile.fetch_backend + logger.debug(f"Resolved effective fetcher: {fetcher_name}") # Try plugin override - plugins can decide if they support this fetcher if fetcher_name: diff --git a/changedetectionio/processors/text_json_diff/difference.py b/changedetectionio/processors/text_json_diff/difference.py index 3175cf6b96e..026fc8c5081 100644 --- a/changedetectionio/processors/text_json_diff/difference.py +++ b/changedetectionio/processors/text_json_diff/difference.py @@ -154,7 +154,7 @@ def render(watch, datastore, request, url_for, render_template, flash, redirect, screenshot_url = watch.get_screenshot() - is_html_webdriver = watch.fetcher_supports_screenshots + fetcher_supports_screenshots = watch.fetcher_supports_screenshots password_enabled_and_share_is_off = False if datastore.data['settings']['application'].get('password') or os.getenv("SALTED_PASS", False): @@ -210,7 +210,7 @@ def render(watch, datastore, request, url_for, render_template, flash, redirect, extra_title=f" - {watch.label} - History", extract_form=extract_form, from_version=str(from_version), - is_html_webdriver=is_html_webdriver, + fetcher_supports_screenshots=fetcher_supports_screenshots, last_error=watch['last_error'], last_error_screenshot=watch.get_error_snapshot(), last_error_text=watch.get_error_text(), diff --git a/changedetectionio/static/images/firefox-icon.svg b/changedetectionio/static/images/firefox-icon.svg new file mode 100644 index 00000000000..c382310ecd0 --- /dev/null +++ b/changedetectionio/static/images/firefox-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/changedetectionio/static/js/stepper.js b/changedetectionio/static/js/stepper.js index 063f8488536..df37f2a7c64 100644 --- a/changedetectionio/static/js/stepper.js +++ b/changedetectionio/static/js/stepper.js @@ -4,7 +4,7 @@ $(document).ready(function(){ }); var checkUserVal = function(){ - if($('#fetch_backend input:checked').val()=='html_requests') { + if($('#fetch_backend input:checked').val()=='requests') { $('#request-override').show(); $('#webdriver-stepper').hide(); } else { diff --git a/changedetectionio/static/js/vis.js b/changedetectionio/static/js/vis.js index edcacfb6738..ee3bb99bbba 100644 --- a/changedetectionio/static/js/vis.js +++ b/changedetectionio/static/js/vis.js @@ -3,21 +3,40 @@ $(document).ready(function () { // Lazy Hide/Show elements mechanism $('[data-visible-for]').hide(); function show_related_elem(e) { - var n = $(e).attr('name') + "=" + $(e).val(); - if (n === 'fetch_backend=system') { + var name = $(e).attr('name'); + var val = $(e).val(); + var n = name + "=" + val; + + // Resolve browser_profile select → underlying fetch_backend class name + // browserProfileFetcherMap is injected by the page as {machine_name: 'playwright', ...} + if (name && name.endsWith('browser_profile') && typeof browserProfileFetcherMap !== 'undefined') { + var fetcherClass = val === 'system' + ? (typeof default_system_fetch_backend !== 'undefined' ? default_system_fetch_backend : null) + : browserProfileFetcherMap[val]; + if (fetcherClass) { + n = 'fetch_backend=' + fetcherClass; + } + } else if (n === 'fetch_backend=system') { n = "fetch_backend=" + default_system_fetch_backend; } $(`[data-visible-for~="${n}"]`).show(); } - $(':radio').on('keyup keypress blur change click', function (e) { + + $('select, :radio').on('change', function (e) { + $(`[data-visible-for]`).hide(); + $('.advanced-options').hide(); + show_related_elem(this); + }); + // Retain original click/keyup handling for radio buttons + $(':radio').on('keyup keypress blur click', function (e) { $(`[data-visible-for]`).hide(); $('.advanced-options').hide(); show_related_elem(this); }); - $(':radio:checked').each(function (e) { + $(':radio:checked, select').each(function (e) { show_related_elem(this); - }) + }); // Show advanced @@ -26,4 +45,4 @@ $(document).ready(function () { $(this).toggle(); }) }); -}); \ No newline at end of file +}); diff --git a/changedetectionio/store/__init__.py b/changedetectionio/store/__init__.py index 70561c54060..c015e09aab3 100644 --- a/changedetectionio/store/__init__.py +++ b/changedetectionio/store/__init__.py @@ -143,7 +143,7 @@ def _rehydrate_tags(self): self.__data['settings']['application']['tags'][uuid] = Tag.model( datastore_path=self.datastore_path, - __datastore=self.__data, + __datastore=self, default=tag ) logger.info(f"Tag: {uuid} {tag['title']}") @@ -207,7 +207,7 @@ def reload_state(self, datastore_path, include_default_watches, version_tag): self.json_store_path = os.path.join(self.datastore_path, "changedetection.json") # Base definition for all watchers (deepcopy part of #569) - self.generic_definition = deepcopy(Watch.model(datastore_path=datastore_path, __datastore=self.__data, default={})) + self.generic_definition = deepcopy(Watch.model(datastore_path=datastore_path, __datastore=self, default={})) # Load build SHA if available (Docker deployments) if path.isfile('changedetectionio/source.txt'): @@ -245,6 +245,10 @@ def reload_state(self, datastore_path, include_default_watches, version_tag): # Maybe they copied a bunch of watch subdirs across too self._load_state() + # Apply env-var browser config after state is fully loaded so we can safely + # read existing settings without risk of being overwritten. + self.preconfigure_browser_profiles_based_on_env() + def init_fresh_install(self, include_default_watches, version_tag): # Generate app_guid FIRST (required for all operations) if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ: @@ -268,13 +272,11 @@ def init_fresh_install(self, include_default_watches, version_tag): if include_default_watches: self.add_watch( url='https://news.ycombinator.com/', - tag='Tech news', - extras={'fetch_backend': 'html_requests'} + tag='Tech news' ) self.add_watch( url='https://changedetection.io/CHANGELOG.txt', - tag='changedetection.io', - extras={'fetch_backend': 'html_requests'} + tag='changedetection.io' ) # Create changedetection.json immediately @@ -331,9 +333,64 @@ def rehydrate_entity(self, uuid, entity, processor_override=None): if entity.get('processor') != 'text_json_diff': logger.trace(f"Loading Watch object '{watch_class.__module__}.{watch_class.__name__}' for UUID {uuid}") - entity = watch_class(datastore_path=self.datastore_path, __datastore=self.__data, default=entity) + entity = watch_class(datastore_path=self.datastore_path, __datastore=self, default=entity) return entity + def preconfigure_browser_profiles_based_on_env(self): + """Instantiate browser profiles from environment variables and store them. + + Always runs at the end of reload_state() — covers fresh installs, + existing datastores, and server restarts. Env vars always win so that + changing PLAYWRIGHT_DRIVER_URL and restarting is reflected immediately. + + Creates BrowserProfile instances from env vars and stores them in + ``settings.application.browser_profiles`` under their machine names, + then sets ``settings.application.browser_profile`` to that profile as + the system-wide default. + """ + from changedetectionio.model import browser_profile as bp + from changedetectionio.strtobool import strtobool + + store_profiles = self.__data['settings']['application'].setdefault('browser_profiles', {}) + service_workers = os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow') + extra_delay = int(os.getenv('WEBDRIVER_DELAY_BEFORE_CONTENT_READY', 0)) + configured_profile = None + + playwright_url = os.getenv('PLAYWRIGHT_DRIVER_URL') + if playwright_url: + playwright_url = playwright_url.strip('"') + builtin = bp.BUILTIN_PUPPETEER if strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) else bp.BUILTIN_PLAYWRIGHT + profile = bp.BrowserProfile( + name=builtin.name, + fetch_backend=builtin.fetch_backend, + browser_connection_url=playwright_url, + service_workers=service_workers, + extra_delay=extra_delay, + is_builtin=True, + ) + logger.debug(f"Configuring browser profile '{profile.get_machine_name()}' from env") + store_profiles[profile.get_machine_name()] = profile.model_dump() + configured_profile = profile + + webdriver_url = os.getenv('WEBDRIVER_URL') + if webdriver_url: + profile = bp.BrowserProfile( + name=bp.BUILTIN_SELENIUM.name, + fetch_backend=bp.BUILTIN_SELENIUM.fetch_backend, + browser_connection_url=webdriver_url.strip('"'), + extra_delay=extra_delay, + is_builtin=True, + ) + logger.debug(f"Configuring browser profile '{profile.get_machine_name()}' from env") + store_profiles[profile.get_machine_name()] = profile.model_dump() + if not configured_profile: + configured_profile = profile + + if configured_profile: + logger.debug(f"Setting system default browser profile to '{configured_profile.get_machine_name()}'") + self.__data['settings']['application']['browser_profile'] = configured_profile.get_machine_name() + + # ============================================================================ # FileSavingDataStore Abstract Method Implementations # ============================================================================ @@ -365,6 +422,14 @@ def _build_settings_data(self): # Is saved as {uuid}/tag.json settings_copy['application']['tags'] = {} + # Serialize BrowserProfile Pydantic instances to plain dicts for JSON storage + raw_profiles = settings_copy['application'].get('browser_profiles', {}) + from changedetectionio.model.browser_profile import BrowserProfile + settings_copy['application']['browser_profiles'] = { + k: v.model_dump() if isinstance(v, BrowserProfile) else v + for k, v in raw_profiles.items() + } + return { 'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json', 'app_guid': self.__data.get('app_guid'), @@ -421,7 +486,7 @@ def rehydrate_tag(uuid, entity_dict): return Tag.model( datastore_path=self.datastore_path, - __datastore=self.__data, + __datastore=self, default=entity_dict ) @@ -767,7 +832,7 @@ def add_watch(self, url, tag='', extras=None, tag_uuids=None, save_immediately=T # If the processor also has its own Watch implementation watch_class = get_custom_watch_obj_for_processor(apply_extras.get('processor')) - new_watch = watch_class(datastore_path=self.datastore_path, __datastore=self.__data, url=url) + new_watch = watch_class(datastore_path=self.datastore_path, __datastore=self, url=url) new_uuid = new_watch.get('uuid') @@ -852,6 +917,16 @@ def proxy_list(self): return proxy_list if len(proxy_list) else None + def get_proxy_url_for_watch(self, uuid, override_id=None): + """ + Returns the resolved proxy URL string for a watch, or None. + override_id forces a specific proxy (e.g. proxy checker bypass). + """ + proxy_id = override_id or self.get_preferred_proxy_for_watch(uuid) + if proxy_id: + return self.proxy_list.get(proxy_id, {}).get('url') + return None + def get_preferred_proxy_for_watch(self, uuid): """ Returns the preferred proxy by ID key @@ -885,6 +960,71 @@ def get_preferred_proxy_for_watch(self, uuid): return None + # ------------------------------------------------------------------ + # BrowserProfile helpers + # ------------------------------------------------------------------ + + def get_browser_profile(self, machine_name: str): + """Return a BrowserProfile by machine name, or None if not found. + + Built-in profiles (direct_http_requests, browser_chromeplaywright) are + always available and checked first. + """ + from changedetectionio.model.browser_profile import get_profile + store_profiles = self.data['settings']['application'].get('browser_profiles', {}) + return get_profile(machine_name, store_profiles) + + def delete_browser_profile(self, machine_name: str): + """Delete a user-defined BrowserProfile by machine name. + + Rules enforced: + - Built-in profiles cannot be deleted. + - The profile cannot be the current system default + (settings.application.browser_profile); caller must change the + default first. + - Any watch or tag that referenced this profile is reset to None + (falls back through the chain on next fetch). + + Returns the number of watches/tags that were reset. + """ + from changedetectionio.model.browser_profile import RESERVED_MACHINE_NAMES + + if machine_name in RESERVED_MACHINE_NAMES: + raise ValueError(f"Built-in profile '{machine_name}' cannot be deleted") + + system_default = self.data['settings']['application'].get('browser_profile') + if system_default == machine_name: + raise ValueError( + f"Profile '{machine_name}' is the system default. " + f"Change the system default before deleting it." + ) + + store_profiles = self.data['settings']['application'].get('browser_profiles', {}) + if machine_name not in store_profiles: + return 0 + + del store_profiles[machine_name] + + reset_count = 0 + + # Reset watches that reference this profile + for uuid, watch in self.data['watching'].items(): + if watch.get('browser_profile') == machine_name: + watch['browser_profile'] = None + watch.commit() + reset_count += 1 + + # Reset tags that reference this profile + for tag_uuid, tag in self.data['settings']['application'].get('tags', {}).items(): + if tag.get('browser_profile') == machine_name: + tag['browser_profile'] = None + tag.commit() + reset_count += 1 + + self._save_settings() + logger.info(f"Deleted BrowserProfile '{machine_name}', reset {reset_count} watches/tags") + return reset_count + @property def has_extra_headers_file(self): filepath = os.path.join(self.datastore_path, 'headers.txt') @@ -962,7 +1102,7 @@ def add_tag(self, title): from ..model import Tag new_tag = Tag.model( datastore_path=self.datastore_path, - __datastore=self.__data, + __datastore=self, default={ 'title': title.strip(), 'date_created': int(time.time()) diff --git a/changedetectionio/store/updates.py b/changedetectionio/store/updates.py index 936378628fb..78b7173ca4d 100644 --- a/changedetectionio/store/updates.py +++ b/changedetectionio/store/updates.py @@ -15,6 +15,7 @@ import time from loguru import logger from copy import deepcopy +from typing import Optional # Try to import orjson for faster JSON serialization @@ -730,6 +731,144 @@ def update_29(self): # (left this out by accident in previous update, added tags={} in the changedetection.json save_to_disk) self._save_settings() + def update_31(self): + """ + Migrate legacy ``fetch_backend`` strings to the new ``browser_profile`` + machine-name system. + + What this migration does + ------------------------ + 1. ``settings.requests.extra_browsers`` entries are converted into + ``BrowserProfile`` objects and stored in + ``settings.application.browser_profiles`` keyed by machine name. + + 2. ``settings.application.fetch_backend`` (the system-wide default) is + translated to a machine name and written to + ``settings.application.browser_profile``. + + 3. Every watch that has an explicit ``fetch_backend`` (not ``'system'``) + gets a corresponding ``browser_profile`` machine name set, then + ``fetch_backend`` is reset to ``'system'``. + + 4. The same translation is applied to tags with ``overrides_watch=True`` + that carry an explicit ``fetch_backend``. + + Legacy mapping + ~~~~~~~~~~~~~~ + * ``'html_requests'`` → built-in ``'direct_http_requests'`` + * ``'html_webdriver'`` → built-in ``'browser_chromeplaywright'`` + * ``'extra_browser_'`` → machine name of the migrated custom profile + * ``'system'`` / missing → ``None`` (continue to use chain resolution) + + Safe to re-run: skips watches / tags that already have ``browser_profile`` + set, and skips extra_browser entries that have already been migrated. + """ + from ..model.browser_profile import ( + BrowserProfile, + BUILTIN_REQUESTS, + BUILTIN_BROWSER, + ) + + app_settings = self.data['settings']['application'] + + # ------------------------------------------------------------------ + # 1. Migrate extra_browsers → browser_profiles + # ------------------------------------------------------------------ + extra_browsers = self.data['settings']['requests'].get('extra_browsers', []) + browser_profiles: dict = app_settings.setdefault('browser_profiles', {}) + + extra_browser_name_to_machine: dict[str, str] = {} + + for entry in extra_browsers: + browser_name = entry.get('browser_name', '').strip() + connection_url = entry.get('browser_connection_url', '').strip() + if not browser_name: + continue + + profile = BrowserProfile( + name=browser_name, + fetch_backend='playwright_cdp', + browser_connection_url=connection_url or None, + ) + machine_name = profile.get_machine_name() + + if machine_name not in browser_profiles: + browser_profiles[machine_name] = profile.model_dump() + logger.info(f"update_31: migrated extra_browser '{browser_name}' → profile '{machine_name}'") + + extra_browser_name_to_machine[browser_name] = machine_name + + # ------------------------------------------------------------------ + # Helper: translate a fetch_backend string to a machine name + # ------------------------------------------------------------------ + builtin_requests_name = BUILTIN_REQUESTS.get_machine_name() + builtin_browser_name = BUILTIN_BROWSER.get_machine_name() + + def _to_machine_name(fetch_backend: str) -> Optional[str]: + if not fetch_backend or fetch_backend in ('system', 'default', ''): + return None + if fetch_backend.startswith('extra_browser_'): + key = fetch_backend[len('extra_browser_'):] + return extra_browser_name_to_machine.get(key) + # Strip legacy html_ prefix then query the fetcher registry + from changedetectionio import content_fetchers as cf + clean = fetch_backend[5:] if fetch_backend.startswith('html_') else fetch_backend + fetcher_cls = cf.get_fetcher(clean) + if fetcher_cls is None: + logger.warning(f"update_31: unknown fetch_backend value {fetch_backend!r}, skipping") + return None + if fetcher_cls.supports_screenshots: + return builtin_browser_name + return builtin_requests_name + + # ------------------------------------------------------------------ + # 2. Migrate system-wide default + # ------------------------------------------------------------------ + system_fetch_backend = app_settings.get('fetch_backend', 'requests') + if not app_settings.get('browser_profile'): + machine = _to_machine_name(system_fetch_backend) + app_settings['browser_profile'] = machine + logger.info( + f"update_31: system fetch_backend '{system_fetch_backend}' → browser_profile '{machine}'" + ) + + # ------------------------------------------------------------------ + # 3. Migrate watches + # ------------------------------------------------------------------ + for uuid, watch in self.data['watching'].items(): + if watch.get('browser_profile'): + continue # already migrated + + fetch_backend = watch.get('fetch_backend', 'system') + machine = _to_machine_name(fetch_backend) + watch['browser_profile'] = machine + watch['fetch_backend'] = 'system' # clear legacy value + watch.commit() + if machine: + logger.info( + f"update_31: watch {uuid} fetch_backend '{fetch_backend}' → browser_profile '{machine}'" + ) + + # ------------------------------------------------------------------ + # 4. Migrate tags + # ------------------------------------------------------------------ + for tag_uuid, tag in app_settings.get('tags', {}).items(): + if tag.get('browser_profile'): + continue # already migrated + + fetch_backend = tag.get('fetch_backend', 'system') + machine = _to_machine_name(fetch_backend) + if machine: + tag['browser_profile'] = machine + tag['fetch_backend'] = 'system' + tag.commit() + logger.info( + f"update_31: tag {tag_uuid} fetch_backend '{fetch_backend}' → browser_profile '{machine}'" + ) + + self._save_settings() + logger.success("update_31: fetch_backend → browser_profile migration complete") + def update_30(self): """Migrate restock_settings out of watch.json into restock_diff.json processor config file. diff --git a/changedetectionio/templates/_notification_profiles_selector.html b/changedetectionio/templates/_notification_profiles_selector.html new file mode 100644 index 00000000000..0eac631bef2 --- /dev/null +++ b/changedetectionio/templates/_notification_profiles_selector.html @@ -0,0 +1,208 @@ +{# + Notification Profile Selector widget. + + Usage: + {% from '_notification_profiles_selector.html' import render_notification_profile_selector %} + {{ render_notification_profile_selector( + own_profiles=watch.get('notification_profiles', []), + inherited_profiles=inherited_notification_profiles, + all_profile_data=settings_application.get('notification_profile_data', {}), + registry=registry + ) }} + + own_profiles — list of UUIDs directly linked to this watch/group + inherited_profiles — list of (uuid, origin_label) tuples from parent groups/system + all_profile_data — dict of uuid→profile from settings.application.notification_profile_data + registry — notification_profiles.registry instance +#} + +{% macro render_notification_profile_selector(own_profiles, inherited_profiles, all_profile_data, registry) %} +
    + + {# Hidden inputs — one per selected UUID, submitted with the form #} +
    + {% for uid in own_profiles %} + + {% endfor %} +
    + +
    + + {# Own profiles — solid chips, removable #} + {% for uid in own_profiles %} + {% set profile = all_profile_data.get(uid) %} + {% if profile %} + {% set handler = registry.get(profile.get('type', 'apprise')) %} + + + {{ profile.get('name', uid) }} + × + + {% endif %} + {% endfor %} + + {# Inherited profiles — dimmed, read-only, show origin #} + {% for uid, origin_label in (inherited_profiles or []) %} + {% if uid not in own_profiles %} + {% set profile = all_profile_data.get(uid) %} + {% if profile %} + {% set handler = registry.get(profile.get('type', 'apprise')) %} + + + {{ profile.get('name', uid) }} + + + {% endif %} + {% endif %} + {% endfor %} + + {# Add button + dropdown #} +
    + + + +
    + +
    {# .np-chips #} + + {% if not own_profiles and not inherited_profiles %} +

    + {{ _('No notification profiles linked. Notifications will not be sent for this watch.') }} +

    + {% endif %} + +
    {# .notification-profile-selector #} + + +{% endmacro %} diff --git a/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py index 514a99658f7..2a84610d3ef 100644 --- a/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py +++ b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py @@ -4,30 +4,54 @@ from flask import url_for from ..util import live_server_setup, wait_for_all_checks -def do_test(client, live_server, make_test_use_extra_browser=False): +CUSTOM_PROFILE_NAME = 'Custom Browser URL' +CUSTOM_PROFILE_MACHINE_NAME = 'custom_browser_url' +CUSTOM_BROWSER_WS = 'ws://sockpuppetbrowser-custom-url:3000' + + +def create_custom_browser_profile(client): + """Create a browser profile that uses the custom sockpuppet container.""" + res = client.post( + url_for("settings.settings_browsers.save"), + data={ + "name": CUSTOM_PROFILE_NAME, + "fetch_backend": "playwright_cdp", + "browser_connection_url": CUSTOM_BROWSER_WS, + "viewport_width": 1280, + "viewport_height": 1000, + "block_images": "", + "block_fonts": "", + "ignore_https_errors": "", + "user_agent": "", + "locale": "", + "original_machine_name": "", + }, + follow_redirects=True + ) + assert b"saved." in res.data, f"Expected profile save confirmation, got: {res.data[:500]}" - # Grep for this string in the logs? - test_url = "https://changedetection.io/ci-test.html?non-custom-default=true" - # "non-custom-default" should not appear in the custom browser connection - custom_browser_name = 'custom browser URL' + +def do_test(client, live_server, make_test_use_extra_browser=False): # needs to be set and something like 'ws://127.0.0.1:3000' assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" - ##################### + test_url = "https://changedetection.io/ci-test.html?non-custom-default=true" + + # preconfigure_browser_profiles_based_on_env() already set the correct system default res = client.post( url_for("settings.settings_page"), - data={"application-empty_pages_are_a_change": "", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_webdriver", - 'requests-extra_browsers-0-browser_connection_url': 'ws://sockpuppetbrowser-custom-url:3000', - 'requests-extra_browsers-0-browser_name': custom_browser_name - }, + data={ + "application-empty_pages_are_a_change": "", + "requests-time_between_check-minutes": 180, + }, follow_redirects=True ) - assert b"Settings updated." in res.data + # Create the custom browser profile + create_custom_browser_profile(client) + # Add our URL to the import page uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) @@ -35,23 +59,24 @@ def do_test(client, live_server, make_test_use_extra_browser=False): if make_test_use_extra_browser: - # So the name should appear in the edit page under "Request" > "Fetch Method" + # The custom profile name should appear in the edit page under "Request" tab res = client.get( url_for("ui.ui_edit.edit_page", uuid="first"), follow_redirects=True ) - assert b'custom browser URL' in res.data + assert CUSTOM_PROFILE_NAME.encode() in res.data, \ + f"Expected '{CUSTOM_PROFILE_NAME}' in edit page fetch method choices" res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), data={ - # 'run_customer_browser_url_tests.sh' will search for this string to know if we hit the right browser container or not - "url": "https://changedetection.io/ci-test.html?custom-browser-search-string=1", - "tags": "", - "headers": "", - 'fetch_backend': f"extra_browser_{custom_browser_name}", - 'webdriver_js_execute_code': '', - "time_between_check_use_default": "y" + # 'run_custom_browser_url_tests.sh' will grep for this string in the custom container logs + "url": "https://changedetection.io/ci-test.html?custom-browser-search-string=1", + "tags": "", + "headers": "", + "browser_profile": CUSTOM_PROFILE_MACHINE_NAME, + "webdriver_js_execute_code": "", + "time_between_check_use_default": "y" }, follow_redirects=True ) @@ -74,12 +99,10 @@ def do_test(client, live_server, make_test_use_extra_browser=False): # Requires playwright to be installed def test_request_via_custom_browser_url(client, live_server, measure_memory_usage, datastore_path): - # live_server_setup(live_server) # Setup on conftest per function # We do this so we can grep the logs of the custom container and see if the request actually went through that container do_test(client, live_server, make_test_use_extra_browser=True) def test_request_not_via_custom_browser_url(client, live_server, measure_memory_usage, datastore_path): - # live_server_setup(live_server) # Setup on conftest per function # We do this so we can grep the logs of the custom container and see if the request actually went through that container do_test(client, live_server, make_test_use_extra_browser=False) diff --git a/changedetectionio/tests/fetchers/test_content.py b/changedetectionio/tests/fetchers/test_content.py index 89623fdc520..0cb38156cb5 100644 --- a/changedetectionio/tests/fetchers/test_content.py +++ b/changedetectionio/tests/fetchers/test_content.py @@ -12,12 +12,13 @@ def test_fetch_webdriver_content(client, live_server, measure_memory_usage, data # live_server_setup(live_server) # Setup on conftest per function ##################### + # preconfigure_browser_profiles_based_on_env() already set the correct system default + # (playwright or puppeteer depending on FAST_PUPPETEER_CHROME_FETCHER) — no need to override it. res = client.post( url_for("settings.settings_page"), data={ "application-empty_pages_are_a_change": "", "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_webdriver", 'application-ui-favicons_enabled': "y", }, follow_redirects=True diff --git a/changedetectionio/tests/fetchers/test_custom_js_before_content.py b/changedetectionio/tests/fetchers/test_custom_js_before_content.py index 2ee8836dead..e4a4cbd5058 100644 --- a/changedetectionio/tests/fetchers/test_custom_js_before_content.py +++ b/changedetectionio/tests/fetchers/test_custom_js_before_content.py @@ -25,7 +25,6 @@ def test_execute_custom_js(client, live_server, measure_memory_usage, datastore_ data={ "url": test_url, "tags": "", - 'fetch_backend': "html_webdriver", 'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();', 'headers': "testheader: yes\buser-agent: MyCustomAgent", "time_between_check_use_default": "y", diff --git a/changedetectionio/tests/proxy_list/test_multiple_proxy.py b/changedetectionio/tests/proxy_list/test_multiple_proxy.py index b81cd8160f7..dcca34ba11e 100644 --- a/changedetectionio/tests/proxy_list/test_multiple_proxy.py +++ b/changedetectionio/tests/proxy_list/test_multiple_proxy.py @@ -22,7 +22,7 @@ def test_preferred_proxy(client, live_server, measure_memory_usage, datastore_pa url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1), data={ "include_filters": "", - "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', + "browser_profile": "system", "headers": "", "proxy": "proxy-two", "tags": "", diff --git a/changedetectionio/tests/proxy_list/test_noproxy.py b/changedetectionio/tests/proxy_list/test_noproxy.py index c3b4c3e40e8..db243a85a6e 100644 --- a/changedetectionio/tests/proxy_list/test_noproxy.py +++ b/changedetectionio/tests/proxy_list/test_noproxy.py @@ -22,7 +22,6 @@ def test_noproxy_option(client, live_server, measure_memory_usage, datastore_pat data={ "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", - "application-fetch_backend": "html_requests", "requests-extra_proxies-0-proxy_name": "custom-one-proxy", "requests-extra_proxies-0-proxy_url": "http://test:awesome@squid-one:3128", "requests-extra_proxies-1-proxy_name": "custom-two-proxy", @@ -57,7 +56,6 @@ def test_noproxy_option(client, live_server, measure_memory_usage, datastore_pat url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1), data={ "include_filters": "", - "fetch_backend": "html_requests", "headers": "", "proxy": "no-proxy", "tags": "", diff --git a/changedetectionio/tests/proxy_list/test_proxy_noconnect.py b/changedetectionio/tests/proxy_list/test_proxy_noconnect.py index 93b7e8cb7a7..9ea2ef22869 100644 --- a/changedetectionio/tests/proxy_list/test_proxy_noconnect.py +++ b/changedetectionio/tests/proxy_list/test_proxy_noconnect.py @@ -21,7 +21,6 @@ def test_proxy_noconnect_custom(client, live_server, measure_memory_usage, datas data={ "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", - "application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else 'html_requests', "requests-extra_proxies-0-proxy_name": "custom-test-proxy", # test:awesome is set in tests/proxy_list/squid-passwords.txt "requests-extra_proxies-0-proxy_url": "http://127.0.0.1:3128", @@ -42,7 +41,7 @@ def test_proxy_noconnect_custom(client, live_server, measure_memory_usage, datas options = { "url": test_url, - "fetch_backend": "html_webdriver" if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else "html_requests", + "browser_profile": "system", "proxy": "ui-0custom-test-proxy", "time_between_check_use_default": "y", } diff --git a/changedetectionio/tests/proxy_list/test_select_custom_proxy.py b/changedetectionio/tests/proxy_list/test_select_custom_proxy.py index c060224770c..c3e849547c6 100644 --- a/changedetectionio/tests/proxy_list/test_select_custom_proxy.py +++ b/changedetectionio/tests/proxy_list/test_select_custom_proxy.py @@ -15,7 +15,6 @@ def test_select_custom(client, live_server, measure_memory_usage, datastore_path data={ "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", - "application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', "requests-extra_proxies-0-proxy_name": "custom-test-proxy", # test:awesome is set in tests/proxy_list/squid-passwords.txt "requests-extra_proxies-0-proxy_url": "http://test:awesome@squid-custom:3128", @@ -59,7 +58,6 @@ def test_custom_proxy_validation(client, live_server, measure_memory_usage, data data={ "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", - "application-fetch_backend": 'html_requests', "requests-extra_proxies-0-proxy_name": "custom-test-proxy", "requests-extra_proxies-0-proxy_url": "xxxxhtt/333??p://test:awesome@squid-custom:3128", }, @@ -75,7 +73,6 @@ def test_custom_proxy_validation(client, live_server, measure_memory_usage, data data={ "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", - "application-fetch_backend": 'html_requests', "requests-extra_proxies-0-proxy_name": "custom-test-proxy", "requests-extra_proxies-0-proxy_url": "https://", }, diff --git a/changedetectionio/tests/proxy_socks5/test_socks5_proxy.py b/changedetectionio/tests/proxy_socks5/test_socks5_proxy.py index 99b6d54d172..06a8931a4f2 100644 --- a/changedetectionio/tests/proxy_socks5/test_socks5_proxy.py +++ b/changedetectionio/tests/proxy_socks5/test_socks5_proxy.py @@ -29,7 +29,6 @@ def test_socks5(client, live_server, measure_memory_usage, datastore_path): data={ "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", - "application-fetch_backend": "html_requests", # set in .github/workflows/test-only.yml "requests-extra_proxies-0-proxy_url": "socks5://proxy_user123:proxy_pass123@socks5proxy:1080", "requests-extra_proxies-0-proxy_name": "socks5proxy", @@ -61,7 +60,7 @@ def test_socks5(client, live_server, measure_memory_usage, datastore_path): url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1), data={ "include_filters": "", - "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', + "browser_profile": "system", "headers": "", "proxy": "ui-0socks5proxy", "tags": "", diff --git a/changedetectionio/tests/proxy_socks5/test_socks5_proxy_sources.py b/changedetectionio/tests/proxy_socks5/test_socks5_proxy_sources.py index f0bf81c63ae..8ef62ee9c51 100644 --- a/changedetectionio/tests/proxy_socks5/test_socks5_proxy_sources.py +++ b/changedetectionio/tests/proxy_socks5/test_socks5_proxy_sources.py @@ -48,7 +48,7 @@ def test_socks5_from_proxiesjson_file(client, live_server, measure_memory_usage, url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1), data={ "include_filters": "", - "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', + "browser_profile": "system", "headers": "", "proxy": "socks5proxy", "tags": "", diff --git a/changedetectionio/tests/restock/test_restock.py b/changedetectionio/tests/restock/test_restock.py index aaaf5605f74..d133f077866 100644 --- a/changedetectionio/tests/restock/test_restock.py +++ b/changedetectionio/tests/restock/test_restock.py @@ -60,15 +60,14 @@ def test_restock_detection(client, live_server, measure_memory_usage, datastore_ ##################### - # Set this up for when we remove the notification from the watch, it should fallback with these details + # preconfigure_browser_profiles_based_on_env() already set the correct system default res = client.post( url_for("settings.settings_page"), data={"application-notification_urls": notification_url, "application-notification_title": "fallback-title "+default_notification_title, "application-notification_body": "fallback-body "+default_notification_body, "application-notification_format": default_notification_format, - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_webdriver"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url diff --git a/changedetectionio/tests/smtp/test_notification_smtp.py b/changedetectionio/tests/smtp/test_notification_smtp.py index 01ee339027f..bd7c5027713 100644 --- a/changedetectionio/tests/smtp/test_notification_smtp.py +++ b/changedetectionio/tests/smtp/test_notification_smtp.py @@ -56,8 +56,7 @@ def test_check_notification_email_formats_default_HTML(client, live_server, meas "application-notification_title": "fallback-title " + default_notification_title, "application-notification_body": "some text\nfallback-body
    " + default_notification_body, "application-notification_format": 'html', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data @@ -126,8 +125,7 @@ def test_check_notification_plaintext_format(client, live_server, measure_memory "application-notification_title": "fallback-title {{watch_title}} {{ diff_added.splitlines()[0] if diff_added else 'diff added didnt split' }} " + default_notification_title, "application-notification_body": f"some text\n" + default_notification_body + f"\nMore output test\n{ALL_MARKUP_TOKENS}", "application-notification_format": 'text', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -188,8 +186,7 @@ def test_check_notification_html_color_format(client, live_server, measure_memor "application-notification_title": "fallback-title {{watch_title}} - diff_added_lines_test : '{{ diff_added.splitlines()[0] if diff_added else 'diff added didnt split' }}' " + default_notification_title, "application-notification_body": f"some text\n{default_notification_body}\nMore output test\n{ALL_MARKUP_TOKENS}", "application-notification_format": 'htmlcolor', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -273,8 +270,7 @@ def test_check_notification_markdown_format(client, live_server, measure_memory_ "application-notification_title": "fallback-title diff_added_lines_test : '{{ diff_added.splitlines()[0] if diff_added else 'diff added didnt split' }}' " + default_notification_title, "application-notification_body": "*header*\n\nsome text\n" + default_notification_body, "application-notification_format": 'markdown', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -369,8 +365,7 @@ def test_check_notification_email_formats_default_Text_override_HTML(client, liv "application-notification_title": "fallback-title " + default_notification_title, "application-notification_body": notification_body, "application-notification_format": 'text', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data @@ -420,7 +415,7 @@ def test_check_notification_email_formats_default_Text_override_HTML(client, liv data={ "url": test_url, "notification_format": 'html', - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -480,8 +475,7 @@ def test_check_plaintext_document_plaintext_notification_smtp(client, live_serve "application-notification_title": "fallback-title " + default_notification_title, "application-notification_body": f"{notification_body}\nMore output test\n{ALL_MARKUP_TOKENS}", "application-notification_format": 'text', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data @@ -533,8 +527,7 @@ def test_check_plaintext_document_html_notifications(client, live_server, measur "application-notification_title": "fallback-title " + default_notification_title, "application-notification_body": f"{notification_body}\nMore output test\n{ALL_MARKUP_TOKENS}", "application-notification_format": 'html', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data @@ -613,8 +606,7 @@ def test_check_plaintext_document_html_color_notifications(client, live_server, "application-notification_title": "fallback-title " + default_notification_title, "application-notification_body": f"{notification_body}\nMore output test\n{ALL_MARKUP_TOKENS}", "application-notification_format": 'htmlcolor', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -686,8 +678,7 @@ def test_check_html_document_plaintext_notification(client, live_server, measure "application-notification_title": "fallback-title " + default_notification_title, "application-notification_body": f"{notification_body}\nMore output test\n{ALL_MARKUP_TOKENS}", "application-notification_format": 'text', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -740,8 +731,7 @@ def test_check_html_notification_with_apprise_format_is_html(client, live_server "application-notification_title": "fallback-title " + default_notification_title, "application-notification_body": "some text\nfallback-body
    " + default_notification_body, "application-notification_format": 'html', - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data diff --git a/changedetectionio/tests/test_access_control.py b/changedetectionio/tests/test_access_control.py index fc37019516c..992f186b567 100644 --- a/changedetectionio/tests/test_access_control.py +++ b/changedetectionio/tests/test_access_control.py @@ -32,8 +32,7 @@ def test_check_access_control(app, client, live_server, measure_memory_usage, da url_for("settings.settings_page"), data={"application-password": "foobar", "application-shared_diff_access": "True", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -91,8 +90,7 @@ def test_check_access_control(app, client, live_server, measure_memory_usage, da res = c.post( url_for("settings.settings_page"), data={ - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -127,16 +125,16 @@ def test_check_access_control(app, client, live_server, measure_memory_usage, da assert b"IMPORT" in res.data assert b"LOG OUT" in res.data assert b"time_between_check-minutes" in res.data - assert b"fetch_backend" in res.data + ################################################## # Remove password button, and check that it worked ################################################## + # preconfigure_browser_profiles_based_on_env() already set the correct system default res = c.post( url_for("settings.settings_page"), data={ "requests-time_between_check-minutes": 180, - "application-fetch_backend": "html_webdriver", "application-removepassword_button": "Remove password" }, follow_redirects=True, @@ -150,8 +148,7 @@ def test_check_access_control(app, client, live_server, measure_memory_usage, da res = c.post( url_for("settings.settings_page"), data={"application-password": "", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -164,8 +161,7 @@ def test_check_access_control(app, client, live_server, measure_memory_usage, da data={"application-password": "foobar", # Should be disabled "application-shared_diff_access": "", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_add_replace_remove_filter.py b/changedetectionio/tests/test_add_replace_remove_filter.py index 5f715e33261..d66bb8687c5 100644 --- a/changedetectionio/tests/test_add_replace_remove_filter.py +++ b/changedetectionio/tests/test_add_replace_remove_filter.py @@ -60,7 +60,6 @@ def test_check_removed_line_contains_trigger(client, live_server, measure_memory url_for("ui.ui_edit.edit_page", uuid="first"), data={"trigger_text": 'The golden line', "url": test_url, - 'fetch_backend': "html_requests", 'filter_text_removed': 'y', "time_between_check_use_default": "y"}, follow_redirects=True @@ -127,8 +126,7 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa # https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation "application-notification_urls": test_notification_url, "application-notification_format": 'text', - "application-minutes_between_check": 180, - "application-fetch_backend": "html_requests" + "application-minutes_between_check": 180 }, follow_redirects=True ) @@ -149,7 +147,6 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa data={"trigger_text": 'Oh yes please', "url": test_url, 'processor': 'text_json_diff', - 'fetch_backend': "html_requests", 'filter_text_removed': '', 'filter_text_added': 'y', "time_between_check_use_default": "y"}, diff --git a/changedetectionio/tests/test_api.py b/changedetectionio/tests/test_api.py index b40e47a1bce..29bbe819e7a 100644 --- a/changedetectionio/tests/test_api.py +++ b/changedetectionio/tests/test_api.py @@ -416,7 +416,6 @@ def test_access_denied(client, live_server, measure_memory_usage, datastore_path url_for("settings.settings_page"), data={ "requests-time_between_check-minutes": 180, - "application-fetch_backend": "html_requests", "application-api_access_token_enabled": "" }, follow_redirects=True @@ -436,7 +435,6 @@ def test_access_denied(client, live_server, measure_memory_usage, datastore_path url_for("settings.settings_page"), data={ "requests-time_between_check-minutes": 180, - "application-fetch_backend": "html_requests", "application-api_access_token_enabled": "y" }, follow_redirects=True @@ -907,8 +905,7 @@ def test_api_conflict_UI_password(client, live_server, measure_memory_usage, dat url_for("settings.settings_page"), data={"application-password": "foobar", # password is now set! API should still work! "application-api_access_token_enabled": "y", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_api_openapi.py b/changedetectionio/tests/test_api_openapi.py index 5628cc7f025..e3b47403867 100644 --- a/changedetectionio/tests/test_api_openapi.py +++ b/changedetectionio/tests/test_api_openapi.py @@ -177,7 +177,6 @@ def test_openapi_validation_get_requests_bypass_validation(client, live_server, url_for("settings.settings_page"), data={ "requests-time_between_check-minutes": 180, - "application-fetch_backend": "html_requests", "application-api_access_token_enabled": "" }, follow_redirects=True diff --git a/changedetectionio/tests/test_auth.py b/changedetectionio/tests/test_auth.py index 2e073e212a1..b133d8c4c05 100644 --- a/changedetectionio/tests/test_auth.py +++ b/changedetectionio/tests/test_auth.py @@ -19,7 +19,7 @@ def test_basic_auth(client, live_server, measure_memory_usage, datastore_path): # Check form validation res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": "", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": "", "url": test_url, "tags": "", "headers": "", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py index d0713133c01..6bfc1f4e3c0 100644 --- a/changedetectionio/tests/test_backend.py +++ b/changedetectionio/tests/test_backend.py @@ -172,8 +172,7 @@ def test_title_scraper(client, live_server, measure_memory_usage, datastore_path res = client.post( url_for("settings.settings_page"), data={"application-ui-use_page_title_in_list": "", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -215,8 +214,7 @@ def test_requests_timeout(client, live_server, measure_memory_usage, datastore_p url_for("settings.settings_page"), data={"application-ui-use_page_title_in_list": "", "requests-time_between_check-minutes": 180, - "requests-timeout": delay - 1, - 'application-fetch_backend': "html_requests"}, + "requests-timeout": delay - 1}, follow_redirects=True ) @@ -234,8 +232,7 @@ def test_requests_timeout(client, live_server, measure_memory_usage, datastore_p url_for("settings.settings_page"), data={"application-ui-use_page_title_in_list": "", "requests-time_between_check-minutes": 180, - "requests-timeout": delay + 1, # timeout should be a second more than the reply time - 'application-fetch_backend': "html_requests"}, + "requests-timeout": delay + 1}, # timeout should be a second more than the reply time follow_redirects=True ) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) diff --git a/changedetectionio/tests/test_basic_socketio.py b/changedetectionio/tests/test_basic_socketio.py index f1585dc8590..9147baab33e 100644 --- a/changedetectionio/tests/test_basic_socketio.py +++ b/changedetectionio/tests/test_basic_socketio.py @@ -118,8 +118,7 @@ def test_everything(live_server, client, measure_memory_usage, datastore_path): res = client.post( url_for("settings.settings_page"), data={"application-password": "foobar", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_block_while_text_present.py b/changedetectionio/tests/test_block_while_text_present.py index e70db27bde3..acef60dce9a 100644 --- a/changedetectionio/tests/test_block_while_text_present.py +++ b/changedetectionio/tests/test_block_while_text_present.py @@ -83,7 +83,6 @@ def test_check_block_changedetection_text_NOT_present(client, live_server, measu url_for("ui.ui_edit.edit_page", uuid=uuid), data={"text_should_not_be_present": ignore_text, "url": test_url, - 'fetch_backend': "html_requests", "time_between_check_use_default": "y" }, follow_redirects=True diff --git a/changedetectionio/tests/test_browser_profile_status_icon.py b/changedetectionio/tests/test_browser_profile_status_icon.py new file mode 100644 index 00000000000..ebcbe188ea5 --- /dev/null +++ b/changedetectionio/tests/test_browser_profile_status_icon.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +Tests that the watchlist shows/hides the browser status icon based on the +effective browser profile, covering the full inheritance chain: + + watch browser_profile → system default browser_profile → direct_http_requests +""" + +import pytest +from flask import url_for + + +def set_system_default_profile(client, profile_machine_name): + res = client.post( + url_for('settings.settings_browsers.set_default'), + data={'machine_name': profile_machine_name}, + follow_redirects=True, + ) + assert res.status_code == 200 + + +def create_custom_browser_profile(client, name='My Custom Chrome'): + """Create a custom browser profile using playwright_cdp and return its machine name.""" + res = client.post( + url_for('settings.settings_browsers.save'), + data={ + 'name': name, + 'fetch_backend': 'playwright_cdp', + 'browser_connection_url': 'ws://localhost:3000', + 'viewport_width': 1280, + 'viewport_height': 1000, + 'block_images': '', + 'block_fonts': '', + 'ignore_https_errors': '', + 'user_agent': '', + 'locale': '', + 'custom_headers': '', + 'original_machine_name': '', + }, + follow_redirects=True, + ) + assert b'saved.' in res.data + from changedetectionio.model.browser_profile import BrowserProfile + return BrowserProfile(name=name, fetch_backend='playwright_cdp').get_machine_name() + + +def create_requests_browser_profile(client, name, user_agent='', custom_headers=''): + """Create a requests-type browser profile with optional UA and custom headers.""" + res = client.post( + url_for('settings.settings_browsers.save'), + data={ + 'name': name, + 'fetch_backend': 'requests', + 'browser_connection_url': '', + 'viewport_width': 1280, + 'viewport_height': 1000, + 'block_images': '', + 'block_fonts': '', + 'ignore_https_errors': '', + 'user_agent': user_agent, + 'locale': '', + 'custom_headers': custom_headers, + 'original_machine_name': '', + }, + follow_redirects=True, + ) + assert b'saved.' in res.data + from changedetectionio.model.browser_profile import BrowserProfile + return BrowserProfile(name=name, fetch_backend='requests').get_machine_name() + + +# --------------------------------------------------------------------------- +# Unit tests — status_icon attribute on fetcher classes +# --------------------------------------------------------------------------- + +def test_status_icon_on_browser_fetchers(): + """Browser fetcher classes must declare a status_icon dict.""" + from changedetectionio.content_fetchers.playwright.CDP import fetcher as playwright_fetcher + from changedetectionio.content_fetchers.puppeteer import fetcher as puppeteer_fetcher + from changedetectionio.content_fetchers.webdriver_selenium import fetcher as selenium_fetcher + + for cls in (playwright_fetcher, puppeteer_fetcher, selenium_fetcher): + assert cls.status_icon is not None, f"{cls} should have status_icon set" + assert 'filename' in cls.status_icon + assert 'alt' in cls.status_icon + assert 'title' in cls.status_icon + + +def test_no_status_icon_on_requests_fetcher(): + """The plain requests fetcher must have status_icon = None.""" + from changedetectionio.content_fetchers.requests import fetcher as requests_fetcher + assert requests_fetcher.status_icon is None + + +def test_fetcher_status_icons_filter_uses_status_icon(monkeypatch): + """fetcher_status_icons filter returns icon HTML for a class with status_icon set.""" + from changedetectionio import content_fetchers + + class FakeBrowserFetcher: + status_icon = {'filename': 'test-icon.png', 'alt': 'Test browser', 'title': 'Test browser'} + supports_screenshots = True + + monkeypatch.setitem(content_fetchers.FETCHERS, 'fake_browser', FakeBrowserFetcher) + + from changedetectionio.flask_app import app + with app.test_request_context('/'): + from changedetectionio.flask_app import _jinja2_filter_fetcher_status_icons + result = _jinja2_filter_fetcher_status_icons('fake_browser') + assert 'test-icon.png' in result + assert 'Test browser' in result + + # Requests fetcher → empty string + with app.test_request_context('/'): + result = _jinja2_filter_fetcher_status_icons('requests') + assert result == '' + + +# --------------------------------------------------------------------------- +# Integration tests — inheritance chain +# --------------------------------------------------------------------------- + +def test_watch_explicit_browser_profile_shows_icon(client, live_server, measure_memory_usage, datastore_path): + """Watch explicitly assigned a browser profile shows the chrome icon, + even when the system default is requests.""" + datastore = client.application.config.get('DATASTORE') + set_system_default_profile(client, 'direct_http_requests') + + machine_name = create_custom_browser_profile(client) + uuid = datastore.add_watch(url='http://example.com', extras={'browser_profile': machine_name, 'paused': True}) + res = client.get(url_for('watchlist.index'), follow_redirects=True) + assert b'Using a Chrome browser' in res.data, \ + "Chrome icon should appear when watch is explicitly set to a browser profile" + + datastore.delete(uuid) + client.get(url_for('settings.settings_browsers.delete', machine_name=machine_name), follow_redirects=True) + + +def test_watch_explicit_requests_profile_no_icon(client, live_server, measure_memory_usage, datastore_path): + """Watch explicitly set to direct_http_requests never shows the chrome icon, + even when the system default is a browser.""" + datastore = client.application.config.get('DATASTORE') + + machine_name = create_custom_browser_profile(client) + set_system_default_profile(client, machine_name) + + uuid = datastore.add_watch(url='http://example.com', extras={'browser_profile': 'direct_http_requests', 'paused': True}) + res = client.get(url_for('watchlist.index'), follow_redirects=True) + assert b'Using a Chrome browser' not in res.data, \ + "Chrome icon should NOT appear when watch is explicitly set to direct_http_requests" + + datastore.delete(uuid) + set_system_default_profile(client, 'direct_http_requests') + client.get(url_for('settings.settings_browsers.delete', machine_name=machine_name), follow_redirects=True) + + +def test_system_default_requests_inherited_by_watch(client, live_server, measure_memory_usage, datastore_path): + """Watch using system default inherits requests → no icon.""" + datastore = client.application.config.get('DATASTORE') + set_system_default_profile(client, 'direct_http_requests') + + uuid = datastore.add_watch(url='http://example.com', extras={'paused': True}) + res = client.get(url_for('watchlist.index'), follow_redirects=True) + assert b'Using a Chrome browser' not in res.data, \ + "Chrome icon should NOT appear when system default is requests and watch uses system default" + + datastore.delete(uuid) + + +def test_system_default_browser_inherited_by_watch(client, live_server, measure_memory_usage, datastore_path): + """Watch using system default inherits a browser profile → icon shown.""" + datastore = client.application.config.get('DATASTORE') + + machine_name = create_custom_browser_profile(client) + set_system_default_profile(client, machine_name) + + uuid = datastore.add_watch(url='http://example.com', extras={'paused': True}) + res = client.get(url_for('watchlist.index'), follow_redirects=True) + assert b'Using a Chrome browser' in res.data, \ + "Chrome icon should appear when system default is a browser profile and watch uses system default" + + datastore.delete(uuid) + set_system_default_profile(client, 'direct_http_requests') + client.get(url_for('settings.settings_browsers.delete', machine_name=machine_name), follow_redirects=True) + +# --------------------------------------------------------------------------- +# Integration tests — BrowserProfile UA and custom_headers applied to requests +# --------------------------------------------------------------------------- + +def test_browser_profile_user_agent_applied(client, live_server, measure_memory_usage, datastore_path): + """User-Agent set on a BrowserProfile appears in the fetched request; + a per-watch User-Agent header overrides it.""" + from changedetectionio.tests.util import wait_for_all_checks + + datastore = client.application.config.get('DATASTORE') + test_url = url_for('test_headers', _external=True) + + machine_name = create_requests_browser_profile( + client, name='UA Profile Test', user_agent='profile-ua/2.0' + ) + + uuid = datastore.add_watch(url=test_url, extras={'browser_profile': machine_name}) + client.get(url_for('ui.form_watch_checknow'), follow_redirects=True) + wait_for_all_checks(client) + + res = client.get(url_for('ui.ui_preview.preview_page', uuid='first'), follow_redirects=True) + assert b'profile-ua/2.0' in res.data, "Profile UA should appear in the echoed request headers" + + # Per-watch User-Agent header overrides the profile UA + client.post( + url_for('ui.ui_edit.edit_page', uuid='first'), + data={ + 'url': test_url, + 'tags': '', + 'browser_profile': machine_name, + 'headers': 'User-Agent: watch-ua/3.0', + 'time_between_check_use_default': 'y', + }, + follow_redirects=True, + ) + client.get(url_for('ui.form_watch_checknow'), follow_redirects=True) + wait_for_all_checks(client) + + res = client.get(url_for('ui.ui_preview.preview_page', uuid='first'), follow_redirects=True) + assert b'watch-ua/3.0' in res.data, "Watch-level UA should override profile UA" + assert b'profile-ua/2.0' not in res.data, "Profile UA should be superseded by watch-level header" + + datastore.delete(uuid) + client.get(url_for('settings.settings_browsers.delete', machine_name=machine_name), follow_redirects=True) + + +def test_browser_profile_custom_headers_applied(client, live_server, measure_memory_usage, datastore_path): + """Custom headers set on a BrowserProfile are sent with every request using that profile; + per-watch headers override them when the same header name is used.""" + from changedetectionio.tests.util import wait_for_all_checks + + datastore = client.application.config.get('DATASTORE') + test_url = url_for('test_headers', _external=True) + + machine_name = create_requests_browser_profile( + client, + name='Headers Profile Test', + custom_headers='X-Profile-Header: profile-value\nX-Shared-Header: from-profile', + ) + + uuid = datastore.add_watch(url=test_url, extras={'browser_profile': machine_name}) + client.get(url_for('ui.form_watch_checknow'), follow_redirects=True) + wait_for_all_checks(client) + + res = client.get(url_for('ui.ui_preview.preview_page', uuid='first'), follow_redirects=True) + assert b'X-Profile-Header:profile-value' in res.data, \ + "Profile custom header should appear in the echoed request" + assert b'X-Shared-Header:from-profile' in res.data, \ + "Second profile custom header should appear" + + # Per-watch header for the same key overrides the profile header + client.post( + url_for('ui.ui_edit.edit_page', uuid='first'), + data={ + 'url': test_url, + 'tags': '', + 'browser_profile': machine_name, + 'headers': 'X-Shared-Header: from-watch\nX-Watch-Only: watch-value', + 'time_between_check_use_default': 'y', + }, + follow_redirects=True, + ) + client.get(url_for('ui.form_watch_checknow'), follow_redirects=True) + wait_for_all_checks(client) + + res = client.get(url_for('ui.ui_preview.preview_page', uuid='first'), follow_redirects=True) + assert b'X-Profile-Header:profile-value' in res.data, \ + "Unrelated profile header should still be present" + assert b'X-Shared-Header:from-watch' in res.data, \ + "Watch-level header should override the same-named profile header" + assert b'X-Shared-Header:from-profile' not in res.data, \ + "Profile value for overridden header should be gone" + assert b'X-Watch-Only:watch-value' in res.data, \ + "Watch-only header should appear" + + datastore.delete(uuid) + client.get(url_for('settings.settings_browsers.delete', machine_name=machine_name), follow_redirects=True) diff --git a/changedetectionio/tests/test_commit_persistence.py b/changedetectionio/tests/test_commit_persistence.py index cd5fb6b29bf..f23313748c8 100644 --- a/changedetectionio/tests/test_commit_persistence.py +++ b/changedetectionio/tests/test_commit_persistence.py @@ -464,7 +464,7 @@ def test_settings_persist_after_update(client, live_server): # Update settings directly (bypass form validation issues) datastore.data['settings']['application']['empty_pages_are_a_change'] = True - datastore.data['settings']['application']['fetch_backend'] = 'html_requests' + datastore.data['settings']['application']['browser_profile'] = 'direct_http_requests' datastore.data['settings']['requests']['time_between_check']['minutes'] = 120 datastore.commit() @@ -478,7 +478,7 @@ def test_settings_persist_after_update(client, live_server): # Verify settings survived assert datastore2.data['settings']['application']['empty_pages_are_a_change'] == True, "empty_pages_are_a_change should persist" - assert datastore2.data['settings']['application']['fetch_backend'] == 'html_requests', "fetch_backend should persist" + assert datastore2.data['settings']['application']['browser_profile'] == 'direct_http_requests', "browser_profile should persist" assert datastore2.data['settings']['requests']['time_between_check']['minutes'] == 120, "time_between_check should persist" @@ -634,7 +634,7 @@ def test_ui_watch_edit_persists_all_fields(client, live_server): 'time_between_check-hours': '2', 'time_between_check-minutes': '30', 'include_filters': '#content', - 'fetch_backend': 'html_requests', + 'browser_profile': 'direct_http_requests', 'method': 'POST', 'ignore_text': 'Advertisement\nTracking' }, @@ -657,5 +657,5 @@ def test_ui_watch_edit_persists_all_fields(client, live_server): assert watch['title'] == 'Updated Watch Title' assert watch['time_between_check']['hours'] == 2 assert watch['time_between_check']['minutes'] == 30 - assert watch['fetch_backend'] == 'html_requests' + assert watch['browser_profile'] == 'direct_http_requests' assert watch['method'] == 'POST' diff --git a/changedetectionio/tests/test_conditions.py b/changedetectionio/tests/test_conditions.py index 1b7bb01dca0..ce5890873ca 100644 --- a/changedetectionio/tests/test_conditions.py +++ b/changedetectionio/tests/test_conditions.py @@ -72,7 +72,6 @@ def test_conditions_with_text_and_number(client, live_server, measure_memory_usa url_for("ui.ui_edit.edit_page", uuid=uuid), data={ "url": test_url, - "fetch_backend": "html_requests", "include_filters": ".number-container", "title": "Number AND Text Condition Test", "conditions_match_logic": CONDITIONS_MATCH_LOGIC_DEFAULT, # ALL = AND logic @@ -258,7 +257,6 @@ def test_lev_conditions_plugin(client, live_server, measure_memory_usage, datast url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1), data={ "url": test_url, - "fetch_backend": "html_requests", "conditions_match_logic": CONDITIONS_MATCH_LOGIC_DEFAULT, # ALL = AND logic "conditions-0-field": "levenshtein_ratio", "conditions-0-operator": "<", diff --git a/changedetectionio/tests/test_css_selector.py b/changedetectionio/tests/test_css_selector.py index 5a4f364d1bb..122c31c952f 100644 --- a/changedetectionio/tests/test_css_selector.py +++ b/changedetectionio/tests/test_css_selector.py @@ -89,7 +89,7 @@ def test_check_markup_include_filters_restriction(client, live_server, measure_m # Add our URL to the import page res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": include_filters, "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": include_filters, "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -144,7 +144,7 @@ def test_check_multiple_filters(client, live_server, measure_memory_usage, datas "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -195,7 +195,7 @@ def test_filter_is_empty_help_suggestion(client, live_server, measure_memory_usa "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_element_removal.py b/changedetectionio/tests/test_element_removal.py index 5fde2206a71..8582777d714 100644 --- a/changedetectionio/tests/test_element_removal.py +++ b/changedetectionio/tests/test_element_removal.py @@ -171,7 +171,7 @@ def test_element_removal_full(client, live_server, measure_memory_usage, datasto "url": test_url, "tags": "", "headers": "", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y", }, follow_redirects=True, diff --git a/changedetectionio/tests/test_errorhandling.py b/changedetectionio/tests/test_errorhandling.py index d96c7a225e8..87a681700fe 100644 --- a/changedetectionio/tests/test_errorhandling.py +++ b/changedetectionio/tests/test_errorhandling.py @@ -10,6 +10,8 @@ def _runner_test_http_errors(client, live_server, http_code, expected_text, datastore_path): + from loguru import logger + logger.debug(f"_runner_test_http_errors - testing text '{expected_text}' for code {http_code}") with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f: f.write("Now you going to get a {} error code\n".format(http_code)) @@ -20,6 +22,11 @@ def _runner_test_http_errors(client, live_server, http_code, expected_text, data status_code=http_code, _external=True) + if os.getenv("PLAYWRIGHT_DRIVER_URL") or os.getenv('WEBDRIVER_URL'): + logger.warning("!!! Looks like we're running test with playwright or selenium, so FORCE a connection back to our container 'cdio'") + test_url = test_url.replace('localhost.localdomain', 'changedet') + test_url = test_url.replace('localhost', 'changedet') + uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) @@ -76,7 +83,8 @@ def test_DNS_errors(client, live_server, measure_memory_usage, datastore_path): b"nodename nor servname provided" in res.data or b"Temporary failure in name resolution" in res.data or b"Failed to establish a new connection" in res.data or - b"Connection error occurred" in res.data + b"Connection error occurred" in res.data or + b"net::ERR_NAME_NOT_RESOLVED" in res.data ) assert found_name_resolution_error # Should always record that we tried @@ -108,7 +116,8 @@ def test_low_level_errors_clear_correctly(client, live_server, measure_memory_us b"nodename nor servname provided" in res.data or b"Temporary failure in name resolution" in res.data or b"Failed to establish a new connection" in res.data or - b"Connection error occurred" in res.data + b"Connection error occurred" in res.data or + b"net::ERR_NAME_NOT_RESOLVED" in res.data ) assert found_name_resolution_error @@ -117,7 +126,7 @@ def test_low_level_errors_clear_correctly(client, live_server, measure_memory_us url_for("ui.ui_edit.edit_page", uuid="first"), data={ "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -131,7 +140,8 @@ def test_low_level_errors_clear_correctly(client, live_server, measure_memory_us b"nodename nor servname provided" in res.data or b"Temporary failure in name resolution" in res.data or b"Failed to establish a new connection" in res.data or - b"Connection error occurred" in res.data + b"Connection error occurred" in res.data or + b"net::ERR_NAME_NOT_RESOLVED" in res.data ) assert not found_name_resolution_error diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py index c51ad42bb5b..f5ebc87981a 100644 --- a/changedetectionio/tests/test_extract_regex.py +++ b/changedetectionio/tests/test_extract_regex.py @@ -92,7 +92,7 @@ def test_check_filter_multiline(client, live_server, measure_memory_usage, datas "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True @@ -143,7 +143,7 @@ def test_check_filter_and_regex_extract(client, live_server, measure_memory_usag "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True @@ -212,7 +212,7 @@ def test_regex_error_handling(client, live_server, measure_memory_usage, datasto url_for("ui.ui_edit.edit_page", uuid=uuid), data={"extract_text": '/something bad\d{3/XYZ', "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_filter_exist_changes.py b/changedetectionio/tests/test_filter_exist_changes.py index d5dccd72fd5..6c0dc54a82e 100644 --- a/changedetectionio/tests/test_filter_exist_changes.py +++ b/changedetectionio/tests/test_filter_exist_changes.py @@ -96,7 +96,7 @@ def test_filter_doesnt_exist_then_exists_should_get_notification(client, live_se # preprended with extra filter that intentionally doesn't match any entry, # notification should still be sent even if first filter does not match (PR#3516) "include_filters": ".non-matching-selector\n.ticket-available", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}) res = client.post( diff --git a/changedetectionio/tests/test_filter_failure_notification.py b/changedetectionio/tests/test_filter_failure_notification.py index 1520e3bf0e3..40ef43f46f2 100644 --- a/changedetectionio/tests/test_filter_failure_notification.py +++ b/changedetectionio/tests/test_filter_failure_notification.py @@ -70,7 +70,7 @@ def run_filter_test(client, live_server, content_filter, app_notification_format "Diff as Patch: {{diff_patch}}\n" ":-)", "notification_format": 'text', - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "filter_failure_notification_send": 'y', "time_between_check_use_default": "y", "headers": "", diff --git a/changedetectionio/tests/test_group.py b/changedetectionio/tests/test_group.py index 8e694114a5b..149373ac67a 100644 --- a/changedetectionio/tests/test_group.py +++ b/changedetectionio/tests/test_group.py @@ -417,7 +417,7 @@ def test_order_of_filters_tag_filter_and_watch_filter(client, live_server, measu "url": test_url, "tags": "test-tag-keep-order", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py index 35bc17ff310..74f33e0d078 100644 --- a/changedetectionio/tests/test_history_consistency.py +++ b/changedetectionio/tests/test_history_consistency.py @@ -50,8 +50,7 @@ def test_consistent_history(client, live_server, measure_memory_usage, datastore res = client.post( url_for("settings.settings_page"), data={"application-empty_pages_are_a_change": "", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data @@ -244,7 +243,7 @@ def test_history_trim_global_override_in_watch(client, live_server, measure_memo uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": "", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", + data={"include_filters": "", "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y", "history_snapshot_max_length": str(limit)}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_ignore.py b/changedetectionio/tests/test_ignore.py index 6e825a24f86..dea81986279 100644 --- a/changedetectionio/tests/test_ignore.py +++ b/changedetectionio/tests/test_ignore.py @@ -68,8 +68,7 @@ def test_strip_ignore_lines(client, live_server, measure_memory_usage, datastore "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", "application-strip_ignored_lines": "y", - "application-global_ignore_text": "Which is across multiple", - 'application-fetch_backend': "html_requests" + "application-global_ignore_text": "Which is across multiple" }, follow_redirects=True ) diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index 37868faca45..9bfb20b0de7 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -108,7 +108,7 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa # Add our URL to the import page res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"ignore_text": ignore_text, "url": test_url, 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"ignore_text": ignore_text, "url": test_url, 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -181,8 +181,7 @@ def _run_test_global_ignore(client, datastore_path, as_source=False, extra_ignor data={ "requests-time_between_check-minutes": 180, "application-ignore_whitespace": "y", - "application-global_ignore_text": ignore_text, - 'application-fetch_backend': "html_requests" + "application-global_ignore_text": ignore_text }, follow_redirects=True ) @@ -204,7 +203,7 @@ def _run_test_global_ignore(client, datastore_path, as_source=False, extra_ignor #Adding some ignore text should not trigger a change res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data diff --git a/changedetectionio/tests/test_ignorehyperlinks.py b/changedetectionio/tests/test_ignorehyperlinks.py index b1a3477c5a8..2f2e7af0f38 100644 --- a/changedetectionio/tests/test_ignorehyperlinks.py +++ b/changedetectionio/tests/test_ignorehyperlinks.py @@ -53,7 +53,6 @@ def test_render_anchor_tag_content_true(client, live_server, measure_memory_usag url_for("settings.settings_page"), data={ "requests-time_between_check-minutes": 180, - "application-fetch_backend": "html_requests", }, follow_redirects=True, ) @@ -90,7 +89,6 @@ def test_render_anchor_tag_content_true(client, live_server, measure_memory_usag data={ "requests-time_between_check-minutes": 180, "application-render_anchor_tag_content": "true", - "application-fetch_backend": "html_requests", }, follow_redirects=True, ) diff --git a/changedetectionio/tests/test_ignorestatuscode.py b/changedetectionio/tests/test_ignorestatuscode.py index f3b9fe47624..3cc01032b50 100644 --- a/changedetectionio/tests/test_ignorestatuscode.py +++ b/changedetectionio/tests/test_ignorestatuscode.py @@ -49,8 +49,7 @@ def test_normal_page_check_works_with_ignore_status_code(client, live_server, me url_for("settings.settings_page"), data={ "requests-time_between_check-minutes": 180, - "application-ignore_status_codes": "y", - 'application-fetch_backend': "html_requests" + "application-ignore_status_codes": "y" }, follow_redirects=True ) @@ -117,7 +116,7 @@ def test_403_page_check_works_with_ignore_status_code(client, live_server, measu # Add our URL to the import page res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"ignore_status_codes": "y", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"ignore_status_codes": "y", "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data diff --git a/changedetectionio/tests/test_ignorewhitespace.py b/changedetectionio/tests/test_ignorewhitespace.py index cf158da4747..cc44ca19d47 100644 --- a/changedetectionio/tests/test_ignorewhitespace.py +++ b/changedetectionio/tests/test_ignorewhitespace.py @@ -58,8 +58,7 @@ def test_check_ignore_whitespace(client, live_server, measure_memory_usage, data url_for("settings.settings_page"), data={ "requests-time_between_check-minutes": 180, - "application-ignore_whitespace": "y", - "application-fetch_backend": "html_requests" + "application-ignore_whitespace": "y" }, follow_redirects=True ) diff --git a/changedetectionio/tests/test_import.py b/changedetectionio/tests/test_import.py index df67a821da7..ff2e1021a86 100644 --- a/changedetectionio/tests/test_import.py +++ b/changedetectionio/tests/test_import.py @@ -205,12 +205,7 @@ def test_import_watchete_xlsx(client, live_server, measure_memory_usage, datasto filters = watch.get('include_filters') assert filters[0] == '/html[1]/body[1]/div[4]/div[1]/div[1]/div[1]||//*[@id=\'content\']/div[3]/div[1]/div[1]||//*[@id=\'content\']/div[1]' assert watch.get('time_between_check') == {'weeks': 0, 'days': 1, 'hours': 6, 'minutes': 24, 'seconds': 0} - assert watch.get('fetch_backend') == 'html_requests' # Has inactive 'dynamic wachet' + assert watch.get('browser_profile') == 'direct_http_requests' # Has inactive 'dynamic wachet' - if watch.get('title') == 'JS website': - assert watch.get('fetch_backend') == 'html_webdriver' # Has active 'dynamic wachet' - - if watch.get('title') == 'system default website': - assert watch.get('fetch_backend') == 'system' # uses default if blank delete_all_watches(client) diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index 00e0bcd78cf..4acd56554d4 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -377,7 +377,7 @@ def check_json_ext_filter(json_filter, client, live_server, datastore_path): "url": test_url, "tags": "", "headers": "", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True diff --git a/changedetectionio/tests/test_live_preview.py b/changedetectionio/tests/test_live_preview.py index 89629912da3..7506d6725e9 100644 --- a/changedetectionio/tests/test_live_preview.py +++ b/changedetectionio/tests/test_live_preview.py @@ -35,7 +35,7 @@ def test_content_filter_live_preview(client, live_server, measure_memory_usage, url_for("ui.ui_edit.edit_page", uuid=uuid), data={ "include_filters": "", - "fetch_backend": 'html_requests', + "browser_profile": 'direct_http_requests', "ignore_text": "something to ignore", "trigger_text": "something to trigger", "url": test_url, @@ -64,7 +64,7 @@ def test_content_filter_live_preview(client, live_server, measure_memory_usage, url_for("ui.ui_edit.watch_get_preview_rendered", uuid=uuid), data={ "include_filters": "", - "fetch_backend": 'html_requests', + "browser_profile": 'direct_http_requests', "ignore_text": "sOckS", # Also be sure case insensitive works "trigger_text": "AweSOme", "url": test_url, diff --git a/changedetectionio/tests/test_nonrenderable_pages.py b/changedetectionio/tests/test_nonrenderable_pages.py index 299f54125e6..764f24d2a7c 100644 --- a/changedetectionio/tests/test_nonrenderable_pages.py +++ b/changedetectionio/tests/test_nonrenderable_pages.py @@ -50,8 +50,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure client.post( url_for("settings.settings_page"), data={"application-empty_pages_are_a_change": "", # default, OFF, they are NOT a change - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -81,8 +80,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure client.post( url_for("settings.settings_page"), data={"application-empty_pages_are_a_change": "y", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) set_modified_response(datastore_path=datastore_path) diff --git a/changedetectionio/tests/test_notification.py b/changedetectionio/tests/test_notification.py index a679485fa88..96b6a62adb3 100644 --- a/changedetectionio/tests/test_notification.py +++ b/changedetectionio/tests/test_notification.py @@ -41,8 +41,7 @@ def test_check_notification(client, live_server, measure_memory_usage, datastore "application-notification_title": "fallback-title "+default_notification_title, "application-notification_body": "fallback-body "+default_notification_body, "application-notification_format": default_notification_format, - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -122,7 +121,7 @@ def test_check_notification(client, live_server, measure_memory_usage, datastore "tags": "my tag, my second tag", "title": "my title", "headers": "", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}) res = client.post( @@ -251,7 +250,7 @@ def test_check_notification(client, live_server, measure_memory_usage, datastore "notification_title": '', "notification_body": '', "notification_format": default_notification_format, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -282,7 +281,6 @@ def test_notification_urls_jinja2_apprise_integration(client, live_server, measu res = client.post( url_for("settings.settings_page"), data={ - "application-fetch_backend": "html_requests", "application-minutes_between_check": 180, "application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444, "somebug": "网站监测 内容更新了", "another": "{{diff|truncate(1500)}}" }', "application-notification_format": default_notification_format, @@ -314,7 +312,6 @@ def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_me res = client.post( url_for("settings.settings_page"), data={ - "application-fetch_backend": "html_requests", "application-minutes_between_check": 180, "application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444, "somebug": "网站监测 内容更新了" }', "application-notification_format": default_notification_format, @@ -399,7 +396,6 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage res = client.post( url_for("settings.settings_page"), data={ - "application-fetch_backend": "html_requests", "application-minutes_between_check": 180, "application-notification_body": test_body, "application-notification_format": default_notification_format, @@ -478,7 +474,8 @@ def test_global_send_test_notification(client, live_server, measure_memory_usage b"nodename nor servname provided" in res.data or b"Temporary failure in name resolution" in res.data or b"Failed to establish a new connection" in res.data or - b"Connection error occurred" in res.data + b"Connection error occurred" in res.data or + b"net::ERR_NAME_NOT_RESOLVED" in res.data ) client.get( @@ -556,7 +553,6 @@ def _test_color_notifications(client, notification_body_token, datastore_path): res = client.post( url_for("settings.settings_page"), data={ - "application-fetch_backend": "html_requests", "application-minutes_between_check": 180, "application-notification_body": notification_body_token, "application-notification_format": "htmlcolor", diff --git a/changedetectionio/tests/test_notification_errors.py b/changedetectionio/tests/test_notification_errors.py index 4b235da9351..71274480eb3 100644 --- a/changedetectionio/tests/test_notification_errors.py +++ b/changedetectionio/tests/test_notification_errors.py @@ -36,7 +36,7 @@ def test_check_notification_error_handling(client, live_server, measure_memory_u "title": "", "headers": "", "time_between_check-minutes": "180", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -71,7 +71,8 @@ def test_check_notification_error_handling(client, live_server, measure_memory_u b"nodename nor servname provided" in res.data or b"Temporary failure in name resolution" in res.data or b"Failed to establish a new connection" in res.data or - b"Connection error occurred" in res.data + b"Connection error occurred" in res.data or + b"net::ERR_NAME_NOT_RESOLVED" in res.data ) assert found_name_resolution_error diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 1a690ba14f1..e499110d301 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -9,6 +9,10 @@ # Hard to just add more live server URLs when one test is already running (I think) # So we add our test here (was in a different file) def test_headers_in_request(client, live_server, measure_memory_usage, datastore_path): + if os.getenv('WEBDRIVER_URL'): + print("Selenium doesnt support custom HTTP headers!!") + return + #ve_server_setup(live_server) # Add our URL to the import page test_url = url_for('test_headers', _external=True) @@ -35,7 +39,7 @@ def test_headers_in_request(client, live_server, measure_memory_usage, datastore data={ "url": test_url, "tags": "", - "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', + "browser_profile": "system", "headers": "jinja2:{{ 1+1 }}\nxxx:ooo\ncool:yeah\r\ncookie:"+cookie_header, "time_between_check_use_default": "y"}, follow_redirects=True @@ -98,7 +102,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa "url": test_url, "tags": "", "method": "POST", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "body": "something something", "time_between_check_use_default": "y"}, follow_redirects=True @@ -116,7 +120,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa "url": test_url, "tags": "", "method": "POST", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "body": body_value, "time_between_check_use_default": "y"}, follow_redirects=True @@ -163,7 +167,7 @@ def test_body_in_request(client, live_server, measure_memory_usage, datastore_pa "url": test_url, "tags": "", "method": "GET", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "body": "invalid", "time_between_check_use_default": "y"}, follow_redirects=True @@ -195,7 +199,7 @@ def test_method_in_request(client, live_server, measure_memory_usage, datastore_ data={ "url": test_url, "tags": "", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "method": "invalid", "time_between_check_use_default": "y"}, follow_redirects=True @@ -208,7 +212,7 @@ def test_method_in_request(client, live_server, measure_memory_usage, datastore_ data={ "url": test_url, "tags": "", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "method": "PATCH", "time_between_check_use_default": "y"}, follow_redirects=True @@ -245,97 +249,74 @@ def test_method_in_request(client, live_server, measure_memory_usage, datastore_ delete_all_watches(client) -# Re #2408 - user-agent override test, also should handle case-insensitive header deduplication +# Re #2408 - user-agent override via BrowserProfile; per-watch headers override the profile UA def test_ua_global_override(client, live_server, measure_memory_usage, datastore_path): - ## live_server_setup(live_server) # Setup on conftest per function test_url = url_for('test_headers', _external=True) + datastore = client.application.config.get('DATASTORE') + # Create a requests-type browser profile with a custom UA res = client.post( - url_for("settings.settings_page"), + url_for('settings.settings_browsers.save'), data={ - "application-fetch_backend": "html_requests", - "application-minutes_between_check": 180, - "requests-default_ua-html_requests": "html-requests-user-agent" + 'name': 'UA Test Profile', + 'fetch_backend': 'requests', + 'browser_connection_url': '', + 'viewport_width': 1280, + 'viewport_height': 1000, + 'block_images': '', + 'block_fonts': '', + 'ignore_https_errors': '', + 'user_agent': 'profile-ua-test/1.0', + 'locale': '', + 'custom_headers': '', + 'original_machine_name': '', }, - follow_redirects=True + follow_redirects=True, ) - assert b'Settings updated' in res.data + assert b'saved.' in res.data - uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) - client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + from changedetectionio.model.browser_profile import BrowserProfile + profile_machine_name = BrowserProfile(name='UA Test Profile', fetch_backend='requests').get_machine_name() + uuid = datastore.add_watch(url=test_url, extras={'browser_profile': profile_machine_name}) + client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) wait_for_all_checks(client) - res = client.get( - url_for("ui.ui_preview.preview_page", uuid="first"), - follow_redirects=True - ) - assert b"html-requests-user-agent" in res.data - # default user-agent should have shown by now - # now add a custom one in the headers + res = client.get(url_for("ui.ui_preview.preview_page", uuid="first"), follow_redirects=True) + assert b"profile-ua-test/1.0" in res.data - - # Add some headers to a request + # Per-watch User-Agent header should override the profile UA (case-insensitive) res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), data={ "url": test_url, - "tags": "testtag", - "fetch_backend": 'html_requests', - # Important - also test case-insensitive + "tags": "", + "browser_profile": profile_machine_name, "headers": "User-AGent: agent-from-watch", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data wait_for_all_checks(client) - res = client.get( - url_for("ui.ui_preview.preview_page", uuid="first"), - follow_redirects=True - ) + res = client.get(url_for("ui.ui_preview.preview_page", uuid="first"), follow_redirects=True) assert b"agent-from-watch" in res.data - assert b"html-requests-user-agent" not in res.data + assert b"profile-ua-test/1.0" not in res.data + + client.get(url_for('settings.settings_browsers.delete', machine_name=profile_machine_name), follow_redirects=True) delete_all_watches(client) def test_headers_textfile_in_request(client, live_server, measure_memory_usage, datastore_path): import os - # Add our URL to the import page - - webdriver_ua = "Hello fancy webdriver UA 1.0" - requests_ua = "Hello basic requests UA 1.1" + if os.getenv('WEBDRIVER_URL'): + print("Selenium doesnt support custom HTTP headers!!") + return test_url = url_for('test_headers', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): # Because its no longer calling back to localhost but from the browser container, set in test-only.yml test_url = test_url.replace('localhost', 'cdio') - form_data = { - "application-fetch_backend": "html_requests", - "application-minutes_between_check": 180, - "requests-default_ua-html_requests": requests_ua - } - - if os.getenv('PLAYWRIGHT_DRIVER_URL'): - form_data["requests-default_ua-html_webdriver"] = webdriver_ua - - res = client.post( - url_for("settings.settings_page"), - data=form_data, - follow_redirects=True - ) - assert b'Settings updated' in res.data - - res = client.get(url_for("settings.settings_page")) - - # Only when some kind of real browser is setup - if os.getenv('PLAYWRIGHT_DRIVER_URL'): - assert b'requests-default_ua-html_webdriver' in res.data - - # Field should always be there - assert b"requests-default_ua-html_requests" in res.data - - # Add the test URL twice, we will check uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) @@ -347,7 +328,7 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage, data={ "url": test_url, "tags": "testtag", - "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', + "browser_profile": "system", "headers": "xxx:ooo\ncool:yeah\r\n", "time_between_check_use_default": "y"}, follow_redirects=True @@ -398,12 +379,6 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage, assert b"Url-Header-Global:http://example.com/global" in res.data assert b"Url-Header-Watch:http://example.com/watch" in res.data - # Check the custom UA from system settings page made it through - if os.getenv('PLAYWRIGHT_DRIVER_URL'): - assert "User-Agent:".encode('utf-8') + webdriver_ua.encode('utf-8') in res.data - else: - assert "User-Agent:".encode('utf-8') + requests_ua.encode('utf-8') in res.data - # unlink headers.txt on start/stop delete_all_watches(client) @@ -418,7 +393,7 @@ def test_headers_validation(client, live_server, measure_memory_usage, datastore url_for("ui.ui_edit.edit_page", uuid="first"), data={ "url": test_url, - "fetch_backend": 'html_requests', + "browser_profile": 'direct_http_requests', "headers": "User-AGent agent-from-watch\r\nsadfsadfsadfsdaf\r\n:foobar", "time_between_check_use_default": "y"}, follow_redirects=True diff --git a/changedetectionio/tests/test_restock_itemprop.py b/changedetectionio/tests/test_restock_itemprop.py index 352825fd7e8..10537303a4e 100644 --- a/changedetectionio/tests/test_restock_itemprop.py +++ b/changedetectionio/tests/test_restock_itemprop.py @@ -109,7 +109,7 @@ def test_itemprop_price_change(client, live_server, measure_memory_usage, datast set_original_response(props_markup=instock_props[0], price='120.45', datastore_path=datastore_path) res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"processor_config_restock_diff-follow_price_changes": "", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"processor_config_restock_diff-follow_price_changes": "", "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -141,7 +141,7 @@ def _run_test_minmax_limit(client, extra_watch_edit_form, datastore_path): "url": test_url, "headers": "", "time_between_check-hours": 5, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" } data.update(extra_watch_edit_form) @@ -263,7 +263,7 @@ def test_itemprop_percent_threshold(client, live_server, measure_memory_usage, d "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True @@ -309,7 +309,7 @@ def test_itemprop_percent_threshold(client, live_server, measure_memory_usage, d "processor_config_restock_diff-price_change_threshold_percent": 5.05, "processor": "text_json_diff", "url": test_url, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True @@ -361,8 +361,7 @@ def test_change_with_notification_values(client, live_server, measure_memory_usa "application-notification_title": "title new price {{restock.price}}", "application-notification_body": "new price {{restock.price}} previous price {{restock.previous_price}} instock {{restock.in_stock}}", "application-notification_format": default_notification_format, - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_rss.py b/changedetectionio/tests/test_rss.py index eb6606906d2..047effa3091 100644 --- a/changedetectionio/tests/test_rss.py +++ b/changedetectionio/tests/test_rss.py @@ -152,7 +152,7 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage, datastor url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1), data={ "include_filters": "//item/title", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "headers": "", "proxy": "no-proxy", "tags": "", diff --git a/changedetectionio/tests/test_rss_single_watch.py b/changedetectionio/tests/test_rss_single_watch.py index fc7128cb932..be5ed504692 100644 --- a/changedetectionio/tests/test_rss_single_watch.py +++ b/changedetectionio/tests/test_rss_single_watch.py @@ -273,7 +273,6 @@ def test_rss_single_watch_follow_notification_body(client, live_server, measure_ res = client.post( url_for("settings.settings_page"), data={ - "application-fetch_backend": "html_requests", "application-minutes_between_check": 180, "application-notification_body": 'Boo yeah hello from main settings notification body
    \nTitle: {{ watch_title }} changed', "application-notification_format": default_notification_format, @@ -335,7 +334,7 @@ def test_rss_single_watch_follow_notification_body(client, live_server, measure_ url_for("ui.ui_edit.edit_page", uuid=uuid), data={"notification_body": "RSS body description set from watch level at notification body - {{ watch_title }}", "url": test_url, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True diff --git a/changedetectionio/tests/test_scheduler.py b/changedetectionio/tests/test_scheduler.py index a8b0d0064ca..dbaa3613617 100644 --- a/changedetectionio/tests/test_scheduler.py +++ b/changedetectionio/tests/test_scheduler.py @@ -24,8 +24,7 @@ def test_check_basic_scheduler_functionality(client, live_server, measure_memory url_for("settings.settings_page"), data={"application-empty_pages_are_a_change": "", "requests-time_between_check-seconds": 1, - "application-scheduler_timezone_default": "Pacific/Kiritimati", # Most Forward Time Zone (UTC+14:00) - 'application-fetch_backend': "html_requests"}, + "application-scheduler_timezone_default": "Pacific/Kiritimati"}, # Most Forward Time Zone (UTC+14:00) follow_redirects=True ) @@ -59,7 +58,7 @@ def test_check_basic_scheduler_functionality(client, live_server, measure_memory data = { "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "" # no } data.update(scheduler_data) @@ -120,7 +119,6 @@ def test_check_basic_global_scheduler_functionality(client, live_server, measure data = { "application-empty_pages_are_a_change": "", "application-scheduler_timezone_default": "Pacific/Kiritimati", # Most Forward Time Zone (UTC+14:00) - 'application-fetch_backend': "html_requests", "requests-time_between_check-hours": 0, "requests-time_between_check-minutes": 0, "requests-time_between_check-seconds": 1, @@ -152,7 +150,7 @@ def test_check_basic_global_scheduler_functionality(client, live_server, measure url_for("ui.ui_edit.edit_page", uuid="first"), data={ "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -182,7 +180,7 @@ def test_validation_time_interval_field(client, live_server, measure_memory_usag url_for("ui.ui_edit.edit_page", uuid="first"), data={"trigger_text": 'The golden line', "url": test_url, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", 'filter_text_removed': 'y', "time_between_check_use_default": "" }, @@ -197,7 +195,7 @@ def test_validation_time_interval_field(client, live_server, measure_memory_usag url_for("ui.ui_edit.edit_page", uuid="first"), data={"trigger_text": 'The golden line', "url": test_url, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check-minutes": 1, "time_between_check_use_default": "" }, diff --git a/changedetectionio/tests/test_search.py b/changedetectionio/tests/test_search.py index e89b7a14aba..35192315dc2 100644 --- a/changedetectionio/tests/test_search.py +++ b/changedetectionio/tests/test_search.py @@ -27,7 +27,7 @@ def test_basic_search(client, live_server, measure_memory_usage, datastore_path) res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"title": "xxx-title", "url": urls[0], "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"title": "xxx-title", "url": urls[0], "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -62,7 +62,7 @@ def test_search_in_tag_limit(client, live_server, measure_memory_usage, datastor res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), data={"title": "xxx-title", "url": urls[0].split(' ')[0], "tags": urls[0].split(' ')[1], "headers": "", - 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data diff --git a/changedetectionio/tests/test_security.py b/changedetectionio/tests/test_security.py index 04eb1236f49..5463cfda7bf 100644 --- a/changedetectionio/tests/test_security.py +++ b/changedetectionio/tests/test_security.py @@ -68,7 +68,7 @@ def test_bad_access(client, live_server, measure_memory_usage, datastore_path): "url": 'javascript:alert(document.domain)', "tags": "", "method": "GET", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "body": "", "time_between_check_use_default": "y"}, follow_redirects=True @@ -159,8 +159,7 @@ def test_xss(client, live_server, measure_memory_usage, datastore_path): "application-notification_title": '">', "application-notification_body": '">', "application-notification_format": default_notification_format, - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) @@ -205,7 +204,7 @@ def test_xss_watch_last_error(client, live_server, measure_memory_usage, datasto data={ "include_filters": '', "url": url_for('test_endpoint', _external=True), - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True diff --git a/changedetectionio/tests/test_settings_tag_force_reprocess.py b/changedetectionio/tests/test_settings_tag_force_reprocess.py index 29fee83d7cc..50885af694b 100644 --- a/changedetectionio/tests/test_settings_tag_force_reprocess.py +++ b/changedetectionio/tests/test_settings_tag_force_reprocess.py @@ -54,8 +54,7 @@ def test_settings_change_forces_reprocess(client, live_server, measure_memory_us url_for("settings.settings_page"), data={ "application-empty_pages_are_a_change": "", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests" + "requests-time_between_check-minutes": 180 }, follow_redirects=True ) diff --git a/changedetectionio/tests/test_share_watch.py b/changedetectionio/tests/test_share_watch.py index 025b2f2c7cc..11a768b05c9 100644 --- a/changedetectionio/tests/test_share_watch.py +++ b/changedetectionio/tests/test_share_watch.py @@ -22,7 +22,7 @@ def test_share_watch(client, live_server, measure_memory_usage, datastore_path): # Add our URL to the import page res = client.post( url_for("ui.ui_edit.edit_page", uuid=uuid), - data={"include_filters": include_filters, "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": include_filters, "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data diff --git a/changedetectionio/tests/test_source.py b/changedetectionio/tests/test_source.py index e0bbf64979b..5033622c694 100644 --- a/changedetectionio/tests/test_source.py +++ b/changedetectionio/tests/test_source.py @@ -67,7 +67,7 @@ def test_check_ignore_elements(client, live_server, measure_memory_usage, datast client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": 'span,p', "url": test_url, "tags": "", "subtractive_selectors": ".foobar-detection", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": 'span,p', "url": test_url, "tags": "", "subtractive_selectors": ".foobar-detection", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_trigger.py b/changedetectionio/tests/test_trigger.py index 25714d97e6a..ee1d7deb31a 100644 --- a/changedetectionio/tests/test_trigger.py +++ b/changedetectionio/tests/test_trigger.py @@ -80,7 +80,7 @@ def test_trigger_functionality(client, live_server, measure_memory_usage, datast data={"trigger_text": trigger_text, "ignore_text": "and more", "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_trigger_regex.py b/changedetectionio/tests/test_trigger_regex.py index c2cfed4aa0a..27a834aaa9c 100644 --- a/changedetectionio/tests/test_trigger_regex.py +++ b/changedetectionio/tests/test_trigger_regex.py @@ -46,7 +46,7 @@ def test_trigger_regex_functionality(client, live_server, measure_memory_usage, url_for("ui.ui_edit.edit_page", uuid="first"), data={"trigger_text": '/something \d{3}/', "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_trigger_regex_with_filter.py b/changedetectionio/tests/test_trigger_regex_with_filter.py index 28db3a5c8e3..0fe71e3b304 100644 --- a/changedetectionio/tests/test_trigger_regex_with_filter.py +++ b/changedetectionio/tests/test_trigger_regex_with_filter.py @@ -44,7 +44,7 @@ def test_trigger_regex_functionality_with_filter(client, live_server, measure_me data={"trigger_text": "/cool.stuff/", "url": test_url, "include_filters": '#in-here', - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_ui.py b/changedetectionio/tests/test_ui.py index d6adbefd37d..5fa09ce856a 100644 --- a/changedetectionio/tests/test_ui.py +++ b/changedetectionio/tests/test_ui.py @@ -44,7 +44,7 @@ class globalSettingsRequestForm(Form): url_for("ui.ui_edit.edit_page", uuid="first"), data={ "url": test_url, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "", # OFF "time_between_check-weeks": '', "time_between_check-days": '', @@ -63,7 +63,7 @@ class globalSettingsRequestForm(Form): url_for("ui.ui_edit.edit_page", uuid="first"), data={ "url": test_url, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "", # OFF "time_between_check-weeks": '', "time_between_check-days": '', @@ -82,7 +82,7 @@ class globalSettingsRequestForm(Form): url_for("ui.ui_edit.edit_page", uuid="first"), data={ "url": test_url, - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y", # ON YES "time_between_check-weeks": '', "time_between_check-days": '', @@ -179,8 +179,7 @@ def test_page_title_listing_behaviour(client, live_server, measure_memory_usage, res = client.post( url_for("settings.settings_page"), data={"application-ui-use_page_title_in_list": "", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data @@ -207,7 +206,7 @@ def test_page_title_listing_behaviour(client, live_server, measure_memory_usage, data={ "url": url_for('test_endpoint', _external=True), "title": "my title", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -219,8 +218,7 @@ def test_page_title_listing_behaviour(client, live_server, measure_memory_usage, res = client.post( url_for("settings.settings_page"), data={"application-ui-use_page_title_in_list": "y", - "requests-time_between_check-minutes": 180, - 'application-fetch_backend': "html_requests"}, + "requests-time_between_check-minutes": 180}, follow_redirects=True ) assert b"Settings updated." in res.data @@ -235,7 +233,7 @@ def test_page_title_listing_behaviour(client, live_server, measure_memory_usage, data={ "url": url_for('test_endpoint', _external=True), "title": "", - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_unique_lines.py b/changedetectionio/tests/test_unique_lines.py index 294578c874b..18e457e39fb 100644 --- a/changedetectionio/tests/test_unique_lines.py +++ b/changedetectionio/tests/test_unique_lines.py @@ -89,7 +89,7 @@ def test_unique_lines_functionality(client, live_server, measure_memory_usage, d url_for("ui.ui_edit.edit_page", uuid="first"), data={"check_unique_lines": "y", "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -133,7 +133,7 @@ def test_sort_lines_functionality(client, live_server, measure_memory_usage, dat url_for("ui.ui_edit.edit_page", uuid="first"), data={"sort_text_alphabetically": "n", "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -181,7 +181,7 @@ def test_extra_filters(client, live_server, measure_memory_usage, datastore_path "trim_text_whitespace": "y", "sort_text_alphabetically": "", # leave this OFF for testing "url": test_url, - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_watch_fields_storage.py b/changedetectionio/tests/test_watch_fields_storage.py index 9cb32ecf365..a593e5dcb22 100644 --- a/changedetectionio/tests/test_watch_fields_storage.py +++ b/changedetectionio/tests/test_watch_fields_storage.py @@ -25,7 +25,7 @@ def test_check_watch_field_storage(client, live_server, measure_memory_usage, da "url": test_url, "tags": "woohoo", "headers": "curl:foo", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y" }, follow_redirects=True diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py index 88ce48f0e0b..b495e798f48 100644 --- a/changedetectionio/tests/test_xpath_selector.py +++ b/changedetectionio/tests/test_xpath_selector.py @@ -119,7 +119,7 @@ def test_check_xpath_filter_utf8(client, live_server, measure_memory_usage, data wait_for_all_checks(client) res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": filter, "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": filter, "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -168,7 +168,7 @@ def test_check_xpath_text_function_utf8(client, live_server, measure_memory_usag wait_for_all_checks(client) res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": filter, "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": filter, "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -205,7 +205,7 @@ def test_check_markup_xpath_filter_restriction(client, live_server, measure_memo # Add our URL to the import page res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": xpath_filter, "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": xpath_filter, "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -238,7 +238,7 @@ def test_xpath_validation(client, live_server, measure_memory_usage, datastore_p res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": "/something horrible", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": "/something horrible", "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"is not a valid XPath expression" in res.data @@ -254,7 +254,7 @@ def test_xpath23_prefix_validation(client, live_server, measure_memory_usage, da res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": "xpath:/something horrible", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": "xpath:/something horrible", "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"is not a valid XPath expression" in res.data @@ -300,7 +300,7 @@ def test_xpath1_lxml(client, live_server, measure_memory_usage, datastore_path): res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), data={"include_filters": "xpath1://title/text()", "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -329,7 +329,7 @@ def test_xpath1_validation(client, live_server, measure_memory_usage, datastore_ res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), - data={"include_filters": "xpath1:/something horrible", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + data={"include_filters": "xpath1:/something horrible", "url": test_url, "tags": "", "headers": "", 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"is not a valid XPath expression" in res.data @@ -351,7 +351,7 @@ def test_check_with_prefix_include_filters(client, live_server, measure_memory_u res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), data={"include_filters": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -401,7 +401,7 @@ def test_various_rules(client, live_server, measure_memory_usage, datastore_path "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -428,7 +428,7 @@ def test_xpath_20(client, live_server, measure_memory_usage, datastore_path): "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -462,7 +462,7 @@ def test_xpath_20_function_count(client, live_server, measure_memory_usage, data "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -495,7 +495,7 @@ def test_xpath_20_function_count2(client, live_server, measure_memory_usage, dat "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -531,7 +531,7 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) @@ -567,7 +567,7 @@ def _subtest_xpath_rss(client, datastore_path, content_type='text/html'): "url": test_url, "include_filters": "xpath://item", "tags": '', - "fetch_backend": "html_requests", + "browser_profile": "direct_http_requests", "time_between_check_use_default": "y", }, follow_redirects=True @@ -661,7 +661,7 @@ def test_xpath_blocked_functions_form_validation(client, live_server, measure_me res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), data={"include_filters": expr, "url": test_url, "tags": "", "headers": "", - 'fetch_backend': "html_requests", "time_between_check_use_default": "y"}, + 'browser_profile': "direct_http_requests", "time_between_check_use_default": "y"}, follow_redirects=True ) assert b"is not a valid XPath expression" in res.data, \ diff --git a/changedetectionio/tests/visualselector/test_fetch_data.py b/changedetectionio/tests/visualselector/test_fetch_data.py index e217d5fde62..de4b0be267f 100644 --- a/changedetectionio/tests/visualselector/test_fetch_data.py +++ b/changedetectionio/tests/visualselector/test_fetch_data.py @@ -3,6 +3,8 @@ import os from flask import url_for from ..util import live_server_setup, wait_for_all_checks +from ... import strtobool + # def test_setup(client, live_server, measure_memory_usage, datastore_path): # live_server_setup(live_server) # Setup on conftest per function @@ -35,7 +37,6 @@ def test_visual_selector_content_ready(client, live_server, measure_memory_usage "tags": "", # For now, cookies doesnt work in headers because it must be a full cookiejar object 'headers': "testheader: yes\buser-agent: MyCustomAgent", - 'fetch_backend': "html_webdriver", "time_between_check_use_default": "y", }, follow_redirects=True @@ -88,6 +89,9 @@ def test_visual_selector_content_ready(client, live_server, measure_memory_usage def test_basic_browserstep(client, live_server, measure_memory_usage, datastore_path): + if os.getenv('PLAYWRIGHT_DRIVER_URL') and strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')): + print("Puppeteer chrome fetch for BrowserSteps not supported!! test_basic_browserstep will be skipped") + return test_url = url_for('test_interactive_html_endpoint', _external=True) test_url = test_url.replace('localhost.localdomain', 'cdio') @@ -106,7 +110,6 @@ def test_basic_browserstep(client, live_server, measure_memory_usage, datastore_ data={ "url": test_url, "tags": "", - 'fetch_backend': "html_webdriver", 'browser_steps-5-operation': 'Enter text in field', 'browser_steps-5-selector': '#test-input-text', # Should get set to the actual text (jinja2 rendered) @@ -173,7 +176,6 @@ def test_non_200_errors_report_browsersteps(client, live_server, measure_memory_ data={ "url": four_o_four_url, "tags": "", - 'fetch_backend': "html_webdriver", 'browser_steps-0-operation': 'Click element', 'browser_steps-0-selector': 'button[name=test-button]', 'browser_steps-0-optional_value': '', @@ -203,7 +205,7 @@ def test_browsersteps_edit_UI_startsession(client, live_server, measure_memory_u test_url = test_url.replace('localhost.localdomain', 'cdio') test_url = test_url.replace('localhost', 'cdio') - uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'fetch_backend': 'html_webdriver', 'paused': True}) + uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'paused': True}) # Test starting a browsersteps session res = client.get( @@ -239,7 +241,6 @@ def test_browsersteps_edit_UI_startsession(client, live_server, measure_memory_u data={ "url": test_url, "tags": "", - 'fetch_backend': "html_webdriver", "time_between_check_use_default": "y", }, follow_redirects=True diff --git a/changedetectionio/worker.py b/changedetectionio/worker.py index 41bcd29cba1..7e02ef6f407 100644 --- a/changedetectionio/worker.py +++ b/changedetectionio/worker.py @@ -214,6 +214,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec process_changedetection_results = False except content_fetchers_exceptions.Non200ErrorCodeReceived as e: + logger.info(f"Watch UUID {uuid} Non200ErrorCodeReceived") if e.status_code == 403: err_text = "Error - 403 (Access denied) received" elif e.status_code == 404: @@ -378,6 +379,11 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec process_changedetection_results = False logger.error(f"Exception (BrowserStepsInUnsupportedFetcher) reached processing watch UUID: {uuid}") + except KeyError as e: + # Watch was deleted between being queued and processed — skip + logger.warning(f"Worker {worker_id} skipping UUID {uuid}: {e}") + process_changedetection_results = False + except Exception as e: import traceback logger.error(f"Worker {worker_id} exception processing watch UUID: {uuid}") diff --git a/docs/api-spec.yaml b/docs/api-spec.yaml index fa8fcc8ca2d..9c7998d2fee 100644 --- a/docs/api-spec.yaml +++ b/docs/api-spec.yaml @@ -316,17 +316,15 @@ components: type: string enum: [GET, POST, DELETE, PUT] description: HTTP method to use - fetch_backend: + browser_profile: type: string description: | - Backend to use for fetching content. Common values: - - `system` (default) - Use the system-wide default fetcher - - `html_requests` - Fast requests-based fetcher - - `html_webdriver` - Browser-based fetcher (Playwright/Puppeteer) - - `extra_browser_*` - Custom browser configurations (if configured) - - Plugin-provided fetchers (if installed) - pattern: '^(system|html_requests|html_webdriver|extra_browser_.+)$' - default: system + Browser profile (machine name) to use for fetching this watch. + - `null` or omitted — use the system default profile + - `direct_http_requests` — fast requests-based fetcher + - `browser_chromeplaywright` — Chrome/Playwright browser + - Any named profile configured in Settings → Browsers + - Plugin-provided profiles (e.g. CloakBrowser, if installed) headers: type: object additionalProperties: