diff --git a/changedetectionio/blueprint/watchlist/templates/watch-overview.html b/changedetectionio/blueprint/watchlist/templates/watch-overview.html index 1c6ab0ba85d..1cb772aae15 100644 --- a/changedetectionio/blueprint/watchlist/templates/watch-overview.html +++ b/changedetectionio/blueprint/watchlist/templates/watch-overview.html @@ -196,9 +196,9 @@ {%- if watch.get('restock') and watch['restock']['price'] != None -%} {%- if watch['restock']['price'] != None -%} - - {{ watch['restock']['price']|format_number_locale }} {{ watch['restock']['currency'] }} - + + {{ watch['restock']['currency'] }} {{ watch['restock']['price']|format_number_locale }} + {%- endif -%} {%- elif not watch.has_restock_info -%} No information diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 1fa81058caa..475076279ff 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -2,9 +2,11 @@ from ..exceptions import ProcessorException from . import Restock from loguru import logger +from bs4 import BeautifulSoup import urllib3 import time +import re urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) name = 'Re-stock & Price detection for pages with a SINGLE product' @@ -28,28 +30,77 @@ def _search_prop_by_value(matches, value): def _deduplicate_prices(data): import re - + ''' - Some price data has multiple entries, OR it has a single entry with ['$159', '159', 159, "$ 159"] or just "159" + Some price data has multiple entries, OR it has a single entry with + ['$159', '159', 159, "$ 159", "R 3,299"] or just "159" Get all the values, clean it and add it to a set then return the unique values ''' unique_data = set() - # Return the complete 'datum' where its price was not seen before - for datum in data: + def normalize(value): + # Convert to string, strip spaces + s = str(value).strip() + if not s: + return None + + # Remove currency symbols and spaces (keep digits, dots, commas) + s = re.sub(r'[^\d.,]', '', s) + + # Remove thousands separators (commas) + s = s.replace(',', '') + + # Convert to float + try: + return float(s) + except ValueError: + return None + # Process data + for datum in data: if isinstance(datum.value, list): - # Process each item in the list - normalized_value = set([float(re.sub(r'[^\d.]', '', str(item))) for item in datum.value if str(item).strip()]) - unique_data.update(normalized_value) + for item in datum.value: + v = normalize(item) + if v is not None: + unique_data.add(v) else: - # Process single value - v = float(re.sub(r'[^\d.]', '', str(datum.value))) - unique_data.add(v) + v = normalize(datum.value) + if v is not None: + unique_data.add(v) return list(unique_data) +def extract_price_from_html(html_content): + """ + Fallback parser when extruct does not return usable data. + Only returns the FIRST price on the page (buybox). + Extracts both price and currency. + """ + from bs4 import BeautifulSoup + import re + + soup = BeautifulSoup(html_content, "html.parser") + + # Find the first element with a class containing 'currency' + el = soup.find(class_=re.compile(r'currency')) + if not el: + return None, None + + text = el.get_text(strip=True) + if not text: + return None, None + + # Extract currency (any non-digit at start, e.g. "R", "$", "€") + match = re.match(r"([^\d]+)", text) + currency = match.group(1).strip() if match else None + + # Normalize price using helper + price = _deduplicate_prices([type("D", (), {"value": text})]) + price = price[0] if price else None + + return price, currency + # should return Restock() # add casting? def get_itemprop_availability(html_content) -> Restock: @@ -86,7 +137,7 @@ def get_itemprop_availability(html_content) -> Restock: price_result = _deduplicate_prices(price_parse.find(data)) if price_result: - # Right now, we just support single product items, maybe we will store the whole actual metadata seperately in teh future and + # Right now, we just support single product items, maybe we will store the whole actual metadata seperately in the future and # parse that for the UI? if len(price_result) > 1 and len(price_result) > 1: # See of all prices are different, in the case that one product has many embedded data types with the same price @@ -120,6 +171,27 @@ def get_itemprop_availability(html_content) -> Restock: value['availability'] = _search_prop_by_value([match.value], "product:availability") if not value.get('currency'): value['currency'] = _search_prop_by_value([match.value], "price:currency") + + # lastly, try utilize raw HTML search for the price and availability + if not value.get('price'): + # ---- Fallback to raw HTML parsing ---- + logger.debug("Falling back to BeautifulSoup parsing for price info...") + price, currency = extract_price_from_html(html_content) + if price: + value['price'] = price + if not value['currency']: + if currency: + value['currency'] = currency + + if not value.get('availability'): + # Availability from classes or text + soup = BeautifulSoup(html_content, "html.parser") + availability_texts = soup.find_all("div", {"data-ref": "in-stock-indicator"}) + if availability_texts: + value['availability'] = "instock" + else: + value['availability'] = "outofstock" + logger.trace(f"Processed with Extruct in {time.time()-now:.3f}s") return value