#!/usr/bin/env python3 import datetime import difflib import sqlite3 import time import requests import os from bs4 import BeautifulSoup from rss_types import RSSItem, RSSFeed def extract_text(content) -> str: soup = BeautifulSoup(content, features="html.parser") for script in soup(["script", "style"]): script.extract() return soup.get_text(separator="\n", strip=True) def compare_to_html( text_old: str, text_new: str, ) -> str: if not text_old or not text_new: return "N/A" output = ["
"] for line in difflib.Differ().compare(text_old.splitlines(), text_new.splitlines()): if line.startswith("+"): output.append(f"🟢 {line.strip()[2:]}") elif line.startswith("-"): output.append(f"🔴 {line.strip()[2:]}") elif line.startswith("?"): output.append(f"🔵 {line.strip()[2:]}") else: output.append(f"🔵 {line.strip()}") output.append("") return "\n".join(output) def get_page_delta(url): conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), "database", "website_data.db")) cursor = conn.cursor() # Initialize database, if needed cursor.execute( """ CREATE TABLE IF NOT EXISTS websites ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT, date_added INTEGER, last_fetched INTEGER ); """ ) cursor.execute( """ CREATE TABLE IF NOT EXISTS deltas ( website_id INTEGER, headers TEXT, content TEXT, fetch_date INTEGER ); """ ) conn.commit() # Add debug info cursor.execute("PRAGMA table_info(deltas)") existing_cols = [row[1] for row in cursor.fetchall()] if "extracted_old" not in existing_cols: cursor.execute("ALTER TABLE deltas ADD COLUMN {} text".format("extracted_old")) cursor.execute("ALTER TABLE deltas ADD COLUMN {} text".format("extracted_new")) conn.commit() # Check, if current website is known. Get latest state, if known. cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,)) id = last_fetched = last_content = None data = cursor.fetchone() if data: id, last_fetched = data cursor.execute("SELECT content FROM deltas WHERE website_id = ? ORDER BY fetch_date desc LIMIT 1", (id,)) last_content = cursor.fetchone() if last_content: last_content = last_content[0] else: cursor.execute( "INSERT INTO websites (url, date_added, last_fetched) VALUES (?, ?, ?)", (url, int(time.time()), int(time.time())), ) conn.commit() id = cursor.lastrowid if not last_fetched or int(time.time()) - last_fetched > 3600: response = requests.get(url, timeout=20) cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id)) extracted_new = extract_text(response.content) extracted_old = extract_text(last_content) if extracted_new != extracted_old: cursor.execute( "INSERT INTO deltas (website_id, headers, content, fetch_date, extracted_old, extracted_new) VALUES (?, ?, ?, ?, ?, ?)", (id, str(response.headers), response.content, int(time.time()), extracted_old, extracted_new), ) conn.commit() cursor.execute("SELECT extracted_old, extracted_new, fetch_date FROM deltas WHERE website_id = ?", (id,)) updates = [] for update in cursor.fetchall(): extracted_old, extracted_new, fetch_date = update updates.append( RSSItem( title=f"Change on {url}", url=url, content=compare_to_html(extracted_old, extracted_new), date=datetime.datetime.fromtimestamp(fetch_date, tz=datetime.UTC), enclosures=[], guid=update[2], ) ) return RSSFeed( title=f"Updates for {url}", url=url, description=f"Detected changes on page {url}", content=updates, )