From b5241f3b4ca53a297b6046fc1d755a55d90e05aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Gl=C3=BCpker?= Date: Wed, 1 May 2024 11:20:22 +0200 Subject: Add /delta to crawl for webpage changes --- urldelta.py | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100755 urldelta.py (limited to 'urldelta.py') diff --git a/urldelta.py b/urldelta.py new file mode 100755 index 0000000..7d487bd --- /dev/null +++ b/urldelta.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +from datetime import datetime +import sqlite3 +import time +import requests + +from rss_types import RSSItem, RSSFeed + + +def get_page_delta(url): + conn = sqlite3.connect("website_data.db") + cursor = conn.cursor() + + # Initialize database, if needed + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS websites ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT, + date_added INTEGER, + last_fetched INTEGER + ); + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS deltas ( + website_id INTEGER, + headers TEXT, + content TEXT, + fetch_date INTEGER + ); + + """ + ) + conn.commit() + + # Check, if current website is known. Get latest state, if known. + cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,)) + + id = last_fetched = last_content = None + data = cursor.fetchone() + if data: + id, last_fetched = data + cursor.execute("SELECT content FROM deltas WHERE website_id = ?", (id,)) + last_content = cursor.fetchone() + if last_content: + last_content = last_content[0] + else: + cursor.execute( + "INSERT INTO websites (url, date_added, last_fetched) VALUES (?, ?, ?)", + (url, int(time.time()), int(time.time())), + ) + conn.commit() + id = cursor.lastrowid + + if not last_fetched or int(time.time()) - last_fetched > 3600: + response = requests.get(url, timeout=20) + + cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id)) + if response.content != last_content: + cursor.execute( + "INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)", + (id, str(response.headers), response.content, int(time.time())), + ) + conn.commit() + + cursor.execute("SELECT headers, content, fetch_date FROM deltas WHERE website_id = ?", (id,)) + + updates = [] + for update in cursor.fetchall(): + updates.append( + RSSItem( + title=f"Change on {url}", + url=url, + content=f"Headers: {update[0]}\n\nContent: {update[1]}", + date=datetime.utcfromtimestamp(update[2]), + enclosures=[], + guid=update[2], + ) + ) + + return RSSFeed( + title=f"Updates for {url}", + url=url, + description=f"Detected changes on page {url}", + content=updates, + ) -- cgit v1.2.3