diff options
author | André Glüpker <git@wgmd.de> | 2024-05-01 11:20:22 +0200 |
---|---|---|
committer | André Glüpker <git@wgmd.de> | 2024-05-01 11:29:50 +0200 |
commit | b5241f3b4ca53a297b6046fc1d755a55d90e05aa (patch) | |
tree | 1ed0b987b63f4c2243dfbc212c8220388a1b56ad /urldelta.py | |
parent | 389f40f305a6ab41b9636cd9fe29420600bc5c73 (diff) | |
download | rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.gz rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.bz2 rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.zip |
Add /delta to crawl for webpage changes
Diffstat (limited to 'urldelta.py')
-rwxr-xr-x | urldelta.py | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/urldelta.py b/urldelta.py new file mode 100755 index 0000000..7d487bd --- /dev/null +++ b/urldelta.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +from datetime import datetime +import sqlite3 +import time +import requests + +from rss_types import RSSItem, RSSFeed + + +def get_page_delta(url): + conn = sqlite3.connect("website_data.db") + cursor = conn.cursor() + + # Initialize database, if needed + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS websites ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT, + date_added INTEGER, + last_fetched INTEGER + ); + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS deltas ( + website_id INTEGER, + headers TEXT, + content TEXT, + fetch_date INTEGER + ); + + """ + ) + conn.commit() + + # Check, if current website is known. Get latest state, if known. + cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,)) + + id = last_fetched = last_content = None + data = cursor.fetchone() + if data: + id, last_fetched = data + cursor.execute("SELECT content FROM deltas WHERE website_id = ?", (id,)) + last_content = cursor.fetchone() + if last_content: + last_content = last_content[0] + else: + cursor.execute( + "INSERT INTO websites (url, date_added, last_fetched) VALUES (?, ?, ?)", + (url, int(time.time()), int(time.time())), + ) + conn.commit() + id = cursor.lastrowid + + if not last_fetched or int(time.time()) - last_fetched > 3600: + response = requests.get(url, timeout=20) + + cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id)) + if response.content != last_content: + cursor.execute( + "INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)", + (id, str(response.headers), response.content, int(time.time())), + ) + conn.commit() + + cursor.execute("SELECT headers, content, fetch_date FROM deltas WHERE website_id = ?", (id,)) + + updates = [] + for update in cursor.fetchall(): + updates.append( + RSSItem( + title=f"Change on {url}", + url=url, + content=f"Headers: {update[0]}\n\nContent: {update[1]}", + date=datetime.utcfromtimestamp(update[2]), + enclosures=[], + guid=update[2], + ) + ) + + return RSSFeed( + title=f"Updates for {url}", + url=url, + description=f"Detected changes on page {url}", + content=updates, + ) |