summaryrefslogtreecommitdiff
path: root/urldelta.py
diff options
context:
space:
mode:
authorAndré Glüpker <git@wgmd.de>2024-05-01 11:20:22 +0200
committerAndré Glüpker <git@wgmd.de>2024-05-01 11:29:50 +0200
commitb5241f3b4ca53a297b6046fc1d755a55d90e05aa (patch)
tree1ed0b987b63f4c2243dfbc212c8220388a1b56ad /urldelta.py
parent389f40f305a6ab41b9636cd9fe29420600bc5c73 (diff)
downloadrss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.gz
rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.bz2
rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.zip
Add /delta to crawl for webpage changes
Diffstat (limited to 'urldelta.py')
-rwxr-xr-xurldelta.py89
1 files changed, 89 insertions, 0 deletions
diff --git a/urldelta.py b/urldelta.py
new file mode 100755
index 0000000..7d487bd
--- /dev/null
+++ b/urldelta.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+from datetime import datetime
+import sqlite3
+import time
+import requests
+
+from rss_types import RSSItem, RSSFeed
+
+
+def get_page_delta(url):
+ conn = sqlite3.connect("website_data.db")
+ cursor = conn.cursor()
+
+ # Initialize database, if needed
+ cursor.execute(
+ """
+ CREATE TABLE IF NOT EXISTS websites (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ url TEXT,
+ date_added INTEGER,
+ last_fetched INTEGER
+ );
+ """
+ )
+ cursor.execute(
+ """
+ CREATE TABLE IF NOT EXISTS deltas (
+ website_id INTEGER,
+ headers TEXT,
+ content TEXT,
+ fetch_date INTEGER
+ );
+
+ """
+ )
+ conn.commit()
+
+ # Check, if current website is known. Get latest state, if known.
+ cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,))
+
+ id = last_fetched = last_content = None
+ data = cursor.fetchone()
+ if data:
+ id, last_fetched = data
+ cursor.execute("SELECT content FROM deltas WHERE website_id = ?", (id,))
+ last_content = cursor.fetchone()
+ if last_content:
+ last_content = last_content[0]
+ else:
+ cursor.execute(
+ "INSERT INTO websites (url, date_added, last_fetched) VALUES (?, ?, ?)",
+ (url, int(time.time()), int(time.time())),
+ )
+ conn.commit()
+ id = cursor.lastrowid
+
+ if not last_fetched or int(time.time()) - last_fetched > 3600:
+ response = requests.get(url, timeout=20)
+
+ cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id))
+ if response.content != last_content:
+ cursor.execute(
+ "INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)",
+ (id, str(response.headers), response.content, int(time.time())),
+ )
+ conn.commit()
+
+ cursor.execute("SELECT headers, content, fetch_date FROM deltas WHERE website_id = ?", (id,))
+
+ updates = []
+ for update in cursor.fetchall():
+ updates.append(
+ RSSItem(
+ title=f"Change on {url}",
+ url=url,
+ content=f"Headers: {update[0]}\n\nContent: {update[1]}",
+ date=datetime.utcfromtimestamp(update[2]),
+ enclosures=[],
+ guid=update[2],
+ )
+ )
+
+ return RSSFeed(
+ title=f"Updates for {url}",
+ url=url,
+ description=f"Detected changes on page {url}",
+ content=updates,
+ )