summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndré Glüpker <git@wgmd.de>2024-05-01 11:20:22 +0200
committerAndré Glüpker <git@wgmd.de>2024-05-01 11:29:50 +0200
commitb5241f3b4ca53a297b6046fc1d755a55d90e05aa (patch)
tree1ed0b987b63f4c2243dfbc212c8220388a1b56ad
parent389f40f305a6ab41b9636cd9fe29420600bc5c73 (diff)
downloadrss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.gz
rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.bz2
rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.zip
Add /delta to crawl for webpage changes
-rwxr-xr-xurldelta.py89
-rwxr-xr-xwebapp.py17
2 files changed, 103 insertions, 3 deletions
diff --git a/urldelta.py b/urldelta.py
new file mode 100755
index 0000000..7d487bd
--- /dev/null
+++ b/urldelta.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+from datetime import datetime
+import sqlite3
+import time
+import requests
+
+from rss_types import RSSItem, RSSFeed
+
+
+def get_page_delta(url):
+ conn = sqlite3.connect("website_data.db")
+ cursor = conn.cursor()
+
+ # Initialize database, if needed
+ cursor.execute(
+ """
+ CREATE TABLE IF NOT EXISTS websites (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ url TEXT,
+ date_added INTEGER,
+ last_fetched INTEGER
+ );
+ """
+ )
+ cursor.execute(
+ """
+ CREATE TABLE IF NOT EXISTS deltas (
+ website_id INTEGER,
+ headers TEXT,
+ content TEXT,
+ fetch_date INTEGER
+ );
+
+ """
+ )
+ conn.commit()
+
+ # Check, if current website is known. Get latest state, if known.
+ cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,))
+
+ id = last_fetched = last_content = None
+ data = cursor.fetchone()
+ if data:
+ id, last_fetched = data
+ cursor.execute("SELECT content FROM deltas WHERE website_id = ?", (id,))
+ last_content = cursor.fetchone()
+ if last_content:
+ last_content = last_content[0]
+ else:
+ cursor.execute(
+ "INSERT INTO websites (url, date_added, last_fetched) VALUES (?, ?, ?)",
+ (url, int(time.time()), int(time.time())),
+ )
+ conn.commit()
+ id = cursor.lastrowid
+
+ if not last_fetched or int(time.time()) - last_fetched > 3600:
+ response = requests.get(url, timeout=20)
+
+ cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id))
+ if response.content != last_content:
+ cursor.execute(
+ "INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)",
+ (id, str(response.headers), response.content, int(time.time())),
+ )
+ conn.commit()
+
+ cursor.execute("SELECT headers, content, fetch_date FROM deltas WHERE website_id = ?", (id,))
+
+ updates = []
+ for update in cursor.fetchall():
+ updates.append(
+ RSSItem(
+ title=f"Change on {url}",
+ url=url,
+ content=f"Headers: {update[0]}\n\nContent: {update[1]}",
+ date=datetime.utcfromtimestamp(update[2]),
+ enclosures=[],
+ guid=update[2],
+ )
+ )
+
+ return RSSFeed(
+ title=f"Updates for {url}",
+ url=url,
+ description=f"Detected changes on page {url}",
+ content=updates,
+ )
diff --git a/webapp.py b/webapp.py
index 0fe540f..ea95805 100755
--- a/webapp.py
+++ b/webapp.py
@@ -2,16 +2,18 @@
import logging
-from flask import Flask, Response
+from flask import Flask, Response, request
+from rss_types import RSSFeed
from rss import buildRSS
from zdf import zdf
+from urldelta import get_page_delta
app = Flask(__name__)
app.secret_key = "NMcgoB.0wd+$.KVKj!F{3>U{%BBUVhL=7=5$:46rQH$Q{enCuU"
-def rssResponse(data):
+def rssResponse(data: RSSFeed):
rss = buildRSS(data)
response = Response(rss, mimetype="text/xml")
response.headers["Access-Control-Allow-Origin"] = "*"
@@ -30,7 +32,16 @@ def not_found(e):
@app.route("/zdf/<path:show_url>")
def fetch_zdf_show(show_url):
- return rssResponse(zdf(show_url))
+ if rss_feed := zdf(show_url):
+ return rssResponse(rss_feed)
+ return "Failed"
+
+
+@app.route("/delta")
+def website_delta():
+ if url := request.args.get("url"):
+ return rssResponse(get_page_delta(url))
+ return "Failed"
if __name__ == "__main__":