diff options
author | André Glüpker <git@wgmd.de> | 2024-05-01 11:20:22 +0200 |
---|---|---|
committer | André Glüpker <git@wgmd.de> | 2024-05-01 11:29:50 +0200 |
commit | b5241f3b4ca53a297b6046fc1d755a55d90e05aa (patch) | |
tree | 1ed0b987b63f4c2243dfbc212c8220388a1b56ad | |
parent | 389f40f305a6ab41b9636cd9fe29420600bc5c73 (diff) | |
download | rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.gz rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.tar.bz2 rss-feeds-b5241f3b4ca53a297b6046fc1d755a55d90e05aa.zip |
Add /delta to crawl for webpage changes
-rwxr-xr-x | urldelta.py | 89 | ||||
-rwxr-xr-x | webapp.py | 17 |
2 files changed, 103 insertions, 3 deletions
diff --git a/urldelta.py b/urldelta.py new file mode 100755 index 0000000..7d487bd --- /dev/null +++ b/urldelta.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +from datetime import datetime +import sqlite3 +import time +import requests + +from rss_types import RSSItem, RSSFeed + + +def get_page_delta(url): + conn = sqlite3.connect("website_data.db") + cursor = conn.cursor() + + # Initialize database, if needed + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS websites ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT, + date_added INTEGER, + last_fetched INTEGER + ); + """ + ) + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS deltas ( + website_id INTEGER, + headers TEXT, + content TEXT, + fetch_date INTEGER + ); + + """ + ) + conn.commit() + + # Check, if current website is known. Get latest state, if known. + cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,)) + + id = last_fetched = last_content = None + data = cursor.fetchone() + if data: + id, last_fetched = data + cursor.execute("SELECT content FROM deltas WHERE website_id = ?", (id,)) + last_content = cursor.fetchone() + if last_content: + last_content = last_content[0] + else: + cursor.execute( + "INSERT INTO websites (url, date_added, last_fetched) VALUES (?, ?, ?)", + (url, int(time.time()), int(time.time())), + ) + conn.commit() + id = cursor.lastrowid + + if not last_fetched or int(time.time()) - last_fetched > 3600: + response = requests.get(url, timeout=20) + + cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id)) + if response.content != last_content: + cursor.execute( + "INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)", + (id, str(response.headers), response.content, int(time.time())), + ) + conn.commit() + + cursor.execute("SELECT headers, content, fetch_date FROM deltas WHERE website_id = ?", (id,)) + + updates = [] + for update in cursor.fetchall(): + updates.append( + RSSItem( + title=f"Change on {url}", + url=url, + content=f"Headers: {update[0]}\n\nContent: {update[1]}", + date=datetime.utcfromtimestamp(update[2]), + enclosures=[], + guid=update[2], + ) + ) + + return RSSFeed( + title=f"Updates for {url}", + url=url, + description=f"Detected changes on page {url}", + content=updates, + ) @@ -2,16 +2,18 @@ import logging -from flask import Flask, Response +from flask import Flask, Response, request +from rss_types import RSSFeed from rss import buildRSS from zdf import zdf +from urldelta import get_page_delta app = Flask(__name__) app.secret_key = "NMcgoB.0wd+$.KVKj!F{3>U{%BBUVhL=7=5$:46rQH$Q{enCuU" -def rssResponse(data): +def rssResponse(data: RSSFeed): rss = buildRSS(data) response = Response(rss, mimetype="text/xml") response.headers["Access-Control-Allow-Origin"] = "*" @@ -30,7 +32,16 @@ def not_found(e): @app.route("/zdf/<path:show_url>") def fetch_zdf_show(show_url): - return rssResponse(zdf(show_url)) + if rss_feed := zdf(show_url): + return rssResponse(rss_feed) + return "Failed" + + +@app.route("/delta") +def website_delta(): + if url := request.args.get("url"): + return rssResponse(get_page_delta(url)) + return "Failed" if __name__ == "__main__": |