summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndré Glüpker <git@wgmd.de>2024-05-25 10:50:31 +0200
committerAndré Glüpker <git@wgmd.de>2024-05-25 10:50:31 +0200
commit24eb5ad088b365c4189003282014403b74ad2249 (patch)
tree907eb8cd1ab34ff1ca8fd4a8aede1ed7168b0f87
parent58f9adfb2fcbe74604044cb2ebc3af63ff6d5b84 (diff)
downloadrss-feeds-24eb5ad088b365c4189003282014403b74ad2249.tar.gz
rss-feeds-24eb5ad088b365c4189003282014403b74ad2249.tar.bz2
rss-feeds-24eb5ad088b365c4189003282014403b74ad2249.zip
Extract text and safe debug information
-rwxr-xr-xurldelta.py28
1 files changed, 24 insertions, 4 deletions
diff --git a/urldelta.py b/urldelta.py
index b9e2358..d45a442 100755
--- a/urldelta.py
+++ b/urldelta.py
@@ -6,9 +6,19 @@ import time
import requests
import os
+from bs4 import BeautifulSoup
from rss_types import RSSItem, RSSFeed
+def extract_text(content) -> str:
+ soup = BeautifulSoup(content, features="html.parser")
+
+ for script in soup(["script", "style"]):
+ script.extract()
+
+ return soup.get_text(separator="\n", strip=True)
+
+
def get_page_delta(url):
conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), "database", "website_data.db"))
cursor = conn.cursor()
@@ -37,6 +47,14 @@ def get_page_delta(url):
)
conn.commit()
+ # Add debug info
+ cursor.execute("PRAGMA table_info(deltas)")
+ existing_cols = [row[1] for row in cursor.fetchall()]
+ if "extracted_old" not in existing_cols:
+ cursor.execute("ALTER TABLE deltas ADD COLUMN {} text".format("extracted_old"))
+ cursor.execute("ALTER TABLE deltas ADD COLUMN {} text".format("extracted_new"))
+ conn.commit()
+
# Check, if current website is known. Get latest state, if known.
cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,))
@@ -60,10 +78,12 @@ def get_page_delta(url):
response = requests.get(url, timeout=20)
cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id))
- if response.content != last_content:
+ extracted_new = extract_text(response.content)
+ extracted_old = extract_text(last_content)
+ if extracted_new != extracted_old:
cursor.execute(
- "INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)",
- (id, str(response.headers), response.content, int(time.time())),
+ "INSERT INTO deltas (website_id, headers, content, fetch_date, extracted_old, extracted_new) VALUES (?, ?, ?, ?, ?, ?)",
+ (id, str(response.headers), response.content, int(time.time()), extracted_old, extracted_new),
)
conn.commit()
@@ -75,7 +95,7 @@ def get_page_delta(url):
RSSItem(
title=f"Change on {url}",
url=url,
- content=f"Headers: {update[0]}\n\nContent: {update[1]}",
+ content=f"Headers: {update[0]}\n\nContent: {extract_text(update[1])}", "<br>"),
date=datetime.datetime.fromtimestamp(update[2], tz=datetime.UTC),
enclosures=[],
guid=update[2],