diff options
-rwxr-xr-x | urldelta.py | 28 |
1 files changed, 24 insertions, 4 deletions
diff --git a/urldelta.py b/urldelta.py index b9e2358..d45a442 100755 --- a/urldelta.py +++ b/urldelta.py @@ -6,9 +6,19 @@ import time import requests import os +from bs4 import BeautifulSoup from rss_types import RSSItem, RSSFeed +def extract_text(content) -> str: + soup = BeautifulSoup(content, features="html.parser") + + for script in soup(["script", "style"]): + script.extract() + + return soup.get_text(separator="\n", strip=True) + + def get_page_delta(url): conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), "database", "website_data.db")) cursor = conn.cursor() @@ -37,6 +47,14 @@ def get_page_delta(url): ) conn.commit() + # Add debug info + cursor.execute("PRAGMA table_info(deltas)") + existing_cols = [row[1] for row in cursor.fetchall()] + if "extracted_old" not in existing_cols: + cursor.execute("ALTER TABLE deltas ADD COLUMN {} text".format("extracted_old")) + cursor.execute("ALTER TABLE deltas ADD COLUMN {} text".format("extracted_new")) + conn.commit() + # Check, if current website is known. Get latest state, if known. cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,)) @@ -60,10 +78,12 @@ def get_page_delta(url): response = requests.get(url, timeout=20) cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id)) - if response.content != last_content: + extracted_new = extract_text(response.content) + extracted_old = extract_text(last_content) + if extracted_new != extracted_old: cursor.execute( - "INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)", - (id, str(response.headers), response.content, int(time.time())), + "INSERT INTO deltas (website_id, headers, content, fetch_date, extracted_old, extracted_new) VALUES (?, ?, ?, ?, ?, ?)", + (id, str(response.headers), response.content, int(time.time()), extracted_old, extracted_new), ) conn.commit() @@ -75,7 +95,7 @@ def get_page_delta(url): RSSItem( title=f"Change on {url}", url=url, - content=f"Headers: {update[0]}\n\nContent: {update[1]}", + content=f"Headers: {update[0]}\n\nContent: {extract_text(update[1])}", "<br>"), date=datetime.datetime.fromtimestamp(update[2], tz=datetime.UTC), enclosures=[], guid=update[2], |