1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#!/usr/bin/env python3
from datetime import datetime
import sqlite3
import time
import requests
from rss_types import RSSItem, RSSFeed
def get_page_delta(url):
conn = sqlite3.connect("website_data.db")
cursor = conn.cursor()
# Initialize database, if needed
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS websites (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT,
date_added INTEGER,
last_fetched INTEGER
);
"""
)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS deltas (
website_id INTEGER,
headers TEXT,
content TEXT,
fetch_date INTEGER
);
"""
)
conn.commit()
# Check, if current website is known. Get latest state, if known.
cursor.execute("SELECT id, last_fetched FROM websites WHERE url = ?", (url,))
id = last_fetched = last_content = None
data = cursor.fetchone()
if data:
id, last_fetched = data
cursor.execute("SELECT content FROM deltas WHERE website_id = ?", (id,))
last_content = cursor.fetchone()
if last_content:
last_content = last_content[0]
else:
cursor.execute(
"INSERT INTO websites (url, date_added, last_fetched) VALUES (?, ?, ?)",
(url, int(time.time()), int(time.time())),
)
conn.commit()
id = cursor.lastrowid
if not last_fetched or int(time.time()) - last_fetched > 3600:
response = requests.get(url, timeout=20)
cursor.execute("UPDATE websites SET last_fetched = ? WHERE id = ?", (int(time.time()), id))
if response.content != last_content:
cursor.execute(
"INSERT INTO deltas (website_id, headers, content, fetch_date) VALUES (?, ?, ?, ?)",
(id, str(response.headers), response.content, int(time.time())),
)
conn.commit()
cursor.execute("SELECT headers, content, fetch_date FROM deltas WHERE website_id = ?", (id,))
updates = []
for update in cursor.fetchall():
updates.append(
RSSItem(
title=f"Change on {url}",
url=url,
content=f"Headers: {update[0]}\n\nContent: {update[1]}",
date=datetime.utcfromtimestamp(update[2]),
enclosures=[],
guid=update[2],
)
)
return RSSFeed(
title=f"Updates for {url}",
url=url,
description=f"Detected changes on page {url}",
content=updates,
)
|