summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndré Glüpker <git@wgmd.de>2021-05-05 20:09:30 +0200
committerAndré Glüpker <git@wgmd.de>2021-05-05 20:09:30 +0200
commit5774dbfb2caa42cb55bafab98a40e47f395e44d9 (patch)
tree8294b7b6fefebc1befeed4104f3b5604683999a8
downloadrss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.gz
rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.bz2
rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.zip
Initial commit of RSS converter application
-rwxr-xr-xnetto.py69
-rw-r--r--pyproject.toml16
-rwxr-xr-xrss.py56
-rwxr-xr-xtelegram.py72
-rwxr-xr-xtwitter.py202
-rwxr-xr-xwebapp.py97
-rwxr-xr-xwsgi.py6
-rwxr-xr-xzdf.py87
8 files changed, 605 insertions, 0 deletions
diff --git a/netto.py b/netto.py
new file mode 100755
index 0000000..fda6409
--- /dev/null
+++ b/netto.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+from urllib.request import urlopen, Request
+from datetime import datetime
+from bs4 import BeautifulSoup
+import sys
+
+def _format_date(dt):
+ """convert a datetime into an RFC 822 formatted date
+ Input date must be in GMT.
+ Stolen from PyRSS2Gen.
+ """
+ # Looks like:
+ # Sat, 07 Sep 2002 00:00:01 GMT
+ # Can't use strftime because that's locale dependent
+ #
+ # Isn't there a standard way to do this for Python? The
+ # rfc822 and email.Utils modules assume a timestamp. The
+ # following is based on the rfc822 module.
+ return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+ ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+ dt.day,
+ ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+ dt.year, dt.hour, dt.minute, dt.second)
+
+def netto(store_id):
+ url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id)
+ res = urlopen(Request(url))
+ soup = BeautifulSoup(res, features="html.parser")
+
+ # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'})
+ message = soup.find('a', attrs={'class': 'flipbook_pdf_flipbook'})
+
+ url = message['href'].split('?')[0]
+ year = str(datetime.now().year)
+ title = url[ url.find(year) : url.find(year) + 7 ]
+
+ return title, url
+
+def main(store_id = 9110):
+ url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id)
+
+ print("""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Netto Angebote """ + str(store_id) + """</title>
+ <link>""" + url + """</link>
+ <description>PDF der neuen Netto Angebote für den Laden um die Ecke.</description>
+ <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""")
+
+ title, link = netto(url)
+ print(' <item>')
+ print(' <title><![CDATA[Angebote für ' + title + ']]></title>')
+ print(' <link>' + link + '</link>')
+ # print(' <description><![CDATA[' + description + ']]></description>')
+ # print(' <pubDate>' + date + '</pubDate>')
+ # print(' <media:content url="' + thumbnail + b'" type="image/jpeg" />')
+ print(' </item>')
+
+ print(' </channel>')
+ print('</rss>')
+
+if __name__ == "__main__":
+ # if len(sys.argv) != 2:
+ # print('Usage:', sys.argv[0], '<foobar>')
+ # sys.exit(1)
+ # main(sys.argv[1])
+ main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..04843a2
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "rss-feeds"
+version = "0.1.0"
+description = "Build RSS feeds for various web services."
+authors = ["Your Name <you@example.com>"]
+
+[tool.poetry.dependencies]
+python = ">=3.5.0,<4"
+Flask = "^1.1.2"
+beautifulsoup4 = "^4.9.3"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/rss.py b/rss.py
new file mode 100755
index 0000000..66ffb35
--- /dev/null
+++ b/rss.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+from datetime import datetime
+
+def _format_date(dt):
+ """convert a datetime into an RFC 822 formatted date
+ Input date must be in GMT.
+ Stolen from PyRSS2Gen.
+ """
+ # Looks like:
+ # Sat, 07 Sep 2002 00:00:01 GMT
+ # Can't use strftime because that's locale dependent
+ #
+ # Isn't there a standard way to do this for Python? The
+ # rfc822 and email.Utils modules assume a timestamp. The
+ # following is based on the rfc822 module.
+ return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+ ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+ dt.day,
+ ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+ dt.year, dt.hour, dt.minute, dt.second)
+
+def buildRSS(title, url, description, content):
+ """
+ Feed basic info: title, url, descriptions
+ Content: List[Dict{title, url, content, date, enclosures, guid}]
+ """
+
+ feed = """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
+ <channel>
+ <title>""" + title + """</title>
+ <link>""" + url + """</link>
+ <description>""" + description + """</description>
+ <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>"""
+
+ for item in content:
+ feed += ' <item>'
+ feed += ' <title><![CDATA[' + item.get('title', 'N/A') + ']]></title>'
+ feed += ' <link>' + item.get('url', 'N/A') + '</link>'
+ feed += ' <description><![CDATA[' + item.get('content', 'N/A') + ']]></description>'
+ if 'date' in item:
+ if type(item['date']) is str:
+ feed += ' <pubDate>' + item['date'] + '</pubDate>'
+ else:
+ feed += ' <pubDate>' + _format_date(item['date']) + '</pubDate>'
+ for enclosure in item.get('enclosures', []):
+ feed += ' <media:content url="' + enclosure + '" />'
+ if 'guid' in item:
+ feed += ' <guid>' + item['guid'] + '</guid>'
+ feed += ' </item>'
+
+ feed += ' </channel>'
+ feed += '</rss>'
+ return feed
diff --git a/telegram.py b/telegram.py
new file mode 100755
index 0000000..3058339
--- /dev/null
+++ b/telegram.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+from urllib.request import urlopen, Request
+from datetime import datetime
+from bs4 import BeautifulSoup
+import sys
+
+def _format_date(dt):
+ """convert a datetime into an RFC 822 formatted date
+ Input date must be in GMT.
+ Stolen from PyRSS2Gen.
+ """
+ # Looks like:
+ # Sat, 07 Sep 2002 00:00:01 GMT
+ # Can't use strftime because that's locale dependent
+ #
+ # Isn't there a standard way to do this for Python? The
+ # rfc822 and email.Utils modules assume a timestamp. The
+ # following is based on the rfc822 module.
+ return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+ ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+ dt.day,
+ ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+ dt.year, dt.hour, dt.minute, dt.second)
+
+def telegram(channel):
+ url = 'https://t.me/s/' + channel
+ res = urlopen(Request(url))
+ soup = BeautifulSoup(res, features="html.parser")
+
+ # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'})
+ messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_bubble'})
+
+ for message in messages:
+ date = message.find('time', attrs={'class': 'time'})['datetime']
+ html = message.find('div', attrs={'class': 'tgme_widget_message_text'})
+ # preview = message.find('div', attrs={'class': 'tgme_widget_message_bubble'})
+ link = message.find('a', attrs={'class': 'tgme_widget_message_date'})
+ title = html.text if html else 'No text'
+ description = str(message) # if preview else '?'
+ link = link['href']
+ yield title, description, link, date
+
+def main(channel):
+ url = 'https://t.me/s/' + channel
+
+ print("""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Telegram: """ + channel + """</title>
+ <link>""" + url + """</link>
+ <description>The latest entries of the telegram channel of """ +channel + """</description>
+ <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""")
+
+ for title, description, link, date in telegram(channel):
+ print(' <item>')
+ print(' <title><![CDATA[' + title + ']]></title>')
+ print(' <link>' + link + '</link>')
+ print(' <description><![CDATA[' + description + ']]></description>')
+ print(' <pubDate>' + date + '</pubDate>')
+ # print(' <media:content url="' + thumbnail + b'" type="image/jpeg" />')
+ print(' </item>')
+
+ print(' </channel>')
+ print('</rss>')
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print('Usage:', sys.argv[0], '<telegram channel>')
+ sys.exit(1)
+ main(sys.argv[1])
diff --git a/twitter.py b/twitter.py
new file mode 100755
index 0000000..5ddf8ad
--- /dev/null
+++ b/twitter.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+
+from urllib.error import HTTPError
+from urllib.request import urlopen, Request
+import logging
+
+# from requests_oauthlib import OAuth1Session
+from datetime import datetime
+import sys
+import json
+
+bearer = None
+
+
+def _format_date(dt):
+ """convert a datetime into an RFC 822 formatted date
+ Input date must be in GMT.
+ Stolen from PyRSS2Gen.
+ """
+ # Looks like:
+ # Sat, 07 Sep 2002 00:00:01 GMT
+ # Can't use strftime because that's locale dependent
+ #
+ # Isn't there a standard way to do this for Python? The
+ # rfc822 and email.Utils modules assume a timestamp. The
+ # following is based on the rfc822 module.
+ return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+ ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+ dt.day,
+ [
+ "Jan",
+ "Feb",
+ "Mar",
+ "Apr",
+ "May",
+ "Jun",
+ "Jul",
+ "Aug",
+ "Sep",
+ "Oct",
+ "Nov",
+ "Dec",
+ ][dt.month - 1],
+ dt.year,
+ dt.hour,
+ dt.minute,
+ dt.second,
+ )
+
+
+def getBearer():
+ global bearer
+ if bearer:
+ return bearer
+ headers = {
+ "Authorization": "Basic Zzl1MXI2SFpYTXg0SXU5UGs5VlNvTzFUdzpmeTIyQjN2QVRRNUI2eGthb1BFdFFRUmtuUGQ1WGZBbnBKVG5hc0ZRa3NyUm5qaVNsaw==",
+ "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
+ }
+ data = b"grant_type=client_credentials"
+ url = "https://api.twitter.com/oauth2/token"
+
+ res = urlopen(Request(url, headers=headers, data=data, method="POST"))
+ response = json.loads(res.read().decode("UTF-8"))
+ bearer = response["access_token"]
+
+ return bearer
+
+
+def unshorten_urls(title, description, urls):
+ for url in urls:
+ shorted_url = url["url"]
+ long_url = url["expanded_url"]
+
+ if "images" in url:
+ img = url["images"][0]["url"]
+ long_url_html = '<a href="' + long_url + '"><img src="' + img + '"/></a>'
+ else:
+ long_url_html = '<a href="' + long_url + '">' + long_url + "</a>"
+
+ description = description.replace(shorted_url, long_url_html)
+ title = title.replace(shorted_url, long_url)
+ return title, description
+
+
+def twitter(user):
+ # 500.000 Tweets per month
+ # API KEY = g9u1r6HZXMx4Iu9Pk9VSoO1Tw
+ # API SECRET KEY = fy22B3vATQ5B6xkaoPEtQQRknPd5XfAnpJTnasFQksrRnjiSlk
+
+ headers = {"authorization": "Bearer " + getBearer()}
+
+ # Recent = last 7 days
+ url = (
+ "https://api.twitter.com/2/tweets/search/recent?query=from:"
+ + user
+ + "&tweet.fields=created_at,author_id,lang,source,public_metrics,entities&expansions=referenced_tweets.id,attachments.media_keys&media.fields=url"
+ )
+
+ try:
+ res = urlopen(Request(url, headers=headers))
+ response = json.loads(res.read().decode("UTF-8"))
+ except Exception as exc:
+ logging.error('Request to twitter failed.', exc_info=exc)
+ return None
+
+ if not response["meta"]["result_count"]:
+ return []
+
+ for tweet in response["data"]:
+ title = tweet["text"]
+ description = tweet["text"]
+ link = "https://twitter.com/" + user + "/status/" + str(tweet["id"])
+
+ # Check included tweets
+ if (
+ "referenced_tweets" in tweet
+ and len(tweet["referenced_tweets"]) == 1
+ and tweet["referenced_tweets"][0]["type"] == "retweeted"
+ ):
+ rt_info = title[: title.index(":") + 2]
+ ref_id = tweet["referenced_tweets"][0]["id"]
+ ref_tweet = next(
+ t for t in response["includes"]["tweets"] if t["id"] == ref_id
+ )
+ title = rt_info + ref_tweet["text"]
+ description = rt_info + ref_tweet["text"]
+ title, description = unshorten_urls(
+ title, description, ref_tweet.get("entities", {}).get("urls", [])
+ )
+
+ title, description = unshorten_urls(
+ title, description, tweet.get("entities", {}).get("urls", [])
+ )
+
+ # Attach media
+ enclosures = []
+ medias = tweet.get('attachments', {}).get('media_keys', [])
+ for media in medias:
+ ref_media = next(
+ t for t in response["includes"]["media"] if t["media_key"] == media
+ )
+ if 'url' not in ref_media: continue
+ if ref_media.get('type', '') == 'photo':
+ description += "<br/><img src=\"" + ref_media['url'] + "\" />"
+ else:
+ enclosures.append(ref_media['url'])
+
+ # Append Retweets etc
+ description += "<br/><br/>"
+ description += str(tweet["public_metrics"]["retweet_count"]) + " Retweets, "
+ description += str(tweet["public_metrics"]["like_count"]) + " Likes, "
+ description += str(tweet["public_metrics"]["reply_count"]) + " Replies, "
+ description += str(tweet["public_metrics"]["quote_count"]) + " Quotes"
+ description += "<br/>"
+ description += "Source: " + tweet["source"]
+
+ date = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+
+ yield title, description, link, date, enclosures
+
+
+def main(channel):
+ print(
+ """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+ <channel>
+ <title>Twitter: """
+ + channel
+ + """</title>
+ <link>https://twitter.com/"""
+ + channel
+ + """</link>
+ <description>The latest entries of the twitter account of """
+ + channel
+ + """</description>
+ <lastBuildDate>"""
+ + _format_date(datetime.now())
+ + """</lastBuildDate>"""
+ )
+
+ for title, description, link, date, enclosures in twitter(channel):
+ print(" <item>")
+ print(" <title><![CDATA[" + title + "]]></title>")
+ print(" <link>" + link + "</link>")
+ print(" <description><![CDATA[" + description + "]]></description>")
+ print(" <pubDate>" + _format_date(date) + "</pubDate>")
+ for enclosure in enclosures:
+ print(' <media:content url="' + enclosure + '" />')
+ print(" </item>")
+
+ print(" </channel>")
+ print("</rss>")
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print("Usage:", sys.argv[0], "<twitter channel>")
+ sys.exit(1)
+ main(sys.argv[1])
+ # twitter('rheinbahn_intim')
+ # twitter('realDonaldTrump')
diff --git a/webapp.py b/webapp.py
new file mode 100755
index 0000000..b992150
--- /dev/null
+++ b/webapp.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+# from flask import Flask, redirect, render_template, request, session, url_for
+from flask import Flask, Response
+
+# from wsgiref.util import setup_testing_defaults
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor
+
+import json
+import os
+import re
+import sys
+import time, datetime
+# import traceback
+import logging
+
+from twitter import twitter
+from telegram import telegram
+from netto import netto
+from rss import buildRSS
+from zdf import zdf
+
+app = Flask(__name__)
+app.secret_key = "NMcgoB.0wd+$.KVKj!F{3>U{%BBUVhL=7=5$:46rQH$Q{enCuU"
+
+
+@app.route("/")
+def main():
+ return 'this is sparta'
+
+@app.errorhandler(404)
+def not_found(e):
+ return 'Die angeforderte Seite konnte nicht gefunden werden.'
+
+@app.route("/twitter/<account>")
+def feedTwitter(account):
+ content = [{'title': t, 'url': u, 'content': c, 'date': d, 'enclosures': e}
+ for t,c,u,d,e in twitter(account)]
+ xml = buildRSS(
+ title = 'Twitter: ' + account,
+ url = 'https://twitter.com/' + account,
+ description = 'The latest entries of the twitter account of ' + account,
+ content = content)
+ response = Response(xml, mimetype='text/xml')
+ response.headers['Access-Control-Allow-Origin'] = '*'
+ return response
+
+@app.route("/telegram/<account>")
+def feedTelegram(account):
+ content = [{'title': t, 'url': u, 'content': c, 'date': d}
+ for t,c,u,d in telegram(account)]
+ xml = buildRSS(
+ title = 'Telegram: ' + account,
+ url = 'https://t.me/s/' + account,
+ description = 'The latest entries of the telegram channel of ' + account,
+ content = content)
+ response = Response(xml, mimetype='text/xml')
+ response.headers['Access-Control-Allow-Origin'] = '*'
+ return response
+
+@app.route("/netto/<market>")
+def feedNetto(market):
+ title, url = netto(market)
+ content = [{
+ 'title': 'Angebote für ' + title,
+ 'url': url,
+ 'content': 'Angebote für ' + title + ' finden sich unter ' + url,
+ }]
+ xml = buildRSS(
+ title = 'Netto Angebote für ' + market,
+ url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + market,
+ description = 'PDF der neuen Netto Angebote für den Laden um die Ecke.',
+ content = content)
+ response = Response(xml, mimetype='text/xml')
+ response.headers['Access-Control-Allow-Origin'] = '*'
+ return response
+
+@app.route("/zdf/<path:feed>")
+def filterZDFFeed(feed):
+ title, url, description, content = zdf(feed)
+ xml = buildRSS(
+ title = title,
+ url = url,
+ description = description,
+ content = content)
+ response = Response(xml, mimetype='text/xml')
+ response.headers['Access-Control-Allow-Origin'] = '*'
+ return response
+
+
+if __name__ == '__main__':
+ logging.basicConfig(filename='./main.log', level=logging.INFO)
+
+ app.config['TEMPLATES_AUTO_RELOAD'] = True
+ app.run(threaded=True)
+
diff --git a/wsgi.py b/wsgi.py
new file mode 100755
index 0000000..2371f9c
--- /dev/null
+++ b/wsgi.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.append('./')
+
+from webapp import app as application
diff --git a/zdf.py b/zdf.py
new file mode 100755
index 0000000..37749ac
--- /dev/null
+++ b/zdf.py
@@ -0,0 +1,87 @@
+import logging
+from urllib.request import urlopen, Request
+from datetime import datetime
+from xml.dom.minidom import parse, parseString
+import locale
+
+def getText(dom, element):
+ textNode = dom.getElementsByTagName(element)[0].firstChild
+ if textNode:
+ return textNode.data
+ return ""
+
+def zdf(feed):
+ url = f"https://www.zdf.de/rss/zdf/{feed}"
+
+ try:
+ res = urlopen(Request(url))
+ except Exception as exc:
+ logging.error('Request to zdf failed.', exc_info=exc)
+ return None
+
+ try:
+ rss = res.read()
+ xml = parseString(rss)
+ except Exception as exc:
+ logging.error('Parsing to zdf failed.', exc_info=exc)
+ return None
+
+ try:
+ title = getText(xml, 'title')
+ description = getText(xml, 'description')
+
+ content = []
+ for show in xml.getElementsByTagName('item'):
+ s_url = getText(show, 'link')
+ if not s_url:
+ continue
+ # Full episodes have the ID 100
+ if not s_url.endswith('-100.html'):
+ continue
+
+ s_title = getText(show, 'title')
+ if not s_title.startswith(title):
+ continue
+
+ s_date = getText(show, 'pubDate')
+ s_date_parsed = datetime.strptime(s_date, "%a, %d %b %Y %H:%M:%S %z")
+
+ if s_date_parsed.timestamp() > datetime.now().timestamp():
+ continue
+
+ # s_tmp = s_title = getText(show, 'title')
+ # if s_tmp.startswith(f'{title} vom '):
+ # s_tmp = s_tmp[len(f'{title} vom '):]
+ # saved = locale.setlocale(locale.LC_TIME)
+ # locale.setlocale(locale.LC_TIME, "de_DE.utf8")
+ # tmp = datetime.strptime(s_tmp, "%d. %B %Y")
+ # locale.setlocale(locale.LC_TIME, saved)
+
+ s_desc = getText(show, 'description')
+ s_guid = getText(show, 'guid')
+ print("Adding", s_url, s_desc)
+ content.append({
+ 'title': s_title,
+ 'url': s_url,
+ 'content': s_desc,
+ 'date': s_date,
+ 'guid': s_guid,
+ })
+
+ return title, url, description, content
+ except Exception as exc:
+ logging.error('Working with zdf failed.', exc_info=exc)
+ return None
+
+
+def main():
+ # print(zdf("comedy/heute-show"))
+ # print(zdf("comedy/die-anstalt"))
+ print(zdf("comedy/zdf-magazin-royale"))
+
+if __name__ == "__main__":
+ # if len(sys.argv) != 2:
+ # print('Usage:', sys.argv[0], '<foobar>')
+ # sys.exit(1)
+ # main(sys.argv[1])
+ main()