diff options
author | André Glüpker <git@wgmd.de> | 2021-05-05 20:09:30 +0200 |
---|---|---|
committer | André Glüpker <git@wgmd.de> | 2021-05-05 20:09:30 +0200 |
commit | 5774dbfb2caa42cb55bafab98a40e47f395e44d9 (patch) | |
tree | 8294b7b6fefebc1befeed4104f3b5604683999a8 | |
download | rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.gz rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.bz2 rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.zip |
Initial commit of RSS converter application
-rwxr-xr-x | netto.py | 69 | ||||
-rw-r--r-- | pyproject.toml | 16 | ||||
-rwxr-xr-x | rss.py | 56 | ||||
-rwxr-xr-x | telegram.py | 72 | ||||
-rwxr-xr-x | twitter.py | 202 | ||||
-rwxr-xr-x | webapp.py | 97 | ||||
-rwxr-xr-x | wsgi.py | 6 | ||||
-rwxr-xr-x | zdf.py | 87 |
8 files changed, 605 insertions, 0 deletions
diff --git a/netto.py b/netto.py new file mode 100755 index 0000000..fda6409 --- /dev/null +++ b/netto.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +from urllib.request import urlopen, Request +from datetime import datetime +from bs4 import BeautifulSoup +import sys + +def _format_date(dt): + """convert a datetime into an RFC 822 formatted date + Input date must be in GMT. + Stolen from PyRSS2Gen. + """ + # Looks like: + # Sat, 07 Sep 2002 00:00:01 GMT + # Can't use strftime because that's locale dependent + # + # Isn't there a standard way to do this for Python? The + # rfc822 and email.Utils modules assume a timestamp. The + # following is based on the rfc822 module. + return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( + ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()], + dt.day, + ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1], + dt.year, dt.hour, dt.minute, dt.second) + +def netto(store_id): + url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id) + res = urlopen(Request(url)) + soup = BeautifulSoup(res, features="html.parser") + + # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'}) + message = soup.find('a', attrs={'class': 'flipbook_pdf_flipbook'}) + + url = message['href'].split('?')[0] + year = str(datetime.now().year) + title = url[ url.find(year) : url.find(year) + 7 ] + + return title, url + +def main(store_id = 9110): + url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id) + + print("""<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>Netto Angebote """ + str(store_id) + """</title> + <link>""" + url + """</link> + <description>PDF der neuen Netto Angebote für den Laden um die Ecke.</description> + <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""") + + title, link = netto(url) + print(' <item>') + print(' <title><![CDATA[Angebote für ' + title + ']]></title>') + print(' <link>' + link + '</link>') + # print(' <description><![CDATA[' + description + ']]></description>') + # print(' <pubDate>' + date + '</pubDate>') + # print(' <media:content url="' + thumbnail + b'" type="image/jpeg" />') + print(' </item>') + + print(' </channel>') + print('</rss>') + +if __name__ == "__main__": + # if len(sys.argv) != 2: + # print('Usage:', sys.argv[0], '<foobar>') + # sys.exit(1) + # main(sys.argv[1]) + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..04843a2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "rss-feeds" +version = "0.1.0" +description = "Build RSS feeds for various web services." +authors = ["Your Name <you@example.com>"] + +[tool.poetry.dependencies] +python = ">=3.5.0,<4" +Flask = "^1.1.2" +beautifulsoup4 = "^4.9.3" + +[tool.poetry.dev-dependencies] + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +from datetime import datetime + +def _format_date(dt): + """convert a datetime into an RFC 822 formatted date + Input date must be in GMT. + Stolen from PyRSS2Gen. + """ + # Looks like: + # Sat, 07 Sep 2002 00:00:01 GMT + # Can't use strftime because that's locale dependent + # + # Isn't there a standard way to do this for Python? The + # rfc822 and email.Utils modules assume a timestamp. The + # following is based on the rfc822 module. + return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( + ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()], + dt.day, + ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1], + dt.year, dt.hour, dt.minute, dt.second) + +def buildRSS(title, url, description, content): + """ + Feed basic info: title, url, descriptions + Content: List[Dict{title, url, content, date, enclosures, guid}] + """ + + feed = """<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"> + <channel> + <title>""" + title + """</title> + <link>""" + url + """</link> + <description>""" + description + """</description> + <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""" + + for item in content: + feed += ' <item>' + feed += ' <title><![CDATA[' + item.get('title', 'N/A') + ']]></title>' + feed += ' <link>' + item.get('url', 'N/A') + '</link>' + feed += ' <description><![CDATA[' + item.get('content', 'N/A') + ']]></description>' + if 'date' in item: + if type(item['date']) is str: + feed += ' <pubDate>' + item['date'] + '</pubDate>' + else: + feed += ' <pubDate>' + _format_date(item['date']) + '</pubDate>' + for enclosure in item.get('enclosures', []): + feed += ' <media:content url="' + enclosure + '" />' + if 'guid' in item: + feed += ' <guid>' + item['guid'] + '</guid>' + feed += ' </item>' + + feed += ' </channel>' + feed += '</rss>' + return feed diff --git a/telegram.py b/telegram.py new file mode 100755 index 0000000..3058339 --- /dev/null +++ b/telegram.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +from urllib.request import urlopen, Request +from datetime import datetime +from bs4 import BeautifulSoup +import sys + +def _format_date(dt): + """convert a datetime into an RFC 822 formatted date + Input date must be in GMT. + Stolen from PyRSS2Gen. + """ + # Looks like: + # Sat, 07 Sep 2002 00:00:01 GMT + # Can't use strftime because that's locale dependent + # + # Isn't there a standard way to do this for Python? The + # rfc822 and email.Utils modules assume a timestamp. The + # following is based on the rfc822 module. + return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( + ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()], + dt.day, + ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1], + dt.year, dt.hour, dt.minute, dt.second) + +def telegram(channel): + url = 'https://t.me/s/' + channel + res = urlopen(Request(url)) + soup = BeautifulSoup(res, features="html.parser") + + # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'}) + messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_bubble'}) + + for message in messages: + date = message.find('time', attrs={'class': 'time'})['datetime'] + html = message.find('div', attrs={'class': 'tgme_widget_message_text'}) + # preview = message.find('div', attrs={'class': 'tgme_widget_message_bubble'}) + link = message.find('a', attrs={'class': 'tgme_widget_message_date'}) + title = html.text if html else 'No text' + description = str(message) # if preview else '?' + link = link['href'] + yield title, description, link, date + +def main(channel): + url = 'https://t.me/s/' + channel + + print("""<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>Telegram: """ + channel + """</title> + <link>""" + url + """</link> + <description>The latest entries of the telegram channel of """ +channel + """</description> + <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""") + + for title, description, link, date in telegram(channel): + print(' <item>') + print(' <title><![CDATA[' + title + ']]></title>') + print(' <link>' + link + '</link>') + print(' <description><![CDATA[' + description + ']]></description>') + print(' <pubDate>' + date + '</pubDate>') + # print(' <media:content url="' + thumbnail + b'" type="image/jpeg" />') + print(' </item>') + + print(' </channel>') + print('</rss>') + +if __name__ == "__main__": + if len(sys.argv) != 2: + print('Usage:', sys.argv[0], '<telegram channel>') + sys.exit(1) + main(sys.argv[1]) diff --git a/twitter.py b/twitter.py new file mode 100755 index 0000000..5ddf8ad --- /dev/null +++ b/twitter.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 + +from urllib.error import HTTPError +from urllib.request import urlopen, Request +import logging + +# from requests_oauthlib import OAuth1Session +from datetime import datetime +import sys +import json + +bearer = None + + +def _format_date(dt): + """convert a datetime into an RFC 822 formatted date + Input date must be in GMT. + Stolen from PyRSS2Gen. + """ + # Looks like: + # Sat, 07 Sep 2002 00:00:01 GMT + # Can't use strftime because that's locale dependent + # + # Isn't there a standard way to do this for Python? The + # rfc822 and email.Utils modules assume a timestamp. The + # following is based on the rfc822 module. + return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( + ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()], + dt.day, + [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ][dt.month - 1], + dt.year, + dt.hour, + dt.minute, + dt.second, + ) + + +def getBearer(): + global bearer + if bearer: + return bearer + headers = { + "Authorization": "Basic Zzl1MXI2SFpYTXg0SXU5UGs5VlNvTzFUdzpmeTIyQjN2QVRRNUI2eGthb1BFdFFRUmtuUGQ1WGZBbnBKVG5hc0ZRa3NyUm5qaVNsaw==", + "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", + } + data = b"grant_type=client_credentials" + url = "https://api.twitter.com/oauth2/token" + + res = urlopen(Request(url, headers=headers, data=data, method="POST")) + response = json.loads(res.read().decode("UTF-8")) + bearer = response["access_token"] + + return bearer + + +def unshorten_urls(title, description, urls): + for url in urls: + shorted_url = url["url"] + long_url = url["expanded_url"] + + if "images" in url: + img = url["images"][0]["url"] + long_url_html = '<a href="' + long_url + '"><img src="' + img + '"/></a>' + else: + long_url_html = '<a href="' + long_url + '">' + long_url + "</a>" + + description = description.replace(shorted_url, long_url_html) + title = title.replace(shorted_url, long_url) + return title, description + + +def twitter(user): + # 500.000 Tweets per month + # API KEY = g9u1r6HZXMx4Iu9Pk9VSoO1Tw + # API SECRET KEY = fy22B3vATQ5B6xkaoPEtQQRknPd5XfAnpJTnasFQksrRnjiSlk + + headers = {"authorization": "Bearer " + getBearer()} + + # Recent = last 7 days + url = ( + "https://api.twitter.com/2/tweets/search/recent?query=from:" + + user + + "&tweet.fields=created_at,author_id,lang,source,public_metrics,entities&expansions=referenced_tweets.id,attachments.media_keys&media.fields=url" + ) + + try: + res = urlopen(Request(url, headers=headers)) + response = json.loads(res.read().decode("UTF-8")) + except Exception as exc: + logging.error('Request to twitter failed.', exc_info=exc) + return None + + if not response["meta"]["result_count"]: + return [] + + for tweet in response["data"]: + title = tweet["text"] + description = tweet["text"] + link = "https://twitter.com/" + user + "/status/" + str(tweet["id"]) + + # Check included tweets + if ( + "referenced_tweets" in tweet + and len(tweet["referenced_tweets"]) == 1 + and tweet["referenced_tweets"][0]["type"] == "retweeted" + ): + rt_info = title[: title.index(":") + 2] + ref_id = tweet["referenced_tweets"][0]["id"] + ref_tweet = next( + t for t in response["includes"]["tweets"] if t["id"] == ref_id + ) + title = rt_info + ref_tweet["text"] + description = rt_info + ref_tweet["text"] + title, description = unshorten_urls( + title, description, ref_tweet.get("entities", {}).get("urls", []) + ) + + title, description = unshorten_urls( + title, description, tweet.get("entities", {}).get("urls", []) + ) + + # Attach media + enclosures = [] + medias = tweet.get('attachments', {}).get('media_keys', []) + for media in medias: + ref_media = next( + t for t in response["includes"]["media"] if t["media_key"] == media + ) + if 'url' not in ref_media: continue + if ref_media.get('type', '') == 'photo': + description += "<br/><img src=\"" + ref_media['url'] + "\" />" + else: + enclosures.append(ref_media['url']) + + # Append Retweets etc + description += "<br/><br/>" + description += str(tweet["public_metrics"]["retweet_count"]) + " Retweets, " + description += str(tweet["public_metrics"]["like_count"]) + " Likes, " + description += str(tweet["public_metrics"]["reply_count"]) + " Replies, " + description += str(tweet["public_metrics"]["quote_count"]) + " Quotes" + description += "<br/>" + description += "Source: " + tweet["source"] + + date = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") + + + yield title, description, link, date, enclosures + + +def main(channel): + print( + """<?xml version="1.0" encoding="UTF-8"?> +<rss version="2.0"> + <channel> + <title>Twitter: """ + + channel + + """</title> + <link>https://twitter.com/""" + + channel + + """</link> + <description>The latest entries of the twitter account of """ + + channel + + """</description> + <lastBuildDate>""" + + _format_date(datetime.now()) + + """</lastBuildDate>""" + ) + + for title, description, link, date, enclosures in twitter(channel): + print(" <item>") + print(" <title><![CDATA[" + title + "]]></title>") + print(" <link>" + link + "</link>") + print(" <description><![CDATA[" + description + "]]></description>") + print(" <pubDate>" + _format_date(date) + "</pubDate>") + for enclosure in enclosures: + print(' <media:content url="' + enclosure + '" />') + print(" </item>") + + print(" </channel>") + print("</rss>") + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage:", sys.argv[0], "<twitter channel>") + sys.exit(1) + main(sys.argv[1]) + # twitter('rheinbahn_intim') + # twitter('realDonaldTrump') diff --git a/webapp.py b/webapp.py new file mode 100755 index 0000000..b992150 --- /dev/null +++ b/webapp.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# from flask import Flask, redirect, render_template, request, session, url_for +from flask import Flask, Response + +# from wsgiref.util import setup_testing_defaults +from collections import OrderedDict +from concurrent.futures import ThreadPoolExecutor + +import json +import os +import re +import sys +import time, datetime +# import traceback +import logging + +from twitter import twitter +from telegram import telegram +from netto import netto +from rss import buildRSS +from zdf import zdf + +app = Flask(__name__) +app.secret_key = "NMcgoB.0wd+$.KVKj!F{3>U{%BBUVhL=7=5$:46rQH$Q{enCuU" + + +@app.route("/") +def main(): + return 'this is sparta' + +@app.errorhandler(404) +def not_found(e): + return 'Die angeforderte Seite konnte nicht gefunden werden.' + +@app.route("/twitter/<account>") +def feedTwitter(account): + content = [{'title': t, 'url': u, 'content': c, 'date': d, 'enclosures': e} + for t,c,u,d,e in twitter(account)] + xml = buildRSS( + title = 'Twitter: ' + account, + url = 'https://twitter.com/' + account, + description = 'The latest entries of the twitter account of ' + account, + content = content) + response = Response(xml, mimetype='text/xml') + response.headers['Access-Control-Allow-Origin'] = '*' + return response + +@app.route("/telegram/<account>") +def feedTelegram(account): + content = [{'title': t, 'url': u, 'content': c, 'date': d} + for t,c,u,d in telegram(account)] + xml = buildRSS( + title = 'Telegram: ' + account, + url = 'https://t.me/s/' + account, + description = 'The latest entries of the telegram channel of ' + account, + content = content) + response = Response(xml, mimetype='text/xml') + response.headers['Access-Control-Allow-Origin'] = '*' + return response + +@app.route("/netto/<market>") +def feedNetto(market): + title, url = netto(market) + content = [{ + 'title': 'Angebote für ' + title, + 'url': url, + 'content': 'Angebote für ' + title + ' finden sich unter ' + url, + }] + xml = buildRSS( + title = 'Netto Angebote für ' + market, + url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + market, + description = 'PDF der neuen Netto Angebote für den Laden um die Ecke.', + content = content) + response = Response(xml, mimetype='text/xml') + response.headers['Access-Control-Allow-Origin'] = '*' + return response + +@app.route("/zdf/<path:feed>") +def filterZDFFeed(feed): + title, url, description, content = zdf(feed) + xml = buildRSS( + title = title, + url = url, + description = description, + content = content) + response = Response(xml, mimetype='text/xml') + response.headers['Access-Control-Allow-Origin'] = '*' + return response + + +if __name__ == '__main__': + logging.basicConfig(filename='./main.log', level=logging.INFO) + + app.config['TEMPLATES_AUTO_RELOAD'] = True + app.run(threaded=True) + @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +import sys +sys.path.append('./') + +from webapp import app as application @@ -0,0 +1,87 @@ +import logging +from urllib.request import urlopen, Request +from datetime import datetime +from xml.dom.minidom import parse, parseString +import locale + +def getText(dom, element): + textNode = dom.getElementsByTagName(element)[0].firstChild + if textNode: + return textNode.data + return "" + +def zdf(feed): + url = f"https://www.zdf.de/rss/zdf/{feed}" + + try: + res = urlopen(Request(url)) + except Exception as exc: + logging.error('Request to zdf failed.', exc_info=exc) + return None + + try: + rss = res.read() + xml = parseString(rss) + except Exception as exc: + logging.error('Parsing to zdf failed.', exc_info=exc) + return None + + try: + title = getText(xml, 'title') + description = getText(xml, 'description') + + content = [] + for show in xml.getElementsByTagName('item'): + s_url = getText(show, 'link') + if not s_url: + continue + # Full episodes have the ID 100 + if not s_url.endswith('-100.html'): + continue + + s_title = getText(show, 'title') + if not s_title.startswith(title): + continue + + s_date = getText(show, 'pubDate') + s_date_parsed = datetime.strptime(s_date, "%a, %d %b %Y %H:%M:%S %z") + + if s_date_parsed.timestamp() > datetime.now().timestamp(): + continue + + # s_tmp = s_title = getText(show, 'title') + # if s_tmp.startswith(f'{title} vom '): + # s_tmp = s_tmp[len(f'{title} vom '):] + # saved = locale.setlocale(locale.LC_TIME) + # locale.setlocale(locale.LC_TIME, "de_DE.utf8") + # tmp = datetime.strptime(s_tmp, "%d. %B %Y") + # locale.setlocale(locale.LC_TIME, saved) + + s_desc = getText(show, 'description') + s_guid = getText(show, 'guid') + print("Adding", s_url, s_desc) + content.append({ + 'title': s_title, + 'url': s_url, + 'content': s_desc, + 'date': s_date, + 'guid': s_guid, + }) + + return title, url, description, content + except Exception as exc: + logging.error('Working with zdf failed.', exc_info=exc) + return None + + +def main(): + # print(zdf("comedy/heute-show")) + # print(zdf("comedy/die-anstalt")) + print(zdf("comedy/zdf-magazin-royale")) + +if __name__ == "__main__": + # if len(sys.argv) != 2: + # print('Usage:', sys.argv[0], '<foobar>') + # sys.exit(1) + # main(sys.argv[1]) + main() |