Initial commit of RSS converter application

author: André Glüpker <git@wgmd.de> 2021-05-05 20:09:30 +0200
committer: André Glüpker <git@wgmd.de> 2021-05-05 20:09:30 +0200
commit: 5774dbfb2caa42cb55bafab98a40e47f395e44d9 (patch)
tree: 8294b7b6fefebc1befeed4104f3b5604683999a8
download: rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.gz
rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.bz2
rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.zip
8 files changed, 605 insertions, 0 deletions
diff --git a/netto.py b/netto.py
new file mode 100755
index 0000000..fda6409
--- /dev/null
+++ b/netto.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+from urllib.request import urlopen, Request
+from datetime import datetime
+from bs4 import BeautifulSoup
+import sys
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+            dt.day,
+            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+            dt.year, dt.hour, dt.minute, dt.second)
+
+def netto(store_id):
+    url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id)
+    res = urlopen(Request(url))
+    soup = BeautifulSoup(res, features="html.parser")
+
+    # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'})
+    message = soup.find('a', attrs={'class': 'flipbook_pdf_flipbook'})
+
+    url = message['href'].split('?')[0]
+    year = str(datetime.now().year)
+    title = url[ url.find(year) : url.find(year) + 7 ]
+
+    return title, url
+
+def main(store_id = 9110):
+    url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id)
+
+    print("""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Netto Angebote """ + str(store_id) + """</title>
+    <link>""" + url + """</link>
+    <description>PDF der neuen Netto Angebote für den Laden um die Ecke.</description>
+    <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""")
+
+    title, link = netto(url)
+    print('    <item>')
+    print('      <title><![CDATA[Angebote für ' + title + ']]></title>')
+    print('      <link>' + link + '</link>')
+    # print('      <description><![CDATA[' + description + ']]></description>')
+    # print('      <pubDate>' + date + '</pubDate>')
+    # print('      <media:content url="' + thumbnail + b'" type="image/jpeg" />')
+    print('    </item>')
+
+    print('  </channel>')
+    print('</rss>')
+
+if __name__ == "__main__":
+    # if len(sys.argv) != 2:
+    #     print('Usage:', sys.argv[0], '<foobar>')
+    #     sys.exit(1)
+    # main(sys.argv[1])
+    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..04843a2
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "rss-feeds"
+version = "0.1.0"
+description = "Build RSS feeds for various web services."
+authors = ["Your Name <you@example.com>"]
+
+[tool.poetry.dependencies]
+python = ">=3.5.0,<4"
+Flask = "^1.1.2"
+beautifulsoup4 = "^4.9.3"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/rss.py b/rss.py
new file mode 100755
index 0000000..66ffb35
--- /dev/null
+++ b/rss.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+from datetime import datetime
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+            dt.day,
+            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+            dt.year, dt.hour, dt.minute, dt.second)
+
+def buildRSS(title, url, description, content):
+    """
+    Feed basic info: title, url, descriptions
+    Content: List[Dict{title, url, content, date, enclosures, guid}]
+    """
+
+    feed = """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
+  <channel>
+    <title>""" + title + """</title>
+    <link>""" + url + """</link>
+    <description>""" + description + """</description>
+    <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>"""
+
+    for item in content:
+        feed += '    <item>'
+        feed += '      <title><![CDATA[' + item.get('title', 'N/A') + ']]></title>'
+        feed += '      <link>' + item.get('url', 'N/A') + '</link>'
+        feed += '      <description><![CDATA[' + item.get('content', 'N/A') + ']]></description>'
+        if 'date' in item:
+            if type(item['date']) is str:
+                feed += '      <pubDate>' + item['date'] + '</pubDate>'
+            else:
+                feed += '      <pubDate>' + _format_date(item['date']) + '</pubDate>'
+        for enclosure in item.get('enclosures', []):
+            feed += '      <media:content url="' + enclosure + '" />'
+        if 'guid' in item:
+            feed += '      <guid>' + item['guid'] + '</guid>'
+        feed += '    </item>'
+
+    feed += '  </channel>'
+    feed += '</rss>'
+    return feed
diff --git a/telegram.py b/telegram.py
new file mode 100755
index 0000000..3058339
--- /dev/null
+++ b/telegram.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+from urllib.request import urlopen, Request
+from datetime import datetime
+from bs4 import BeautifulSoup
+import sys
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+            dt.day,
+            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+            dt.year, dt.hour, dt.minute, dt.second)
+
+def telegram(channel):
+    url = 'https://t.me/s/' + channel
+    res = urlopen(Request(url))
+    soup = BeautifulSoup(res, features="html.parser")
+
+    # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'})
+    messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_bubble'})
+
+    for message in messages:
+        date = message.find('time', attrs={'class': 'time'})['datetime']
+        html = message.find('div', attrs={'class': 'tgme_widget_message_text'})
+        # preview = message.find('div', attrs={'class': 'tgme_widget_message_bubble'})
+        link = message.find('a', attrs={'class': 'tgme_widget_message_date'})
+        title = html.text if html else 'No text'
+        description = str(message) # if preview else '?'
+        link = link['href']
+        yield title, description, link, date
+
+def main(channel):
+    url = 'https://t.me/s/' + channel
+
+    print("""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Telegram: """ + channel + """</title>
+    <link>""" + url + """</link>
+    <description>The latest entries of the telegram channel of """ +channel + """</description>
+    <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""")
+
+    for title, description, link, date in telegram(channel):
+        print('    <item>')
+        print('      <title><![CDATA[' + title + ']]></title>')
+        print('      <link>' + link + '</link>')
+        print('      <description><![CDATA[' + description + ']]></description>')
+        print('      <pubDate>' + date + '</pubDate>')
+        # print('      <media:content url="' + thumbnail + b'" type="image/jpeg" />')
+        print('    </item>')
+
+    print('  </channel>')
+    print('</rss>')
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print('Usage:', sys.argv[0], '<telegram channel>')
+        sys.exit(1)
+    main(sys.argv[1])
diff --git a/twitter.py b/twitter.py
new file mode 100755
index 0000000..5ddf8ad
--- /dev/null
+++ b/twitter.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+
+from urllib.error import HTTPError
+from urllib.request import urlopen, Request
+import logging
+
+# from requests_oauthlib import OAuth1Session
+from datetime import datetime
+import sys
+import json
+
+bearer = None
+
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+        ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+        dt.day,
+        [
+            "Jan",
+            "Feb",
+            "Mar",
+            "Apr",
+            "May",
+            "Jun",
+            "Jul",
+            "Aug",
+            "Sep",
+            "Oct",
+            "Nov",
+            "Dec",
+        ][dt.month - 1],
+        dt.year,
+        dt.hour,
+        dt.minute,
+        dt.second,
+    )
+
+
+def getBearer():
+    global bearer
+    if bearer:
+        return bearer
+    headers = {
+        "Authorization": "Basic Zzl1MXI2SFpYTXg0SXU5UGs5VlNvTzFUdzpmeTIyQjN2QVRRNUI2eGthb1BFdFFRUmtuUGQ1WGZBbnBKVG5hc0ZRa3NyUm5qaVNsaw==",
+        "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
+    }
+    data = b"grant_type=client_credentials"
+    url = "https://api.twitter.com/oauth2/token"
+
+    res = urlopen(Request(url, headers=headers, data=data, method="POST"))
+    response = json.loads(res.read().decode("UTF-8"))
+    bearer = response["access_token"]
+
+    return bearer
+
+
+def unshorten_urls(title, description, urls):
+    for url in urls:
+        shorted_url = url["url"]
+        long_url = url["expanded_url"]
+
+        if "images" in url:
+            img = url["images"][0]["url"]
+            long_url_html = '<a href="' + long_url + '"><img src="' + img + '"/></a>'
+        else:
+            long_url_html = '<a href="' + long_url + '">' + long_url + "</a>"
+
+        description = description.replace(shorted_url, long_url_html)
+        title = title.replace(shorted_url, long_url)
+    return title, description
+
+
+def twitter(user):
+    # 500.000 Tweets per month
+    # API KEY = g9u1r6HZXMx4Iu9Pk9VSoO1Tw
+    # API SECRET KEY = fy22B3vATQ5B6xkaoPEtQQRknPd5XfAnpJTnasFQksrRnjiSlk
+
+    headers = {"authorization": "Bearer " + getBearer()}
+
+    # Recent = last 7 days
+    url = (
+        "https://api.twitter.com/2/tweets/search/recent?query=from:"
+        + user
+        + "&tweet.fields=created_at,author_id,lang,source,public_metrics,entities&expansions=referenced_tweets.id,attachments.media_keys&media.fields=url"
+    )
+
+    try:
+        res = urlopen(Request(url, headers=headers))
+        response = json.loads(res.read().decode("UTF-8"))
+    except Exception as exc:
+        logging.error('Request to twitter failed.', exc_info=exc)
+        return None
+
+    if not response["meta"]["result_count"]:
+        return []
+
+    for tweet in response["data"]:
+        title = tweet["text"]
+        description = tweet["text"]
+        link = "https://twitter.com/" + user + "/status/" + str(tweet["id"])
+
+        # Check included tweets
+        if (
+            "referenced_tweets" in tweet
+            and len(tweet["referenced_tweets"]) == 1
+            and tweet["referenced_tweets"][0]["type"] == "retweeted"
+        ):
+            rt_info = title[: title.index(":") + 2]
+            ref_id = tweet["referenced_tweets"][0]["id"]
+            ref_tweet = next(
+                t for t in response["includes"]["tweets"] if t["id"] == ref_id
+            )
+            title = rt_info + ref_tweet["text"]
+            description = rt_info + ref_tweet["text"]
+            title, description = unshorten_urls(
+                title, description, ref_tweet.get("entities", {}).get("urls", [])
+            )
+
+        title, description = unshorten_urls(
+            title, description, tweet.get("entities", {}).get("urls", [])
+        )
+
+        # Attach media
+        enclosures = []
+        medias = tweet.get('attachments', {}).get('media_keys', [])
+        for media in medias:
+            ref_media = next(
+                t for t in response["includes"]["media"] if t["media_key"] == media
+            )
+            if 'url' not in ref_media: continue
+            if ref_media.get('type', '') == 'photo':
+                description += "<br/><img src=\"" + ref_media['url'] + "\" />"
+            else:
+                enclosures.append(ref_media['url'])
+
+        # Append Retweets etc
+        description += "<br/><br/>"
+        description += str(tweet["public_metrics"]["retweet_count"]) + " Retweets, "
+        description += str(tweet["public_metrics"]["like_count"]) + " Likes, "
+        description += str(tweet["public_metrics"]["reply_count"]) + " Replies, "
+        description += str(tweet["public_metrics"]["quote_count"]) + " Quotes"
+        description += "<br/>"
+        description += "Source: " + tweet["source"]
+
+        date = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+
+        yield title, description, link, date, enclosures
+
+
+def main(channel):
+    print(
+        """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Twitter: """
+        + channel
+        + """</title>
+    <link>https://twitter.com/"""
+        + channel
+        + """</link>
+    <description>The latest entries of the twitter account of """
+        + channel
+        + """</description>
+    <lastBuildDate>"""
+        + _format_date(datetime.now())
+        + """</lastBuildDate>"""
+    )
+
+    for title, description, link, date, enclosures in twitter(channel):
+        print("    <item>")
+        print("      <title><![CDATA[" + title + "]]></title>")
+        print("      <link>" + link + "</link>")
+        print("      <description><![CDATA[" + description + "]]></description>")
+        print("      <pubDate>" + _format_date(date) + "</pubDate>")
+        for enclosure in enclosures:
+            print('      <media:content url="' + enclosure + '" />')
+        print("    </item>")
+
+    print("  </channel>")
+    print("</rss>")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage:", sys.argv[0], "<twitter channel>")
+        sys.exit(1)
+    main(sys.argv[1])
+    # twitter('rheinbahn_intim')
+    # twitter('realDonaldTrump')
diff --git a/webapp.py b/webapp.py
new file mode 100755
index 0000000..b992150
--- /dev/null
+++ b/webapp.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+# from flask import Flask, redirect, render_template, request, session, url_for
+from flask import Flask, Response
+
+# from wsgiref.util import setup_testing_defaults
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor
+
+import json
+import os
+import re
+import sys
+import time, datetime
+# import traceback
+import logging
+
+from twitter import twitter
+from telegram import telegram
+from netto import netto
+from rss import buildRSS
+from zdf import zdf
+
+app = Flask(__name__)
+app.secret_key = "NMcgoB.0wd+$.KVKj!F{3>U{%BBUVhL=7=5$:46rQH$Q{enCuU"
+
+
+@app.route("/")
+def main():
+    return 'this is sparta'
+
+@app.errorhandler(404)
+def not_found(e):
+    return 'Die angeforderte Seite konnte nicht gefunden werden.'
+
+@app.route("/twitter/<account>")
+def feedTwitter(account):
+    content = [{'title': t, 'url': u, 'content': c, 'date': d, 'enclosures': e}
+               for t,c,u,d,e in twitter(account)]
+    xml = buildRSS(
+        title = 'Twitter: ' + account,
+        url = 'https://twitter.com/' + account,
+        description = 'The latest entries of the twitter account of ' + account,
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+@app.route("/telegram/<account>")
+def feedTelegram(account):
+    content = [{'title': t, 'url': u, 'content': c, 'date': d}
+               for t,c,u,d in telegram(account)]
+    xml = buildRSS(
+        title = 'Telegram: ' + account,
+        url = 'https://t.me/s/' + account,
+        description = 'The latest entries of the telegram channel of ' + account,
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+@app.route("/netto/<market>")
+def feedNetto(market):
+    title, url = netto(market)
+    content = [{
+        'title': 'Angebote für ' + title,
+        'url': url,
+        'content': 'Angebote für ' + title + ' finden sich unter ' + url,
+    }]
+    xml = buildRSS(
+        title = 'Netto Angebote für ' + market,
+        url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + market,
+        description = 'PDF der neuen Netto Angebote für den Laden um die Ecke.',
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+@app.route("/zdf/<path:feed>")
+def filterZDFFeed(feed):
+    title, url, description, content = zdf(feed)
+    xml = buildRSS(
+        title = title,
+        url = url,
+        description = description,
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+
+if __name__ == '__main__':
+    logging.basicConfig(filename='./main.log', level=logging.INFO)
+
+    app.config['TEMPLATES_AUTO_RELOAD'] = True
+    app.run(threaded=True)
+
diff --git a/wsgi.py b/wsgi.py
new file mode 100755
index 0000000..2371f9c
--- /dev/null
+++ b/wsgi.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.append('./')
+
+from webapp import app as application
diff --git a/zdf.py b/zdf.py
new file mode 100755
index 0000000..37749ac
--- /dev/null
+++ b/zdf.py
@@ -0,0 +1,87 @@
+import logging
+from urllib.request import urlopen, Request
+from datetime import datetime
+from xml.dom.minidom import parse, parseString
+import locale
+
+def getText(dom, element):
+    textNode = dom.getElementsByTagName(element)[0].firstChild
+    if textNode:
+        return textNode.data
+    return ""
+
+def zdf(feed):
+    url = f"https://www.zdf.de/rss/zdf/{feed}"
+
+    try:
+        res = urlopen(Request(url))
+    except Exception as exc:
+        logging.error('Request to zdf failed.', exc_info=exc)
+        return None
+
+    try:
+        rss = res.read()
+        xml = parseString(rss)
+    except Exception as exc:
+        logging.error('Parsing to zdf failed.', exc_info=exc)
+        return None
+
+    try:
+        title = getText(xml, 'title')
+        description = getText(xml, 'description')
+
+        content = []
+        for show in xml.getElementsByTagName('item'):
+            s_url = getText(show, 'link')
+            if not s_url:
+                continue
+            # Full episodes have the ID 100
+            if not s_url.endswith('-100.html'):
+                continue
+
+            s_title = getText(show, 'title')
+            if not s_title.startswith(title):
+                continue
+
+            s_date = getText(show, 'pubDate')
+            s_date_parsed = datetime.strptime(s_date, "%a, %d %b %Y %H:%M:%S %z")
+
+            if s_date_parsed.timestamp() > datetime.now().timestamp():
+                continue
+
+            # s_tmp = s_title = getText(show, 'title')
+            # if s_tmp.startswith(f'{title} vom '):
+            #     s_tmp = s_tmp[len(f'{title} vom '):]
+            #     saved = locale.setlocale(locale.LC_TIME)
+            #     locale.setlocale(locale.LC_TIME, "de_DE.utf8")
+            #     tmp = datetime.strptime(s_tmp, "%d. %B %Y")
+            #     locale.setlocale(locale.LC_TIME, saved)
+
+            s_desc = getText(show, 'description')
+            s_guid = getText(show, 'guid')
+            print("Adding", s_url, s_desc)
+            content.append({
+                'title': s_title,
+                'url': s_url,
+                'content': s_desc,
+                'date': s_date,
+                'guid': s_guid,
+            })
+
+        return title, url, description, content
+    except Exception as exc:
+        logging.error('Working with zdf failed.', exc_info=exc)
+        return None
+
+
+def main():
+    # print(zdf("comedy/heute-show"))
+    # print(zdf("comedy/die-anstalt"))
+    print(zdf("comedy/zdf-magazin-royale"))
+
+if __name__ == "__main__":
+    # if len(sys.argv) != 2:
+    #     print('Usage:', sys.argv[0], '<foobar>')
+    #     sys.exit(1)
+    # main(sys.argv[1])
+    main()
author	André Glüpker <git@wgmd.de>	2021-05-05 20:09:30 +0200
committer	André Glüpker <git@wgmd.de>	2021-05-05 20:09:30 +0200
commit	5774dbfb2caa42cb55bafab98a40e47f395e44d9 (patch)
tree	8294b7b6fefebc1befeed4104f3b5604683999a8
download	rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.gz rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.tar.bz2 rss-feeds-5774dbfb2caa42cb55bafab98a40e47f395e44d9.zip