From 5774dbfb2caa42cb55bafab98a40e47f395e44d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Gl=C3=BCpker?= <git@wgmd.de>
Date: Wed, 5 May 2021 20:09:30 +0200
Subject: Initial commit of RSS converter application

---
 netto.py       |  69 ++++++++++++++++++++
 pyproject.toml |  16 +++++
 rss.py         |  56 ++++++++++++++++
 telegram.py    |  72 ++++++++++++++++++++
 twitter.py     | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 webapp.py      |  97 +++++++++++++++++++++++++++
 wsgi.py        |   6 ++
 zdf.py         |  87 +++++++++++++++++++++++++
 8 files changed, 605 insertions(+)
 create mode 100755 netto.py
 create mode 100644 pyproject.toml
 create mode 100755 rss.py
 create mode 100755 telegram.py
 create mode 100755 twitter.py
 create mode 100755 webapp.py
 create mode 100755 wsgi.py
 create mode 100755 zdf.py
diff --git a/netto.py b/netto.py
new file mode 100755
index 0000000..fda6409
--- /dev/null
+++ b/netto.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+from urllib.request import urlopen, Request
+from datetime import datetime
+from bs4 import BeautifulSoup
+import sys
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+            dt.day,
+            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+            dt.year, dt.hour, dt.minute, dt.second)
+
+def netto(store_id):
+    url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id)
+    res = urlopen(Request(url))
+    soup = BeautifulSoup(res, features="html.parser")
+
+    # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'})
+    message = soup.find('a', attrs={'class': 'flipbook_pdf_flipbook'})
+
+    url = message['href'].split('?')[0]
+    year = str(datetime.now().year)
+    title = url[ url.find(year) : url.find(year) + 7 ]
+
+    return title, url
+
+def main(store_id = 9110):
+    url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + str(store_id)
+
+    print("""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Netto Angebote """ + str(store_id) + """</title>
+    <link>""" + url + """</link>
+    <description>PDF der neuen Netto Angebote für den Laden um die Ecke.</description>
+    <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""")
+
+    title, link = netto(url)
+    print('    <item>')
+    print('      <title><![CDATA[Angebote für ' + title + ']]></title>')
+    print('      <link>' + link + '</link>')
+    # print('      <description><![CDATA[' + description + ']]></description>')
+    # print('      <pubDate>' + date + '</pubDate>')
+    # print('      <media:content url="' + thumbnail + b'" type="image/jpeg" />')
+    print('    </item>')
+
+    print('  </channel>')
+    print('</rss>')
+
+if __name__ == "__main__":
+    # if len(sys.argv) != 2:
+    #     print('Usage:', sys.argv[0], '<foobar>')
+    #     sys.exit(1)
+    # main(sys.argv[1])
+    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..04843a2
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "rss-feeds"
+version = "0.1.0"
+description = "Build RSS feeds for various web services."
+authors = ["Your Name <you@example.com>"]
+
+[tool.poetry.dependencies]
+python = ">=3.5.0,<4"
+Flask = "^1.1.2"
+beautifulsoup4 = "^4.9.3"
+
+[tool.poetry.dev-dependencies]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/rss.py b/rss.py
new file mode 100755
index 0000000..66ffb35
--- /dev/null
+++ b/rss.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+from datetime import datetime
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+            dt.day,
+            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+            dt.year, dt.hour, dt.minute, dt.second)
+
+def buildRSS(title, url, description, content):
+    """
+    Feed basic info: title, url, descriptions
+    Content: List[Dict{title, url, content, date, enclosures, guid}]
+    """
+
+    feed = """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
+  <channel>
+    <title>""" + title + """</title>
+    <link>""" + url + """</link>
+    <description>""" + description + """</description>
+    <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>"""
+
+    for item in content:
+        feed += '    <item>'
+        feed += '      <title><![CDATA[' + item.get('title', 'N/A') + ']]></title>'
+        feed += '      <link>' + item.get('url', 'N/A') + '</link>'
+        feed += '      <description><![CDATA[' + item.get('content', 'N/A') + ']]></description>'
+        if 'date' in item:
+            if type(item['date']) is str:
+                feed += '      <pubDate>' + item['date'] + '</pubDate>'
+            else:
+                feed += '      <pubDate>' + _format_date(item['date']) + '</pubDate>'
+        for enclosure in item.get('enclosures', []):
+            feed += '      <media:content url="' + enclosure + '" />'
+        if 'guid' in item:
+            feed += '      <guid>' + item['guid'] + '</guid>'
+        feed += '    </item>'
+
+    feed += '  </channel>'
+    feed += '</rss>'
+    return feed
diff --git a/telegram.py b/telegram.py
new file mode 100755
index 0000000..3058339
--- /dev/null
+++ b/telegram.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+from urllib.request import urlopen, Request
+from datetime import datetime
+from bs4 import BeautifulSoup
+import sys
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+            dt.day,
+            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+            dt.year, dt.hour, dt.minute, dt.second)
+
+def telegram(channel):
+    url = 'https://t.me/s/' + channel
+    res = urlopen(Request(url))
+    soup = BeautifulSoup(res, features="html.parser")
+
+    # messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_wrap'})
+    messages = soup.find_all('div', attrs={'class': 'tgme_widget_message_bubble'})
+
+    for message in messages:
+        date = message.find('time', attrs={'class': 'time'})['datetime']
+        html = message.find('div', attrs={'class': 'tgme_widget_message_text'})
+        # preview = message.find('div', attrs={'class': 'tgme_widget_message_bubble'})
+        link = message.find('a', attrs={'class': 'tgme_widget_message_date'})
+        title = html.text if html else 'No text'
+        description = str(message) # if preview else '?'
+        link = link['href']
+        yield title, description, link, date
+
+def main(channel):
+    url = 'https://t.me/s/' + channel
+
+    print("""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Telegram: """ + channel + """</title>
+    <link>""" + url + """</link>
+    <description>The latest entries of the telegram channel of """ +channel + """</description>
+    <lastBuildDate>""" + _format_date(datetime.now()) + """</lastBuildDate>""")
+
+    for title, description, link, date in telegram(channel):
+        print('    <item>')
+        print('      <title><![CDATA[' + title + ']]></title>')
+        print('      <link>' + link + '</link>')
+        print('      <description><![CDATA[' + description + ']]></description>')
+        print('      <pubDate>' + date + '</pubDate>')
+        # print('      <media:content url="' + thumbnail + b'" type="image/jpeg" />')
+        print('    </item>')
+
+    print('  </channel>')
+    print('</rss>')
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print('Usage:', sys.argv[0], '<telegram channel>')
+        sys.exit(1)
+    main(sys.argv[1])
diff --git a/twitter.py b/twitter.py
new file mode 100755
index 0000000..5ddf8ad
--- /dev/null
+++ b/twitter.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+
+from urllib.error import HTTPError
+from urllib.request import urlopen, Request
+import logging
+
+# from requests_oauthlib import OAuth1Session
+from datetime import datetime
+import sys
+import json
+
+bearer = None
+
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+    Input date must be in GMT.
+    Stolen from PyRSS2Gen.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+        ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+        dt.day,
+        [
+            "Jan",
+            "Feb",
+            "Mar",
+            "Apr",
+            "May",
+            "Jun",
+            "Jul",
+            "Aug",
+            "Sep",
+            "Oct",
+            "Nov",
+            "Dec",
+        ][dt.month - 1],
+        dt.year,
+        dt.hour,
+        dt.minute,
+        dt.second,
+    )
+
+
+def getBearer():
+    global bearer
+    if bearer:
+        return bearer
+    headers = {
+        "Authorization": "Basic Zzl1MXI2SFpYTXg0SXU5UGs5VlNvTzFUdzpmeTIyQjN2QVRRNUI2eGthb1BFdFFRUmtuUGQ1WGZBbnBKVG5hc0ZRa3NyUm5qaVNsaw==",
+        "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
+    }
+    data = b"grant_type=client_credentials"
+    url = "https://api.twitter.com/oauth2/token"
+
+    res = urlopen(Request(url, headers=headers, data=data, method="POST"))
+    response = json.loads(res.read().decode("UTF-8"))
+    bearer = response["access_token"]
+
+    return bearer
+
+
+def unshorten_urls(title, description, urls):
+    for url in urls:
+        shorted_url = url["url"]
+        long_url = url["expanded_url"]
+
+        if "images" in url:
+            img = url["images"][0]["url"]
+            long_url_html = '<a href="' + long_url + '"><img src="' + img + '"/></a>'
+        else:
+            long_url_html = '<a href="' + long_url + '">' + long_url + "</a>"
+
+        description = description.replace(shorted_url, long_url_html)
+        title = title.replace(shorted_url, long_url)
+    return title, description
+
+
+def twitter(user):
+    # 500.000 Tweets per month
+    # API KEY = g9u1r6HZXMx4Iu9Pk9VSoO1Tw
+    # API SECRET KEY = fy22B3vATQ5B6xkaoPEtQQRknPd5XfAnpJTnasFQksrRnjiSlk
+
+    headers = {"authorization": "Bearer " + getBearer()}
+
+    # Recent = last 7 days
+    url = (
+        "https://api.twitter.com/2/tweets/search/recent?query=from:"
+        + user
+        + "&tweet.fields=created_at,author_id,lang,source,public_metrics,entities&expansions=referenced_tweets.id,attachments.media_keys&media.fields=url"
+    )
+
+    try:
+        res = urlopen(Request(url, headers=headers))
+        response = json.loads(res.read().decode("UTF-8"))
+    except Exception as exc:
+        logging.error('Request to twitter failed.', exc_info=exc)
+        return None
+
+    if not response["meta"]["result_count"]:
+        return []
+
+    for tweet in response["data"]:
+        title = tweet["text"]
+        description = tweet["text"]
+        link = "https://twitter.com/" + user + "/status/" + str(tweet["id"])
+
+        # Check included tweets
+        if (
+            "referenced_tweets" in tweet
+            and len(tweet["referenced_tweets"]) == 1
+            and tweet["referenced_tweets"][0]["type"] == "retweeted"
+        ):
+            rt_info = title[: title.index(":") + 2]
+            ref_id = tweet["referenced_tweets"][0]["id"]
+            ref_tweet = next(
+                t for t in response["includes"]["tweets"] if t["id"] == ref_id
+            )
+            title = rt_info + ref_tweet["text"]
+            description = rt_info + ref_tweet["text"]
+            title, description = unshorten_urls(
+                title, description, ref_tweet.get("entities", {}).get("urls", [])
+            )
+
+        title, description = unshorten_urls(
+            title, description, tweet.get("entities", {}).get("urls", [])
+        )
+
+        # Attach media
+        enclosures = []
+        medias = tweet.get('attachments', {}).get('media_keys', [])
+        for media in medias:
+            ref_media = next(
+                t for t in response["includes"]["media"] if t["media_key"] == media
+            )
+            if 'url' not in ref_media: continue
+            if ref_media.get('type', '') == 'photo':
+                description += "<br/><img src=\"" + ref_media['url'] + "\" />"
+            else:
+                enclosures.append(ref_media['url'])
+
+        # Append Retweets etc
+        description += "<br/><br/>"
+        description += str(tweet["public_metrics"]["retweet_count"]) + " Retweets, "
+        description += str(tweet["public_metrics"]["like_count"]) + " Likes, "
+        description += str(tweet["public_metrics"]["reply_count"]) + " Replies, "
+        description += str(tweet["public_metrics"]["quote_count"]) + " Quotes"
+        description += "<br/>"
+        description += "Source: " + tweet["source"]
+
+        date = datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+
+        yield title, description, link, date, enclosures
+
+
+def main(channel):
+    print(
+        """<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Twitter: """
+        + channel
+        + """</title>
+    <link>https://twitter.com/"""
+        + channel
+        + """</link>
+    <description>The latest entries of the twitter account of """
+        + channel
+        + """</description>
+    <lastBuildDate>"""
+        + _format_date(datetime.now())
+        + """</lastBuildDate>"""
+    )
+
+    for title, description, link, date, enclosures in twitter(channel):
+        print("    <item>")
+        print("      <title><![CDATA[" + title + "]]></title>")
+        print("      <link>" + link + "</link>")
+        print("      <description><![CDATA[" + description + "]]></description>")
+        print("      <pubDate>" + _format_date(date) + "</pubDate>")
+        for enclosure in enclosures:
+            print('      <media:content url="' + enclosure + '" />')
+        print("    </item>")
+
+    print("  </channel>")
+    print("</rss>")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage:", sys.argv[0], "<twitter channel>")
+        sys.exit(1)
+    main(sys.argv[1])
+    # twitter('rheinbahn_intim')
+    # twitter('realDonaldTrump')
diff --git a/webapp.py b/webapp.py
new file mode 100755
index 0000000..b992150
--- /dev/null
+++ b/webapp.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+# from flask import Flask, redirect, render_template, request, session, url_for
+from flask import Flask, Response
+
+# from wsgiref.util import setup_testing_defaults
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor
+
+import json
+import os
+import re
+import sys
+import time, datetime
+# import traceback
+import logging
+
+from twitter import twitter
+from telegram import telegram
+from netto import netto
+from rss import buildRSS
+from zdf import zdf
+
+app = Flask(__name__)
+app.secret_key = "NMcgoB.0wd+$.KVKj!F{3>U{%BBUVhL=7=5$:46rQH$Q{enCuU"
+
+
+@app.route("/")
+def main():
+    return 'this is sparta'
+
+@app.errorhandler(404)
+def not_found(e):
+    return 'Die angeforderte Seite konnte nicht gefunden werden.'
+
+@app.route("/twitter/<account>")
+def feedTwitter(account):
+    content = [{'title': t, 'url': u, 'content': c, 'date': d, 'enclosures': e}
+               for t,c,u,d,e in twitter(account)]
+    xml = buildRSS(
+        title = 'Twitter: ' + account,
+        url = 'https://twitter.com/' + account,
+        description = 'The latest entries of the twitter account of ' + account,
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+@app.route("/telegram/<account>")
+def feedTelegram(account):
+    content = [{'title': t, 'url': u, 'content': c, 'date': d}
+               for t,c,u,d in telegram(account)]
+    xml = buildRSS(
+        title = 'Telegram: ' + account,
+        url = 'https://t.me/s/' + account,
+        description = 'The latest entries of the telegram channel of ' + account,
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+@app.route("/netto/<market>")
+def feedNetto(market):
+    title, url = netto(market)
+    content = [{
+        'title': 'Angebote für ' + title,
+        'url': url,
+        'content': 'Angebote für ' + title + ' finden sich unter ' + url,
+    }]
+    xml = buildRSS(
+        title = 'Netto Angebote für ' + market,
+        url = 'https://www.netto-online.de/ueber-netto/Online-Prospekte.chtm/' + market,
+        description = 'PDF der neuen Netto Angebote für den Laden um die Ecke.',
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+@app.route("/zdf/<path:feed>")
+def filterZDFFeed(feed):
+    title, url, description, content = zdf(feed)
+    xml = buildRSS(
+        title = title,
+        url = url,
+        description = description,
+        content = content)
+    response = Response(xml, mimetype='text/xml')
+    response.headers['Access-Control-Allow-Origin'] = '*'
+    return response
+
+
+if __name__ == '__main__':
+    logging.basicConfig(filename='./main.log', level=logging.INFO)
+
+    app.config['TEMPLATES_AUTO_RELOAD'] = True
+    app.run(threaded=True)
+
diff --git a/wsgi.py b/wsgi.py
new file mode 100755
index 0000000..2371f9c
--- /dev/null
+++ b/wsgi.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.append('./')
+
+from webapp import app as application
diff --git a/zdf.py b/zdf.py
new file mode 100755
index 0000000..37749ac
--- /dev/null
+++ b/zdf.py
@@ -0,0 +1,87 @@
+import logging
+from urllib.request import urlopen, Request
+from datetime import datetime
+from xml.dom.minidom import parse, parseString
+import locale
+
+def getText(dom, element):
+    textNode = dom.getElementsByTagName(element)[0].firstChild
+    if textNode:
+        return textNode.data
+    return ""
+
+def zdf(feed):
+    url = f"https://www.zdf.de/rss/zdf/{feed}"
+
+    try:
+        res = urlopen(Request(url))
+    except Exception as exc:
+        logging.error('Request to zdf failed.', exc_info=exc)
+        return None
+
+    try:
+        rss = res.read()
+        xml = parseString(rss)
+    except Exception as exc:
+        logging.error('Parsing to zdf failed.', exc_info=exc)
+        return None
+
+    try:
+        title = getText(xml, 'title')
+        description = getText(xml, 'description')
+
+        content = []
+        for show in xml.getElementsByTagName('item'):
+            s_url = getText(show, 'link')
+            if not s_url:
+                continue
+            # Full episodes have the ID 100
+            if not s_url.endswith('-100.html'):
+                continue
+
+            s_title = getText(show, 'title')
+            if not s_title.startswith(title):
+                continue
+
+            s_date = getText(show, 'pubDate')
+            s_date_parsed = datetime.strptime(s_date, "%a, %d %b %Y %H:%M:%S %z")
+
+            if s_date_parsed.timestamp() > datetime.now().timestamp():
+                continue
+
+            # s_tmp = s_title = getText(show, 'title')
+            # if s_tmp.startswith(f'{title} vom '):
+            #     s_tmp = s_tmp[len(f'{title} vom '):]
+            #     saved = locale.setlocale(locale.LC_TIME)
+            #     locale.setlocale(locale.LC_TIME, "de_DE.utf8")
+            #     tmp = datetime.strptime(s_tmp, "%d. %B %Y")
+            #     locale.setlocale(locale.LC_TIME, saved)
+
+            s_desc = getText(show, 'description')
+            s_guid = getText(show, 'guid')
+            print("Adding", s_url, s_desc)
+            content.append({
+                'title': s_title,
+                'url': s_url,
+                'content': s_desc,
+                'date': s_date,
+                'guid': s_guid,
+            })
+
+        return title, url, description, content
+    except Exception as exc:
+        logging.error('Working with zdf failed.', exc_info=exc)
+        return None
+
+
+def main():
+    # print(zdf("comedy/heute-show"))
+    # print(zdf("comedy/die-anstalt"))
+    print(zdf("comedy/zdf-magazin-royale"))
+
+if __name__ == "__main__":
+    # if len(sys.argv) != 2:
+    #     print('Usage:', sys.argv[0], '<foobar>')
+    #     sys.exit(1)
+    # main(sys.argv[1])
+    main()
-- 
cgit v1.2.3