Vendor drastikbot/drastikbot+modules v2.1

2022-04-21 14:35:31 -04:00
parent 82f4cddc27
commit befb55fff3
40 changed files with 8230 additions and 0 deletions
--- a/src/irc/modules/url.py
+++ b/src/irc/modules/url.py
@@ -0,0 +1,355 @@
+# coding=utf-8
+
+# URL Module for Drastikbot
+#
+# Depends:
+#   - requests      :: $ pip3 install requests
+#   - beautifulsoup :: $ pip3 install beautifulsoup4
+
+'''
+Copyright (C) 2017-2020 drastik.org
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published
+by the Free Software Foundation, version 3 only.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+'''
+
+import re
+import math
+import json
+import urllib.parse
+import requests
+import bs4
+
+
+class Module:
+    def __init__(self):
+        self.auto = True
+
+
+# ----- Constants ----- #
+parser = 'html.parser'
+user_agent = "w3m/0.52"
+accept_lang = "en-US"
+nsfw_tag = "\x0304[NSFW]\x0F"
+data_limit = 69120
+# --------------------- #
+
+
+def remove_formatting(msg):
+    '''Remove IRC String formatting codes'''
+    # - Regex -
+    # Capture "x03N,M". Should be the first called:
+    # (\\x03[0-9]{0,2},{1}[0-9]{1,2})
+    # Capture "x03N". Catch all color codes.
+    # (\\x03[0-9]{0,2})
+    # Capture the other formatting codes
+    line = re.sub(r'(\\x03[0-9]{0,2},{1}[0-9]{1,2})', '', msg)
+    line = re.sub(r'(\\x03[0-9]{1,2})', '', line)
+    line = line.replace("\\x03", "")
+    line = line.replace("\\x02", "")
+    line = line.replace("\\x1d", "")
+    line = line.replace("\\x1D", "")
+    line = line.replace("\\x1f", "")
+    line = line.replace("\\x1F", "")
+    line = line.replace("\\x16", "")
+    line = line.replace("\\x0f", "")
+    line = line.replace("\\x0F", "")
+    return line
+
+
+def convert_size(size_bytes):
+    # https://stackoverflow.com/
+    # questions/5194057/better-way-to-convert-file-sizes-in-python
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return "%s %s" % (s, size_name[i])
+
+
+def get_url(msg):
+    '''Search a string for urls and return a list of them.'''
+    str_l = msg.split()
+    req_l = ["http://", "https://"]  # add "." for parse urls without a scheme
+    urls = [u for u in str_l if any(r in u for r in req_l)]
+    # Avoid parsing IPv4s that are not complete (IPs like: 1.1):
+    # Useful when a scheme is not required to parse a URL.
+    # urls = [u for u in urls if u.count('.') == 3 or u.upper().isupper()]
+    return urls
+
+
+def default_parser(u):
+    '''
+    Visit each url and check if there is html content
+    served. If there is try to get the <title></title>
+    tag. If there is not try to read the http headers
+    to find 'content-type' and 'content-length'.
+    '''
+    data = ""
+    output = ""
+    try:
+        r = requests.get(u, stream=True,
+                         headers={"user-agent": user_agent,
+                                  "Accept-Language": accept_lang},
+                         timeout=5)
+    except Exception:
+        return False
+    for i in r.iter_content(chunk_size=512, decode_unicode=False):
+        data += i.decode('utf-8', errors='ignore')
+        if len(data) > data_limit or '</head>' in data.lower():
+            break
+    r.close()
+    soup = bs4.BeautifulSoup(data, parser)
+    try:
+        output += soup.head.title.text.strip()
+    except Exception:
+        try:
+            output += r.headers['content-type']
+        except KeyError:
+            pass
+        try:
+            h_length = convert_size(float(r.headers['content-length']))
+            if output:
+                output += f", Size: {h_length}"
+            else:
+                output += h_length
+        except KeyError:
+            pass
+    try:
+        if "RTA-5042-1996-1400-1577-RTA" in data:
+            output = f"{nsfw_tag} {output}"
+        elif r.headers["Rating"] == "RTA-5042-1996-1400-1577-RTA":
+            output = f"{nsfw_tag} {output}"
+    except KeyError:
+        pass
+    return output, data
+
+
+#                                            #
+# BEGIN: Website Handling Functions (by url) #
+#                                            #
+def youtube(url):
+    '''Visit a video and get it's information.'''
+    logo = "\x0300,04 ► \x0F"
+    u = f"https://www.youtube.com/oembed?url={url}"
+    r = requests.get(u, timeout=10)
+    if r:
+        j = r.json()
+        return (f"{logo}: {j['title']}"
+                f" | \x02Channel:\x0F {j['author_name']}")
+    else:
+        out = default_parser(url)[0]
+        return f"{logo}: {out}"
+
+
+def lainchan(url):
+    logo = "\x0309lainchan\x0F"
+    if "/res/" in url:
+        board = url.split("lainchan.org/")[1].split("/", 1)[0]
+        board = urllib.parse.unquote(board)
+        u = url.replace(".html", ".json")
+        post_no = False
+        if ".html#" in url:
+            post_no = url.split("#")[1][1:]
+        r = requests.get(u, timeout=10).json()
+        try:
+            title = r["posts"][0]["sub"]
+        except KeyError:
+            title = f'{r["posts"][0]["com"][:80]}...'
+        replies = len(r["posts"]) - 1
+        files = 0
+        for i in r["posts"]:
+            if "filename" in i:
+                files += 1
+            if "extra_files" in i:
+                files += len(i["extra_files"])
+        if post_no:
+            for i in r["posts"]:
+                if int(post_no) != i["no"]:
+                    continue
+                post_text = bs4.BeautifulSoup(i["com"], parser).get_text()[:50]
+                return (f"{logo} \x0306/{board}/\x0F {title} "
+                        f"\x02->\x0F \x0302{post_text}...\x0F | "
+                        f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")
+
+        return (f"{logo} \x0306/{board}/\x0F {title} | "
+                f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")
+    else:
+        out = default_parser(url)[0]
+        return f"{logo}: {out}"
+
+
+def imgur(url):
+    try:
+        up = urllib.parse.urlparse(url)
+        host = up.hostname
+        path = up.path
+        if host[:2] == "i.":
+            host = host[2:]
+            path = path.rsplit(".", 1)[0]
+            u = f"https://{host}{path}"
+        else:
+            u = url
+
+        r = requests.get(u, timeout=10)
+        s = "widgetFactory.mergeConfig('gallery', "
+        b = r.text.index(s) + len(s)
+        e = r.text.index(");", b)
+        t = r.text[b:e]
+
+        s = "image               :"
+        b = t.index(s) + len(s)
+        e = t.index("},", b)
+        t = t[b:e] + "}"
+
+        j = json.loads(t)
+        title = j["title"]
+        mimetype = j["mimetype"]
+        size = j["size"]
+        width = j["width"]
+        height = j["height"]
+        nsfw = j["nsfw"]
+
+        output = ""
+        if nsfw:
+            output += f"{nsfw_tag} "
+        output += f"{title} - Imgur"
+        output += f" | {mimetype}, Size: {convert_size(size)}"
+        output += f", {width}x{height}"
+        return output
+    except Exception:
+        return default_parser(url)[0]
+
+
+def nitter(url):
+    logo = "\x02Nitter\x0f"
+    output, data = default_parser(url)
+    try:
+        soup = bs4.BeautifulSoup(data, parser)
+        user = soup.find(attrs={"property": "og:title"})['content']
+        post = soup.find(attrs={"property": "og:description"})['content']
+        if post:
+            return f"{logo}: \x0305{user}\x0f {post}"
+        return output
+    except Exception:
+        return output
+
+
+def twitter(url):
+    logo = "\x0311twitter\x0F"
+    u = f"https://publish.twitter.com/oembed?url={url}"
+    r = requests.get(u, timeout=10,
+                     headers={"user-agent": user_agent,
+                              "Accept-Language": accept_lang})
+    if r:
+        j = r.json()
+        html = j["html"]
+        soup = bs4.BeautifulSoup(html, parser)
+        tweet = soup.get_text(separator=" ")
+        return f"{logo}: {tweet}"
+    else:
+        out = default_parser(url)[0]
+        return f"{logo}: {out}"
+#                                          #
+# END: Website Handling Functions (by url) #
+#                                          #
+
+
+hosts_d = {
+    "youtube.com": youtube,
+    "youtu.be": youtube,
+    "lainchan.org": lainchan,
+    "i.imgur.com": imgur,
+    "imgur.com": imgur,
+    "nitter.net": nitter,
+    "twitter.com": twitter
+}
+
+
+def _get_title_from_host(u):
+    host = urllib.parse.urlparse(u).hostname
+    if host[:4] == "www.":
+        host = host[4:]
+    if host not in hosts_d:
+        return default_parser(u)  # It's a tuple
+    else:
+        return hosts_d[host](u), False
+
+
+#                                              #
+# BEGIN: Website Handling Functions (by title) #
+#                                              #
+def pleroma(data):
+    logo = "\x0308Pleroma\x0F"
+    soup = bs4.BeautifulSoup(data, parser)
+    t = soup.find(attrs={"property": "og:description"})['content']
+    t = t.split(": ", 1)
+    poster = t[0]
+    post = t[1]
+    return f"{logo}: \x0305{poster}\x0F {post}"
+#                                            #
+# END: Website Handling Functions (by title) #
+#                                            #
+
+
+titles_d = {
+    "Pleroma": pleroma
+}
+
+
+def _get_title_from_title(title, data):
+    '''
+    Used to get data from the <head> when the <title> isn't very helpful
+    '''
+    if title in titles_d:
+        try:
+            return titles_d[title](data)
+        except Exception:
+            return title
+    else:
+        return title
+
+
+def get_title(u):
+    title, data = _get_title_from_host(u)
+    if data:
+        title = _get_title_from_title(title, data)
+    return title
+
+
+def main(i, irc):
+    # - Raw undecoded message clean up.
+    # Remove /r/n and whitespace
+    msg = i.msg_raw.strip()
+    # Convert the bytes to a string,
+    # split the irc commands from the text message,
+    # remove ' character from the end of the string.
+    msg = str(msg).split(' :', 1)[1][:-1]
+    # Remove all IRC formatting codes
+    msg = remove_formatting(msg)
+    # msg = info[2]
+
+    urls = get_url(msg)
+    prev_u = set()  # Already visited URLs, used to avoid spamming.
+    for u in urls:
+        if not (u.startswith('http://') or u.startswith('https://')):
+            u = f'http://{u}'
+        if u in prev_u:
+            return
+        title = get_title(u)
+        if not title:
+            continue
+
+        irc.privmsg(i.channel, title)
+        prev_u.add(u)