dailybuild_modules/src/irc/modules/url.py

# coding=utf-8

# URL Module for Drastikbot
#
# Depends:
#   - requests      :: $ pip3 install requests
#   - beautifulsoup :: $ pip3 install beautifulsoup4

'''
Copyright (C) 2017-2020 drastik.org

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, version 3 only.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
'''

import re
import math
import json
import urllib.parse
import requests
import bs4


class Module:
    def __init__(self):
        self.auto = True


# ----- Constants ----- #
parser = 'html.parser'
user_agent = "w3m/0.52"
accept_lang = "en-US"
nsfw_tag = "\x0304[NSFW]\x0F"
data_limit = 69120
# --------------------- #


def remove_formatting(msg):
    '''Remove IRC String formatting codes'''
    # - Regex -
    # Capture "x03N,M". Should be the first called:
    # (\\x03[0-9]{0,2},{1}[0-9]{1,2})
    # Capture "x03N". Catch all color codes.
    # (\\x03[0-9]{0,2})
    # Capture the other formatting codes
    line = re.sub(r'(\\x03[0-9]{0,2},{1}[0-9]{1,2})', '', msg)
    line = re.sub(r'(\\x03[0-9]{1,2})', '', line)
    line = line.replace("\\x03", "")
    line = line.replace("\\x02", "")
    line = line.replace("\\x1d", "")
    line = line.replace("\\x1D", "")
    line = line.replace("\\x1f", "")
    line = line.replace("\\x1F", "")
    line = line.replace("\\x16", "")
    line = line.replace("\\x0f", "")
    line = line.replace("\\x0F", "")
    return line


def convert_size(size_bytes):
    # https://stackoverflow.com/
    # questions/5194057/better-way-to-convert-file-sizes-in-python
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])


def get_url(msg):
    '''Search a string for urls and return a list of them.'''
    str_l = msg.split()
    req_l = ["http://", "https://"]  # add "." for parse urls without a scheme
    urls = [u for u in str_l if any(r in u for r in req_l)]
    # Avoid parsing IPv4s that are not complete (IPs like: 1.1):
    # Useful when a scheme is not required to parse a URL.
    # urls = [u for u in urls if u.count('.') == 3 or u.upper().isupper()]
    return urls


def default_parser(u):
    '''
    Visit each url and check if there is html content
    served. If there is try to get the <title></title>
    tag. If there is not try to read the http headers
    to find 'content-type' and 'content-length'.
    '''
    data = ""
    output = ""
    try:
        r = requests.get(u, stream=True,
                         headers={"user-agent": user_agent,
                                  "Accept-Language": accept_lang},
                         timeout=5)
    except Exception:
        return False
    for i in r.iter_content(chunk_size=512, decode_unicode=False):
        data += i.decode('utf-8', errors='ignore')
        if len(data) > data_limit or '</head>' in data.lower():
            break
    r.close()
    soup = bs4.BeautifulSoup(data, parser)
    try:
        output += soup.head.title.text.strip()
    except Exception:
        try:
            output += r.headers['content-type']
        except KeyError:
            pass
        try:
            h_length = convert_size(float(r.headers['content-length']))
            if output:
                output += f", Size: {h_length}"
            else:
                output += h_length
        except KeyError:
            pass
    try:
        if "RTA-5042-1996-1400-1577-RTA" in data:
            output = f"{nsfw_tag} {output}"
        elif r.headers["Rating"] == "RTA-5042-1996-1400-1577-RTA":
            output = f"{nsfw_tag} {output}"
    except KeyError:
        pass
    return output, data


#                                            #
# BEGIN: Website Handling Functions (by url) #
#                                            #
def youtube(url):
    '''Visit a video and get it's information.'''
    logo = "\x0300,04 ► \x0F"
    u = f"https://www.youtube.com/oembed?url={url}"
    r = requests.get(u, timeout=10)
    if r:
        j = r.json()
        return (f"{logo}: {j['title']}"
                f" | \x02Channel:\x0F {j['author_name']}")
    else:
        out = default_parser(url)[0]
        return f"{logo}: {out}"


def lainchan(url):
    logo = "\x0309lainchan\x0F"
    if "/res/" in url:
        board = url.split("lainchan.org/")[1].split("/", 1)[0]
        board = urllib.parse.unquote(board)
        u = url.replace(".html", ".json")
        post_no = False
        if ".html#" in url:
            post_no = url.split("#")[1][1:]
        r = requests.get(u, timeout=10).json()
        try:
            title = r["posts"][0]["sub"]
        except KeyError:
            title = f'{r["posts"][0]["com"][:80]}...'
        replies = len(r["posts"]) - 1
        files = 0
        for i in r["posts"]:
            if "filename" in i:
                files += 1
            if "extra_files" in i:
                files += len(i["extra_files"])
        if post_no:
            for i in r["posts"]:
                if int(post_no) != i["no"]:
                    continue
                post_text = bs4.BeautifulSoup(i["com"], parser).get_text()[:50]
                return (f"{logo} \x0306/{board}/\x0F {title} "
                        f"\x02->\x0F \x0302{post_text}...\x0F | "
                        f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")

        return (f"{logo} \x0306/{board}/\x0F {title} | "
                f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")
    else:
        out = default_parser(url)[0]
        return f"{logo}: {out}"


def imgur(url):
    try:
        up = urllib.parse.urlparse(url)
        host = up.hostname
        path = up.path
        if host[:2] == "i.":
            host = host[2:]
            path = path.rsplit(".", 1)[0]
            u = f"https://{host}{path}"
        else:
            u = url

        r = requests.get(u, timeout=10)
        s = "widgetFactory.mergeConfig('gallery', "
        b = r.text.index(s) + len(s)
        e = r.text.index(");", b)
        t = r.text[b:e]

        s = "image               :"
        b = t.index(s) + len(s)
        e = t.index("},", b)
        t = t[b:e] + "}"

        j = json.loads(t)
        title = j["title"]
        mimetype = j["mimetype"]
        size = j["size"]
        width = j["width"]
        height = j["height"]
        nsfw = j["nsfw"]

        output = ""
        if nsfw:
            output += f"{nsfw_tag} "
        output += f"{title} - Imgur"
        output += f" | {mimetype}, Size: {convert_size(size)}"
        output += f", {width}x{height}"
        return output
    except Exception:
        return default_parser(url)[0]


def nitter(url):
    logo = "\x02Nitter\x0f"
    output, data = default_parser(url)
    try:
        soup = bs4.BeautifulSoup(data, parser)
        user = soup.find(attrs={"property": "og:title"})['content']
        post = soup.find(attrs={"property": "og:description"})['content']
        if post:
            return f"{logo}: \x0305{user}\x0f {post}"
        return output
    except Exception:
        return output


def twitter(url):
    logo = "\x0311twitter\x0F"
    u = f"https://publish.twitter.com/oembed?url={url}"
    r = requests.get(u, timeout=10,
                     headers={"user-agent": user_agent,
                              "Accept-Language": accept_lang})
    if r:
        j = r.json()
        html = j["html"]
        soup = bs4.BeautifulSoup(html, parser)
        tweet = soup.get_text(separator=" ")
        return f"{logo}: {tweet}"
    else:
        out = default_parser(url)[0]
        return f"{logo}: {out}"
#                                          #
# END: Website Handling Functions (by url) #
#                                          #


hosts_d = {
    "youtube.com": youtube,
    "youtu.be": youtube,
    "m.youtube.com": youtube,
    "lainchan.org": lainchan,
    "i.imgur.com": imgur,
    "imgur.com": imgur,
    "nitter.net": nitter,
    "twitter.com": twitter
}


def _get_title_from_host(u):
    host = urllib.parse.urlparse(u).hostname
    if host[:4] == "www.":
        host = host[4:]
    if host not in hosts_d:
        return default_parser(u)  # It's a tuple
    else:
        return hosts_d[host](u), False


#                                              #
# BEGIN: Website Handling Functions (by title) #
#                                              #
def pleroma(data):
    logo = "\x0308Pleroma\x0F"
    soup = bs4.BeautifulSoup(data, parser)
    t = soup.find(attrs={"property": "og:description"})['content']
    t = t.split(": ", 1)
    poster = t[0]
    post = t[1]
    return f"{logo}: \x0305{poster}\x0F {post}"
#                                            #
# END: Website Handling Functions (by title) #
#                                            #


titles_d = {
    "Pleroma": pleroma
}


def _get_title_from_title(title, data):
    '''
    Used to get data from the <head> when the <title> isn't very helpful
    '''
    if title in titles_d:
        try:
            return titles_d[title](data)
        except Exception:
            return title
    else:
        return title


def get_title(u):
    title, data = _get_title_from_host(u)
    if data:
        title = _get_title_from_title(title, data)
    return title


def main(i, irc):
    # - Raw undecoded message clean up.
    # Remove /r/n and whitespace
    msg = i.msg_raw.strip()
    # Convert the bytes to a string,
    # split the irc commands from the text message,
    # remove ' character from the end of the string.
    msg = str(msg).split(' :', 1)[1][:-1]
    # Remove all IRC formatting codes
    msg = remove_formatting(msg)
    # msg = info[2]

    urls = get_url(msg)
    prev_u = set()  # Already visited URLs, used to avoid spamming.
    for u in urls:
        if not (u.startswith('http://') or u.startswith('https://')):
            u = f'http://{u}'
        if u in prev_u:
            return
        title = get_title(u)
        if not title:
            continue

        irc.privmsg(i.channel, title)
        prev_u.add(u)