forked from zuckerberg/dailybot
357 lines
10 KiB
Python
357 lines
10 KiB
Python
# coding=utf-8
|
|
|
|
# URL Module for Drastikbot
|
|
#
|
|
# Depends:
|
|
# - requests :: $ pip3 install requests
|
|
# - beautifulsoup :: $ pip3 install beautifulsoup4
|
|
|
|
'''
|
|
Copyright (C) 2017-2020 drastik.org
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published
|
|
by the Free Software Foundation, version 3 only.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
'''
|
|
|
|
import re
|
|
import math
|
|
import json
|
|
import urllib.parse
|
|
import requests
|
|
import bs4
|
|
|
|
|
|
class Module:
|
|
def __init__(self):
|
|
self.auto = True
|
|
|
|
|
|
# ----- Constants ----- #
|
|
parser = 'html.parser'
|
|
user_agent = "w3m/0.52"
|
|
accept_lang = "en-US"
|
|
nsfw_tag = "\x0304[NSFW]\x0F"
|
|
data_limit = 69120
|
|
# --------------------- #
|
|
|
|
|
|
def remove_formatting(msg):
|
|
'''Remove IRC String formatting codes'''
|
|
# - Regex -
|
|
# Capture "x03N,M". Should be the first called:
|
|
# (\\x03[0-9]{0,2},{1}[0-9]{1,2})
|
|
# Capture "x03N". Catch all color codes.
|
|
# (\\x03[0-9]{0,2})
|
|
# Capture the other formatting codes
|
|
line = re.sub(r'(\\x03[0-9]{0,2},{1}[0-9]{1,2})', '', msg)
|
|
line = re.sub(r'(\\x03[0-9]{1,2})', '', line)
|
|
line = line.replace("\\x03", "")
|
|
line = line.replace("\\x02", "")
|
|
line = line.replace("\\x1d", "")
|
|
line = line.replace("\\x1D", "")
|
|
line = line.replace("\\x1f", "")
|
|
line = line.replace("\\x1F", "")
|
|
line = line.replace("\\x16", "")
|
|
line = line.replace("\\x0f", "")
|
|
line = line.replace("\\x0F", "")
|
|
return line
|
|
|
|
|
|
def convert_size(size_bytes):
|
|
# https://stackoverflow.com/
|
|
# questions/5194057/better-way-to-convert-file-sizes-in-python
|
|
if size_bytes == 0:
|
|
return "0B"
|
|
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
|
|
i = int(math.floor(math.log(size_bytes, 1024)))
|
|
p = math.pow(1024, i)
|
|
s = round(size_bytes / p, 2)
|
|
return "%s %s" % (s, size_name[i])
|
|
|
|
|
|
def get_url(msg):
|
|
'''Search a string for urls and return a list of them.'''
|
|
str_l = msg.split()
|
|
req_l = ["http://", "https://"] # add "." for parse urls without a scheme
|
|
urls = [u for u in str_l if any(r in u for r in req_l)]
|
|
# Avoid parsing IPv4s that are not complete (IPs like: 1.1):
|
|
# Useful when a scheme is not required to parse a URL.
|
|
# urls = [u for u in urls if u.count('.') == 3 or u.upper().isupper()]
|
|
return urls
|
|
|
|
|
|
def default_parser(u):
|
|
'''
|
|
Visit each url and check if there is html content
|
|
served. If there is try to get the <title></title>
|
|
tag. If there is not try to read the http headers
|
|
to find 'content-type' and 'content-length'.
|
|
'''
|
|
data = ""
|
|
output = ""
|
|
try:
|
|
r = requests.get(u, stream=True,
|
|
headers={"user-agent": user_agent,
|
|
"Accept-Language": accept_lang},
|
|
timeout=5)
|
|
except Exception:
|
|
return False
|
|
for i in r.iter_content(chunk_size=512, decode_unicode=False):
|
|
data += i.decode('utf-8', errors='ignore')
|
|
if len(data) > data_limit or '</head>' in data.lower():
|
|
break
|
|
r.close()
|
|
soup = bs4.BeautifulSoup(data, parser)
|
|
try:
|
|
output += soup.head.title.text.strip()
|
|
except Exception:
|
|
try:
|
|
output += r.headers['content-type']
|
|
except KeyError:
|
|
pass
|
|
try:
|
|
h_length = convert_size(float(r.headers['content-length']))
|
|
if output:
|
|
output += f", Size: {h_length}"
|
|
else:
|
|
output += h_length
|
|
except KeyError:
|
|
pass
|
|
try:
|
|
if "RTA-5042-1996-1400-1577-RTA" in data:
|
|
output = f"{nsfw_tag} {output}"
|
|
elif r.headers["Rating"] == "RTA-5042-1996-1400-1577-RTA":
|
|
output = f"{nsfw_tag} {output}"
|
|
except KeyError:
|
|
pass
|
|
return output, data
|
|
|
|
|
|
# #
|
|
# BEGIN: Website Handling Functions (by url) #
|
|
# #
|
|
def youtube(url):
|
|
'''Visit a video and get it's information.'''
|
|
logo = "\x0300,04 ► \x0F"
|
|
u = f"https://www.youtube.com/oembed?url={url}"
|
|
r = requests.get(u, timeout=10)
|
|
if r:
|
|
j = r.json()
|
|
return (f"{logo}: {j['title']}"
|
|
f" | \x02Channel:\x0F {j['author_name']}")
|
|
else:
|
|
out = default_parser(url)[0]
|
|
return f"{logo}: {out}"
|
|
|
|
|
|
def lainchan(url):
|
|
logo = "\x0309lainchan\x0F"
|
|
if "/res/" in url:
|
|
board = url.split("lainchan.org/")[1].split("/", 1)[0]
|
|
board = urllib.parse.unquote(board)
|
|
u = url.replace(".html", ".json")
|
|
post_no = False
|
|
if ".html#" in url:
|
|
post_no = url.split("#")[1][1:]
|
|
r = requests.get(u, timeout=10).json()
|
|
try:
|
|
title = r["posts"][0]["sub"]
|
|
except KeyError:
|
|
title = f'{r["posts"][0]["com"][:80]}...'
|
|
replies = len(r["posts"]) - 1
|
|
files = 0
|
|
for i in r["posts"]:
|
|
if "filename" in i:
|
|
files += 1
|
|
if "extra_files" in i:
|
|
files += len(i["extra_files"])
|
|
if post_no:
|
|
for i in r["posts"]:
|
|
if int(post_no) != i["no"]:
|
|
continue
|
|
post_text = bs4.BeautifulSoup(i["com"], parser).get_text()[:50]
|
|
return (f"{logo} \x0306/{board}/\x0F {title} "
|
|
f"\x02->\x0F \x0302{post_text}...\x0F | "
|
|
f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")
|
|
|
|
return (f"{logo} \x0306/{board}/\x0F {title} | "
|
|
f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")
|
|
else:
|
|
out = default_parser(url)[0]
|
|
return f"{logo}: {out}"
|
|
|
|
|
|
def imgur(url):
|
|
try:
|
|
up = urllib.parse.urlparse(url)
|
|
host = up.hostname
|
|
path = up.path
|
|
if host[:2] == "i.":
|
|
host = host[2:]
|
|
path = path.rsplit(".", 1)[0]
|
|
u = f"https://{host}{path}"
|
|
else:
|
|
u = url
|
|
|
|
r = requests.get(u, timeout=10)
|
|
s = "widgetFactory.mergeConfig('gallery', "
|
|
b = r.text.index(s) + len(s)
|
|
e = r.text.index(");", b)
|
|
t = r.text[b:e]
|
|
|
|
s = "image :"
|
|
b = t.index(s) + len(s)
|
|
e = t.index("},", b)
|
|
t = t[b:e] + "}"
|
|
|
|
j = json.loads(t)
|
|
title = j["title"]
|
|
mimetype = j["mimetype"]
|
|
size = j["size"]
|
|
width = j["width"]
|
|
height = j["height"]
|
|
nsfw = j["nsfw"]
|
|
|
|
output = ""
|
|
if nsfw:
|
|
output += f"{nsfw_tag} "
|
|
output += f"{title} - Imgur"
|
|
output += f" | {mimetype}, Size: {convert_size(size)}"
|
|
output += f", {width}x{height}"
|
|
return output
|
|
except Exception:
|
|
return default_parser(url)[0]
|
|
|
|
|
|
def nitter(url):
|
|
logo = "\x02Nitter\x0f"
|
|
output, data = default_parser(url)
|
|
try:
|
|
soup = bs4.BeautifulSoup(data, parser)
|
|
user = soup.find(attrs={"property": "og:title"})['content']
|
|
post = soup.find(attrs={"property": "og:description"})['content']
|
|
if post:
|
|
return f"{logo}: \x0305{user}\x0f {post}"
|
|
return output
|
|
except Exception:
|
|
return output
|
|
|
|
|
|
def twitter(url):
|
|
logo = "\x0311twitter\x0F"
|
|
u = f"https://publish.twitter.com/oembed?url={url}"
|
|
r = requests.get(u, timeout=10,
|
|
headers={"user-agent": user_agent,
|
|
"Accept-Language": accept_lang})
|
|
if r:
|
|
j = r.json()
|
|
html = j["html"]
|
|
soup = bs4.BeautifulSoup(html, parser)
|
|
tweet = soup.get_text(separator=" ")
|
|
return f"{logo}: {tweet}"
|
|
else:
|
|
out = default_parser(url)[0]
|
|
return f"{logo}: {out}"
|
|
# #
|
|
# END: Website Handling Functions (by url) #
|
|
# #
|
|
|
|
|
|
hosts_d = {
|
|
"youtube.com": youtube,
|
|
"youtu.be": youtube,
|
|
"m.youtube.com": youtube,
|
|
"lainchan.org": lainchan,
|
|
"i.imgur.com": imgur,
|
|
"imgur.com": imgur,
|
|
"nitter.net": nitter,
|
|
"twitter.com": twitter
|
|
}
|
|
|
|
|
|
def _get_title_from_host(u):
|
|
host = urllib.parse.urlparse(u).hostname
|
|
if host[:4] == "www.":
|
|
host = host[4:]
|
|
if host not in hosts_d:
|
|
return default_parser(u) # It's a tuple
|
|
else:
|
|
return hosts_d[host](u), False
|
|
|
|
|
|
# #
|
|
# BEGIN: Website Handling Functions (by title) #
|
|
# #
|
|
def pleroma(data):
|
|
logo = "\x0308Pleroma\x0F"
|
|
soup = bs4.BeautifulSoup(data, parser)
|
|
t = soup.find(attrs={"property": "og:description"})['content']
|
|
t = t.split(": ", 1)
|
|
poster = t[0]
|
|
post = t[1]
|
|
return f"{logo}: \x0305{poster}\x0F {post}"
|
|
# #
|
|
# END: Website Handling Functions (by title) #
|
|
# #
|
|
|
|
|
|
titles_d = {
|
|
"Pleroma": pleroma
|
|
}
|
|
|
|
|
|
def _get_title_from_title(title, data):
|
|
'''
|
|
Used to get data from the <head> when the <title> isn't very helpful
|
|
'''
|
|
if title in titles_d:
|
|
try:
|
|
return titles_d[title](data)
|
|
except Exception:
|
|
return title
|
|
else:
|
|
return title
|
|
|
|
|
|
def get_title(u):
|
|
title, data = _get_title_from_host(u)
|
|
if data:
|
|
title = _get_title_from_title(title, data)
|
|
return title
|
|
|
|
|
|
def main(i, irc):
|
|
# - Raw undecoded message clean up.
|
|
# Remove /r/n and whitespace
|
|
msg = i.msg_raw.strip()
|
|
# Convert the bytes to a string,
|
|
# split the irc commands from the text message,
|
|
# remove ' character from the end of the string.
|
|
msg = str(msg).split(' :', 1)[1][:-1]
|
|
# Remove all IRC formatting codes
|
|
msg = remove_formatting(msg)
|
|
# msg = info[2]
|
|
|
|
urls = get_url(msg)
|
|
prev_u = set() # Already visited URLs, used to avoid spamming.
|
|
for u in urls:
|
|
if not (u.startswith('http://') or u.startswith('https://')):
|
|
u = f'http://{u}'
|
|
if u in prev_u:
|
|
return
|
|
title = get_title(u)
|
|
if not title:
|
|
continue
|
|
|
|
irc.privmsg(i.channel, title)
|
|
prev_u.add(u)
|