Vendor drastikbot/drastikbot+modules v2.1
This commit is contained in:
355
src/irc/modules/url.py
Normal file
355
src/irc/modules/url.py
Normal file
@@ -0,0 +1,355 @@
|
||||
# coding=utf-8
|
||||
|
||||
# URL Module for Drastikbot
|
||||
#
|
||||
# Depends:
|
||||
# - requests :: $ pip3 install requests
|
||||
# - beautifulsoup :: $ pip3 install beautifulsoup4
|
||||
|
||||
'''
|
||||
Copyright (C) 2017-2020 drastik.org
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published
|
||||
by the Free Software Foundation, version 3 only.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
'''
|
||||
|
||||
import re
|
||||
import math
|
||||
import json
|
||||
import urllib.parse
|
||||
import requests
|
||||
import bs4
|
||||
|
||||
|
||||
class Module:
|
||||
def __init__(self):
|
||||
self.auto = True
|
||||
|
||||
|
||||
# ----- Constants ----- #
|
||||
parser = 'html.parser'
|
||||
user_agent = "w3m/0.52"
|
||||
accept_lang = "en-US"
|
||||
nsfw_tag = "\x0304[NSFW]\x0F"
|
||||
data_limit = 69120
|
||||
# --------------------- #
|
||||
|
||||
|
||||
def remove_formatting(msg):
|
||||
'''Remove IRC String formatting codes'''
|
||||
# - Regex -
|
||||
# Capture "x03N,M". Should be the first called:
|
||||
# (\\x03[0-9]{0,2},{1}[0-9]{1,2})
|
||||
# Capture "x03N". Catch all color codes.
|
||||
# (\\x03[0-9]{0,2})
|
||||
# Capture the other formatting codes
|
||||
line = re.sub(r'(\\x03[0-9]{0,2},{1}[0-9]{1,2})', '', msg)
|
||||
line = re.sub(r'(\\x03[0-9]{1,2})', '', line)
|
||||
line = line.replace("\\x03", "")
|
||||
line = line.replace("\\x02", "")
|
||||
line = line.replace("\\x1d", "")
|
||||
line = line.replace("\\x1D", "")
|
||||
line = line.replace("\\x1f", "")
|
||||
line = line.replace("\\x1F", "")
|
||||
line = line.replace("\\x16", "")
|
||||
line = line.replace("\\x0f", "")
|
||||
line = line.replace("\\x0F", "")
|
||||
return line
|
||||
|
||||
|
||||
def convert_size(size_bytes):
|
||||
# https://stackoverflow.com/
|
||||
# questions/5194057/better-way-to-convert-file-sizes-in-python
|
||||
if size_bytes == 0:
|
||||
return "0B"
|
||||
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
|
||||
i = int(math.floor(math.log(size_bytes, 1024)))
|
||||
p = math.pow(1024, i)
|
||||
s = round(size_bytes / p, 2)
|
||||
return "%s %s" % (s, size_name[i])
|
||||
|
||||
|
||||
def get_url(msg):
|
||||
'''Search a string for urls and return a list of them.'''
|
||||
str_l = msg.split()
|
||||
req_l = ["http://", "https://"] # add "." for parse urls without a scheme
|
||||
urls = [u for u in str_l if any(r in u for r in req_l)]
|
||||
# Avoid parsing IPv4s that are not complete (IPs like: 1.1):
|
||||
# Useful when a scheme is not required to parse a URL.
|
||||
# urls = [u for u in urls if u.count('.') == 3 or u.upper().isupper()]
|
||||
return urls
|
||||
|
||||
|
||||
def default_parser(u):
|
||||
'''
|
||||
Visit each url and check if there is html content
|
||||
served. If there is try to get the <title></title>
|
||||
tag. If there is not try to read the http headers
|
||||
to find 'content-type' and 'content-length'.
|
||||
'''
|
||||
data = ""
|
||||
output = ""
|
||||
try:
|
||||
r = requests.get(u, stream=True,
|
||||
headers={"user-agent": user_agent,
|
||||
"Accept-Language": accept_lang},
|
||||
timeout=5)
|
||||
except Exception:
|
||||
return False
|
||||
for i in r.iter_content(chunk_size=512, decode_unicode=False):
|
||||
data += i.decode('utf-8', errors='ignore')
|
||||
if len(data) > data_limit or '</head>' in data.lower():
|
||||
break
|
||||
r.close()
|
||||
soup = bs4.BeautifulSoup(data, parser)
|
||||
try:
|
||||
output += soup.head.title.text.strip()
|
||||
except Exception:
|
||||
try:
|
||||
output += r.headers['content-type']
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
h_length = convert_size(float(r.headers['content-length']))
|
||||
if output:
|
||||
output += f", Size: {h_length}"
|
||||
else:
|
||||
output += h_length
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
if "RTA-5042-1996-1400-1577-RTA" in data:
|
||||
output = f"{nsfw_tag} {output}"
|
||||
elif r.headers["Rating"] == "RTA-5042-1996-1400-1577-RTA":
|
||||
output = f"{nsfw_tag} {output}"
|
||||
except KeyError:
|
||||
pass
|
||||
return output, data
|
||||
|
||||
|
||||
# #
|
||||
# BEGIN: Website Handling Functions (by url) #
|
||||
# #
|
||||
def youtube(url):
|
||||
'''Visit a video and get it's information.'''
|
||||
logo = "\x0300,04 ► \x0F"
|
||||
u = f"https://www.youtube.com/oembed?url={url}"
|
||||
r = requests.get(u, timeout=10)
|
||||
if r:
|
||||
j = r.json()
|
||||
return (f"{logo}: {j['title']}"
|
||||
f" | \x02Channel:\x0F {j['author_name']}")
|
||||
else:
|
||||
out = default_parser(url)[0]
|
||||
return f"{logo}: {out}"
|
||||
|
||||
|
||||
def lainchan(url):
|
||||
logo = "\x0309lainchan\x0F"
|
||||
if "/res/" in url:
|
||||
board = url.split("lainchan.org/")[1].split("/", 1)[0]
|
||||
board = urllib.parse.unquote(board)
|
||||
u = url.replace(".html", ".json")
|
||||
post_no = False
|
||||
if ".html#" in url:
|
||||
post_no = url.split("#")[1][1:]
|
||||
r = requests.get(u, timeout=10).json()
|
||||
try:
|
||||
title = r["posts"][0]["sub"]
|
||||
except KeyError:
|
||||
title = f'{r["posts"][0]["com"][:80]}...'
|
||||
replies = len(r["posts"]) - 1
|
||||
files = 0
|
||||
for i in r["posts"]:
|
||||
if "filename" in i:
|
||||
files += 1
|
||||
if "extra_files" in i:
|
||||
files += len(i["extra_files"])
|
||||
if post_no:
|
||||
for i in r["posts"]:
|
||||
if int(post_no) != i["no"]:
|
||||
continue
|
||||
post_text = bs4.BeautifulSoup(i["com"], parser).get_text()[:50]
|
||||
return (f"{logo} \x0306/{board}/\x0F {title} "
|
||||
f"\x02->\x0F \x0302{post_text}...\x0F | "
|
||||
f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")
|
||||
|
||||
return (f"{logo} \x0306/{board}/\x0F {title} | "
|
||||
f"\x02Replies:\x0F {replies} - \x02Files:\x0F {files}")
|
||||
else:
|
||||
out = default_parser(url)[0]
|
||||
return f"{logo}: {out}"
|
||||
|
||||
|
||||
def imgur(url):
|
||||
try:
|
||||
up = urllib.parse.urlparse(url)
|
||||
host = up.hostname
|
||||
path = up.path
|
||||
if host[:2] == "i.":
|
||||
host = host[2:]
|
||||
path = path.rsplit(".", 1)[0]
|
||||
u = f"https://{host}{path}"
|
||||
else:
|
||||
u = url
|
||||
|
||||
r = requests.get(u, timeout=10)
|
||||
s = "widgetFactory.mergeConfig('gallery', "
|
||||
b = r.text.index(s) + len(s)
|
||||
e = r.text.index(");", b)
|
||||
t = r.text[b:e]
|
||||
|
||||
s = "image :"
|
||||
b = t.index(s) + len(s)
|
||||
e = t.index("},", b)
|
||||
t = t[b:e] + "}"
|
||||
|
||||
j = json.loads(t)
|
||||
title = j["title"]
|
||||
mimetype = j["mimetype"]
|
||||
size = j["size"]
|
||||
width = j["width"]
|
||||
height = j["height"]
|
||||
nsfw = j["nsfw"]
|
||||
|
||||
output = ""
|
||||
if nsfw:
|
||||
output += f"{nsfw_tag} "
|
||||
output += f"{title} - Imgur"
|
||||
output += f" | {mimetype}, Size: {convert_size(size)}"
|
||||
output += f", {width}x{height}"
|
||||
return output
|
||||
except Exception:
|
||||
return default_parser(url)[0]
|
||||
|
||||
|
||||
def nitter(url):
|
||||
logo = "\x02Nitter\x0f"
|
||||
output, data = default_parser(url)
|
||||
try:
|
||||
soup = bs4.BeautifulSoup(data, parser)
|
||||
user = soup.find(attrs={"property": "og:title"})['content']
|
||||
post = soup.find(attrs={"property": "og:description"})['content']
|
||||
if post:
|
||||
return f"{logo}: \x0305{user}\x0f {post}"
|
||||
return output
|
||||
except Exception:
|
||||
return output
|
||||
|
||||
|
||||
def twitter(url):
|
||||
logo = "\x0311twitter\x0F"
|
||||
u = f"https://publish.twitter.com/oembed?url={url}"
|
||||
r = requests.get(u, timeout=10,
|
||||
headers={"user-agent": user_agent,
|
||||
"Accept-Language": accept_lang})
|
||||
if r:
|
||||
j = r.json()
|
||||
html = j["html"]
|
||||
soup = bs4.BeautifulSoup(html, parser)
|
||||
tweet = soup.get_text(separator=" ")
|
||||
return f"{logo}: {tweet}"
|
||||
else:
|
||||
out = default_parser(url)[0]
|
||||
return f"{logo}: {out}"
|
||||
# #
|
||||
# END: Website Handling Functions (by url) #
|
||||
# #
|
||||
|
||||
|
||||
hosts_d = {
|
||||
"youtube.com": youtube,
|
||||
"youtu.be": youtube,
|
||||
"lainchan.org": lainchan,
|
||||
"i.imgur.com": imgur,
|
||||
"imgur.com": imgur,
|
||||
"nitter.net": nitter,
|
||||
"twitter.com": twitter
|
||||
}
|
||||
|
||||
|
||||
def _get_title_from_host(u):
|
||||
host = urllib.parse.urlparse(u).hostname
|
||||
if host[:4] == "www.":
|
||||
host = host[4:]
|
||||
if host not in hosts_d:
|
||||
return default_parser(u) # It's a tuple
|
||||
else:
|
||||
return hosts_d[host](u), False
|
||||
|
||||
|
||||
# #
|
||||
# BEGIN: Website Handling Functions (by title) #
|
||||
# #
|
||||
def pleroma(data):
|
||||
logo = "\x0308Pleroma\x0F"
|
||||
soup = bs4.BeautifulSoup(data, parser)
|
||||
t = soup.find(attrs={"property": "og:description"})['content']
|
||||
t = t.split(": ", 1)
|
||||
poster = t[0]
|
||||
post = t[1]
|
||||
return f"{logo}: \x0305{poster}\x0F {post}"
|
||||
# #
|
||||
# END: Website Handling Functions (by title) #
|
||||
# #
|
||||
|
||||
|
||||
titles_d = {
|
||||
"Pleroma": pleroma
|
||||
}
|
||||
|
||||
|
||||
def _get_title_from_title(title, data):
|
||||
'''
|
||||
Used to get data from the <head> when the <title> isn't very helpful
|
||||
'''
|
||||
if title in titles_d:
|
||||
try:
|
||||
return titles_d[title](data)
|
||||
except Exception:
|
||||
return title
|
||||
else:
|
||||
return title
|
||||
|
||||
|
||||
def get_title(u):
|
||||
title, data = _get_title_from_host(u)
|
||||
if data:
|
||||
title = _get_title_from_title(title, data)
|
||||
return title
|
||||
|
||||
|
||||
def main(i, irc):
|
||||
# - Raw undecoded message clean up.
|
||||
# Remove /r/n and whitespace
|
||||
msg = i.msg_raw.strip()
|
||||
# Convert the bytes to a string,
|
||||
# split the irc commands from the text message,
|
||||
# remove ' character from the end of the string.
|
||||
msg = str(msg).split(' :', 1)[1][:-1]
|
||||
# Remove all IRC formatting codes
|
||||
msg = remove_formatting(msg)
|
||||
# msg = info[2]
|
||||
|
||||
urls = get_url(msg)
|
||||
prev_u = set() # Already visited URLs, used to avoid spamming.
|
||||
for u in urls:
|
||||
if not (u.startswith('http://') or u.startswith('https://')):
|
||||
u = f'http://{u}'
|
||||
if u in prev_u:
|
||||
return
|
||||
title = get_title(u)
|
||||
if not title:
|
||||
continue
|
||||
|
||||
irc.privmsg(i.channel, title)
|
||||
prev_u.add(u)
|
||||
Reference in New Issue
Block a user