dailybuild_modules/src/irc/modules/wikipedia.py

#!/usr/bin/env python3
# coding=utf-8

# Wikipedia Module for Drastikbot
#
# NOTE: This module is making use of the MediaWiki API,
# so it should work with other MediaWiki based websites.
#
# Depends:
#   - requests       :: $ pip3 install requests
#   - beautifulsoup4 :: $ pip3 install beautifulsoup4

'''
Copyright (C) 2017 drastik.org

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
'''

import requests
import bs4
import urllib.parse
from dbot_tools import Config, p_truncate


class Module:
    def __init__(self):
        self.commands = ['wikipedia', 'wiki', 'w']

        usage = lambda x, y: (f"{x}{y} <article> [--full] [--search]"
                           " [--sections] [-l <lang>] [--resuult <num>]")
        info = ("--info: Get the full section in a query."
                " / --search: Search and get the results in a query."
                " / --sections: Get all the sections of an article in a query."
                " / -l: Post an article from a specific language"
                " / --result: Select specific result."
                " <num> is the index of the result returned by --search"
                " / Use #section after the article's name to get a specific"
                " section. Example: .w irc#Technical information")
        self.manual = {
            "desc": ("Search wikipedia and post a snippet from the resulting"
                     " article."),
            "bot_commands": {
                "wikipedia": {"usage": lambda x: usage(x, "wikipedia"),
                              "info": info,
                              "alias": ["w", "wiki"]},
                "wiki": {"usage": lambda x: usage(x, "wiki"),
                         "info": info,
                         "alias": ["w", "wikipedia"]},
                "w": {"usage": lambda x: usage(x, "w"),
                      "info": info,
                      "alias": ["wikipedia", "wiki"]}
            }
        }


# ----- Global Constants ----- #
r_timeout = 10
bs4_parser = 'html.parser'
# ---------------------------- #


def language(args, config, channel):
    '''Set the language used to search for wikipedia articles'''
    if '-l' in args:
        # Check if the language command has been used and
        # use the given value instead of the configuration
        index = args.index('-l')
        return args[index + 1]
    else:
        # Try loading from the configuration
        try:
            # Check the configuration file for per channel language settings.
            return config['irc']['modules']['wikipedia']['channels'][channel]
        except KeyError:
            try:
                # Check the configuration file for global language settings
                return config['irc']['modules']['wikipedia']['lang']
            except KeyError:
                # Return English if all above fails
                return 'en'


def mw_opensearch(query, url, max_results=1):
    '''
    Uses the MediaWiki API:Opensearch
    https://en.wikipedia.org/w/api.php?action=help&modules=opensearch

    Search MediaWiki for articles relevant to the search 'query'
    It returns a [query,[titles],[descriptions],[urls]] of relevant
    results to the search query.

    'query' is the string to search for
    'url' is the url of the MediaWiki website
    'max_results' is the maximum amount of results to get
    '''
    u = (f'{url}/w/api.php'
         f'?action=opensearch&format=json&limit={max_results}&search={query}')
    r = requests.get(u, timeout=r_timeout)
    return r.json()


def mw_list_sections(page, url):
    '''
    Uses the MediaWiki API:Parsing_wikitext#parse
    https://www.mediawiki.org/wiki/Special:MyLanguage/API:Parsing_wikitext#parse

    Get a list of all the available sections for a given article.
    Returns a tuple with the title of the article and a
    list [[sections],[indexes]]

    'page' should be the name of the MediaWiki article as returned
    by mw_opensearch()
    'url' is the url of the MediaWiki website
    '''
    u = f'{url}/w/api.php?action=parse&format=json&prop=sections&page={page}'
    r = requests.get(u, timeout=r_timeout)
    parse = r.json()
    title = parse['parse']['title']
    sections_ = parse['parse']['sections']
    section_list = [[], []]
    for i in sections_:
        section_list[0].append(i['line'])
        section_list[1].append(i['index'])
    return (title, section_list)


def text_cleanup(soup):
    try:
        for sup in soup('sup'):
            soup.sup.decompose()
    except AttributeError:
        pass
    try:
        for small in soup('small'):
            soup.small.decompose()
    except AttributeError:
        pass
    return soup


def mw_parse_intro(url, page, limit):
    '''
    Uses the MediaWiki API:Parsing_wikitext#parse
    https://www.mediawiki.org/wiki/Special:MyLanguage/API:Parsing_wikitext#parse

    This function calls the MediaWiki API, which returns a JSON
    document containing the html of the introduction section
    of an article, which is parsed by beautifulsoup4, limited
    to the specified amount of characters and returned.

    'url' is the url of the MediaWiki website
    'page' should be the name of the MediaWiki article as
    returned by mw_opensearch()
    'limit' if True truncate the text
    '''
    u = (f'{url}/w/api.php'
         f'?action=parse&format=json&prop=text&section=0&page={page}')
    r = requests.get(u, timeout=r_timeout)
    html = r.json()['parse']['text']['*']
    soup = bs4.BeautifulSoup(html, bs4_parser)
    soup = text_cleanup(soup)
    text = soup.find('p').text
    if text == 'Redirect to:':
        n_title = soup.find('a').text
        n_text = mw_parse_intro(url, n_title, limit)
        text = f'\x0302[Redirect to: {n_title}]\x0F {n_text}'
    if limit:
        text = p_truncate(text, msg_len, 85, True)
    return text


def mw_parse_section(url, section_list, page, sect, limit):
    '''
    Uses the MediaWiki API:Parsing_wikitext#parse
    https://www.mediawiki.org/wiki/Special:MyLanguage/API:Parsing_wikitext#parse

    This function finds the position of the section ('sect')
    requested in 'section_list' and calls the MediaWiki API,
    which returns a JSON document containing the html of the
    requested section which is parsed by beautifulsoup4,
    limited to the specified amount of characters and returned.

    'url' is the url of the MediaWiki website
    'section_list' is the second item returned by mw_list_sections()
    'page' should be the name of the MediaWiki article as
    returned by mw_opensearch()
    'sect' is the section requested to be viewed
    'limit' if True truncate the text
    '''
    id_index = section_list[0].index(sect)
    u = (f'{url}/w/api.php'
         '?action=parse&format=json&prop=text'
         f'&section={section_list[1][id_index]}&page={page}')
    r = requests.get(u, timeout=r_timeout)
    html = r.json()['parse']['text']['*']
    soup = bs4.BeautifulSoup(html, bs4_parser)
    soup = text_cleanup(soup)
    text = soup.find('span', id=sect)
    text = text.find_next('p').text
    if limit:
        text = p_truncate(text, msg_len, 85, True)
    return text


def str2url(url):
    return urllib.parse.quote_plus(url)


def query(args):
    # Get the args list and the commands
    # Join the list to a string and return
    _args = args[:]
    cmds = ['--search', '--sections', '--full']
    cmds_args = ['--result', '-r', '-l']
    for i in cmds_args:
        try:
            idx = _args.index(i)
            del _args[idx]
            del _args[idx]
        except ValueError:
            pass
    for i in cmds:
        try:
            idx = _args.index(i)
            del _args[idx]
        except ValueError:
            pass
    return ' '.join(_args)


def main(i, irc):
    if not i.msg_nocmd:
        msg = (f'Usage: {i.cmd_prefix}{i.cmd} <Article> '
               '[--full, --search, --sections -l], [--result <NUM>]')
        return irc.privmsg(i.channel, msg)

    channel = i.channel
    args = i.msg_nocmd.split()
    config = Config(irc.cd).read()
    lang = language(args, config, i.channel)
    # Do not put a "/" slash at the end of the url
    mediawiki_url = f'https://{lang}.wikipedia.org'
    logo = '\x0301,00Wikipedia\x0F'
    limit = True
    search_q = query(args)
    global msg_len
    msg_len = irc.var.msg_len - 9 - 22

    if '--search' in args:
        opensearch = mw_opensearch(search_q, mediawiki_url, 10)
        rs_string = ''
        for n in opensearch[1]:
            rs_string += f'[{opensearch[1].index(n) + 1}:{n}] '
        msg = (f'{logo}: \x0302[search results for: '
               f'{search_q}]\x0F: {rs_string}')
        return irc.privmsg(i.nickname, msg)

    if '--full' in args:
        limit = False
        channel = i.nickname

    if '--result' in args or '-r' in args:
        try:
            r_index = args.index('--result')
        except ValueError:
            r_index = args.index('-r')
        os_limit = int(args[r_index + 1])
        opensearch = mw_opensearch(search_q, mediawiki_url, os_limit)
        try:
            title = opensearch[1][os_limit - 1]
        except IndexError:
            msg = f'{logo}: No article was found for \x02{search_q}\x0F'
            return irc.privmsg(channel, msg)
    else:
        opensearch = mw_opensearch(search_q, mediawiki_url)
        try:
            title = opensearch[1][0]
        except IndexError:
            msg = f'{logo}: No article was found for \x02{search_q}\x0F'
            return irc.privmsg(channel, msg)
    wikiurl = f'{mediawiki_url}/wiki/{title.replace(" ", "_")}'

    if '--sections' in args:
        sections_out = mw_list_sections(title, mediawiki_url)
        sec_out_str = ' | '.join(sections_out[1][0])
        msg = (f'{logo}: \x0302 [sections for {sections_out[0]}]\x0F: '
               f'{sec_out_str} [ {wikiurl} ]')
        irc.privmsg(i.nickname, msg)
    elif '#' in search_q:
        ts_list = search_q.split('#')
        sections_out = mw_list_sections(title, mediawiki_url)
        snippet = mw_parse_section(mediawiki_url, sections_out[1],
                                   title, ts_list[1], limit)
        msg = f'{logo}: \x02{title}#{ts_list[1]}\x0F | {snippet} | {wikiurl}'
        irc.privmsg(channel, msg)
    else:
        snippet = mw_parse_intro(mediawiki_url, title, limit)
        msg = f'{logo}: \x02{title}\x0F | {snippet} | {wikiurl}'
        irc.privmsg(channel, msg)