arch-wiki-docs/ArchWiki/ArchWiki.py

#! /usr/bin/env python3

""" Module extending generic MediaWiki interface with stuff specific to ArchWiki
    and some convenient generic methods.
"""

import os.path
import re
import hashlib

from simplemediawiki import MediaWiki

__all__ = ["ArchWiki"]

url = "https://wiki.archlinux.org/api.php"
local_language = "English"
language_names = {
    "العربية": {"subtag": "ar", "english": "Arabic"},
    "Bosanski": {"subtag": "bs", "english": "Bosnian"},
    "Български": {"subtag": "bg", "english": "Bulgarian"},
    "Català": {"subtag": "ca", "english": "Catalan"},
    "Čeština": {"subtag": "cs", "english": "Czech"},
    "Dansk": {"subtag": "da", "english": "Danish"},
    "Deutsch": {"subtag": "de", "english": "German"},
    "Ελληνικά": {"subtag": "el", "english": "Greek"},
    "English": {"subtag": "en", "english": "English"},
    "Esperanto": {"subtag": "eo", "english": "Esperanto"},
    "Español": {"subtag": "es", "english": "Spanish"},
    "فارسی": {"subtag": "fa", "english": "Persian"},
    "Suomi": {"subtag": "fi", "english": "Finnish"},
    "Français": {"subtag": "fr", "english": "French"},
    "עברית": {"subtag": "he", "english": "Hebrew"},
    "Hrvatski": {"subtag": "hr", "english": "Croatian"},
    "Magyar": {"subtag": "hu", "english": "Hungarian"},
    "Bahasa Indonesia": {"subtag": "id", "english": "Indonesian"},
    "Italiano": {"subtag": "it", "english": "Italian"},
    "日本語": {"subtag": "ja", "english": "Japanese"},
    "한국어": {"subtag": "ko", "english": "Korean"},
    "Lietuvių": {"subtag": "lt", "english": "Lithuanian"},
    "Norsk Bokmål": {"subtag": "nb", "english": "Norwegian (Bokmål)"},
    "Nederlands": {"subtag": "nl", "english": "Dutch"},
    "Polski": {"subtag": "pl", "english": "Polish"},
    "Português": {"subtag": "pt", "english": "Portuguese"},
    "Română": {"subtag": "ro", "english": "Romanian"},
    "Русский": {"subtag": "ru", "english": "Russian"},
    "Slovenčina": {"subtag": "sk", "english": "Slovak"},
    "Српски": {"subtag": "sr", "english": "Serbian"},
    "Svenska": {"subtag": "sv", "english": "Swedish"},
    "ไทย": {"subtag": "th", "english": "Thai"},
    "Türkçe": {"subtag": "tr", "english": "Turkish"},
    "Українська": {"subtag": "uk", "english": "Ukrainian"},
    "Tiếng Việt": {"subtag": "vi", "english": "Vietnamese"},
    "粵語": {"subtag": "yue", "english": "Cantonese"},
    "简体中文": {"subtag": "zh-hans", "english": "Chinese (Simplified)"},
    "正體中文": {"subtag": "zh-hant", "english": "Chinese (Traditional)"}
}
interlanguage_external = ["de", "fa", "ja", "sv"]
interlanguage_internal = ["ar", "bs", "bg", "cs", "da", "el", "en", "es", "fi", "fr",
                          "he", "hr", "hu", "id", "it", "ko", "lt", "nl", "pl", "pt",
                          "ru", "sk", "sr", "th", "tr", "uk", "zh-hans", "zh-hant"]

def is_ascii(text):
    try:
        text.encode("ascii")
        return True
    except:
        return False

class ArchWiki(MediaWiki):

    def __init__(self, safe_filenames=False, langs=None, **kwargs):
        """ Parameters:
            @safe_filenames: force self.get_local_filename() to return ASCII string
            + all keyword arguments of simplemediawiki.MediaWiki
        """
        super().__init__(url, **kwargs)

        self._safe_filenames = safe_filenames
        self._namespaces = None
        self._redirects = None

        if langs is not None:
            self._language_names = {}
            for lang, metadata in language_names.items():
                if not set(metadata.values()).isdisjoint(langs):
                    self._language_names[lang] = metadata
        else:
            self._language_names = language_names

    def query_continue(self, query):
        """ Generator for MediaWiki's query-continue feature.
            ref: https://www.mediawiki.org/wiki/API:Query#Continuing_queries
        """
        last_continue = {"continue": ""}

        while True:
            # clone the original params to clean up old continue params
            query_copy = query.copy()
            # and update with the last continue -- it may involve multiple params,
            # hence the clean up with params.copy()
            query_copy.update(last_continue)
            # call the API and handle the result
            result = self.call(query_copy)
            if "error" in result:
                raise Exception(result["error"])
            if "warnings" in result:
                print(result["warnings"])
            if "query" in result:
                yield result["query"]
            if "continue" not in result:
                break
            last_continue = result["continue"]

    def namespaces(self):
        """ Force the Main namespace to have name instead of empty string.
        """
        if self._namespaces is None:
            self._namespaces = super().namespaces()
            self._namespaces[0] = "Main"
        return self._namespaces

    def print_namespaces(self):
        nsmap = self.namespaces()
        print("Available namespaces:")
        for ns in sorted(nsmap.keys()):
            print("  %2d -- %s" % (ns, nsmap[ns]))

    def detect_namespace(self, title, safe=True):
        """ Detect namespace of a given title.
        """
        pure_title = title
        detected_namespace = self.namespaces()[0]
        match = re.match("^((.+):)?(.+)$", title)
        ns = match.group(2)
        if ns:
            ns = ns.replace("_", " ")
            if ns in self.namespaces().values():
                detected_namespace = ns
                pure_title = match.group(3)
        return pure_title, detected_namespace

    def detect_language(self, title, *, strip_all_subpage_parts=True):
        """
        Detect language of a given title. The matching is case-sensitive and spaces are
        treated the same way as underscores.

        :param title: page title to work with
        :returns: a ``(pure, lang)`` tuple, where ``pure`` is the pure page title without
            the language suffix and ``lang`` is the detected language in long, localized form
        """
        title_regex = r"(?P<pure>.*?)[ _]\((?P<lang>[^\(\)]+)\)"
        pure_suffix = ""
        # matches "Page name/Subpage (Language)"
        match = re.fullmatch(title_regex, title)
        # matches "Page name (Language)/Subpage"
        if not match and "/" in title:
            base, pure_suffix = title.split("/", maxsplit=1)
            pure_suffix = "/" + pure_suffix
            match = re.fullmatch(title_regex, base)
        # matches "Category:Language"
        if not match:
            match = re.fullmatch(r"(?P<pure>[Cc]ategory[ _]?\:[ _]?(?P<lang>[^\(\)]+))", title)
        if match:
            pure = match.group("pure")
            lang = match.group("lang")
            if lang in language_names:
                # strip "(Language)" from all subpage components to handle cases like
                # "Page name (Language)/Subpage (Language)"
                if strip_all_subpage_parts is True and "/" in pure:
                    parts = pure.split("/")
                    new_parts = []
                    for p in parts:
                        match = re.fullmatch(title_regex, p)
                        if match:
                            part_lang = match.group("lang")
                            if part_lang == lang:
                                new_parts.append(match.group("pure"))
                            else:
                                new_parts.append(p)
                        else:
                            new_parts.append(p)
                    pure = "/".join(new_parts)
                return pure + pure_suffix, lang
        return title, local_language

    def get_local_filename(self, title, basepath):
        """ Return file name where the given page should be stored, relative to 'basepath'.
        """
        title, lang = self.detect_language(title)

        if lang not in self._language_names:
            return None

        title, namespace = self.detect_namespace(title)

        # be safe and use '_' instead of ' ' in filenames (MediaWiki style)
        title = title.replace(" ", "_")
        namespace = namespace.replace(" ", "_")

        # force ASCII filename
        if self._safe_filenames and not is_ascii(title):
            h = hashlib.md5()
            h.update(title.encode("utf-8"))
            title = h.hexdigest()

        # select pattern per namespace
        if namespace == "Main":
            pattern = "{base}/{langsubtag}/{title}.{ext}"
        elif namespace in ["Talk", "ArchWiki", "ArchWiki_talk", "Template", "Template_talk", "Help", "Help_talk", "Category", "Category_talk"]:
            pattern = "{base}/{langsubtag}/{namespace}:{title}.{ext}"
        elif namespace == "File":
            pattern = "{base}/{namespace}:{title}"
        else:
            pattern = "{base}/{namespace}:{title}.{ext}"

        path = pattern.format(
            base=basepath,
            langsubtag=self._language_names[lang]["subtag"],
            namespace=namespace,
            title=title,
            ext="html"
        )
        return os.path.normpath(path)

    def _fetch_redirects(self):
        """ Fetch dictionary of redirect pages and their targets
        """
        query_allredirects = {
            "action": "query",
            "generator": "allpages",
            "gaplimit": "max",
            "gapfilterredir": "nonredirects",
            "prop": "redirects",
            "rdprop": "title|fragment",
            "rdlimit": "max",
        }
        namespaces = ["0", "4", "12", "14"]

        self._redirects = {}

        for ns in namespaces:
            query_allredirects["gapnamespace"] = ns

            for pages_snippet in self.query_continue(query_allredirects):
                pages_snippet = sorted(pages_snippet["pages"].values(), key=lambda d: d["title"])
                for page in pages_snippet:
                    # construct the mapping, the query result is somewhat reversed...
                    target_title = page["title"]
                    for redirect in page.get("redirects", []):
                        source_title = redirect["title"]
                        target_fragment = redirect.get("fragment")
                        if target_fragment:
                            self._redirects[source_title] = "{}#{}".format(target_title, target_fragment)
                        else:
                            self._redirects[source_title] = target_title

    def redirects(self):
        if self._redirects is None:
            self._fetch_redirects()
        return self._redirects

    def resolve_redirect(self, title):
        """ Returns redirect target title, or given title if it is not redirect.
            The returned title will always contain spaces instead of underscores.
        """
        # the given title must match the format of titles used in self._redirects
        title = title.replace("_", " ")

        return self.redirects().get(title, title)