#! /usr/bin/env python3 """ Module extending generic MediaWiki interface with stuff specific to ArchWiki and some convenient generic methods. """ import os.path import re import hashlib from simplemediawiki import MediaWiki __all__ = ["ArchWiki"] url = "https://wiki.archlinux.org/api.php" local_language = "English" language_names = { "العربية": {"subtag": "ar", "english": "Arabic"}, "Bosanski": {"subtag": "bs", "english": "Bosnian"}, "Български": {"subtag": "bg", "english": "Bulgarian"}, "Català": {"subtag": "ca", "english": "Catalan"}, "Čeština": {"subtag": "cs", "english": "Czech"}, "Dansk": {"subtag": "da", "english": "Danish"}, "Deutsch": {"subtag": "de", "english": "German"}, "Ελληνικά": {"subtag": "el", "english": "Greek"}, "English": {"subtag": "en", "english": "English"}, "Esperanto": {"subtag": "eo", "english": "Esperanto"}, "Español": {"subtag": "es", "english": "Spanish"}, "فارسی": {"subtag": "fa", "english": "Persian"}, "Suomi": {"subtag": "fi", "english": "Finnish"}, "Français": {"subtag": "fr", "english": "French"}, "עברית": {"subtag": "he", "english": "Hebrew"}, "Hrvatski": {"subtag": "hr", "english": "Croatian"}, "Magyar": {"subtag": "hu", "english": "Hungarian"}, "Bahasa Indonesia": {"subtag": "id", "english": "Indonesian"}, "Italiano": {"subtag": "it", "english": "Italian"}, "日本語": {"subtag": "ja", "english": "Japanese"}, "한국어": {"subtag": "ko", "english": "Korean"}, "Lietuvių": {"subtag": "lt", "english": "Lithuanian"}, "Norsk Bokmål": {"subtag": "nb", "english": "Norwegian (Bokmål)"}, "Nederlands": {"subtag": "nl", "english": "Dutch"}, "Polski": {"subtag": "pl", "english": "Polish"}, "Português": {"subtag": "pt", "english": "Portuguese"}, "Română": {"subtag": "ro", "english": "Romanian"}, "Русский": {"subtag": "ru", "english": "Russian"}, "Slovenčina": {"subtag": "sk", "english": "Slovak"}, "Српски": {"subtag": "sr", "english": "Serbian"}, "Svenska": {"subtag": "sv", "english": "Swedish"}, "ไทย": {"subtag": "th", "english": "Thai"}, "Türkçe": {"subtag": "tr", "english": "Turkish"}, "Українська": {"subtag": "uk", "english": "Ukrainian"}, "Tiếng Việt": {"subtag": "vi", "english": "Vietnamese"}, "粵語": {"subtag": "yue", "english": "Cantonese"}, "简体中文": {"subtag": "zh-hans", "english": "Chinese (Simplified)"}, "正體中文": {"subtag": "zh-hant", "english": "Chinese (Traditional)"} } interlanguage_external = ["de", "fa", "ja", "sv"] interlanguage_internal = ["ar", "bs", "bg", "cs", "da", "el", "en", "es", "fi", "fr", "he", "hr", "hu", "id", "it", "ko", "lt", "nl", "pl", "pt", "ru", "sk", "sr", "th", "tr", "uk", "zh-hans", "zh-hant"] def is_ascii(text): try: text.encode("ascii") return True except: return False class ArchWiki(MediaWiki): def __init__(self, safe_filenames=False, langs=None, **kwargs): """ Parameters: @safe_filenames: force self.get_local_filename() to return ASCII string + all keyword arguments of simplemediawiki.MediaWiki """ super().__init__(url, **kwargs) self._safe_filenames = safe_filenames self._namespaces = None self._redirects = None if langs is not None: self._language_names = {} for lang, metadata in language_names.items(): if not set(metadata.values()).isdisjoint(langs): self._language_names[lang] = metadata else: self._language_names = language_names def query_continue(self, query): """ Generator for MediaWiki's query-continue feature. ref: https://www.mediawiki.org/wiki/API:Query#Continuing_queries """ last_continue = {"continue": ""} while True: # clone the original params to clean up old continue params query_copy = query.copy() # and update with the last continue -- it may involve multiple params, # hence the clean up with params.copy() query_copy.update(last_continue) # call the API and handle the result result = self.call(query_copy) if "error" in result: raise Exception(result["error"]) if "warnings" in result: print(result["warnings"]) if "query" in result: yield result["query"] if "continue" not in result: break last_continue = result["continue"] def namespaces(self): """ Force the Main namespace to have name instead of empty string. """ if self._namespaces is None: self._namespaces = super().namespaces() self._namespaces[0] = "Main" return self._namespaces def print_namespaces(self): nsmap = self.namespaces() print("Available namespaces:") for ns in sorted(nsmap.keys()): print(" %2d -- %s" % (ns, nsmap[ns])) def detect_namespace(self, title, safe=True): """ Detect namespace of a given title. """ pure_title = title detected_namespace = self.namespaces()[0] match = re.match("^((.+):)?(.+)$", title) ns = match.group(2) if ns: ns = ns.replace("_", " ") if ns in self.namespaces().values(): detected_namespace = ns pure_title = match.group(3) return pure_title, detected_namespace def detect_language(self, title, *, strip_all_subpage_parts=True): """ Detect language of a given title. The matching is case-sensitive and spaces are treated the same way as underscores. :param title: page title to work with :returns: a ``(pure, lang)`` tuple, where ``pure`` is the pure page title without the language suffix and ``lang`` is the detected language in long, localized form """ title_regex = r"(?P.*?)[ _]\((?P[^\(\)]+)\)" pure_suffix = "" # matches "Page name/Subpage (Language)" match = re.fullmatch(title_regex, title) # matches "Page name (Language)/Subpage" if not match and "/" in title: base, pure_suffix = title.split("/", maxsplit=1) pure_suffix = "/" + pure_suffix match = re.fullmatch(title_regex, base) # matches "Category:Language" if not match: match = re.fullmatch(r"(?P[Cc]ategory[ _]?\:[ _]?(?P[^\(\)]+))", title) if match: pure = match.group("pure") lang = match.group("lang") if lang in language_names: # strip "(Language)" from all subpage components to handle cases like # "Page name (Language)/Subpage (Language)" if strip_all_subpage_parts is True and "/" in pure: parts = pure.split("/") new_parts = [] for p in parts: match = re.fullmatch(title_regex, p) if match: part_lang = match.group("lang") if part_lang == lang: new_parts.append(match.group("pure")) else: new_parts.append(p) else: new_parts.append(p) pure = "/".join(new_parts) return pure + pure_suffix, lang return title, local_language def get_local_filename(self, title, basepath): """ Return file name where the given page should be stored, relative to 'basepath'. """ title, lang = self.detect_language(title) if lang not in self._language_names: return None title, namespace = self.detect_namespace(title) # be safe and use '_' instead of ' ' in filenames (MediaWiki style) title = title.replace(" ", "_") namespace = namespace.replace(" ", "_") # force ASCII filename if self._safe_filenames and not is_ascii(title): h = hashlib.md5() h.update(title.encode("utf-8")) title = h.hexdigest() # select pattern per namespace if namespace == "Main": pattern = "{base}/{langsubtag}/{title}.{ext}" elif namespace in ["Talk", "ArchWiki", "ArchWiki_talk", "Template", "Template_talk", "Help", "Help_talk", "Category", "Category_talk"]: pattern = "{base}/{langsubtag}/{namespace}:{title}.{ext}" elif namespace == "File": pattern = "{base}/{namespace}:{title}" else: pattern = "{base}/{namespace}:{title}.{ext}" path = pattern.format( base=basepath, langsubtag=self._language_names[lang]["subtag"], namespace=namespace, title=title, ext="html" ) return os.path.normpath(path) def _fetch_redirects(self): """ Fetch dictionary of redirect pages and their targets """ query_allredirects = { "action": "query", "generator": "allpages", "gaplimit": "max", "gapfilterredir": "nonredirects", "prop": "redirects", "rdprop": "title|fragment", "rdlimit": "max", } namespaces = ["0", "4", "12", "14"] self._redirects = {} for ns in namespaces: query_allredirects["gapnamespace"] = ns for pages_snippet in self.query_continue(query_allredirects): pages_snippet = sorted(pages_snippet["pages"].values(), key=lambda d: d["title"]) for page in pages_snippet: # construct the mapping, the query result is somewhat reversed... target_title = page["title"] for redirect in page.get("redirects", []): source_title = redirect["title"] target_fragment = redirect.get("fragment") if target_fragment: self._redirects[source_title] = "{}#{}".format(target_title, target_fragment) else: self._redirects[source_title] = target_title def redirects(self): if self._redirects is None: self._fetch_redirects() return self._redirects def resolve_redirect(self, title): """ Returns redirect target title, or given title if it is not redirect. The returned title will always contain spaces instead of underscores. """ # the given title must match the format of titles used in self._redirects title = title.replace("_", " ") return self.redirects().get(title, title)