270 lines
11 KiB
Python
270 lines
11 KiB
Python
|
#! /usr/bin/env python3
|
||
|
|
||
|
""" Module extending generic MediaWiki interface with stuff specific to ArchWiki
|
||
|
and some convenient generic methods.
|
||
|
"""
|
||
|
|
||
|
import os.path
|
||
|
import re
|
||
|
import hashlib
|
||
|
|
||
|
from simplemediawiki import MediaWiki
|
||
|
|
||
|
__all__ = ["ArchWiki"]
|
||
|
|
||
|
url = "https://wiki.archlinux.org/api.php"
|
||
|
local_language = "English"
|
||
|
language_names = {
|
||
|
"العربية": {"subtag": "ar", "english": "Arabic"},
|
||
|
"Bosanski": {"subtag": "bs", "english": "Bosnian"},
|
||
|
"Български": {"subtag": "bg", "english": "Bulgarian"},
|
||
|
"Català": {"subtag": "ca", "english": "Catalan"},
|
||
|
"Čeština": {"subtag": "cs", "english": "Czech"},
|
||
|
"Dansk": {"subtag": "da", "english": "Danish"},
|
||
|
"Deutsch": {"subtag": "de", "english": "German"},
|
||
|
"Ελληνικά": {"subtag": "el", "english": "Greek"},
|
||
|
"English": {"subtag": "en", "english": "English"},
|
||
|
"Esperanto": {"subtag": "eo", "english": "Esperanto"},
|
||
|
"Español": {"subtag": "es", "english": "Spanish"},
|
||
|
"فارسی": {"subtag": "fa", "english": "Persian"},
|
||
|
"Suomi": {"subtag": "fi", "english": "Finnish"},
|
||
|
"Français": {"subtag": "fr", "english": "French"},
|
||
|
"עברית": {"subtag": "he", "english": "Hebrew"},
|
||
|
"Hrvatski": {"subtag": "hr", "english": "Croatian"},
|
||
|
"Magyar": {"subtag": "hu", "english": "Hungarian"},
|
||
|
"Bahasa Indonesia": {"subtag": "id", "english": "Indonesian"},
|
||
|
"Italiano": {"subtag": "it", "english": "Italian"},
|
||
|
"日本語": {"subtag": "ja", "english": "Japanese"},
|
||
|
"한국어": {"subtag": "ko", "english": "Korean"},
|
||
|
"Lietuvių": {"subtag": "lt", "english": "Lithuanian"},
|
||
|
"Norsk Bokmål": {"subtag": "nb", "english": "Norwegian (Bokmål)"},
|
||
|
"Nederlands": {"subtag": "nl", "english": "Dutch"},
|
||
|
"Polski": {"subtag": "pl", "english": "Polish"},
|
||
|
"Português": {"subtag": "pt", "english": "Portuguese"},
|
||
|
"Română": {"subtag": "ro", "english": "Romanian"},
|
||
|
"Русский": {"subtag": "ru", "english": "Russian"},
|
||
|
"Slovenčina": {"subtag": "sk", "english": "Slovak"},
|
||
|
"Српски": {"subtag": "sr", "english": "Serbian"},
|
||
|
"Svenska": {"subtag": "sv", "english": "Swedish"},
|
||
|
"ไทย": {"subtag": "th", "english": "Thai"},
|
||
|
"Türkçe": {"subtag": "tr", "english": "Turkish"},
|
||
|
"Українська": {"subtag": "uk", "english": "Ukrainian"},
|
||
|
"Tiếng Việt": {"subtag": "vi", "english": "Vietnamese"},
|
||
|
"粵語": {"subtag": "yue", "english": "Cantonese"},
|
||
|
"简体中文": {"subtag": "zh-hans", "english": "Chinese (Simplified)"},
|
||
|
"正體中文": {"subtag": "zh-hant", "english": "Chinese (Traditional)"}
|
||
|
}
|
||
|
interlanguage_external = ["de", "fa", "ja", "sv"]
|
||
|
interlanguage_internal = ["ar", "bs", "bg", "cs", "da", "el", "en", "es", "fi", "fr",
|
||
|
"he", "hr", "hu", "id", "it", "ko", "lt", "nl", "pl", "pt",
|
||
|
"ru", "sk", "sr", "th", "tr", "uk", "zh-hans", "zh-hant"]
|
||
|
|
||
|
def is_ascii(text):
|
||
|
try:
|
||
|
text.encode("ascii")
|
||
|
return True
|
||
|
except:
|
||
|
return False
|
||
|
|
||
|
class ArchWiki(MediaWiki):
|
||
|
|
||
|
def __init__(self, safe_filenames=False, langs=None, **kwargs):
|
||
|
""" Parameters:
|
||
|
@safe_filenames: force self.get_local_filename() to return ASCII string
|
||
|
+ all keyword arguments of simplemediawiki.MediaWiki
|
||
|
"""
|
||
|
super().__init__(url, **kwargs)
|
||
|
|
||
|
self._safe_filenames = safe_filenames
|
||
|
self._namespaces = None
|
||
|
self._redirects = None
|
||
|
|
||
|
if langs is not None:
|
||
|
self._language_names = {}
|
||
|
for lang, metadata in language_names.items():
|
||
|
if not set(metadata.values()).isdisjoint(langs):
|
||
|
self._language_names[lang] = metadata
|
||
|
else:
|
||
|
self._language_names = language_names
|
||
|
|
||
|
def query_continue(self, query):
|
||
|
""" Generator for MediaWiki's query-continue feature.
|
||
|
ref: https://www.mediawiki.org/wiki/API:Query#Continuing_queries
|
||
|
"""
|
||
|
last_continue = {"continue": ""}
|
||
|
|
||
|
while True:
|
||
|
# clone the original params to clean up old continue params
|
||
|
query_copy = query.copy()
|
||
|
# and update with the last continue -- it may involve multiple params,
|
||
|
# hence the clean up with params.copy()
|
||
|
query_copy.update(last_continue)
|
||
|
# call the API and handle the result
|
||
|
result = self.call(query_copy)
|
||
|
if "error" in result:
|
||
|
raise Exception(result["error"])
|
||
|
if "warnings" in result:
|
||
|
print(result["warnings"])
|
||
|
if "query" in result:
|
||
|
yield result["query"]
|
||
|
if "continue" not in result:
|
||
|
break
|
||
|
last_continue = result["continue"]
|
||
|
|
||
|
def namespaces(self):
|
||
|
""" Force the Main namespace to have name instead of empty string.
|
||
|
"""
|
||
|
if self._namespaces is None:
|
||
|
self._namespaces = super().namespaces()
|
||
|
self._namespaces[0] = "Main"
|
||
|
return self._namespaces
|
||
|
|
||
|
def print_namespaces(self):
|
||
|
nsmap = self.namespaces()
|
||
|
print("Available namespaces:")
|
||
|
for ns in sorted(nsmap.keys()):
|
||
|
print(" %2d -- %s" % (ns, nsmap[ns]))
|
||
|
|
||
|
def detect_namespace(self, title, safe=True):
|
||
|
""" Detect namespace of a given title.
|
||
|
"""
|
||
|
pure_title = title
|
||
|
detected_namespace = self.namespaces()[0]
|
||
|
match = re.match("^((.+):)?(.+)$", title)
|
||
|
ns = match.group(2)
|
||
|
if ns:
|
||
|
ns = ns.replace("_", " ")
|
||
|
if ns in self.namespaces().values():
|
||
|
detected_namespace = ns
|
||
|
pure_title = match.group(3)
|
||
|
return pure_title, detected_namespace
|
||
|
|
||
|
def detect_language(self, title, *, strip_all_subpage_parts=True):
|
||
|
"""
|
||
|
Detect language of a given title. The matching is case-sensitive and spaces are
|
||
|
treated the same way as underscores.
|
||
|
|
||
|
:param title: page title to work with
|
||
|
:returns: a ``(pure, lang)`` tuple, where ``pure`` is the pure page title without
|
||
|
the language suffix and ``lang`` is the detected language in long, localized form
|
||
|
"""
|
||
|
title_regex = r"(?P<pure>.*?)[ _]\((?P<lang>[^\(\)]+)\)"
|
||
|
pure_suffix = ""
|
||
|
# matches "Page name/Subpage (Language)"
|
||
|
match = re.fullmatch(title_regex, title)
|
||
|
# matches "Page name (Language)/Subpage"
|
||
|
if not match and "/" in title:
|
||
|
base, pure_suffix = title.split("/", maxsplit=1)
|
||
|
pure_suffix = "/" + pure_suffix
|
||
|
match = re.fullmatch(title_regex, base)
|
||
|
# matches "Category:Language"
|
||
|
if not match:
|
||
|
match = re.fullmatch(r"(?P<pure>[Cc]ategory[ _]?\:[ _]?(?P<lang>[^\(\)]+))", title)
|
||
|
if match:
|
||
|
pure = match.group("pure")
|
||
|
lang = match.group("lang")
|
||
|
if lang in language_names:
|
||
|
# strip "(Language)" from all subpage components to handle cases like
|
||
|
# "Page name (Language)/Subpage (Language)"
|
||
|
if strip_all_subpage_parts is True and "/" in pure:
|
||
|
parts = pure.split("/")
|
||
|
new_parts = []
|
||
|
for p in parts:
|
||
|
match = re.fullmatch(title_regex, p)
|
||
|
if match:
|
||
|
part_lang = match.group("lang")
|
||
|
if part_lang == lang:
|
||
|
new_parts.append(match.group("pure"))
|
||
|
else:
|
||
|
new_parts.append(p)
|
||
|
else:
|
||
|
new_parts.append(p)
|
||
|
pure = "/".join(new_parts)
|
||
|
return pure + pure_suffix, lang
|
||
|
return title, local_language
|
||
|
|
||
|
def get_local_filename(self, title, basepath):
|
||
|
""" Return file name where the given page should be stored, relative to 'basepath'.
|
||
|
"""
|
||
|
title, lang = self.detect_language(title)
|
||
|
|
||
|
if lang not in self._language_names:
|
||
|
return None
|
||
|
|
||
|
title, namespace = self.detect_namespace(title)
|
||
|
|
||
|
# be safe and use '_' instead of ' ' in filenames (MediaWiki style)
|
||
|
title = title.replace(" ", "_")
|
||
|
namespace = namespace.replace(" ", "_")
|
||
|
|
||
|
# force ASCII filename
|
||
|
if self._safe_filenames and not is_ascii(title):
|
||
|
h = hashlib.md5()
|
||
|
h.update(title.encode("utf-8"))
|
||
|
title = h.hexdigest()
|
||
|
|
||
|
# select pattern per namespace
|
||
|
if namespace == "Main":
|
||
|
pattern = "{base}/{langsubtag}/{title}.{ext}"
|
||
|
elif namespace in ["Talk", "ArchWiki", "ArchWiki_talk", "Template", "Template_talk", "Help", "Help_talk", "Category", "Category_talk"]:
|
||
|
pattern = "{base}/{langsubtag}/{namespace}:{title}.{ext}"
|
||
|
elif namespace == "File":
|
||
|
pattern = "{base}/{namespace}:{title}"
|
||
|
else:
|
||
|
pattern = "{base}/{namespace}:{title}.{ext}"
|
||
|
|
||
|
path = pattern.format(
|
||
|
base=basepath,
|
||
|
langsubtag=self._language_names[lang]["subtag"],
|
||
|
namespace=namespace,
|
||
|
title=title,
|
||
|
ext="html"
|
||
|
)
|
||
|
return os.path.normpath(path)
|
||
|
|
||
|
def _fetch_redirects(self):
|
||
|
""" Fetch dictionary of redirect pages and their targets
|
||
|
"""
|
||
|
query_allredirects = {
|
||
|
"action": "query",
|
||
|
"generator": "allpages",
|
||
|
"gaplimit": "max",
|
||
|
"gapfilterredir": "nonredirects",
|
||
|
"prop": "redirects",
|
||
|
"rdprop": "title|fragment",
|
||
|
"rdlimit": "max",
|
||
|
}
|
||
|
namespaces = ["0", "4", "12", "14"]
|
||
|
|
||
|
self._redirects = {}
|
||
|
|
||
|
for ns in namespaces:
|
||
|
query_allredirects["gapnamespace"] = ns
|
||
|
|
||
|
for pages_snippet in self.query_continue(query_allredirects):
|
||
|
pages_snippet = sorted(pages_snippet["pages"].values(), key=lambda d: d["title"])
|
||
|
for page in pages_snippet:
|
||
|
# construct the mapping, the query result is somewhat reversed...
|
||
|
target_title = page["title"]
|
||
|
for redirect in page.get("redirects", []):
|
||
|
source_title = redirect["title"]
|
||
|
target_fragment = redirect.get("fragment")
|
||
|
if target_fragment:
|
||
|
self._redirects[source_title] = "{}#{}".format(target_title, target_fragment)
|
||
|
else:
|
||
|
self._redirects[source_title] = target_title
|
||
|
|
||
|
def redirects(self):
|
||
|
if self._redirects is None:
|
||
|
self._fetch_redirects()
|
||
|
return self._redirects
|
||
|
|
||
|
def resolve_redirect(self, title):
|
||
|
""" Returns redirect target title, or given title if it is not redirect.
|
||
|
The returned title will always contain spaces instead of underscores.
|
||
|
"""
|
||
|
# the given title must match the format of titles used in self._redirects
|
||
|
title = title.replace("_", " ")
|
||
|
|
||
|
return self.redirects().get(title, title)
|