commit eeea6fd44fea47ee90a8af9a63cd55f95eea54eb Author: Sam Date: Wed Nov 27 12:51:56 2024 +0000 add poetry for dependency management and shell.nix for use on nix systems diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8cdd5a6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +wiki +*.lock diff --git a/ArchWiki/ArchWiki.py b/ArchWiki/ArchWiki.py new file mode 100644 index 0000000..b03487c --- /dev/null +++ b/ArchWiki/ArchWiki.py @@ -0,0 +1,269 @@ +#! /usr/bin/env python3 + +""" Module extending generic MediaWiki interface with stuff specific to ArchWiki + and some convenient generic methods. +""" + +import os.path +import re +import hashlib + +from simplemediawiki import MediaWiki + +__all__ = ["ArchWiki"] + +url = "https://wiki.archlinux.org/api.php" +local_language = "English" +language_names = { + "العربية": {"subtag": "ar", "english": "Arabic"}, + "Bosanski": {"subtag": "bs", "english": "Bosnian"}, + "Български": {"subtag": "bg", "english": "Bulgarian"}, + "Català": {"subtag": "ca", "english": "Catalan"}, + "Čeština": {"subtag": "cs", "english": "Czech"}, + "Dansk": {"subtag": "da", "english": "Danish"}, + "Deutsch": {"subtag": "de", "english": "German"}, + "Ελληνικά": {"subtag": "el", "english": "Greek"}, + "English": {"subtag": "en", "english": "English"}, + "Esperanto": {"subtag": "eo", "english": "Esperanto"}, + "Español": {"subtag": "es", "english": "Spanish"}, + "فارسی": {"subtag": "fa", "english": "Persian"}, + "Suomi": {"subtag": "fi", "english": "Finnish"}, + "Français": {"subtag": "fr", "english": "French"}, + "עברית": {"subtag": "he", "english": "Hebrew"}, + "Hrvatski": {"subtag": "hr", "english": "Croatian"}, + "Magyar": {"subtag": "hu", "english": "Hungarian"}, + "Bahasa Indonesia": {"subtag": "id", "english": "Indonesian"}, + "Italiano": {"subtag": "it", "english": "Italian"}, + "日本語": {"subtag": "ja", "english": "Japanese"}, + "한국어": {"subtag": "ko", "english": "Korean"}, + "Lietuvių": {"subtag": "lt", "english": "Lithuanian"}, + "Norsk Bokmål": {"subtag": "nb", "english": "Norwegian (Bokmål)"}, + "Nederlands": {"subtag": "nl", "english": "Dutch"}, + "Polski": {"subtag": "pl", "english": "Polish"}, + "Português": {"subtag": "pt", "english": "Portuguese"}, + "Română": {"subtag": "ro", "english": "Romanian"}, + "Русский": {"subtag": "ru", "english": "Russian"}, + "Slovenčina": {"subtag": "sk", "english": "Slovak"}, + "Српски": {"subtag": "sr", "english": "Serbian"}, + "Svenska": {"subtag": "sv", "english": "Swedish"}, + "ไทย": {"subtag": "th", "english": "Thai"}, + "Türkçe": {"subtag": "tr", "english": "Turkish"}, + "Українська": {"subtag": "uk", "english": "Ukrainian"}, + "Tiếng Việt": {"subtag": "vi", "english": "Vietnamese"}, + "粵語": {"subtag": "yue", "english": "Cantonese"}, + "简体中文": {"subtag": "zh-hans", "english": "Chinese (Simplified)"}, + "正體中文": {"subtag": "zh-hant", "english": "Chinese (Traditional)"} +} +interlanguage_external = ["de", "fa", "ja", "sv"] +interlanguage_internal = ["ar", "bs", "bg", "cs", "da", "el", "en", "es", "fi", "fr", + "he", "hr", "hu", "id", "it", "ko", "lt", "nl", "pl", "pt", + "ru", "sk", "sr", "th", "tr", "uk", "zh-hans", "zh-hant"] + +def is_ascii(text): + try: + text.encode("ascii") + return True + except: + return False + +class ArchWiki(MediaWiki): + + def __init__(self, safe_filenames=False, langs=None, **kwargs): + """ Parameters: + @safe_filenames: force self.get_local_filename() to return ASCII string + + all keyword arguments of simplemediawiki.MediaWiki + """ + super().__init__(url, **kwargs) + + self._safe_filenames = safe_filenames + self._namespaces = None + self._redirects = None + + if langs is not None: + self._language_names = {} + for lang, metadata in language_names.items(): + if not set(metadata.values()).isdisjoint(langs): + self._language_names[lang] = metadata + else: + self._language_names = language_names + + def query_continue(self, query): + """ Generator for MediaWiki's query-continue feature. + ref: https://www.mediawiki.org/wiki/API:Query#Continuing_queries + """ + last_continue = {"continue": ""} + + while True: + # clone the original params to clean up old continue params + query_copy = query.copy() + # and update with the last continue -- it may involve multiple params, + # hence the clean up with params.copy() + query_copy.update(last_continue) + # call the API and handle the result + result = self.call(query_copy) + if "error" in result: + raise Exception(result["error"]) + if "warnings" in result: + print(result["warnings"]) + if "query" in result: + yield result["query"] + if "continue" not in result: + break + last_continue = result["continue"] + + def namespaces(self): + """ Force the Main namespace to have name instead of empty string. + """ + if self._namespaces is None: + self._namespaces = super().namespaces() + self._namespaces[0] = "Main" + return self._namespaces + + def print_namespaces(self): + nsmap = self.namespaces() + print("Available namespaces:") + for ns in sorted(nsmap.keys()): + print(" %2d -- %s" % (ns, nsmap[ns])) + + def detect_namespace(self, title, safe=True): + """ Detect namespace of a given title. + """ + pure_title = title + detected_namespace = self.namespaces()[0] + match = re.match("^((.+):)?(.+)$", title) + ns = match.group(2) + if ns: + ns = ns.replace("_", " ") + if ns in self.namespaces().values(): + detected_namespace = ns + pure_title = match.group(3) + return pure_title, detected_namespace + + def detect_language(self, title, *, strip_all_subpage_parts=True): + """ + Detect language of a given title. The matching is case-sensitive and spaces are + treated the same way as underscores. + + :param title: page title to work with + :returns: a ``(pure, lang)`` tuple, where ``pure`` is the pure page title without + the language suffix and ``lang`` is the detected language in long, localized form + """ + title_regex = r"(?P.*?)[ _]\((?P[^\(\)]+)\)" + pure_suffix = "" + # matches "Page name/Subpage (Language)" + match = re.fullmatch(title_regex, title) + # matches "Page name (Language)/Subpage" + if not match and "/" in title: + base, pure_suffix = title.split("/", maxsplit=1) + pure_suffix = "/" + pure_suffix + match = re.fullmatch(title_regex, base) + # matches "Category:Language" + if not match: + match = re.fullmatch(r"(?P[Cc]ategory[ _]?\:[ _]?(?P[^\(\)]+))", title) + if match: + pure = match.group("pure") + lang = match.group("lang") + if lang in language_names: + # strip "(Language)" from all subpage components to handle cases like + # "Page name (Language)/Subpage (Language)" + if strip_all_subpage_parts is True and "/" in pure: + parts = pure.split("/") + new_parts = [] + for p in parts: + match = re.fullmatch(title_regex, p) + if match: + part_lang = match.group("lang") + if part_lang == lang: + new_parts.append(match.group("pure")) + else: + new_parts.append(p) + else: + new_parts.append(p) + pure = "/".join(new_parts) + return pure + pure_suffix, lang + return title, local_language + + def get_local_filename(self, title, basepath): + """ Return file name where the given page should be stored, relative to 'basepath'. + """ + title, lang = self.detect_language(title) + + if lang not in self._language_names: + return None + + title, namespace = self.detect_namespace(title) + + # be safe and use '_' instead of ' ' in filenames (MediaWiki style) + title = title.replace(" ", "_") + namespace = namespace.replace(" ", "_") + + # force ASCII filename + if self._safe_filenames and not is_ascii(title): + h = hashlib.md5() + h.update(title.encode("utf-8")) + title = h.hexdigest() + + # select pattern per namespace + if namespace == "Main": + pattern = "{base}/{langsubtag}/{title}.{ext}" + elif namespace in ["Talk", "ArchWiki", "ArchWiki_talk", "Template", "Template_talk", "Help", "Help_talk", "Category", "Category_talk"]: + pattern = "{base}/{langsubtag}/{namespace}:{title}.{ext}" + elif namespace == "File": + pattern = "{base}/{namespace}:{title}" + else: + pattern = "{base}/{namespace}:{title}.{ext}" + + path = pattern.format( + base=basepath, + langsubtag=self._language_names[lang]["subtag"], + namespace=namespace, + title=title, + ext="html" + ) + return os.path.normpath(path) + + def _fetch_redirects(self): + """ Fetch dictionary of redirect pages and their targets + """ + query_allredirects = { + "action": "query", + "generator": "allpages", + "gaplimit": "max", + "gapfilterredir": "nonredirects", + "prop": "redirects", + "rdprop": "title|fragment", + "rdlimit": "max", + } + namespaces = ["0", "4", "12", "14"] + + self._redirects = {} + + for ns in namespaces: + query_allredirects["gapnamespace"] = ns + + for pages_snippet in self.query_continue(query_allredirects): + pages_snippet = sorted(pages_snippet["pages"].values(), key=lambda d: d["title"]) + for page in pages_snippet: + # construct the mapping, the query result is somewhat reversed... + target_title = page["title"] + for redirect in page.get("redirects", []): + source_title = redirect["title"] + target_fragment = redirect.get("fragment") + if target_fragment: + self._redirects[source_title] = "{}#{}".format(target_title, target_fragment) + else: + self._redirects[source_title] = target_title + + def redirects(self): + if self._redirects is None: + self._fetch_redirects() + return self._redirects + + def resolve_redirect(self, title): + """ Returns redirect target title, or given title if it is not redirect. + The returned title will always contain spaces instead of underscores. + """ + # the given title must match the format of titles used in self._redirects + title = title.replace("_", " ") + + return self.redirects().get(title, title) diff --git a/ArchWiki/__init__.py b/ArchWiki/__init__.py new file mode 100644 index 0000000..4871bef --- /dev/null +++ b/ArchWiki/__init__.py @@ -0,0 +1,9 @@ +#! /usr/bin/env python3 + +__author__ = "Jakub Klinkovský " +__url__ = "https://github.com/lahwaacz/arch-wiki-docs" +__version__ = "0.2" + +from .ArchWiki import * +from .downloader import * +from .optimizer import * diff --git a/ArchWiki/converter.py b/ArchWiki/converter.py new file mode 100644 index 0000000..d36b775 --- /dev/null +++ b/ArchWiki/converter.py @@ -0,0 +1,157 @@ +#! /usr/bin/env python3 + +import os +import subprocess + +# for filter_pre +import lxml.etree +import lxml.html + +# for filter_in +import json +import pandocfilters + +class PandocError(Exception): + def __init__(self, retcode, errs): + Exception.__init__(self, "pandoc failed with return code %s\nstderr:\n%s" % (retcode, errs)) + +class ManFilter: + format = "man" + + def filter_pre(self, instring): + root = lxml.html.fromstring(instring) + + # force headers to start from level 1 + content = root.cssselect("#bodyContent")[0] + headers = content.cssselect("h1, h2, h3, h4, h5, h6") + if len(headers) > 0: + top_level = int(headers[0].tag[-1]) + for h in headers: + level = int(h.tag[-1]) - top_level + 1 + h.tag = "h%d" % max(level, 1) + + # add some headers to distinguish divs in output formats like man + for catlinks in root.cssselect("#catlinks"): + h = lxml.etree.Element("h1") + h.text = "Categories" + catlinks.insert(0, h) + for footer in root.cssselect("#footer"): + h = lxml.etree.Element("h1") + h.text = "Notes" + footer.insert(0, h) + + return lxml.etree.tostring(root, encoding="unicode", method="html", doctype="") + + def filter_in(self, instring): + def _filter(key, value, format, meta): + # remove HTML specific stuff + if key == "Link": + # remove relative path prefix and .html suffix + internal, [href, text] = value + if href.endswith(".html"): + href = href[:-5] +# FIXME: this stupid detection will not work +# or just leave the full path? +# if href.startswith("./"): +# href = href[2:] +# elif href.startswith("../"): +# href = href[3:] + return pandocfilters.Link(internal, [href, text]) + +# TODO: it's implemented in filter_pre, but could be useful anyway since html may not be +# the only input format; the most generic way should be implemented +# if key == "Header": +# level, classes, internal = value +# +# # record top level +# if self.heading_top_level == 0: +# self.heading_top_level = level +# +# # ensure we start from h1 in output +# if level > self.heading_top_level: +# level -= self.heading_top_level +# +# return pandocfilters.Header(level, classes, internal) + + doc = json.loads(instring) + altered = pandocfilters.walk(doc, _filter, self.format, doc[0]["unMeta"]) + return json.dumps(altered) + + def filter_post(self, instring): + return instring + +class Converter: + def __init__(self, filter_inst, input_dir, output_dir, output_format): + self.filter = filter_inst + self.input_dir = os.path.abspath(input_dir) + self.output_dir = os.path.abspath(output_dir) + self.output_format = output_format + + # ensure output directory always exists + if not os.path.isdir(self.output_dir): + os.mkdir(self.output_dir) + + def convert(self): + failed = [] + + for path, dirs, files in os.walk(self.input_dir): + for f in files: + infile = os.path.join(path, f) + outdir = os.path.join(self.output_dir, os.path.relpath(path, self.input_dir)) + outfile = os.path.join(os.path.normpath(outdir), f) + outfile = os.path.splitext(outfile)[0] + "." + self.output_format + if infile.endswith(".html"): + try: + self.convert_file(infile, outfile) + except PandocError as e: + failed.append(infile) + print(e) + print(" [conv failed] %s" % infile) + else: + print(" [skip conv] %s" % infile) + + if len(failed) > 0: + print("failed to convert %d pages:" % len(failed)) + for f in failed: + print(" %s" % f) + + def convert_file(self, infile, outfile): + print(" [converting] %s" % infile) + + # ensure that target directory exists (necessary for subpages) + try: + os.makedirs(os.path.split(outfile)[0]) + except FileExistsError: + pass + + content = open(infile, "r").read() + content = self.filter.filter_pre(content) + content = self.pandoc_first(content) + content = self.filter.filter_in(content) + content = self.pandoc_last(content) + content = self.filter.filter_post(content) + + f = open(outfile, "w") + f.write(content) + f.close() + + def run_pandoc(self, cmd, instring): + popen = subprocess.Popen(cmd, shell=True, universal_newlines=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + outs, errs = popen.communicate(instring) + + if popen.returncode != 0: + raise PandocError(popen.returncode, errs) + + return outs + + def pandoc_first(self, instring): + return self.run_pandoc("pandoc -s -f html -t json", instring) + + def pandoc_last(self, instring): + return self.run_pandoc("pandoc -s -f json -t %s" % self.output_format, instring) + +if __name__ == "__main__": + f = ManFilter() + c = Converter(f, "./wiki/", "./output/", "man") + c.convert() diff --git a/ArchWiki/downloader.py b/ArchWiki/downloader.py new file mode 100644 index 0000000..5aba72c --- /dev/null +++ b/ArchWiki/downloader.py @@ -0,0 +1,157 @@ +#! /usr/bin/env python3 + +import os +import datetime + +import requests +from requests.packages.urllib3.util.retry import Retry + +class Downloader: + query_allpages = { + "action": "query", + "generator": "allpages", + "gaplimit": "max", + "gapfilterredir": "nonredirects", + "gapnamespace": "0", + "prop": "info", + "inprop": "url", + "continue": "", + } + + query_allimages = { + "action": "query", + "list": "allimages", + "ailimit": "max", + "aiprop": "url|timestamp", + "continue": "", + } + + css_links = { + "https://wiki.archlinux.org/load.php?lang=en&modules=site.styles|skins.vector.icons,styles|zzz.ext.archLinux.styles&only=styles&skin=vector-2022": "ArchWikiOffline.css", + } + + def __init__(self, wiki, output_directory, epoch, *, optimizer=None): + """ Parameters: + @wiki: ArchWiki instance to work with + @output_directory: where to store the downloaded files + @epoch: force update of every file older than this date (must be instance + of 'datetime') + @optimizer: callback function for HTML post-processing + """ + + self.wiki = wiki + self.output_directory = output_directory + self.epoch = epoch + self.optimizer = optimizer + + # ensure output directory always exists + if not os.path.isdir(self.output_directory): + os.mkdir(self.output_directory) + + # list of valid files + self.files = [] + + self.session = requests.Session() + # granular control over requests' retries: https://stackoverflow.com/a/35504626 + retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + adapter = requests.adapters.HTTPAdapter(max_retries=retries) + self.session.mount("https://", adapter) + self.session.mount("http://", adapter) + + def needs_update(self, fname, timestamp): + """ determine if it is necessary to download a page + """ + + if not os.path.exists(fname): + return True + local = datetime.datetime.utcfromtimestamp(os.path.getmtime(fname)) + if local < timestamp or local < self.epoch: + return True + return False + + def process_namespace(self, namespace): + """ walk all pages in given namespace, download if necessary + """ + + print(f"Processing namespace {namespace}...") + + query = self.query_allpages.copy() + query["gapnamespace"] = namespace + for pages_snippet in self.wiki.query_continue(query): + for page in sorted(pages_snippet["pages"].values(), key=lambda d: d["title"]): + title = page["title"] + fname = self.wiki.get_local_filename(title, self.output_directory) + if not fname: + print(f" [skipping] {title}") + continue + self.files.append(fname) + timestamp = self.wiki.parse_date(page["touched"]) + if self.needs_update(fname, timestamp): + print(f" [downloading] {title}") + fullurl = page["fullurl"] + + r = self.session.get(fullurl) + if self.optimizer is not None: + text = self.optimizer.optimize(fname, r.text) + else: + text = r.text + + # ensure that target directory exists (necessary for subpages) + os.makedirs(os.path.dirname(fname), exist_ok=True) + + with open(fname, "w") as fd: + fd.write(text) + else: + print(f" [up-to-date] {title}") + + def download_css(self): + print("Downloading CSS...") + for link, dest in self.css_links.items(): + print(" ", dest) + fname = os.path.join(self.output_directory, dest) + if fname: + self.files.append(fname) + r = self.session.get(link) + with open(fname, "w") as fd: + fd.write(r.text) + + def download_images(self): + print("Downloading images...") + query = self.query_allimages.copy() + for images_snippet in self.wiki.query_continue(query): + for image in images_snippet["allimages"]: + title = image["title"] + fname = self.wiki.get_local_filename(title, self.output_directory) + if not fname: + print(f" [skipping] {title}") + continue + self.files.append(fname) + timestamp = self.wiki.parse_date(image["timestamp"]) + if self.needs_update(fname, timestamp): + print(f" [downloading] {title}") + r = self.session.get(image["url"]) + with open(fname, "wb") as fd: + fd.write(r.content) + else: + print(f" [up-to-date] {title}") + + def clean_output_directory(self): + """ Walk output_directory and delete all files not found on the wiki. + Should be run _after_ downloading, otherwise all files will be deleted! + """ + + print("Deleting unwanted files (deleted/moved on the wiki)...") + valid_files = self.files.copy() + + for path, dirs, files in os.walk(self.output_directory, topdown=False): + # handle files + for f in files: + fpath = os.path.join(path, f) + if fpath not in valid_files: + print(f" [deleting] {fpath}") + os.unlink(fpath) + + # remove empty directories + if len(os.listdir(path)) == 0: + print(f" [deleting] {path}/") + os.rmdir(path) diff --git a/ArchWiki/optimizer.py b/ArchWiki/optimizer.py new file mode 100644 index 0000000..915a175 --- /dev/null +++ b/ArchWiki/optimizer.py @@ -0,0 +1,124 @@ +#! /usr/bin/env python3 + +import os +import re +import lxml.etree +import lxml.html +import urllib.parse + +class Optimizer: + def __init__(self, wiki, base_directory): + """ @wiki: ArchWiki instance to work with + @base_directory: absolute path to base output directory, used for + computation of relative links + """ + self.wiki = wiki + self.base_directory = base_directory + + def optimize(self, title, html_content): + # path relative from the HTML file to base output directory + relbase = os.path.relpath(self.base_directory, os.path.dirname(title)) + + css_path = os.path.join(relbase, "ArchWikiOffline.css") + + # parse the HTML + root = lxml.html.document_fromstring(html_content) + + # optimize + self.strip_page(root) + self.fix_layout(root) + self.replace_css_links(root, css_path) + self.update_links(root, relbase) + self.fix_footer(root) + + # return output + return lxml.etree.tostring(root, + pretty_print=True, + encoding="unicode", + method="html", + doctype="") + + def strip_page(self, root): + """ remove elements useless in offline browsing + """ + + for e in root.cssselect("#archnavbar, #mw-navigation, header.mw-header, .vector-sitenotice-container, .vector-page-toolbar"): + e.getparent().remove(e) + + # strip comments (including IE 6/7 fixes, which are useless for an Arch package) + lxml.etree.strip_elements(root, lxml.etree.Comment) + + # strip