#! /usr/bin/env python3 import os import re import lxml.etree import lxml.html import urllib.parse class Optimizer: def __init__(self, wiki, base_directory): """ @wiki: ArchWiki instance to work with @base_directory: absolute path to base output directory, used for computation of relative links """ self.wiki = wiki self.base_directory = base_directory def optimize(self, title, html_content): # path relative from the HTML file to base output directory relbase = os.path.relpath(self.base_directory, os.path.dirname(title)) css_path = os.path.join(relbase, "ArchWikiOffline.css") # parse the HTML root = lxml.html.document_fromstring(html_content) # optimize self.strip_page(root) self.fix_layout(root) self.replace_css_links(root, css_path) self.update_links(root, relbase) self.fix_footer(root) # return output return lxml.etree.tostring(root, pretty_print=True, encoding="unicode", method="html", doctype="<!DOCTYPE html>") def strip_page(self, root): """ remove elements useless in offline browsing """ for e in root.cssselect("#archnavbar, #mw-navigation, header.mw-header, .vector-sitenotice-container, .vector-page-toolbar"): e.getparent().remove(e) # strip comments (including IE 6/7 fixes, which are useless for an Arch package) lxml.etree.strip_elements(root, lxml.etree.Comment) # strip <script> tags lxml.etree.strip_elements(root, "script") def fix_layout(self, root): """ fix page layout after removing some elements """ # in case of select-by-id a list with max one element is returned for c in root.cssselect("#content"): c.set("style", "margin: 0") for f in root.cssselect("#footer"): f.set("style", "margin: 0") def replace_css_links(self, root, css_path): """ force using local CSS """ links = root.xpath("//head/link[@rel=\"stylesheet\"]") # overwrite first links[0].set("href", css_path) # remove the rest for link in links[1:]: link.getparent().remove(link) def update_links(self, root, relbase): """ change "internal" wiki links into relative """ for a in root.cssselect("a"): href = a.get("href") if href is not None: href = urllib.parse.unquote(href) # matching full URL is necessary for interlanguage links match = re.match("^(https://wiki.archlinux.org)?/title/(?P<title>.+?)(?:#(?P<fragment>.+))?$", str(href)) if match: title = self.wiki.resolve_redirect(match.group("title")) try: title, fragment = title.split("#", maxsplit=1) # FIXME has to be dot-encoded fragment = fragment.replace(" ", "_") except ValueError: fragment = "" # explicit fragment overrides the redirect if match.group("fragment"): fragment = match.group("fragment") href = self.wiki.get_local_filename(title, relbase) # get_local_filename returns None for skipped pages if href is None: continue if fragment: href += "#" + fragment a.set("href", href) for i in root.cssselect("img"): src = i.get("src") if src and src.startswith("/images/"): src = os.path.join(relbase, "File:" + os.path.split(src)[1]) i.set("src", src) def fix_footer(self, root): """ move content from 'div.printfooter' into item in '#footer-info' (normally 'div.printfooter' is given 'display:none' and is separated by the categories list from the real footer) """ for printfooter in root.cssselect("div.printfooter"): printfooter.attrib.pop("class") printfooter.tag = "li" f_list = root.cssselect("#footer-info")[0] f_list.insert(0, printfooter) br = lxml.etree.Element("br") f_list.insert(3, br)