#! /usr/bin/env python3 import os import re import lxml.etree import lxml.html import urllib.parse class Optimizer: def __init__(self, wiki, base_directory): """ @wiki: ArchWiki instance to work with @base_directory: absolute path to base output directory, used for computation of relative links """ self.wiki = wiki self.base_directory = base_directory def optimize(self, title, html_content): # path relative from the HTML file to base output directory relbase = os.path.relpath(self.base_directory, os.path.dirname(title)) css_path = os.path.join(relbase, "ArchWikiOffline.css") # parse the HTML root = lxml.html.document_fromstring(html_content) # optimize self.strip_page(root) self.fix_layout(root) self.replace_css_links(root, css_path) self.update_links(root, relbase) self.fix_footer(root) # return output return lxml.etree.tostring(root, pretty_print=True, encoding="unicode", method="html", doctype="") def strip_page(self, root): """ remove elements useless in offline browsing """ for e in root.cssselect("#archnavbar, #mw-navigation, header.mw-header, .vector-sitenotice-container, .vector-page-toolbar"): e.getparent().remove(e) # strip comments (including IE 6/7 fixes, which are useless for an Arch package) lxml.etree.strip_elements(root, lxml.etree.Comment) # strip