125 lines
4.5 KiB
Python
125 lines
4.5 KiB
Python
#! /usr/bin/env python3
|
|
|
|
import os
|
|
import re
|
|
import lxml.etree
|
|
import lxml.html
|
|
import urllib.parse
|
|
|
|
class Optimizer:
|
|
def __init__(self, wiki, base_directory):
|
|
""" @wiki: ArchWiki instance to work with
|
|
@base_directory: absolute path to base output directory, used for
|
|
computation of relative links
|
|
"""
|
|
self.wiki = wiki
|
|
self.base_directory = base_directory
|
|
|
|
def optimize(self, title, html_content):
|
|
# path relative from the HTML file to base output directory
|
|
relbase = os.path.relpath(self.base_directory, os.path.dirname(title))
|
|
|
|
css_path = os.path.join(relbase, "ArchWikiOffline.css")
|
|
|
|
# parse the HTML
|
|
root = lxml.html.document_fromstring(html_content)
|
|
|
|
# optimize
|
|
self.strip_page(root)
|
|
self.fix_layout(root)
|
|
self.replace_css_links(root, css_path)
|
|
self.update_links(root, relbase)
|
|
self.fix_footer(root)
|
|
|
|
# return output
|
|
return lxml.etree.tostring(root,
|
|
pretty_print=True,
|
|
encoding="unicode",
|
|
method="html",
|
|
doctype="<!DOCTYPE html>")
|
|
|
|
def strip_page(self, root):
|
|
""" remove elements useless in offline browsing
|
|
"""
|
|
|
|
for e in root.cssselect("#archnavbar, #mw-navigation, header.mw-header, .vector-sitenotice-container, .vector-page-toolbar"):
|
|
e.getparent().remove(e)
|
|
|
|
# strip comments (including IE 6/7 fixes, which are useless for an Arch package)
|
|
lxml.etree.strip_elements(root, lxml.etree.Comment)
|
|
|
|
# strip <script> tags
|
|
lxml.etree.strip_elements(root, "script")
|
|
|
|
def fix_layout(self, root):
|
|
""" fix page layout after removing some elements
|
|
"""
|
|
|
|
# in case of select-by-id a list with max one element is returned
|
|
for c in root.cssselect("#content"):
|
|
c.set("style", "margin: 0")
|
|
for f in root.cssselect("#footer"):
|
|
f.set("style", "margin: 0")
|
|
|
|
def replace_css_links(self, root, css_path):
|
|
""" force using local CSS
|
|
"""
|
|
|
|
links = root.xpath("//head/link[@rel=\"stylesheet\"]")
|
|
|
|
# overwrite first
|
|
links[0].set("href", css_path)
|
|
|
|
# remove the rest
|
|
for link in links[1:]:
|
|
link.getparent().remove(link)
|
|
|
|
def update_links(self, root, relbase):
|
|
""" change "internal" wiki links into relative
|
|
"""
|
|
|
|
for a in root.cssselect("a"):
|
|
href = a.get("href")
|
|
if href is not None:
|
|
href = urllib.parse.unquote(href)
|
|
# matching full URL is necessary for interlanguage links
|
|
match = re.match("^(https://wiki.archlinux.org)?/title/(?P<title>.+?)(?:#(?P<fragment>.+))?$", str(href))
|
|
if match:
|
|
title = self.wiki.resolve_redirect(match.group("title"))
|
|
try:
|
|
title, fragment = title.split("#", maxsplit=1)
|
|
# FIXME has to be dot-encoded
|
|
fragment = fragment.replace(" ", "_")
|
|
except ValueError:
|
|
fragment = ""
|
|
# explicit fragment overrides the redirect
|
|
if match.group("fragment"):
|
|
fragment = match.group("fragment")
|
|
href = self.wiki.get_local_filename(title, relbase)
|
|
# get_local_filename returns None for skipped pages
|
|
if href is None:
|
|
continue
|
|
if fragment:
|
|
href += "#" + fragment
|
|
a.set("href", href)
|
|
|
|
for i in root.cssselect("img"):
|
|
src = i.get("src")
|
|
if src and src.startswith("/images/"):
|
|
src = os.path.join(relbase, "File:" + os.path.split(src)[1])
|
|
i.set("src", src)
|
|
|
|
def fix_footer(self, root):
|
|
""" move content from 'div.printfooter' into item in '#footer-info'
|
|
(normally 'div.printfooter' is given 'display:none' and is separated by
|
|
the categories list from the real footer)
|
|
"""
|
|
|
|
for printfooter in root.cssselect("div.printfooter"):
|
|
printfooter.attrib.pop("class")
|
|
printfooter.tag = "li"
|
|
f_list = root.cssselect("#footer-info")[0]
|
|
f_list.insert(0, printfooter)
|
|
br = lxml.etree.Element("br")
|
|
f_list.insert(3, br)
|