#! /usr/bin/env python3

import os
import re
import lxml.etree
import lxml.html
import urllib.parse

class Optimizer:
    def __init__(self, wiki, base_directory):
        """ @wiki:           ArchWiki instance to work with
            @base_directory: absolute path to base output directory, used for
                             computation of relative links
        """
        self.wiki = wiki
        self.base_directory = base_directory

    def optimize(self, title, html_content):
        # path relative from the HTML file to base output directory
        relbase = os.path.relpath(self.base_directory, os.path.dirname(title))

        css_path = os.path.join(relbase, "ArchWikiOffline.css")

        # parse the HTML
        root = lxml.html.document_fromstring(html_content)

        # optimize
        self.strip_page(root)
        self.fix_layout(root)
        self.replace_css_links(root, css_path)
        self.update_links(root, relbase)
        self.fix_footer(root)

        # return output
        return lxml.etree.tostring(root,
                                   pretty_print=True,
                                   encoding="unicode",
                                   method="html",
                                   doctype="<!DOCTYPE html>")

    def strip_page(self, root):
        """ remove elements useless in offline browsing
        """

        for e in root.cssselect("#archnavbar, #mw-navigation, header.mw-header, .vector-sitenotice-container, .vector-page-toolbar"):
            e.getparent().remove(e)

        # strip comments (including IE 6/7 fixes, which are useless for an Arch package)
        lxml.etree.strip_elements(root, lxml.etree.Comment)

        # strip <script> tags
        lxml.etree.strip_elements(root, "script")

    def fix_layout(self, root):
        """ fix page layout after removing some elements
        """

        # in case of select-by-id a list with max one element is returned
        for c in root.cssselect("#content"):
            c.set("style", "margin: 0")
        for f in root.cssselect("#footer"):
            f.set("style", "margin: 0")

    def replace_css_links(self, root, css_path):
        """ force using local CSS
        """

        links = root.xpath("//head/link[@rel=\"stylesheet\"]")

        # overwrite first
        links[0].set("href", css_path)

        # remove the rest
        for link in links[1:]:
            link.getparent().remove(link)

    def update_links(self, root, relbase):
        """ change "internal" wiki links into relative
        """

        for a in root.cssselect("a"):
            href = a.get("href")
            if href is not None:
                href = urllib.parse.unquote(href)
                # matching full URL is necessary for interlanguage links
                match = re.match("^(https://wiki.archlinux.org)?/title/(?P<title>.+?)(?:#(?P<fragment>.+))?$", str(href))
                if match:
                    title = self.wiki.resolve_redirect(match.group("title"))
                    try:
                        title, fragment = title.split("#", maxsplit=1)
                        # FIXME has to be dot-encoded
                        fragment = fragment.replace(" ", "_")
                    except ValueError:
                        fragment = ""
                    # explicit fragment overrides the redirect
                    if match.group("fragment"):
                        fragment = match.group("fragment")
                    href = self.wiki.get_local_filename(title, relbase)
                    # get_local_filename returns None for skipped pages
                    if href is None:
                        continue
                    if fragment:
                        href += "#" + fragment
                    a.set("href", href)

        for i in root.cssselect("img"):
            src = i.get("src")
            if src and src.startswith("/images/"):
                src = os.path.join(relbase, "File:" + os.path.split(src)[1])
                i.set("src", src)

    def fix_footer(self, root):
        """ move content from 'div.printfooter' into item in '#footer-info'
            (normally 'div.printfooter' is given 'display:none' and is separated by
            the categories list from the real footer)
        """

        for printfooter in root.cssselect("div.printfooter"):
            printfooter.attrib.pop("class")
            printfooter.tag = "li"
            f_list = root.cssselect("#footer-info")[0]
            f_list.insert(0, printfooter)
            br = lxml.etree.Element("br")
            f_list.insert(3, br)