arch-wiki-docs/ArchWiki/converter.py

#! /usr/bin/env python3

import os
import subprocess

# for filter_pre
import lxml.etree
import lxml.html

# for filter_in
import json
import pandocfilters

class PandocError(Exception):
    def __init__(self, retcode, errs):
        Exception.__init__(self, "pandoc failed with return code %s\nstderr:\n%s" % (retcode, errs))

class ManFilter:
    format = "man"

    def filter_pre(self, instring):
        root = lxml.html.fromstring(instring)

        # force headers to start from level 1
        content = root.cssselect("#bodyContent")[0]
        headers = content.cssselect("h1, h2, h3, h4, h5, h6")
        if len(headers) > 0:
            top_level = int(headers[0].tag[-1])
            for h in headers:
                level = int(h.tag[-1]) - top_level + 1
                h.tag = "h%d" % max(level, 1)

        # add some headers to distinguish divs in output formats like man
        for catlinks in root.cssselect("#catlinks"):
            h = lxml.etree.Element("h1")
            h.text = "Categories"
            catlinks.insert(0, h)
        for footer in root.cssselect("#footer"):
            h = lxml.etree.Element("h1")
            h.text = "Notes"
            footer.insert(0, h)

        return lxml.etree.tostring(root, encoding="unicode", method="html", doctype="<!DOCTYPE html>")

    def filter_in(self, instring):
        def _filter(key, value, format, meta):
            # remove HTML specific stuff
            if key == "Link":
                # remove relative path prefix and .html suffix
                internal, [href, text] = value
                if href.endswith(".html"):
                    href = href[:-5]
# FIXME: this stupid detection will not work
#        or just leave the full path?
#                    if href.startswith("./"):
#                        href = href[2:]
#                    elif href.startswith("../"):
#                        href = href[3:]
                return pandocfilters.Link(internal, [href, text])

# TODO: it's implemented in filter_pre, but could be useful anyway since html may not be
#       the only input format; the most generic way should be implemented
#            if key == "Header":
#                level, classes, internal = value
#
#                # record top level
#                if self.heading_top_level == 0:
#                    self.heading_top_level = level
#
#                # ensure we start from h1 in output
#                if level > self.heading_top_level:
#                    level -= self.heading_top_level
#
#                return pandocfilters.Header(level, classes, internal)

        doc = json.loads(instring)
        altered = pandocfilters.walk(doc, _filter, self.format, doc[0]["unMeta"])
        return json.dumps(altered)

    def filter_post(self, instring):
        return instring

class Converter:
    def __init__(self, filter_inst, input_dir, output_dir, output_format):
        self.filter = filter_inst
        self.input_dir = os.path.abspath(input_dir)
        self.output_dir = os.path.abspath(output_dir)
        self.output_format = output_format

        # ensure output directory always exists
        if not os.path.isdir(self.output_dir):
            os.mkdir(self.output_dir)

    def convert(self):
        failed = []

        for path, dirs, files in os.walk(self.input_dir):
            for f in files:
                infile = os.path.join(path, f)
                outdir = os.path.join(self.output_dir, os.path.relpath(path, self.input_dir))
                outfile = os.path.join(os.path.normpath(outdir), f)
                outfile = os.path.splitext(outfile)[0] + "." + self.output_format
                if infile.endswith(".html"):
                    try:
                        self.convert_file(infile, outfile)
                    except PandocError as e:
                        failed.append(infile)
                        print(e)
                        print("  [conv failed] %s" % infile)
                else:
                    print("  [skip conv]   %s" % infile)

        if len(failed) > 0:
            print("failed to convert %d pages:" % len(failed))
            for f in failed:
                print("  %s" % f)

    def convert_file(self, infile, outfile):
        print("  [converting]  %s" % infile)

        # ensure that target directory exists (necessary for subpages)
        try:
            os.makedirs(os.path.split(outfile)[0])
        except FileExistsError:
            pass

        content = open(infile, "r").read()
        content = self.filter.filter_pre(content)
        content = self.pandoc_first(content)
        content = self.filter.filter_in(content)
        content = self.pandoc_last(content)
        content = self.filter.filter_post(content)

        f = open(outfile, "w")
        f.write(content)
        f.close()

    def run_pandoc(self, cmd, instring):
        popen = subprocess.Popen(cmd, shell=True, universal_newlines=True,
                                 stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        outs, errs = popen.communicate(instring)

        if popen.returncode != 0:
            raise PandocError(popen.returncode, errs)

        return outs

    def pandoc_first(self, instring):
        return self.run_pandoc("pandoc -s -f html -t json", instring)

    def pandoc_last(self, instring):
        return self.run_pandoc("pandoc -s -f json -t %s" % self.output_format, instring)

if __name__ == "__main__":
    f = ManFilter()
    c = Converter(f, "./wiki/", "./output/", "man")
    c.convert()