#! /usr/bin/env python3 import os import subprocess # for filter_pre import lxml.etree import lxml.html # for filter_in import json import pandocfilters class PandocError(Exception): def __init__(self, retcode, errs): Exception.__init__(self, "pandoc failed with return code %s\nstderr:\n%s" % (retcode, errs)) class ManFilter: format = "man" def filter_pre(self, instring): root = lxml.html.fromstring(instring) # force headers to start from level 1 content = root.cssselect("#bodyContent")[0] headers = content.cssselect("h1, h2, h3, h4, h5, h6") if len(headers) > 0: top_level = int(headers[0].tag[-1]) for h in headers: level = int(h.tag[-1]) - top_level + 1 h.tag = "h%d" % max(level, 1) # add some headers to distinguish divs in output formats like man for catlinks in root.cssselect("#catlinks"): h = lxml.etree.Element("h1") h.text = "Categories" catlinks.insert(0, h) for footer in root.cssselect("#footer"): h = lxml.etree.Element("h1") h.text = "Notes" footer.insert(0, h) return lxml.etree.tostring(root, encoding="unicode", method="html", doctype="") def filter_in(self, instring): def _filter(key, value, format, meta): # remove HTML specific stuff if key == "Link": # remove relative path prefix and .html suffix internal, [href, text] = value if href.endswith(".html"): href = href[:-5] # FIXME: this stupid detection will not work # or just leave the full path? # if href.startswith("./"): # href = href[2:] # elif href.startswith("../"): # href = href[3:] return pandocfilters.Link(internal, [href, text]) # TODO: it's implemented in filter_pre, but could be useful anyway since html may not be # the only input format; the most generic way should be implemented # if key == "Header": # level, classes, internal = value # # # record top level # if self.heading_top_level == 0: # self.heading_top_level = level # # # ensure we start from h1 in output # if level > self.heading_top_level: # level -= self.heading_top_level # # return pandocfilters.Header(level, classes, internal) doc = json.loads(instring) altered = pandocfilters.walk(doc, _filter, self.format, doc[0]["unMeta"]) return json.dumps(altered) def filter_post(self, instring): return instring class Converter: def __init__(self, filter_inst, input_dir, output_dir, output_format): self.filter = filter_inst self.input_dir = os.path.abspath(input_dir) self.output_dir = os.path.abspath(output_dir) self.output_format = output_format # ensure output directory always exists if not os.path.isdir(self.output_dir): os.mkdir(self.output_dir) def convert(self): failed = [] for path, dirs, files in os.walk(self.input_dir): for f in files: infile = os.path.join(path, f) outdir = os.path.join(self.output_dir, os.path.relpath(path, self.input_dir)) outfile = os.path.join(os.path.normpath(outdir), f) outfile = os.path.splitext(outfile)[0] + "." + self.output_format if infile.endswith(".html"): try: self.convert_file(infile, outfile) except PandocError as e: failed.append(infile) print(e) print(" [conv failed] %s" % infile) else: print(" [skip conv] %s" % infile) if len(failed) > 0: print("failed to convert %d pages:" % len(failed)) for f in failed: print(" %s" % f) def convert_file(self, infile, outfile): print(" [converting] %s" % infile) # ensure that target directory exists (necessary for subpages) try: os.makedirs(os.path.split(outfile)[0]) except FileExistsError: pass content = open(infile, "r").read() content = self.filter.filter_pre(content) content = self.pandoc_first(content) content = self.filter.filter_in(content) content = self.pandoc_last(content) content = self.filter.filter_post(content) f = open(outfile, "w") f.write(content) f.close() def run_pandoc(self, cmd, instring): popen = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) outs, errs = popen.communicate(instring) if popen.returncode != 0: raise PandocError(popen.returncode, errs) return outs def pandoc_first(self, instring): return self.run_pandoc("pandoc -s -f html -t json", instring) def pandoc_last(self, instring): return self.run_pandoc("pandoc -s -f json -t %s" % self.output_format, instring) if __name__ == "__main__": f = ManFilter() c = Converter(f, "./wiki/", "./output/", "man") c.convert()