arch-wiki-docs/ArchWiki/downloader.py

#! /usr/bin/env python3

import os
import datetime

import requests
from requests.packages.urllib3.util.retry import Retry

class Downloader:
    query_allpages = {
        "action": "query",
        "generator": "allpages",
        "gaplimit": "max",
        "gapfilterredir": "nonredirects",
        "gapnamespace": "0",
        "prop": "info",
        "inprop": "url",
        "continue": "",
    }

    query_allimages = {
        "action": "query",
        "list": "allimages",
        "ailimit": "max",
        "aiprop": "url|timestamp",
        "continue": "",
    }

    css_links = {
        "https://wiki.archlinux.org/load.php?lang=en&modules=site.styles|skins.vector.icons,styles|zzz.ext.archLinux.styles&only=styles&skin=vector-2022": "ArchWikiOffline.css",
    }

    def __init__(self, wiki, output_directory, epoch, *, optimizer=None):
        """ Parameters:
            @wiki:          ArchWiki instance to work with
            @output_directory:  where to store the downloaded files
            @epoch:         force update of every file older than this date (must be instance
                            of 'datetime')
            @optimizer:     callback function for HTML post-processing
        """

        self.wiki = wiki
        self.output_directory = output_directory
        self.epoch = epoch
        self.optimizer = optimizer

        # ensure output directory always exists
        if not os.path.isdir(self.output_directory):
            os.mkdir(self.output_directory)

        # list of valid files
        self.files = []

        self.session = requests.Session()
        # granular control over requests' retries: https://stackoverflow.com/a/35504626
        retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
        adapter = requests.adapters.HTTPAdapter(max_retries=retries)
        self.session.mount("https://", adapter)
        self.session.mount("http://", adapter)

    def needs_update(self, fname, timestamp):
        """ determine if it is necessary to download a page
        """

        if not os.path.exists(fname):
            return True
        local = datetime.datetime.utcfromtimestamp(os.path.getmtime(fname))
        if local < timestamp or local < self.epoch:
            return True
        return False

    def process_namespace(self, namespace):
        """ walk all pages in given namespace, download if necessary
        """

        print(f"Processing namespace {namespace}...")

        query = self.query_allpages.copy()
        query["gapnamespace"] = namespace
        for pages_snippet in self.wiki.query_continue(query):
            for page in sorted(pages_snippet["pages"].values(), key=lambda d: d["title"]):
                title = page["title"]
                fname = self.wiki.get_local_filename(title, self.output_directory)
                if not fname:
                    print(f"  [skipping] {title}")
                    continue
                self.files.append(fname)
                timestamp = self.wiki.parse_date(page["touched"])
                if self.needs_update(fname, timestamp):
                    print(f"  [downloading] {title}")
                    fullurl = page["fullurl"]

                    r = self.session.get(fullurl)
                    if self.optimizer is not None:
                        text = self.optimizer.optimize(fname, r.text)
                    else:
                        text = r.text

                    # ensure that target directory exists (necessary for subpages)
                    os.makedirs(os.path.dirname(fname), exist_ok=True)

                    with open(fname, "w") as fd:
                        fd.write(text)
                else:
                    print(f"  [up-to-date]  {title}")

    def download_css(self):
        print("Downloading CSS...")
        for link, dest in self.css_links.items():
            print(" ", dest)
            fname = os.path.join(self.output_directory, dest)
            if fname:
                self.files.append(fname)
                r = self.session.get(link)
                with open(fname, "w") as fd:
                    fd.write(r.text)

    def download_images(self):
        print("Downloading images...")
        query = self.query_allimages.copy()
        for images_snippet in self.wiki.query_continue(query):
            for image in images_snippet["allimages"]:
                title = image["title"]
                fname = self.wiki.get_local_filename(title, self.output_directory)
                if not fname:
                    print(f"  [skipping] {title}")
                    continue
                self.files.append(fname)
                timestamp = self.wiki.parse_date(image["timestamp"])
                if self.needs_update(fname, timestamp):
                    print(f"  [downloading] {title}")
                    r = self.session.get(image["url"])
                    with open(fname, "wb") as fd:
                        fd.write(r.content)
                else:
                    print(f"  [up-to-date]  {title}")

    def clean_output_directory(self):
        """ Walk output_directory and delete all files not found on the wiki.
            Should be run _after_ downloading, otherwise all files will be deleted!
        """

        print("Deleting unwanted files (deleted/moved on the wiki)...")
        valid_files = self.files.copy()

        for path, dirs, files in os.walk(self.output_directory, topdown=False):
            # handle files
            for f in files:
                fpath = os.path.join(path, f)
                if fpath not in valid_files:
                    print(f"  [deleting]    {fpath}")
                    os.unlink(fpath)

            # remove empty directories
            if len(os.listdir(path)) == 0:
                print(f"  [deleting]    {path}/")
                os.rmdir(path)
add poetry for dependency management and shell.nix for use on nix systems 2024-11-27 12:51:56 +00:00			`#! /usr/bin/env python3`

			`import os`
			`import datetime`

			`import requests`
			`from requests.packages.urllib3.util.retry import Retry`

			`class Downloader:`
			`query_allpages = {`
			`"action": "query",`
			`"generator": "allpages",`
			`"gaplimit": "max",`
			`"gapfilterredir": "nonredirects",`
			`"gapnamespace": "0",`
			`"prop": "info",`
			`"inprop": "url",`
			`"continue": "",`
			`}`

			`query_allimages = {`
			`"action": "query",`
			`"list": "allimages",`
			`"ailimit": "max",`
			`"aiprop": "url\|timestamp",`
			`"continue": "",`
			`}`

			`css_links = {`
			`"https://wiki.archlinux.org/load.php?lang=en&modules=site.styles\|skins.vector.icons,styles\|zzz.ext.archLinux.styles&only=styles&skin=vector-2022": "ArchWikiOffline.css",`
			`}`

			`def __init__(self, wiki, output_directory, epoch, *, optimizer=None):`
			`""" Parameters:`
			`@wiki: ArchWiki instance to work with`
			`@output_directory: where to store the downloaded files`
			`@epoch: force update of every file older than this date (must be instance`
			`of 'datetime')`
			`@optimizer: callback function for HTML post-processing`
			`"""`

			`self.wiki = wiki`
			`self.output_directory = output_directory`
			`self.epoch = epoch`
			`self.optimizer = optimizer`

			`# ensure output directory always exists`
			`if not os.path.isdir(self.output_directory):`
			`os.mkdir(self.output_directory)`

			`# list of valid files`
			`self.files = []`

			`self.session = requests.Session()`
			`# granular control over requests' retries: https://stackoverflow.com/a/35504626`
			`retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])`
			`adapter = requests.adapters.HTTPAdapter(max_retries=retries)`
			`self.session.mount("https://", adapter)`
			`self.session.mount("http://", adapter)`

			`def needs_update(self, fname, timestamp):`
			`""" determine if it is necessary to download a page`
			`"""`

			`if not os.path.exists(fname):`
			`return True`
			`local = datetime.datetime.utcfromtimestamp(os.path.getmtime(fname))`
			`if local < timestamp or local < self.epoch:`
			`return True`
			`return False`

			`def process_namespace(self, namespace):`
			`""" walk all pages in given namespace, download if necessary`
			`"""`

			`print(f"Processing namespace {namespace}...")`

			`query = self.query_allpages.copy()`
			`query["gapnamespace"] = namespace`
			`for pages_snippet in self.wiki.query_continue(query):`
			`for page in sorted(pages_snippet["pages"].values(), key=lambda d: d["title"]):`
			`title = page["title"]`
			`fname = self.wiki.get_local_filename(title, self.output_directory)`
			`if not fname:`
			`print(f" [skipping] {title}")`
			`continue`
			`self.files.append(fname)`
			`timestamp = self.wiki.parse_date(page["touched"])`
			`if self.needs_update(fname, timestamp):`
			`print(f" [downloading] {title}")`
			`fullurl = page["fullurl"]`

			`r = self.session.get(fullurl)`
			`if self.optimizer is not None:`
			`text = self.optimizer.optimize(fname, r.text)`
			`else:`
			`text = r.text`

			`# ensure that target directory exists (necessary for subpages)`
			`os.makedirs(os.path.dirname(fname), exist_ok=True)`

			`with open(fname, "w") as fd:`
			`fd.write(text)`
			`else:`
			`print(f" [up-to-date] {title}")`

			`def download_css(self):`
			`print("Downloading CSS...")`
			`for link, dest in self.css_links.items():`
			`print(" ", dest)`
			`fname = os.path.join(self.output_directory, dest)`
			`if fname:`
			`self.files.append(fname)`
			`r = self.session.get(link)`
			`with open(fname, "w") as fd:`
			`fd.write(r.text)`

			`def download_images(self):`
			`print("Downloading images...")`
			`query = self.query_allimages.copy()`
			`for images_snippet in self.wiki.query_continue(query):`
			`for image in images_snippet["allimages"]:`
			`title = image["title"]`
			`fname = self.wiki.get_local_filename(title, self.output_directory)`
			`if not fname:`
			`print(f" [skipping] {title}")`
			`continue`
			`self.files.append(fname)`
			`timestamp = self.wiki.parse_date(image["timestamp"])`
			`if self.needs_update(fname, timestamp):`
			`print(f" [downloading] {title}")`
			`r = self.session.get(image["url"])`
			`with open(fname, "wb") as fd:`
			`fd.write(r.content)`
			`else:`
			`print(f" [up-to-date] {title}")`

			`def clean_output_directory(self):`
			`""" Walk output_directory and delete all files not found on the wiki.`
			`Should be run _after_ downloading, otherwise all files will be deleted!`
			`"""`

			`print("Deleting unwanted files (deleted/moved on the wiki)...")`
			`valid_files = self.files.copy()`

			`for path, dirs, files in os.walk(self.output_directory, topdown=False):`
			`# handle files`
			`for f in files:`
			`fpath = os.path.join(path, f)`
			`if fpath not in valid_files:`
			`print(f" [deleting] {fpath}")`
			`os.unlink(fpath)`

			`# remove empty directories`
			`if len(os.listdir(path)) == 0:`
			`print(f" [deleting] {path}/")`
			`os.rmdir(path)`