feat: Cache downloaded inventories as local files

mkdocstrings · Nov 12, 2023 · 78e7e26 · 78e7e26
1 parent b3edf89
commit 78e7e26
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 9 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,11 +29,13 @@ classifiers = [
     "Typing :: Typed",
 ]
 dependencies = [
+    "click>=7.0",
     "Jinja2>=2.11.1",
     "Markdown>=3.3",
     "MarkupSafe>=1.1",
     "mkdocs>=1.4",
     "mkdocs-autorefs>=0.3.1",
+    "platformdirs>=2.2.0",
     "pymdown-extensions>=6.3",
     "importlib-metadata>=4.6; python_version < '3.10'",
     "typing-extensions>=4.1; python_version < '3.10'",

diff --git a/src/mkdocstrings/_cache.py b/src/mkdocstrings/_cache.py
@@ -0,0 +1,76 @@
+import datetime
+import gzip
+import hashlib
+import os
+import urllib.parse
+import urllib.request
+from typing import BinaryIO, Callable
+
+import click
+import platformdirs
+
+from mkdocstrings.loggers import get_logger
+
+log = get_logger(__name__)
+
+
+def download_url_with_gz(url: str) -> bytes:
+    req = urllib.request.Request(  # noqa: S310
+        url,
+        headers={"Accept-Encoding": "gzip", "User-Agent": "mkdocstrings/0.15.0"},
+    )
+    with urllib.request.urlopen(req) as resp:  # noqa: S310
+        content: BinaryIO = resp
+        if "gzip" in resp.headers.get("content-encoding", ""):
+            content = gzip.GzipFile(fileobj=resp)  # type: ignore[assignment]
+        return content.read()
+
+
+# This is mostly a copy of https://github.com/mkdocs/mkdocs/blob/master/mkdocs/utils/cache.py
+# In the future maybe they can be deduplicated.
+
+
+def download_and_cache_url(
+    url: str,
+    download: Callable[[str], bytes],
+    cache_duration: datetime.timedelta,
+    comment: bytes = b"# ",
+) -> bytes:
+    """Downloads a file from the URL, stores it under ~/.cache/, and returns its content.
+
+    For tracking the age of the content, a prefix is inserted into the stored file, rather than relying on mtime.
+
+    Args:
+        url: URL to use.
+        download: Callback that will accept the URL and actually perform the download.
+        cache_duration: How long to consider the URL content cached.
+        comment: The appropriate comment prefix for this file format.
+    """
+    directory = os.path.join(platformdirs.user_cache_dir("mkdocs"), "mkdocstrings_url_cache")
+    name_hash = hashlib.sha256(url.encode()).hexdigest()[:32]
+    path = os.path.join(directory, name_hash + os.path.splitext(url)[1])
+
+    now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+    prefix = b"%s%s downloaded at timestamp " % (comment, url.encode())
+    # Check for cached file and try to return it
+    if os.path.isfile(path):
+        try:
+            with open(path, "rb") as f:
+                line = f.readline()
+                if line.startswith(prefix):
+                    line = line[len(prefix) :]
+                    timestamp = int(line)
+                    if datetime.timedelta(seconds=(now - timestamp)) <= cache_duration:
+                        log.debug(f"Using cached '{path}' for '{url}'")
+                        return f.read()
+        except (OSError, ValueError) as e:
+            log.debug(f"{type(e).__name__}: {e}")
+
+    # Download and cache the file
+    log.debug(f"Downloading '{url}' to '{path}'")
+    content = download(url)
+    os.makedirs(directory, exist_ok=True)
+    with click.open_file(path, "wb", atomic=True) as f:
+        f.write(b"%s%d\n" % (prefix, now))
+        f.write(content)
+    return content
diff --git a/src/mkdocstrings/plugin.py b/src/mkdocstrings/plugin.py
@@ -14,20 +14,21 @@
 
 from __future__ import annotations
 
+import datetime
 import functools
-import gzip
 import os
 import sys
 from concurrent import futures
-from typing import TYPE_CHECKING, Any, BinaryIO, Callable, Iterable, List, Mapping, Tuple, TypeVar
-from urllib import request
+from io import BytesIO
+from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Mapping, Tuple, TypeVar
 
 from mkdocs.config import Config
 from mkdocs.config import config_options as opt
 from mkdocs.plugins import BasePlugin
 from mkdocs.utils import write_file
 from mkdocs_autorefs.plugin import AutorefsPlugin
 
+from mkdocstrings._cache import download_and_cache_url, download_url_with_gz
 from mkdocstrings.extension import MkdocstringsExtension
 from mkdocstrings.handlers.base import BaseHandler, Handlers
 from mkdocstrings.loggers import get_logger
@@ -317,11 +318,7 @@ def _load_inventory(cls, loader: InventoryLoaderType, url: str, **kwargs: Any) -
             A mapping from identifier to absolute URL.
         """
         log.debug(f"Downloading inventory from {url!r}")
-        req = request.Request(url, headers={"Accept-Encoding": "gzip", "User-Agent": "mkdocstrings/0.15.0"})
-        with request.urlopen(req) as resp:  # noqa: S310 (URL audit OK: comes from a checked-in config)
-            content: BinaryIO = resp
-            if "gzip" in resp.headers.get("content-encoding", ""):
-                content = gzip.GzipFile(fileobj=resp)  # type: ignore[assignment]
-            result = dict(loader(content, url=url, **kwargs))
+        content = download_and_cache_url(url, download_url_with_gz, datetime.timedelta(days=1))
+        result = dict(loader(BytesIO(content), url=url, **kwargs))
         log.debug(f"Loaded inventory from {url!r}: {len(result)} items")
         return result