sphinx-doc · jayaddison · Mar 24, 2024 · Mar 24, 2024 · Mar 24, 2024 · Mar 24, 2024
diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst
@@ -2997,6 +2997,20 @@ Options for the linkcheck builder
 
    .. versionadded:: 7.3
 
+.. confval:: linkcheck_parse_leniently
+
+   Allow more lenient HTML parsing for specific URI patterns.
+
+   After the ``linkcheck`` builder fetches content from a URI, it may also
+   attempt to parse it, to check whether it contains anchor targets included
+   in the source hyperlink.
+
+   By default, retrieved HTML is checked for parsing and validity errors.  To
+   relax those constraints, enter a list of regular expressions that match URIs
+   where lenient parsing should be used instead.
+
+   .. versionadded:: 7.3
+
 
 Options for the XML builder
 ---------------------------

diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py
@@ -325,6 +325,8 @@ def __init__(self, config: Config,
             map(re.compile, config.linkcheck_anchors_ignore))
         self.anchors_ignore_for_url: list[re.Pattern[str]] = list(
             map(re.compile, config.linkcheck_anchors_ignore_for_url))
+        self.parse_leniently: list[re.Pattern[str]] = list(
+            map(re.compile, config.linkcheck_parse_leniently))
         self.documents_exclude: list[re.Pattern[str]] = list(
             map(re.compile, config.linkcheck_exclude_documents))
         self.auth = [(re.compile(pattern), auth_info) for pattern, auth_info
@@ -436,6 +438,8 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]:
                         anchor = ''
                         break
             anchor = unquote(anchor)
+        if delimiter and anchor:
+            lenient = any(rex.match(req_url) for rex in self.parse_leniently)
 
         # handle non-ASCII URIs
         try:
@@ -471,9 +475,13 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]:
                     _user_agent=self.user_agent,
                     _tls_info=(self.tls_verify, self.tls_cacerts),
                 ) as response:
-                    if (self.check_anchors and response.ok and anchor
-                            and not contains_anchor(response, anchor)):
-                        raise Exception(__(f'Anchor {quote(anchor)!r} not found'))
+                    if anchor and self.check_anchors and response.ok:
+                        try:
+                            found = contains_anchor(response, anchor, lenient=lenient)
+                        except UnicodeDecodeError:
+                            return 'ignored', 'unable to decode response content', 0
+                        if not found:
+                            return 'broken', f'Anchor {quote(anchor)!r} not found', 0
 
                 # Copy data we need from the (closed) response
                 status_code = response.status_code
@@ -617,7 +625,7 @@ def _get_request_headers(
     return {}
 
 
-def contains_anchor(response: Response, anchor: str) -> bool:
+def contains_anchor(response: Response, anchor: str, *, lenient: bool = False) -> bool:
     """Determine if an anchor is contained within an HTTP response."""
     parser = AnchorCheckParser(anchor)
     # Read file in chunks. If we find a matching anchor, we break
@@ -630,6 +638,10 @@ def contains_anchor(response: Response, anchor: str) -> bool:
         if parser.found:
             break
     parser.close()
+
+    if parser.errors and not lenient:
+        raise ValueError(parser.errors[0])
+
     return parser.found
 
 
@@ -641,8 +653,17 @@ def __init__(self, search_anchor: str) -> None:
 
         self.search_anchor = search_anchor
         self.found = False
+        self.errors: list[str] = []
+        self.decl: str | None = None
+
+    def handle_decl(self, decl: str) -> None:
+        self.decl = decl
 
     def handle_starttag(self, tag: Any, attrs: Any) -> None:
+        if self.errors:
+            return
+        if self.decl is None:
+            self.errors.append(f'encountered start tag "{tag}" before a doctype declaration')
         for key, value in attrs:
             if key in ('id', 'name') and value == self.search_anchor:
                 self.found = True
@@ -708,6 +729,7 @@ def setup(app: Sphinx) -> ExtensionMetadata:
     # commonly used for dynamic pages
     app.add_config_value('linkcheck_anchors_ignore', ['^!'], '')
     app.add_config_value('linkcheck_anchors_ignore_for_url', (), '', (tuple, list))
+    app.add_config_value('linkcheck_parse_leniently', (), '', (tuple, list))
     app.add_config_value('linkcheck_rate_limit_timeout', 300.0, '')
     app.add_config_value('linkcheck_allow_unauthorized', True, '')
     app.add_config_value('linkcheck_report_timeouts_as_broken', True, '', bool)

diff --git a/tests/test_builders/test_build_linkcheck.py b/tests/test_builders/test_build_linkcheck.py
@@ -11,6 +11,7 @@
 from base64 import b64encode
 from http.server import BaseHTTPRequestHandler
 from queue import Queue
+from typing import TYPE_CHECKING
 from unittest import mock
 
 import docutils
@@ -33,6 +34,9 @@
 
 ts_re = re.compile(r".*\[(?P<ts>.*)\].*")
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
 
 class DefaultsHandler(BaseHTTPRequestHandler):
     protocol_version = "HTTP/1.1"
@@ -266,6 +270,43 @@
 
 
 class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
+    protocol_version = 'HTTP/1.1'
+
+    def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:
+
+        def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
+            """Encode a bytestring into a format suitable for HTTP chunked-transfer.
+
+            https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding
+            """
+            yield f'{len(chunk):X}'.encode('ascii')
+            yield b'\r\n'
+            yield chunk
+            yield b'\r\n'
+
+        buffer = bytes()
+        for char in content:
+            buffer += char.encode('utf-8')
+            if len(buffer) >= max_chunk_size:
+                chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
+                yield from _encode_chunk(chunk)
+
+        # Flush remaining bytes, if any
+        if buffer:
+            yield from _encode_chunk(buffer)
+
+        # Emit a final empty chunk to close the stream
+        yield from _encode_chunk(b'')
+
+    def _send_chunked(self, content: str) -> bool:
+        for chunk in self._chunk_content(content, max_chunk_size=20):
+            try:
+                self.wfile.write(chunk)
+            except (BrokenPipeError, ConnectionResetError) as e:
+                self.log_message(str(e))
+                return False
+        return True
+
     def do_HEAD(self):
         if self.path in {'/valid', '/ignored'}:
             self.send_response(200, "OK")
@@ -274,14 +315,31 @@
         self.end_headers()
 
     def do_GET(self):
-        self.do_HEAD()
         if self.path == '/valid':
-            self.wfile.write(b"<h1 id='valid-anchor'>valid anchor</h1>\n")
+            self.send_response(200, 'OK')
+            content = "<h1 id='valid-anchor'>valid anchor</h1>\n"
         elif self.path == '/ignored':
-            self.wfile.write(b"no anchor but page exists\n")
+            self.send_response(200, 'OK')
+            content = 'no anchor but page exists\n'
+        else:
+            self.send_response(404, 'Not Found')
+            content = 'not found\n'
+        self.send_header('Transfer-Encoding', 'chunked')
+        self.end_headers()
+        self._send_chunked(content)
 
 
-@pytest.mark.sphinx('linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True)
+@pytest.mark.sphinx(
+    'linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True,
+    confoverrides={
+        'linkcheck_anchors_ignore_for_url': [
+            'http://localhost:7777/ignored',  # existing page
+            'http://localhost:7777/invalid',  # unknown page
+        ],
+        'linkcheck_parse_leniently': [
+            r'http://localhost:7777/valid',  # incomplete HTML doc
+        ],
+    })
 def test_anchors_ignored_for_url(app):
     with serve_application(app, AnchorsIgnoreForUrlHandler) as address:
         app.config.linkcheck_anchors_ignore_for_url = [  # type: ignore[attr-defined]
@@ -341,6 +399,51 @@
     )
 
 
+@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
+def test_incomplete_html_anchor(app):
+    class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler):
+        protocol_version = 'HTTP/1.1'
+
+        def do_GET(self):
+            content = b'this is <div id="anchor">not</div> a valid HTML document'
+            self.send_response(200, 'OK')
+            self.send_header('Content-Length', str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+
+    with serve_application(app, IncompleteHTMLDocumentHandler) as address:
+        app.build()
+
+    content = (app.outdir / 'output.json').read_text(encoding='utf8')
+    assert len(content.splitlines()) == 1
+
+    row = json.loads(content)
+    assert row['status'] == 'broken'
+    assert row['info'] == 'encountered start tag "div" before a doctype declaration'
+
+
+@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
+def test_decoding_error_anchor_ignored(app):
+    class NonASCIIHandler(BaseHTTPRequestHandler):
+        protocol_version = 'HTTP/1.1'
+
+        def do_GET(self):
+            content = b'\x80\x00\x80\x00'  # non-ASCII byte-string
+            self.send_response(200, 'OK')
+            self.send_header('Content-Length', str(len(content)))
+            self.end_headers()
+            self.wfile.write(content)
+
+    with serve_application(app, NonASCIIHandler) as address:
+        app.build()
+
+    content = (app.outdir / 'output.json').read_text(encoding='utf8')
+    assert len(content.splitlines()) == 1
+
+    row = json.loads(content)
+    assert row['status'] == 'ignored'
+
+
 def custom_handler(valid_credentials=(), success_criteria=lambda _: True):
     """
     Returns an HTTP request handler that authenticates the client and then determines