Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

linkcheck: Improve robustness during anchor checks #12197

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 14 additions & 0 deletions doc/usage/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2997,6 +2997,20 @@ Options for the linkcheck builder

.. versionadded:: 7.3

.. confval:: linkcheck_parse_leniently

Allow more lenient HTML parsing for specific URI patterns.

After the ``linkcheck`` builder fetches content from a URI, it may also
attempt to parse it, to check whether it contains anchor targets included
in the source hyperlink.

By default, retrieved HTML is checked for parsing and validity errors. To
relax those constraints, enter a list of regular expressions that match URIs
where lenient parsing should be used instead.

.. versionadded:: 7.3


Options for the XML builder
---------------------------
Expand Down
30 changes: 26 additions & 4 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,8 @@ def __init__(self, config: Config,
map(re.compile, config.linkcheck_anchors_ignore))
self.anchors_ignore_for_url: list[re.Pattern[str]] = list(
map(re.compile, config.linkcheck_anchors_ignore_for_url))
self.parse_leniently: list[re.Pattern[str]] = list(
map(re.compile, config.linkcheck_parse_leniently))
self.documents_exclude: list[re.Pattern[str]] = list(
map(re.compile, config.linkcheck_exclude_documents))
self.auth = [(re.compile(pattern), auth_info) for pattern, auth_info
Expand Down Expand Up @@ -436,6 +438,8 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]:
anchor = ''
break
anchor = unquote(anchor)
if delimiter and anchor:
lenient = any(rex.match(req_url) for rex in self.parse_leniently)

# handle non-ASCII URIs
try:
Expand Down Expand Up @@ -471,9 +475,13 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]:
_user_agent=self.user_agent,
_tls_info=(self.tls_verify, self.tls_cacerts),
) as response:
if (self.check_anchors and response.ok and anchor
and not contains_anchor(response, anchor)):
raise Exception(__(f'Anchor {quote(anchor)!r} not found'))
if anchor and self.check_anchors and response.ok:
try:
found = contains_anchor(response, anchor, lenient=lenient)
except UnicodeDecodeError:
return 'ignored', 'unable to decode response content', 0
if not found:
return 'broken', f'Anchor {quote(anchor)!r} not found', 0

# Copy data we need from the (closed) response
status_code = response.status_code
Expand Down Expand Up @@ -617,7 +625,7 @@ def _get_request_headers(
return {}


def contains_anchor(response: Response, anchor: str) -> bool:
def contains_anchor(response: Response, anchor: str, *, lenient: bool = False) -> bool:
"""Determine if an anchor is contained within an HTTP response."""
parser = AnchorCheckParser(anchor)
# Read file in chunks. If we find a matching anchor, we break
Expand All @@ -630,6 +638,10 @@ def contains_anchor(response: Response, anchor: str) -> bool:
if parser.found:
break
parser.close()

if parser.errors and not lenient:
raise ValueError(parser.errors[0])

return parser.found


Expand All @@ -641,8 +653,17 @@ def __init__(self, search_anchor: str) -> None:

self.search_anchor = search_anchor
self.found = False
self.errors: list[str] = []
self.decl: str | None = None

def handle_decl(self, decl: str) -> None:
self.decl = decl

def handle_starttag(self, tag: Any, attrs: Any) -> None:
if self.errors:
return
if self.decl is None:
self.errors.append(f'encountered start tag "{tag}" before a doctype declaration')
for key, value in attrs:
if key in ('id', 'name') and value == self.search_anchor:
self.found = True
Expand Down Expand Up @@ -708,6 +729,7 @@ def setup(app: Sphinx) -> ExtensionMetadata:
# commonly used for dynamic pages
app.add_config_value('linkcheck_anchors_ignore', ['^!'], '')
app.add_config_value('linkcheck_anchors_ignore_for_url', (), '', (tuple, list))
app.add_config_value('linkcheck_parse_leniently', (), '', (tuple, list))
app.add_config_value('linkcheck_rate_limit_timeout', 300.0, '')
app.add_config_value('linkcheck_allow_unauthorized', True, '')
app.add_config_value('linkcheck_report_timeouts_as_broken', True, '', bool)
Expand Down
111 changes: 107 additions & 4 deletions tests/test_builders/test_build_linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from base64 import b64encode
from http.server import BaseHTTPRequestHandler
from queue import Queue
from typing import TYPE_CHECKING
from unittest import mock

import docutils
Expand All @@ -33,6 +34,9 @@

ts_re = re.compile(r".*\[(?P<ts>.*)\].*")

if TYPE_CHECKING:
from collections.abc import Iterable


class DefaultsHandler(BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"
Expand Down Expand Up @@ -266,6 +270,43 @@


class AnchorsIgnoreForUrlHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'

def _chunk_content(self, content: str, *, max_chunk_size: int) -> Iterable[bytes]:

def _encode_chunk(chunk: bytes) -> Iterable[bytes]:
"""Encode a bytestring into a format suitable for HTTP chunked-transfer.

https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding
"""
yield f'{len(chunk):X}'.encode('ascii')
yield b'\r\n'
yield chunk
yield b'\r\n'

buffer = bytes()

Check failure on line 287 in tests/test_builders/test_build_linkcheck.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP018)

tests/test_builders/test_build_linkcheck.py:287:18: UP018 Unnecessary `bytes` call (rewrite as a literal)
for char in content:
buffer += char.encode('utf-8')
if len(buffer) >= max_chunk_size:
chunk, buffer = buffer[:max_chunk_size], buffer[max_chunk_size:]
yield from _encode_chunk(chunk)

# Flush remaining bytes, if any
if buffer:
yield from _encode_chunk(buffer)

# Emit a final empty chunk to close the stream
yield from _encode_chunk(b'')

def _send_chunked(self, content: str) -> bool:
for chunk in self._chunk_content(content, max_chunk_size=20):
try:
self.wfile.write(chunk)
except (BrokenPipeError, ConnectionResetError) as e:
self.log_message(str(e))
return False
return True

def do_HEAD(self):
if self.path in {'/valid', '/ignored'}:
self.send_response(200, "OK")
Expand All @@ -274,14 +315,31 @@
self.end_headers()

def do_GET(self):
self.do_HEAD()
if self.path == '/valid':
self.wfile.write(b"<h1 id='valid-anchor'>valid anchor</h1>\n")
self.send_response(200, 'OK')
content = "<h1 id='valid-anchor'>valid anchor</h1>\n"
elif self.path == '/ignored':
self.wfile.write(b"no anchor but page exists\n")
self.send_response(200, 'OK')
content = 'no anchor but page exists\n'
else:
self.send_response(404, 'Not Found')
content = 'not found\n'
self.send_header('Transfer-Encoding', 'chunked')
self.end_headers()
self._send_chunked(content)


@pytest.mark.sphinx('linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True)
@pytest.mark.sphinx(
'linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True,
confoverrides={
'linkcheck_anchors_ignore_for_url': [
'http://localhost:7777/ignored', # existing page
'http://localhost:7777/invalid', # unknown page
],
'linkcheck_parse_leniently': [
r'http://localhost:7777/valid', # incomplete HTML doc
],
})
def test_anchors_ignored_for_url(app):
with serve_application(app, AnchorsIgnoreForUrlHandler) as address:
app.config.linkcheck_anchors_ignore_for_url = [ # type: ignore[attr-defined]
Expand Down Expand Up @@ -341,6 +399,51 @@
)


@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
def test_incomplete_html_anchor(app):
class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'

def do_GET(self):
content = b'this is <div id="anchor">not</div> a valid HTML document'
self.send_response(200, 'OK')
self.send_header('Content-Length', str(len(content)))
self.end_headers()
self.wfile.write(content)

with serve_application(app, IncompleteHTMLDocumentHandler) as address:
app.build()

content = (app.outdir / 'output.json').read_text(encoding='utf8')
assert len(content.splitlines()) == 1

row = json.loads(content)
assert row['status'] == 'broken'
assert row['info'] == 'encountered start tag "div" before a doctype declaration'


@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
def test_decoding_error_anchor_ignored(app):
class NonASCIIHandler(BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'

def do_GET(self):
content = b'\x80\x00\x80\x00' # non-ASCII byte-string
self.send_response(200, 'OK')
self.send_header('Content-Length', str(len(content)))
self.end_headers()
self.wfile.write(content)

with serve_application(app, NonASCIIHandler) as address:
app.build()

content = (app.outdir / 'output.json').read_text(encoding='utf8')
assert len(content.splitlines()) == 1

row = json.loads(content)
assert row['status'] == 'ignored'


def custom_handler(valid_credentials=(), success_criteria=lambda _: True):
"""
Returns an HTTP request handler that authenticates the client and then determines
Expand Down