Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: linkcheck builder: reduce the lifetime of the 'response' variable #11432

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
178 changes: 96 additions & 82 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from os import path
from queue import PriorityQueue, Queue
from threading import Thread
from typing import Any, Generator, NamedTuple, Tuple, Union, cast
from urllib.parse import unquote, urlparse, urlunparse
from typing import Any, Callable, Generator, Iterator, NamedTuple, Tuple, Union, cast
from urllib.parse import unquote, urlparse, urlsplit, urlunparse

from docutils import nodes
from requests import Response
Expand Down Expand Up @@ -72,7 +72,7 @@ class RateLimit(NamedTuple):


class AnchorCheckParser(HTMLParser):
"""Specialized HTML parser that looks for a specific anchor."""
"""Specialised HTML parser that looks for a specific anchor."""

def __init__(self, search_anchor: str) -> None:
super().__init__()
Expand All @@ -87,11 +87,10 @@ def handle_starttag(self, tag: Any, attrs: Any) -> None:
break


def check_anchor(response: requests.requests.Response, anchor: str) -> bool:
"""Reads HTML data from a response object `response` searching for `anchor`.
Returns True if anchor was found, False otherwise.
"""
parser = AnchorCheckParser(anchor)
def contains_anchor(response: Response, anchor: str) -> bool:
"""Determine if an anchor is contained within an HTTP response."""

parser = AnchorCheckParser(unquote(anchor))
# Read file in chunks. If we find a matching anchor, we break
# the loop early in hopes not to have to download the whole thing.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
Expand Down Expand Up @@ -271,7 +270,7 @@ def run(self) -> None:
kwargs['timeout'] = self.config.linkcheck_timeout

def get_request_headers() -> dict[str, str]:
url = urlparse(uri)
url = urlsplit(uri)
candidates = [f"{url.scheme}://{url.netloc}",
f"{url.scheme}://{url.netloc}/",
uri,
Expand All @@ -286,16 +285,11 @@ def get_request_headers() -> dict[str, str]:
return {}

def check_uri() -> tuple[str, str, int]:
# split off anchor
if '#' in uri:
req_url, anchor = uri.split('#', 1)
for rex in self.anchors_ignore:
if rex.match(anchor):
anchor = None
break
else:
req_url = uri
anchor = None
req_url, delimiter, anchor = uri.partition('#')
for rex in self.anchors_ignore if delimiter and anchor else []:
if rex.match(anchor):
anchor = ''
break

# handle non-ASCII URIs
try:
Expand All @@ -313,71 +307,83 @@ def check_uri() -> tuple[str, str, int]:
# update request headers for the URL
kwargs['headers'] = get_request_headers()

try:
if anchor and self.config.linkcheck_anchors:
# Read the whole document and see if #anchor exists
with requests.get(req_url, stream=True, config=self.config, auth=auth_info,
**kwargs) as response:
response.raise_for_status()
found = check_anchor(response, unquote(anchor))

if not found:
raise Exception(__("Anchor '%s' not found") % anchor)
else:
try:
# try a HEAD request first, which should be easier on
# the server and the network
with requests.head(req_url, allow_redirects=True, config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
# Linkcheck HTTP request logic:
#
# - Attempt HTTP HEAD before HTTP GET unless page content is required.
# - Follow server-issued HTTP redirects.
# - Respect server-issued HTTP 429 back-offs.
error_message = None
status_code = -1
response_url = retry_after = ''
for retrieval_method, retrieval_kwargs in _retrieval_methods(
self.config.linkcheck_anchors, anchor,
):
try:
with retrieval_method(url=req_url, auth=auth_info, config=self.config,
**retrieval_kwargs, **kwargs) as response:
if response.ok and anchor and not contains_anchor(response, anchor):
raise Exception(__(f'Anchor {anchor!r} not found'))
Comment on lines +322 to +325
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like I introduced a regression with the rewrite of these lines - and the regression is reported here: #11532 (comment)

Basically: when anchor checking is not enabled -- that is, self.linkcheck_anchors is False -- the first retrieval method returned is an HTTP HEAD request. It can have response.ok, and the anchor value may be non-empty -- because we continue to extract (partition) the anchor from the uri regardless of whether anchor linkchecking will be performed. The HTTP HEAD response doesn't contain a content body, so contains_anchor will return false, and the exception is raised.

I'll prepare a fixup/patch for this and some test coverage to cover this situation within the next few days.

cc @adamtheturtle


# Copy data we need from the (closed) response
status_code = response.status_code
redirect_status_code = response.history[-1].status_code if response.history else None # NoQA: E501
retry_after = response.headers.get('Retry-After')
response_url = f'{response.url}'
response.raise_for_status()
del response
break

except (ConnectionError, TooManyRedirects) as err:
# Servers drop the connection on HEAD requests, causing
# ConnectionError.
except (ConnectionError, HTTPError, TooManyRedirects) as err:
if isinstance(err, HTTPError) and err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers
# don't like HEAD requests.
with requests.get(req_url, stream=True, config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:
# We'll take "Unauthorized" as working.
return 'working', ' - unauthorized', 0
elif err.response.status_code == 429:
next_check = self.limit_rate(err.response)
if next_check is not None:
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', str(err), 0
elif err.response.status_code == 503:
# We'll take "Service Unavailable" as ignored.
return 'ignored', str(err), 0
else:
error_message = str(err)
continue

except HTTPError as err:
error_message = str(err)

# Unauthorised: the reference probably exists
if status_code == 401:
return 'working', 'unauthorized', 0

# Rate limiting; back-off if allowed, or report failure otherwise
if status_code == 429:
if next_check := self.limit_rate(response_url, retry_after):
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', error_message, 0

# Don't claim success/failure during server-side outages
if status_code == 503:
return 'ignored', 'service unavailable', 0

# For most HTTP failures, continue attempting alternate retrieval methods
continue

except Exception as err:
# Unhandled exception (intermittent or permanent); report that the
# the link is broken.
return 'broken', str(err), 0
except Exception as err:
return 'broken', str(err), 0

else:
netloc = urlparse(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass
if response.url.rstrip('/') == req_url.rstrip('/'):
# All available retrieval methods have been exhausted; report
# that the link is broken.
return 'broken', error_message, 0

# Success; clear rate limits for the origin
netloc = urlsplit(req_url).netloc
try:
del self.rate_limits[netloc]
except KeyError:
pass

if ((response_url.rstrip('/') == req_url.rstrip('/'))
or allowed_redirect(req_url, response_url)):
return 'working', '', 0
elif redirect_status_code is not None:
return 'redirected', response_url, redirect_status_code
else:
new_url = response.url
if anchor:
new_url += '#' + anchor

if allowed_redirect(req_url, new_url):
return 'working', '', 0
elif response.history:
# history contains any redirects, get last
code = response.history[-1].status_code
return 'redirected', new_url, code
else:
return 'redirected', new_url, 0
return 'redirected', response_url, 0

def allowed_redirect(url: str, new_url: str) -> bool:
return any(
Expand Down Expand Up @@ -428,7 +434,7 @@ def check(docname: str) -> tuple[str, str, int]:

if uri is None:
break
netloc = urlparse(uri).netloc
netloc = urlsplit(uri).netloc
try:
# Refresh rate limit.
# When there are many links in the queue, workers are all stuck waiting
Expand All @@ -451,9 +457,8 @@ def check(docname: str) -> tuple[str, str, int]:
self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
self.wqueue.task_done()

def limit_rate(self, response: Response) -> float | None:
def limit_rate(self, response_url: str, retry_after: str) -> float | None:
next_check = None
retry_after = response.headers.get("Retry-After")
if retry_after:
try:
# Integer: time to wait before next attempt.
Expand All @@ -471,7 +476,7 @@ def limit_rate(self, response: Response) -> float | None:
delay = (until - datetime.now(timezone.utc)).total_seconds()
else:
next_check = time.time() + delay
netloc = urlparse(response.url).netloc
netloc = urlsplit(response_url).netloc
if next_check is None:
max_delay = self.config.linkcheck_rate_limit_timeout
try:
Expand All @@ -490,6 +495,15 @@ def limit_rate(self, response: Response) -> float | None:
return next_check


def _retrieval_methods(
linkcheck_anchors: bool,
anchor: str,
) -> Iterator[tuple[Callable, dict[str, bool]]]:
if not linkcheck_anchors or not anchor:
yield requests.head, {'allow_redirects': True}
yield requests.get, {'stream': True}


class HyperlinkCollector(SphinxPostTransform):
builders = ('linkcheck',)
default_priority = 800
Expand Down
10 changes: 5 additions & 5 deletions tests/test_build_linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,14 +637,14 @@ class FakeResponse:
def test_limit_rate_default_sleep(app):
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check == 60.0


def test_limit_rate_user_max_delay(app):
app.config.linkcheck_rate_limit_timeout = 0.0
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(), {})
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check is None


Expand All @@ -653,7 +653,7 @@ def test_limit_rate_doubles_previous_wait_time(app):
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
rate_limits)
with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check == 120.0


Expand All @@ -663,7 +663,7 @@ def test_limit_rate_clips_wait_time_to_max_time(app):
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
rate_limits)
with mock.patch('time.time', return_value=0.0):
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check == 90.0


Expand All @@ -672,7 +672,7 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):
rate_limits = {"localhost": RateLimit(90.0, 0.0)}
worker = HyperlinkAvailabilityCheckWorker(app.env, app.config, Queue(), Queue(),
rate_limits)
next_check = worker.limit_rate(FakeResponse())
next_check = worker.limit_rate(FakeResponse.url, FakeResponse.headers.get("Retry-After"))
assert next_check is None


Expand Down