Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

linkcheck builder: begin using session-based HTTP requests #11340

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f538440
linkcheck builder: close streamed HTTP response objects when no respo…
jayaddison Apr 11, 2023
524368f
Add test coverage for anchor-in-document linkchecking
jayaddison Apr 11, 2023
9ec505f
refactor: use requests.Response objects as context managers during li…
jayaddison Apr 11, 2023
c708cc9
cleanup: remove unused HTML test data file
jayaddison Apr 11, 2023
4eb7896
Merge branch 'master' into issue-11317/linkcheck-close-streamed-http-…
jayaddison Apr 11, 2023
95dd08a
Nitpick: remove duplicated HTML tag-closing character
jayaddison Apr 11, 2023
2aceccc
Merge branch 'master' into issue-11317/linkcheck-close-streamed-http-…
jayaddison Apr 16, 2023
a3a340b
continuous integration: unit test workflow: simplify default PYTHONWA…
jayaddison Apr 16, 2023
692fdef
Revert "refactor: use requests.Response objects as context managers d…
jayaddison Apr 16, 2023
6fd82b3
Fixup: do not close the response before anchor-checking has been perf…
jayaddison Apr 16, 2023
98df982
Fixup: close the response after an HTTP HEAD response is received
jayaddison Apr 16, 2023
ec415ea
Revert "Fixup: close the response after an HTTP HEAD response is rece…
jayaddison Apr 16, 2023
5769478
Revert "Fixup: do not close the response before anchor-checking has b…
jayaddison Apr 16, 2023
8269c6d
Revert "Revert "refactor: use requests.Response objects as context ma…
jayaddison Apr 16, 2023
89442fc
Merge branch 'master' into issue-11317/linkcheck-close-streamed-http-…
jayaddison Apr 21, 2023
a6d7a77
Merge branch 'master' into issue-11317/linkcheck-close-streamed-http-…
jayaddison Apr 25, 2023
b6d8006
Merge branch 'master' into issue-11317/linkcheck-close-streamed-http-…
jayaddison Apr 26, 2023
69a6828
Merge branch 'master' into issue-11317/linkcheck-close-streamed-http-…
jayaddison Apr 27, 2023
5fafefa
linkcheck builder: begin using session-based HTTP requests
jayaddison Apr 16, 2023
c6191a1
tests: linkcheck builder: update test webserver handlers from HTTP/1.…
jayaddison Apr 17, 2023
616c55b
tests: use threaded Python basic HTTP server implementation during un…
jayaddison Apr 18, 2023
2615fe4
tests: linkcheck: ConnectionResetHandler: close the connection by set…
jayaddison Apr 18, 2023
982f862
tests: linkcheck: make_redirect_handler: send zero-valued content-len…
jayaddison Apr 18, 2023
a6863bb
tests: linkcheck: add connection-count measurement
jayaddison Apr 18, 2023
01c250e
tests: linkcheck: reduce worker count to one for 'test_TooManyRedirec…
jayaddison Apr 25, 2023
bd2ed0b
tests: linkcheck: capture_headers_handler: relocate the (client) head…
jayaddison Apr 21, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Expand Up @@ -13,7 +13,7 @@ env:
FORCE_COLOR: "1"
PYTHONDEVMODE: "1" # -X dev
PYTHONWARNDEFAULTENCODING: "1" # -X warn_default_encoding
PYTHONWARNINGS: "error,always:unclosed:ResourceWarning::" # default: all warnings as errors, except ResourceWarnings about unclosed items
PYTHONWARNINGS: "error"

jobs:
ubuntu:
Expand Down
31 changes: 17 additions & 14 deletions sphinx/builders/linkcheck.py
Expand Up @@ -262,6 +262,7 @@ def __init__(self, env: BuildEnvironment, config: Config, rqueue: Queue[CheckRes
for doc in self.config.linkcheck_exclude_documents]
self.auth = [(re.compile(pattern), auth_info) for pattern, auth_info
in self.config.linkcheck_auth]
self._session = requests._Session()

super().__init__(daemon=True)

Expand Down Expand Up @@ -316,32 +317,32 @@ def check_uri() -> tuple[str, str, int]:
try:
if anchor and self.config.linkcheck_anchors:
# Read the whole document and see if #anchor exists
response = requests.get(req_url, stream=True, config=self.config,
auth=auth_info, **kwargs)
response.raise_for_status()
found = check_anchor(response, unquote(anchor))
with self._session.get(req_url, stream=True, config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
found = check_anchor(response, unquote(anchor))

if not found:
raise Exception(__("Anchor '%s' not found") % anchor)
if not found:
raise Exception(__("Anchor '%s' not found") % anchor)
else:
try:
# try a HEAD request first, which should be easier on
# the server and the network
response = requests.head(req_url, allow_redirects=True,
config=self.config, auth=auth_info,
**kwargs)
response.raise_for_status()
with self._session.head(req_url, allow_redirects=True,
config=self.config, auth=auth_info,
**kwargs) as response:
response.raise_for_status()
# Servers drop the connection on HEAD requests, causing
# ConnectionError.
except (ConnectionError, HTTPError, TooManyRedirects) as err:
if isinstance(err, HTTPError) and err.response.status_code == 429:
raise
# retry with GET request if that fails, some servers
# don't like HEAD requests.
response = requests.get(req_url, stream=True,
config=self.config,
auth=auth_info, **kwargs)
response.raise_for_status()
with self._session.get(req_url, stream=True,
config=self.config,
auth=auth_info, **kwargs) as response:
response.raise_for_status()
except HTTPError as err:
if err.response.status_code == 401:
# We'll take "Unauthorized" as working.
Expand Down Expand Up @@ -424,6 +425,8 @@ def check(docname: str) -> tuple[str, str, int]:
check_request = self.wqueue.get()
next_check, hyperlink = check_request
if hyperlink is None:
# An empty hyperlink is a signal to shutdown the worker; cleanup resources here
self._session.close()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we close here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to ensure that the requests session's resources are cleaned up when the linkchecker completes -- some warnings appeared during testing to indicate that they weren't -- and this seemed to be a good place for it (the None hyperlink is used as a marker (sentinel?) to indicate that the worker should shutdown).

break

uri, docname, lineno = hyperlink
Expand Down
33 changes: 33 additions & 0 deletions sphinx/util/requests.py
Expand Up @@ -90,3 +90,36 @@ def head(url: str, **kwargs: Any) -> requests.Response:

with ignore_insecure_warning(**kwargs):
return requests.head(url, **kwargs)


class _Session(requests.Session):

def get(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore
"""Sends a GET request like requests.get().

This sets up User-Agent header and TLS verification automatically."""
headers = kwargs.setdefault('headers', {})
config = kwargs.pop('config', None)
if config:
kwargs.setdefault('verify', _get_tls_cacert(url, config))
headers.setdefault('User-Agent', _get_user_agent(config))
else:
headers.setdefault('User-Agent', useragent_header[0][1])

with ignore_insecure_warning(**kwargs):
return super().get(url, **kwargs)

def head(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore
"""Sends a HEAD request like requests.head().

This sets up User-Agent header and TLS verification automatically."""
headers = kwargs.setdefault('headers', {})
config = kwargs.pop('config', None)
if config:
kwargs.setdefault('verify', _get_tls_cacert(url, config))
headers.setdefault('User-Agent', _get_user_agent(config))
else:
headers.setdefault('User-Agent', useragent_header[0][1])

with ignore_insecure_warning(**kwargs):
return super().head(url, **kwargs)
@@ -0,0 +1,3 @@
exclude_patterns = ['_build']
linkcheck_timeout = 0.075
linkcheck_workers = 1
@@ -0,0 +1 @@
`local server <http://localhost:7777/>`_
1 change: 1 addition & 0 deletions tests/roots/test-linkcheck/conf.py
Expand Up @@ -2,3 +2,4 @@
exclude_patterns = ['_build']
linkcheck_anchors = True
linkcheck_timeout = 0.075
linkcheck_workers = 1
1 change: 1 addition & 0 deletions tests/roots/test-linkcheck/links.rst
Expand Up @@ -11,3 +11,4 @@ Some additional anchors to exercise ignore code
.. image:: http://localhost:7777/image.png
.. figure:: http://localhost:7777/image2.png

* `Valid anchored url <http://localhost:7777/anchor.html#found>`_
105 changes: 94 additions & 11 deletions tests/test_build_linkcheck.py
Expand Up @@ -15,6 +15,7 @@
from unittest import mock

import pytest
from urllib3.poolmanager import PoolManager

from sphinx.builders.linkcheck import HyperlinkAvailabilityCheckWorker, RateLimit
from sphinx.testing.util import strip_escseq
Expand All @@ -27,24 +28,77 @@


class DefaultsHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_HEAD(self):
if self.path[1:].rstrip() == "":
self.send_response(200, "OK")
self.end_headers()
elif self.path[1:].rstrip() == "anchor.html":
self.send_response(200, "OK")
self.end_headers()
else:
self.send_response(404, "Not Found")
self.end_headers()

def do_GET(self):
self.do_HEAD()
if self.path[1:].rstrip() == "":
self.wfile.write(b"ok\n\n")
content = b"ok\n\n"
elif self.path[1:].rstrip() == "anchor.html":
doc = '<!DOCTYPE html><html><body><a id="found"></a></body></html>'
content = doc.encode('utf-8')
else:
content = b""

if content:
self.send_response(200, "OK")
self.send_header("Content-Length", str(len(content)))
self.end_headers()
self.wfile.write(content)
else:
self.send_response(404, "Not Found")
self.end_headers()


class ConnectionMeasurement:
"""Measure the number of distinct host connections created during linkchecking"""

def __init__(self):
self.connections = set()
self.urllib3_connection_from_url = PoolManager.connection_from_url
self.patcher = mock.patch.object(
target=PoolManager,
attribute='connection_from_url',
new=self._collect_connections(),
)

def _collect_connections(self):
def connection_collector(obj, url):
connection = self.urllib3_connection_from_url(obj, url)
self.connections.add(connection)
return connection
return connection_collector

def __enter__(self):
self.patcher.start()
return self

def __exit__(self, *args, **kwargs):
for connection in self.connections:
connection.close()
self.patcher.stop()

@property
def connection_count(self):
return len(self.connections)


@pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True)
def test_defaults(app):
with http_server(DefaultsHandler):
app.build()
with ConnectionMeasurement() as m:
app.build()
assert m.connection_count == 1

# Text output
assert (app.outdir / 'output.txt').exists()
Expand All @@ -69,8 +123,8 @@ def test_defaults(app):
for attr in ("filename", "lineno", "status", "code", "uri", "info"):
assert attr in row

assert len(content.splitlines()) == 9
assert len(rows) == 9
assert len(content.splitlines()) == 10
assert len(rows) == 10
# the output order of the rows is not stable
# due to possible variance in network latency
rowsby = {row["uri"]: row for row in rows}
Expand All @@ -95,6 +149,15 @@ def test_defaults(app):
assert rowsby["http://localhost:7777#does-not-exist"]["info"] == "Anchor 'does-not-exist' not found"
# images should fail
assert "Not Found for url: http://localhost:7777/image.png" in rowsby["http://localhost:7777/image.png"]["info"]
# anchor should be found
assert rowsby['http://localhost:7777/anchor.html#found'] == {
'filename': 'links.rst',
'lineno': 14,
'status': 'working',
'code': 0,
'uri': 'http://localhost:7777/anchor.html#found',
'info': '',
}


@pytest.mark.sphinx('linkcheck', testroot='linkcheck-too-many-retries', freshenv=True)
Expand Down Expand Up @@ -166,6 +229,8 @@ def test_anchors_ignored(app):
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
def test_raises_for_invalid_status(app):
class InternalServerErrorHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_GET(self):
self.send_error(500, "Internal Server Error")

Expand All @@ -181,13 +246,15 @@ def do_GET(self):

def capture_headers_handler(records):
class HeadersDumperHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_HEAD(self):
self.do_GET()

def do_GET(self):
records.append(self.headers.as_string())
self.send_response(200, "OK")
self.end_headers()
records.append(self.headers.as_string())
return HeadersDumperHandler


Expand Down Expand Up @@ -276,6 +343,8 @@ def test_linkcheck_request_headers_default(app):

def make_redirect_handler(*, support_head):
class RedirectOnceHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_HEAD(self):
if support_head:
self.do_GET()
Expand All @@ -289,6 +358,7 @@ def do_GET(self):
else:
self.send_response(302, "Found")
self.send_header("Location", "http://localhost:7777/?redirected=1")
self.send_header("Content-Length", "0") # see: https://github.com/psf/requests/issues/6407
self.end_headers()

def log_date_time_string(self):
Expand Down Expand Up @@ -366,13 +436,18 @@ def test_linkcheck_allowed_redirects(app, warning):


class OKHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_HEAD(self):
self.send_response(200, "OK")
self.end_headers()

def do_GET(self):
self.do_HEAD()
self.wfile.write(b"ok\n")
content = b"ok\n"
self.send_response(200, "OK")
self.send_header("Content-Length", str(len(content)))
self.end_headers()
self.wfile.write(content)


@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-https', freshenv=True)
Expand Down Expand Up @@ -477,18 +552,22 @@ def test_connect_to_selfsigned_nonexistent_cert_file(app):


class InfiniteRedirectOnHeadHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_HEAD(self):
self.send_response(302, "Found")
self.send_header("Location", "http://localhost:7777/")
self.end_headers()

def do_GET(self):
content = b"ok\n"
self.send_response(200, "OK")
self.send_header("Content-Length", str(len(content)))
self.end_headers()
self.wfile.write(b"ok\n")
self.wfile.write(content)


@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True)
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-too-many-redirects', freshenv=True)
def test_TooManyRedirects_on_HEAD(app, monkeypatch):
import requests.sessions

Expand All @@ -511,6 +590,8 @@ def test_TooManyRedirects_on_HEAD(app, monkeypatch):

def make_retry_after_handler(responses):
class RetryAfterHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_HEAD(self):
status, retry_after = responses.pop(0)
self.send_response(status)
Expand Down Expand Up @@ -662,8 +743,10 @@ def test_limit_rate_bails_out_after_waiting_max_time(app):


class ConnectionResetHandler(http.server.BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def do_HEAD(self):
self.connection.close()
self.close_connection = True

def do_GET(self):
self.send_response(200, "OK")
Expand Down
2 changes: 1 addition & 1 deletion tests/utils.py
Expand Up @@ -19,7 +19,7 @@
class HttpServerThread(threading.Thread):
def __init__(self, handler, *args, **kwargs):
super().__init__(*args, **kwargs)
self.server = http.server.HTTPServer(("localhost", 7777), handler)
self.server = http.server.ThreadingHTTPServer(("localhost", 7777), handler)

def run(self):
self.server.serve_forever(poll_interval=0.001)
Expand Down