diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8db64c118d0..07fb0680036 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ env: FORCE_COLOR: "1" PYTHONDEVMODE: "1" # -X dev PYTHONWARNDEFAULTENCODING: "1" # -X warn_default_encoding - PYTHONWARNINGS: "error,always:unclosed:ResourceWarning::" # default: all warnings as errors, except ResourceWarnings about unclosed items + PYTHONWARNINGS: "error" jobs: ubuntu: diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index e828fc20988..47852ce45fc 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -262,6 +262,7 @@ def __init__(self, env: BuildEnvironment, config: Config, rqueue: Queue[CheckRes for doc in self.config.linkcheck_exclude_documents] self.auth = [(re.compile(pattern), auth_info) for pattern, auth_info in self.config.linkcheck_auth] + self._session = requests._Session() super().__init__(daemon=True) @@ -316,21 +317,21 @@ def check_uri() -> tuple[str, str, int]: try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists - response = requests.get(req_url, stream=True, config=self.config, - auth=auth_info, **kwargs) - response.raise_for_status() - found = check_anchor(response, unquote(anchor)) + with self._session.get(req_url, stream=True, config=self.config, + auth=auth_info, **kwargs) as response: + response.raise_for_status() + found = check_anchor(response, unquote(anchor)) - if not found: - raise Exception(__("Anchor '%s' not found") % anchor) + if not found: + raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network - response = requests.head(req_url, allow_redirects=True, - config=self.config, auth=auth_info, - **kwargs) - response.raise_for_status() + with self._session.head(req_url, allow_redirects=True, + config=self.config, auth=auth_info, + **kwargs) as response: + response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: @@ -338,10 +339,10 @@ def check_uri() -> tuple[str, str, int]: raise # retry with GET request if that fails, some servers # don't like HEAD requests. - response = requests.get(req_url, stream=True, - config=self.config, - auth=auth_info, **kwargs) - response.raise_for_status() + with self._session.get(req_url, stream=True, + config=self.config, + auth=auth_info, **kwargs) as response: + response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. @@ -424,6 +425,8 @@ def check(docname: str) -> tuple[str, str, int]: check_request = self.wqueue.get() next_check, hyperlink = check_request if hyperlink is None: + # An empty hyperlink is a signal to shutdown the worker; cleanup resources here + self._session.close() break uri, docname, lineno = hyperlink diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index c64754fa2a1..9da08099fc9 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -90,3 +90,36 @@ def head(url: str, **kwargs: Any) -> requests.Response: with ignore_insecure_warning(**kwargs): return requests.head(url, **kwargs) + + +class _Session(requests.Session): + + def get(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore + """Sends a GET request like requests.get(). + + This sets up User-Agent header and TLS verification automatically.""" + headers = kwargs.setdefault('headers', {}) + config = kwargs.pop('config', None) + if config: + kwargs.setdefault('verify', _get_tls_cacert(url, config)) + headers.setdefault('User-Agent', _get_user_agent(config)) + else: + headers.setdefault('User-Agent', useragent_header[0][1]) + + with ignore_insecure_warning(**kwargs): + return super().get(url, **kwargs) + + def head(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore + """Sends a HEAD request like requests.head(). + + This sets up User-Agent header and TLS verification automatically.""" + headers = kwargs.setdefault('headers', {}) + config = kwargs.pop('config', None) + if config: + kwargs.setdefault('verify', _get_tls_cacert(url, config)) + headers.setdefault('User-Agent', _get_user_agent(config)) + else: + headers.setdefault('User-Agent', useragent_header[0][1]) + + with ignore_insecure_warning(**kwargs): + return super().head(url, **kwargs) diff --git a/tests/roots/test-linkcheck-localserver-too-many-redirects/conf.py b/tests/roots/test-linkcheck-localserver-too-many-redirects/conf.py new file mode 100644 index 00000000000..aea9add395b --- /dev/null +++ b/tests/roots/test-linkcheck-localserver-too-many-redirects/conf.py @@ -0,0 +1,3 @@ +exclude_patterns = ['_build'] +linkcheck_timeout = 0.075 +linkcheck_workers = 1 diff --git a/tests/roots/test-linkcheck-localserver-too-many-redirects/index.rst b/tests/roots/test-linkcheck-localserver-too-many-redirects/index.rst new file mode 100644 index 00000000000..c617e942fd2 --- /dev/null +++ b/tests/roots/test-linkcheck-localserver-too-many-redirects/index.rst @@ -0,0 +1 @@ +`local server `_ diff --git a/tests/roots/test-linkcheck/conf.py b/tests/roots/test-linkcheck/conf.py index 0c197b6f03b..1ee90b7b52d 100644 --- a/tests/roots/test-linkcheck/conf.py +++ b/tests/roots/test-linkcheck/conf.py @@ -2,3 +2,4 @@ exclude_patterns = ['_build'] linkcheck_anchors = True linkcheck_timeout = 0.075 +linkcheck_workers = 1 diff --git a/tests/roots/test-linkcheck/links.rst b/tests/roots/test-linkcheck/links.rst index 49afba2b39a..88c757e03d9 100644 --- a/tests/roots/test-linkcheck/links.rst +++ b/tests/roots/test-linkcheck/links.rst @@ -11,3 +11,4 @@ Some additional anchors to exercise ignore code .. image:: http://localhost:7777/image.png .. figure:: http://localhost:7777/image2.png +* `Valid anchored url `_ diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index acfebd64999..8c25ddf0396 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -15,6 +15,7 @@ from unittest import mock import pytest +from urllib3.poolmanager import PoolManager from sphinx.builders.linkcheck import HyperlinkAvailabilityCheckWorker, RateLimit from sphinx.testing.util import strip_escseq @@ -27,24 +28,77 @@ class DefaultsHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): if self.path[1:].rstrip() == "": self.send_response(200, "OK") self.end_headers() + elif self.path[1:].rstrip() == "anchor.html": + self.send_response(200, "OK") + self.end_headers() else: self.send_response(404, "Not Found") self.end_headers() def do_GET(self): - self.do_HEAD() if self.path[1:].rstrip() == "": - self.wfile.write(b"ok\n\n") + content = b"ok\n\n" + elif self.path[1:].rstrip() == "anchor.html": + doc = '' + content = doc.encode('utf-8') + else: + content = b"" + + if content: + self.send_response(200, "OK") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + else: + self.send_response(404, "Not Found") + self.end_headers() + + +class ConnectionMeasurement: + """Measure the number of distinct host connections created during linkchecking""" + + def __init__(self): + self.connections = set() + self.urllib3_connection_from_url = PoolManager.connection_from_url + self.patcher = mock.patch.object( + target=PoolManager, + attribute='connection_from_url', + new=self._collect_connections(), + ) + + def _collect_connections(self): + def connection_collector(obj, url): + connection = self.urllib3_connection_from_url(obj, url) + self.connections.add(connection) + return connection + return connection_collector + + def __enter__(self): + self.patcher.start() + return self + + def __exit__(self, *args, **kwargs): + for connection in self.connections: + connection.close() + self.patcher.stop() + + @property + def connection_count(self): + return len(self.connections) @pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True) def test_defaults(app): with http_server(DefaultsHandler): - app.build() + with ConnectionMeasurement() as m: + app.build() + assert m.connection_count == 1 # Text output assert (app.outdir / 'output.txt').exists() @@ -69,8 +123,8 @@ def test_defaults(app): for attr in ("filename", "lineno", "status", "code", "uri", "info"): assert attr in row - assert len(content.splitlines()) == 9 - assert len(rows) == 9 + assert len(content.splitlines()) == 10 + assert len(rows) == 10 # the output order of the rows is not stable # due to possible variance in network latency rowsby = {row["uri"]: row for row in rows} @@ -95,6 +149,15 @@ def test_defaults(app): assert rowsby["http://localhost:7777#does-not-exist"]["info"] == "Anchor 'does-not-exist' not found" # images should fail assert "Not Found for url: http://localhost:7777/image.png" in rowsby["http://localhost:7777/image.png"]["info"] + # anchor should be found + assert rowsby['http://localhost:7777/anchor.html#found'] == { + 'filename': 'links.rst', + 'lineno': 14, + 'status': 'working', + 'code': 0, + 'uri': 'http://localhost:7777/anchor.html#found', + 'info': '', + } @pytest.mark.sphinx('linkcheck', testroot='linkcheck-too-many-retries', freshenv=True) @@ -166,6 +229,8 @@ def test_anchors_ignored(app): @pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True) def test_raises_for_invalid_status(app): class InternalServerErrorHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_GET(self): self.send_error(500, "Internal Server Error") @@ -181,13 +246,15 @@ def do_GET(self): def capture_headers_handler(records): class HeadersDumperHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): self.do_GET() def do_GET(self): + records.append(self.headers.as_string()) self.send_response(200, "OK") self.end_headers() - records.append(self.headers.as_string()) return HeadersDumperHandler @@ -276,6 +343,8 @@ def test_linkcheck_request_headers_default(app): def make_redirect_handler(*, support_head): class RedirectOnceHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): if support_head: self.do_GET() @@ -289,6 +358,7 @@ def do_GET(self): else: self.send_response(302, "Found") self.send_header("Location", "http://localhost:7777/?redirected=1") + self.send_header("Content-Length", "0") # see: https://github.com/psf/requests/issues/6407 self.end_headers() def log_date_time_string(self): @@ -366,13 +436,18 @@ def test_linkcheck_allowed_redirects(app, warning): class OKHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): self.send_response(200, "OK") self.end_headers() def do_GET(self): - self.do_HEAD() - self.wfile.write(b"ok\n") + content = b"ok\n" + self.send_response(200, "OK") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) @pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-https', freshenv=True) @@ -477,18 +552,22 @@ def test_connect_to_selfsigned_nonexistent_cert_file(app): class InfiniteRedirectOnHeadHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): self.send_response(302, "Found") self.send_header("Location", "http://localhost:7777/") self.end_headers() def do_GET(self): + content = b"ok\n" self.send_response(200, "OK") + self.send_header("Content-Length", str(len(content))) self.end_headers() - self.wfile.write(b"ok\n") + self.wfile.write(content) -@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-too-many-redirects', freshenv=True) def test_TooManyRedirects_on_HEAD(app, monkeypatch): import requests.sessions @@ -511,6 +590,8 @@ def test_TooManyRedirects_on_HEAD(app, monkeypatch): def make_retry_after_handler(responses): class RetryAfterHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): status, retry_after = responses.pop(0) self.send_response(status) @@ -662,8 +743,10 @@ def test_limit_rate_bails_out_after_waiting_max_time(app): class ConnectionResetHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): - self.connection.close() + self.close_connection = True def do_GET(self): self.send_response(200, "OK") diff --git a/tests/utils.py b/tests/utils.py index 429bbd2b2e2..32636b7936c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -19,7 +19,7 @@ class HttpServerThread(threading.Thread): def __init__(self, handler, *args, **kwargs): super().__init__(*args, **kwargs) - self.server = http.server.HTTPServer(("localhost", 7777), handler) + self.server = http.server.ThreadingHTTPServer(("localhost", 7777), handler) def run(self): self.server.serve_forever(poll_interval=0.001)