From f538440efb2a02edfb8568cc79f8665b0d6b0245 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 11 Apr 2023 16:05:40 +0100 Subject: [PATCH 01/20] linkcheck builder: close streamed HTTP response objects when no response-content reads are required --- sphinx/builders/linkcheck.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index e828fc20988..694b6fc2440 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -318,6 +318,7 @@ def check_uri() -> tuple[str, str, int]: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) + response.close() # no HTTP body reads required; close the response response.raise_for_status() found = check_anchor(response, unquote(anchor)) @@ -341,6 +342,7 @@ def check_uri() -> tuple[str, str, int]: response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) + response.close() # no HTTP body reads required; close the response response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: From 524368fc617b5fd72225f6798c705933e3e01192 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 11 Apr 2023 19:12:58 +0100 Subject: [PATCH 02/20] Add test coverage for anchor-in-document linkchecking --- tests/roots/test-linkcheck/anchor.html | 5 +++++ tests/roots/test-linkcheck/links.rst | 1 + tests/test_build_linkcheck.py | 19 +++++++++++++++++-- 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 tests/roots/test-linkcheck/anchor.html diff --git a/tests/roots/test-linkcheck/anchor.html b/tests/roots/test-linkcheck/anchor.html new file mode 100644 index 00000000000..ad7ef5ac1ee --- /dev/null +++ b/tests/roots/test-linkcheck/anchor.html @@ -0,0 +1,5 @@ + + + + + diff --git a/tests/roots/test-linkcheck/links.rst b/tests/roots/test-linkcheck/links.rst index 49afba2b39a..88c757e03d9 100644 --- a/tests/roots/test-linkcheck/links.rst +++ b/tests/roots/test-linkcheck/links.rst @@ -11,3 +11,4 @@ Some additional anchors to exercise ignore code .. image:: http://localhost:7777/image.png .. figure:: http://localhost:7777/image2.png +* `Valid anchored url `_ diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index acfebd64999..718a2796464 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -31,6 +31,9 @@ def do_HEAD(self): if self.path[1:].rstrip() == "": self.send_response(200, "OK") self.end_headers() + elif self.path[1:].rstrip() == "anchor.html": + self.send_response(200, "OK") + self.end_headers() else: self.send_response(404, "Not Found") self.end_headers() @@ -39,6 +42,9 @@ def do_GET(self): self.do_HEAD() if self.path[1:].rstrip() == "": self.wfile.write(b"ok\n\n") + elif self.path[1:].rstrip() == "anchor.html": + doc = '>' + self.wfile.write(doc.encode('utf-8')) @pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True) @@ -69,8 +75,8 @@ def test_defaults(app): for attr in ("filename", "lineno", "status", "code", "uri", "info"): assert attr in row - assert len(content.splitlines()) == 9 - assert len(rows) == 9 + assert len(content.splitlines()) == 10 + assert len(rows) == 10 # the output order of the rows is not stable # due to possible variance in network latency rowsby = {row["uri"]: row for row in rows} @@ -95,6 +101,15 @@ def test_defaults(app): assert rowsby["http://localhost:7777#does-not-exist"]["info"] == "Anchor 'does-not-exist' not found" # images should fail assert "Not Found for url: http://localhost:7777/image.png" in rowsby["http://localhost:7777/image.png"]["info"] + # anchor should be found + assert rowsby['http://localhost:7777/anchor.html#found'] == { + 'filename': 'links.rst', + 'lineno': 14, + 'status': 'working', + 'code': 0, + 'uri': 'http://localhost:7777/anchor.html#found', + 'info': '', + } @pytest.mark.sphinx('linkcheck', testroot='linkcheck-too-many-retries', freshenv=True) From 9ec505f019f98c97e5c8ee976ee9875f75acecb0 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 11 Apr 2023 17:31:40 +0100 Subject: [PATCH 03/20] refactor: use requests.Response objects as context managers during linkchecking This ensures that the close method of the response is called when the response variable goes out-of-scope --- sphinx/builders/linkcheck.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 694b6fc2440..652c2a01bc3 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -316,22 +316,20 @@ def check_uri() -> tuple[str, str, int]: try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists - response = requests.get(req_url, stream=True, config=self.config, - auth=auth_info, **kwargs) - response.close() # no HTTP body reads required; close the response - response.raise_for_status() - found = check_anchor(response, unquote(anchor)) - - if not found: - raise Exception(__("Anchor '%s' not found") % anchor) + with requests.get(req_url, stream=True, config=self.config, auth=auth_info, + **kwargs) as response: + response.raise_for_status() + found = check_anchor(response, unquote(anchor)) + + if not found: + raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network - response = requests.head(req_url, allow_redirects=True, - config=self.config, auth=auth_info, - **kwargs) - response.raise_for_status() + with requests.head(req_url, allow_redirects=True, config=self.config, + auth=auth_info, **kwargs) as response: + response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: @@ -339,11 +337,9 @@ def check_uri() -> tuple[str, str, int]: raise # retry with GET request if that fails, some servers # don't like HEAD requests. - response = requests.get(req_url, stream=True, - config=self.config, - auth=auth_info, **kwargs) - response.close() # no HTTP body reads required; close the response - response.raise_for_status() + with requests.get(req_url, stream=True, config=self.config, + auth=auth_info, **kwargs) as response: + response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. From c708cc9c43d3b4bb3c121ca9690aabde1fe33170 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 11 Apr 2023 19:17:55 +0100 Subject: [PATCH 04/20] cleanup: remove unused HTML test data file --- tests/roots/test-linkcheck/anchor.html | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 tests/roots/test-linkcheck/anchor.html diff --git a/tests/roots/test-linkcheck/anchor.html b/tests/roots/test-linkcheck/anchor.html deleted file mode 100644 index ad7ef5ac1ee..00000000000 --- a/tests/roots/test-linkcheck/anchor.html +++ /dev/null @@ -1,5 +0,0 @@ - - - - - From 95dd08aa268b26eeaf6215c049401c05dd82a49b Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 11 Apr 2023 19:43:00 +0100 Subject: [PATCH 05/20] Nitpick: remove duplicated HTML tag-closing character --- tests/test_build_linkcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 718a2796464..260cf2c4214 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -43,7 +43,7 @@ def do_GET(self): if self.path[1:].rstrip() == "": self.wfile.write(b"ok\n\n") elif self.path[1:].rstrip() == "anchor.html": - doc = '>' + doc = '' self.wfile.write(doc.encode('utf-8')) From a3a340b8b85e0401c011120f54e2ea648234dbde Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 14:11:42 +0100 Subject: [PATCH 06/20] continuous integration: unit test workflow: simplify default PYTHONWARNINGS configuration --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8db64c118d0..07fb0680036 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ env: FORCE_COLOR: "1" PYTHONDEVMODE: "1" # -X dev PYTHONWARNDEFAULTENCODING: "1" # -X warn_default_encoding - PYTHONWARNINGS: "error,always:unclosed:ResourceWarning::" # default: all warnings as errors, except ResourceWarnings about unclosed items + PYTHONWARNINGS: "error" jobs: ubuntu: From 692fdefa1ce2fd647b7eaa9b6424cbc0ed4c4cd3 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 14:38:09 +0100 Subject: [PATCH 07/20] Revert "refactor: use requests.Response objects as context managers during linkchecking" This reverts commit 9ec505f019f98c97e5c8ee976ee9875f75acecb0. --- sphinx/builders/linkcheck.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 652c2a01bc3..694b6fc2440 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -316,20 +316,22 @@ def check_uri() -> tuple[str, str, int]: try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists - with requests.get(req_url, stream=True, config=self.config, auth=auth_info, - **kwargs) as response: - response.raise_for_status() - found = check_anchor(response, unquote(anchor)) - - if not found: - raise Exception(__("Anchor '%s' not found") % anchor) + response = requests.get(req_url, stream=True, config=self.config, + auth=auth_info, **kwargs) + response.close() # no HTTP body reads required; close the response + response.raise_for_status() + found = check_anchor(response, unquote(anchor)) + + if not found: + raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network - with requests.head(req_url, allow_redirects=True, config=self.config, - auth=auth_info, **kwargs) as response: - response.raise_for_status() + response = requests.head(req_url, allow_redirects=True, + config=self.config, auth=auth_info, + **kwargs) + response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: @@ -337,9 +339,11 @@ def check_uri() -> tuple[str, str, int]: raise # retry with GET request if that fails, some servers # don't like HEAD requests. - with requests.get(req_url, stream=True, config=self.config, - auth=auth_info, **kwargs) as response: - response.raise_for_status() + response = requests.get(req_url, stream=True, + config=self.config, + auth=auth_info, **kwargs) + response.close() # no HTTP body reads required; close the response + response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. From 6fd82b337f92835bf06140162040d76c05eb5214 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 14:40:50 +0100 Subject: [PATCH 08/20] Fixup: do not close the response before anchor-checking has been performed --- sphinx/builders/linkcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 694b6fc2440..22aa2a58a3c 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -318,9 +318,9 @@ def check_uri() -> tuple[str, str, int]: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) - response.close() # no HTTP body reads required; close the response response.raise_for_status() found = check_anchor(response, unquote(anchor)) + response.close() # HTTP body reads complete; close the response if not found: raise Exception(__("Anchor '%s' not found") % anchor) From 98df982443686d9b441f6ad9776f69750488ca17 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 14:41:55 +0100 Subject: [PATCH 09/20] Fixup: close the response after an HTTP HEAD response is received --- sphinx/builders/linkcheck.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 22aa2a58a3c..7aa34324a16 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -332,6 +332,7 @@ def check_uri() -> tuple[str, str, int]: config=self.config, auth=auth_info, **kwargs) response.raise_for_status() + response.close() # no HTTP body reads required; close the response # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: From ec415ea6b8a6a45b972439b7a62f6c271166a0a4 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 15:00:30 +0100 Subject: [PATCH 10/20] Revert "Fixup: close the response after an HTTP HEAD response is received" This reverts commit 98df982443686d9b441f6ad9776f69750488ca17. --- sphinx/builders/linkcheck.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 7aa34324a16..22aa2a58a3c 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -332,7 +332,6 @@ def check_uri() -> tuple[str, str, int]: config=self.config, auth=auth_info, **kwargs) response.raise_for_status() - response.close() # no HTTP body reads required; close the response # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: From 5769478b2067cd78255b05da32cf37fce81e6e7c Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 15:00:31 +0100 Subject: [PATCH 11/20] Revert "Fixup: do not close the response before anchor-checking has been performed" This reverts commit 6fd82b337f92835bf06140162040d76c05eb5214. --- sphinx/builders/linkcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 22aa2a58a3c..694b6fc2440 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -318,9 +318,9 @@ def check_uri() -> tuple[str, str, int]: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) + response.close() # no HTTP body reads required; close the response response.raise_for_status() found = check_anchor(response, unquote(anchor)) - response.close() # HTTP body reads complete; close the response if not found: raise Exception(__("Anchor '%s' not found") % anchor) From 8269c6defdf3f13f061aed5eda0260d6784d72a4 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 15:00:31 +0100 Subject: [PATCH 12/20] Revert "Revert "refactor: use requests.Response objects as context managers during linkchecking"" This reverts commit 692fdefa1ce2fd647b7eaa9b6424cbc0ed4c4cd3. --- sphinx/builders/linkcheck.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 694b6fc2440..652c2a01bc3 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -316,22 +316,20 @@ def check_uri() -> tuple[str, str, int]: try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists - response = requests.get(req_url, stream=True, config=self.config, - auth=auth_info, **kwargs) - response.close() # no HTTP body reads required; close the response - response.raise_for_status() - found = check_anchor(response, unquote(anchor)) - - if not found: - raise Exception(__("Anchor '%s' not found") % anchor) + with requests.get(req_url, stream=True, config=self.config, auth=auth_info, + **kwargs) as response: + response.raise_for_status() + found = check_anchor(response, unquote(anchor)) + + if not found: + raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network - response = requests.head(req_url, allow_redirects=True, - config=self.config, auth=auth_info, - **kwargs) - response.raise_for_status() + with requests.head(req_url, allow_redirects=True, config=self.config, + auth=auth_info, **kwargs) as response: + response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: @@ -339,11 +337,9 @@ def check_uri() -> tuple[str, str, int]: raise # retry with GET request if that fails, some servers # don't like HEAD requests. - response = requests.get(req_url, stream=True, - config=self.config, - auth=auth_info, **kwargs) - response.close() # no HTTP body reads required; close the response - response.raise_for_status() + with requests.get(req_url, stream=True, config=self.config, + auth=auth_info, **kwargs) as response: + response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. From 5fafefa48087ff98b6fcbf3a9829dd7c71bc1d45 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 16 Apr 2023 12:28:15 +0100 Subject: [PATCH 13/20] linkcheck builder: begin using session-based HTTP requests --- sphinx/builders/linkcheck.py | 17 +++++++++++------ sphinx/util/requests.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 652c2a01bc3..47852ce45fc 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -262,6 +262,7 @@ def __init__(self, env: BuildEnvironment, config: Config, rqueue: Queue[CheckRes for doc in self.config.linkcheck_exclude_documents] self.auth = [(re.compile(pattern), auth_info) for pattern, auth_info in self.config.linkcheck_auth] + self._session = requests._Session() super().__init__(daemon=True) @@ -316,8 +317,8 @@ def check_uri() -> tuple[str, str, int]: try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists - with requests.get(req_url, stream=True, config=self.config, auth=auth_info, - **kwargs) as response: + with self._session.get(req_url, stream=True, config=self.config, + auth=auth_info, **kwargs) as response: response.raise_for_status() found = check_anchor(response, unquote(anchor)) @@ -327,8 +328,9 @@ def check_uri() -> tuple[str, str, int]: try: # try a HEAD request first, which should be easier on # the server and the network - with requests.head(req_url, allow_redirects=True, config=self.config, - auth=auth_info, **kwargs) as response: + with self._session.head(req_url, allow_redirects=True, + config=self.config, auth=auth_info, + **kwargs) as response: response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. @@ -337,8 +339,9 @@ def check_uri() -> tuple[str, str, int]: raise # retry with GET request if that fails, some servers # don't like HEAD requests. - with requests.get(req_url, stream=True, config=self.config, - auth=auth_info, **kwargs) as response: + with self._session.get(req_url, stream=True, + config=self.config, + auth=auth_info, **kwargs) as response: response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: @@ -422,6 +425,8 @@ def check(docname: str) -> tuple[str, str, int]: check_request = self.wqueue.get() next_check, hyperlink = check_request if hyperlink is None: + # An empty hyperlink is a signal to shutdown the worker; cleanup resources here + self._session.close() break uri, docname, lineno = hyperlink diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index c64754fa2a1..9da08099fc9 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -90,3 +90,36 @@ def head(url: str, **kwargs: Any) -> requests.Response: with ignore_insecure_warning(**kwargs): return requests.head(url, **kwargs) + + +class _Session(requests.Session): + + def get(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore + """Sends a GET request like requests.get(). + + This sets up User-Agent header and TLS verification automatically.""" + headers = kwargs.setdefault('headers', {}) + config = kwargs.pop('config', None) + if config: + kwargs.setdefault('verify', _get_tls_cacert(url, config)) + headers.setdefault('User-Agent', _get_user_agent(config)) + else: + headers.setdefault('User-Agent', useragent_header[0][1]) + + with ignore_insecure_warning(**kwargs): + return super().get(url, **kwargs) + + def head(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore + """Sends a HEAD request like requests.head(). + + This sets up User-Agent header and TLS verification automatically.""" + headers = kwargs.setdefault('headers', {}) + config = kwargs.pop('config', None) + if config: + kwargs.setdefault('verify', _get_tls_cacert(url, config)) + headers.setdefault('User-Agent', _get_user_agent(config)) + else: + headers.setdefault('User-Agent', useragent_header[0][1]) + + with ignore_insecure_warning(**kwargs): + return super().head(url, **kwargs) From c6191a14d95bd900b728a4b1c9e2fc0313d83eff Mon Sep 17 00:00:00 2001 From: James Addison Date: Mon, 17 Apr 2023 16:55:23 +0100 Subject: [PATCH 14/20] tests: linkcheck builder: update test webserver handlers from HTTP/1.0 protocol (default) to HTTP/1.1 --- tests/test_build_linkcheck.py | 43 ++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 260cf2c4214..5c12fada679 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -27,6 +27,8 @@ class DefaultsHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): if self.path[1:].rstrip() == "": self.send_response(200, "OK") @@ -39,12 +41,22 @@ def do_HEAD(self): self.end_headers() def do_GET(self): - self.do_HEAD() if self.path[1:].rstrip() == "": - self.wfile.write(b"ok\n\n") + content = b"ok\n\n" elif self.path[1:].rstrip() == "anchor.html": doc = '' - self.wfile.write(doc.encode('utf-8')) + content = doc.encode('utf-8') + else: + content = b"" + + if content: + self.send_response(200, "OK") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) + else: + self.send_response(404, "Not Found") + self.end_headers() @pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True) @@ -181,6 +193,8 @@ def test_anchors_ignored(app): @pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True) def test_raises_for_invalid_status(app): class InternalServerErrorHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_GET(self): self.send_error(500, "Internal Server Error") @@ -196,6 +210,8 @@ def do_GET(self): def capture_headers_handler(records): class HeadersDumperHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): self.do_GET() @@ -291,6 +307,8 @@ def test_linkcheck_request_headers_default(app): def make_redirect_handler(*, support_head): class RedirectOnceHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): if support_head: self.do_GET() @@ -381,13 +399,18 @@ def test_linkcheck_allowed_redirects(app, warning): class OKHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): self.send_response(200, "OK") self.end_headers() def do_GET(self): - self.do_HEAD() - self.wfile.write(b"ok\n") + content = b"ok\n" + self.send_response(200, "OK") + self.send_header("Content-Length", str(len(content))) + self.end_headers() + self.wfile.write(content) @pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-https', freshenv=True) @@ -492,15 +515,19 @@ def test_connect_to_selfsigned_nonexistent_cert_file(app): class InfiniteRedirectOnHeadHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): self.send_response(302, "Found") self.send_header("Location", "http://localhost:7777/") self.end_headers() def do_GET(self): + content = b"ok\n" self.send_response(200, "OK") + self.send_header("Content-Length", str(len(content))) self.end_headers() - self.wfile.write(b"ok\n") + self.wfile.write(content) @pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) @@ -526,6 +553,8 @@ def test_TooManyRedirects_on_HEAD(app, monkeypatch): def make_retry_after_handler(responses): class RetryAfterHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): status, retry_after = responses.pop(0) self.send_response(status) @@ -677,6 +706,8 @@ def test_limit_rate_bails_out_after_waiting_max_time(app): class ConnectionResetHandler(http.server.BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + def do_HEAD(self): self.connection.close() From 616c55bec976b5c8ccf533f6a305451f7e59a21f Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 18 Apr 2023 14:36:40 +0100 Subject: [PATCH 15/20] tests: use threaded Python basic HTTP server implementation during unit tests Ref: https://docs.python.org/3/library/http.server.html#http.server.ThreadingHTTPServer --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 429bbd2b2e2..32636b7936c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -19,7 +19,7 @@ class HttpServerThread(threading.Thread): def __init__(self, handler, *args, **kwargs): super().__init__(*args, **kwargs) - self.server = http.server.HTTPServer(("localhost", 7777), handler) + self.server = http.server.ThreadingHTTPServer(("localhost", 7777), handler) def run(self): self.server.serve_forever(poll_interval=0.001) From 2615fe46cb68c8f282a4f47b5610d4c6b1c1f914 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 18 Apr 2023 15:10:43 +0100 Subject: [PATCH 16/20] tests: linkcheck: ConnectionResetHandler: close the connection by setting the 'close_connection' handler attribute instead of calling connection.close Ref: https://docs.python.org/3/library/http.server.html#http.server.BaseHTTPRequestHandler.close_connection --- tests/test_build_linkcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 5c12fada679..95cd59eb357 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -709,7 +709,7 @@ class ConnectionResetHandler(http.server.BaseHTTPRequestHandler): protocol_version = "HTTP/1.1" def do_HEAD(self): - self.connection.close() + self.close_connection = True def do_GET(self): self.send_response(200, "OK") From 982f862acc0c6922165378c4dbfc0e313f1eeb28 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 18 Apr 2023 15:24:47 +0100 Subject: [PATCH 17/20] tests: linkcheck: make_redirect_handler: send zero-valued content-length response header during redirects --- tests/test_build_linkcheck.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 95cd59eb357..7a1348a061c 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -322,6 +322,7 @@ def do_GET(self): else: self.send_response(302, "Found") self.send_header("Location", "http://localhost:7777/?redirected=1") + self.send_header("Content-Length", "0") # see: https://github.com/psf/requests/issues/6407 self.end_headers() def log_date_time_string(self): From a6863bbdc01d2a5274abd124d07485e81fd1dfbc Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 18 Apr 2023 17:41:07 +0100 Subject: [PATCH 18/20] tests: linkcheck: add connection-count measurement --- tests/roots/test-linkcheck/conf.py | 1 + tests/test_build_linkcheck.py | 38 +++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/roots/test-linkcheck/conf.py b/tests/roots/test-linkcheck/conf.py index 0c197b6f03b..1ee90b7b52d 100644 --- a/tests/roots/test-linkcheck/conf.py +++ b/tests/roots/test-linkcheck/conf.py @@ -2,3 +2,4 @@ exclude_patterns = ['_build'] linkcheck_anchors = True linkcheck_timeout = 0.075 +linkcheck_workers = 1 diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 7a1348a061c..548f1b46743 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -15,6 +15,7 @@ from unittest import mock import pytest +from urllib3.poolmanager import PoolManager from sphinx.builders.linkcheck import HyperlinkAvailabilityCheckWorker, RateLimit from sphinx.testing.util import strip_escseq @@ -59,10 +60,45 @@ def do_GET(self): self.end_headers() +class ConnectionMeasurement: + """Measure the number of distinct host connections created during linkchecking""" + + def __init__(self): + self.connections = set() + self.urllib3_connection_from_url = PoolManager.connection_from_url + self.patcher = mock.patch.object( + target=PoolManager, + attribute='connection_from_url', + new=self._collect_connections(), + ) + + def _collect_connections(self): + def connection_collector(obj, url): + connection = self.urllib3_connection_from_url(obj, url) + self.connections.add(connection) + return connection + return connection_collector + + def __enter__(self): + self.patcher.start() + return self + + def __exit__(self, *args, **kwargs): + for connection in self.connections: + connection.close() + self.patcher.stop() + + @property + def connection_count(self): + return len(self.connections) + + @pytest.mark.sphinx('linkcheck', testroot='linkcheck', freshenv=True) def test_defaults(app): with http_server(DefaultsHandler): - app.build() + with ConnectionMeasurement() as m: + app.build() + assert m.connection_count == 1 # Text output assert (app.outdir / 'output.txt').exists() From 01c250e7d9e365ff99f4c1222675e388a42d0e59 Mon Sep 17 00:00:00 2001 From: James Addison Date: Tue, 25 Apr 2023 16:41:03 +0100 Subject: [PATCH 19/20] tests: linkcheck: reduce worker count to one for 'test_TooManyRedirects_on_HEAD' --- .../test-linkcheck-localserver-too-many-redirects/conf.py | 3 +++ .../test-linkcheck-localserver-too-many-redirects/index.rst | 1 + tests/test_build_linkcheck.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 tests/roots/test-linkcheck-localserver-too-many-redirects/conf.py create mode 100644 tests/roots/test-linkcheck-localserver-too-many-redirects/index.rst diff --git a/tests/roots/test-linkcheck-localserver-too-many-redirects/conf.py b/tests/roots/test-linkcheck-localserver-too-many-redirects/conf.py new file mode 100644 index 00000000000..aea9add395b --- /dev/null +++ b/tests/roots/test-linkcheck-localserver-too-many-redirects/conf.py @@ -0,0 +1,3 @@ +exclude_patterns = ['_build'] +linkcheck_timeout = 0.075 +linkcheck_workers = 1 diff --git a/tests/roots/test-linkcheck-localserver-too-many-redirects/index.rst b/tests/roots/test-linkcheck-localserver-too-many-redirects/index.rst new file mode 100644 index 00000000000..c617e942fd2 --- /dev/null +++ b/tests/roots/test-linkcheck-localserver-too-many-redirects/index.rst @@ -0,0 +1 @@ +`local server `_ diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 548f1b46743..6962606f0c1 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -567,7 +567,7 @@ def do_GET(self): self.wfile.write(content) -@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver', freshenv=True) +@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-too-many-redirects', freshenv=True) def test_TooManyRedirects_on_HEAD(app, monkeypatch): import requests.sessions From bd2ed0bfd5a89d3dd2efaad8dd6cbe1519f9037d Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 21 Apr 2023 11:29:05 +0100 Subject: [PATCH 20/20] tests: linkcheck: capture_headers_handler: relocate the (client) header capture to before any server-side communication has been initiated by the handler (cherry picked from commit 4d485aee6f304d432121f57c375a479f9674cc5b) --- tests/test_build_linkcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 6962606f0c1..8c25ddf0396 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -252,9 +252,9 @@ def do_HEAD(self): self.do_GET() def do_GET(self): + records.append(self.headers.as_string()) self.send_response(200, "OK") self.end_headers() - records.append(self.headers.as_string()) return HeadersDumperHandler