From 2869378e79b848a6a1e4df0587987697ff09b46c Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 18:00:19 +0100 Subject: [PATCH 01/10] linkcheck builder: begin using session-based HTTP requests --- sphinx/builders/linkcheck.py | 17 ++++++++------ sphinx/util/requests.py | 43 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 428669349fa..2747d7f71dc 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -279,12 +279,16 @@ def __init__(self, config: Config, self.tls_verify = config.tls_verify self.tls_cacerts = config.tls_cacerts + self._session = requests._Session() + super().__init__(daemon=True) def run(self) -> None: while True: next_check, hyperlink = self.wqueue.get() if hyperlink is None: + # An empty hyperlink is a signal to shutdown the worker; cleanup resources here + self._session.close() break uri, docname, _docpath, lineno = hyperlink @@ -346,6 +350,11 @@ def _check(self, docname: str, uri: str, hyperlink: Hyperlink) -> tuple[str, str return status, info, code + def _retrieval_methods(self, check_anchors: bool, anchor: str) -> Iterator[tuple[Callable, dict]]: + if not check_anchors or not anchor: + yield self._session.head, {'allow_redirects': True} + yield self._session.get, {'stream': True} + def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: req_url, delimiter, anchor = uri.partition('#') for rex in self.anchors_ignore if delimiter and anchor else []: @@ -377,7 +386,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: error_message = None status_code = -1 response_url = retry_after = '' - for retrieval_method, kwargs in _retrieval_methods(self.check_anchors, anchor): + for retrieval_method, kwargs in self._retrieval_methods(self.check_anchors, anchor): try: with retrieval_method( url=req_url, auth=auth_info, @@ -508,12 +517,6 @@ def _get_request_headers( return {} -def _retrieval_methods(check_anchors: bool, anchor: str) -> Iterator[tuple[Callable, dict]]: - if not check_anchors or not anchor: - yield requests.head, {'allow_redirects': True} - yield requests.get, {'stream': True} - - def contains_anchor(response: Response, anchor: str) -> bool: """Determine if an anchor is contained within an HTTP response.""" diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index fb89d1237b7..ab54105ffd7 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -77,3 +77,46 @@ def head(url: str, with ignore_insecure_warning(verify): return requests.head(url, **kwargs) + + +class _Session(requests.Session): + + def get(self, + url: str, + _user_agent: str = '', + _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] + **kwargs: Any) -> requests.Response: + """Sends a GET request like requests.get(). + + This sets up User-Agent header and TLS verification automatically.""" + headers = kwargs.setdefault('headers', {}) + headers.setdefault('User-Agent', _user_agent or _USER_AGENT) + if _tls_info: + tls_verify, tls_cacerts = _tls_info + verify = bool(kwargs.get('verify', tls_verify)) + kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) + else: + verify = kwargs.get('verify', True) + + with ignore_insecure_warning(verify): + return super().get(url, **kwargs) + + def head(self, + url: str, + _user_agent: str = '', + _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] + **kwargs: Any) -> requests.Response: + """Sends a HEAD request like requests.head(). + + This sets up User-Agent header and TLS verification automatically.""" + headers = kwargs.setdefault('headers', {}) + headers.setdefault('User-Agent', _user_agent or _USER_AGENT) + if _tls_info: + tls_verify, tls_cacerts = _tls_info + verify = bool(kwargs.get('verify', tls_verify)) + kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) + else: + verify = kwargs.get('verify', True) + + with ignore_insecure_warning(verify): + return super().head(url, **kwargs) From 779c1485934a69195cf868624c206890e76536bb Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 18:05:46 +0100 Subject: [PATCH 02/10] linkcheck builder tests: reduce acceptable connection count threshold --- tests/test_build_linkcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_build_linkcheck.py b/tests/test_build_linkcheck.py index 0d032f947c8..cd0daaa50ae 100644 --- a/tests/test_build_linkcheck.py +++ b/tests/test_build_linkcheck.py @@ -104,7 +104,7 @@ def test_defaults(app): with http_server(DefaultsHandler): with ConnectionMeasurement() as m: app.build() - assert m.connection_count <= 10 + assert m.connection_count <= 5 # Text output assert (app.outdir / 'output.txt').exists() From 4fb1f47c7a4ee7249dc5143a37f490d9d4209180 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 18:13:17 +0100 Subject: [PATCH 03/10] linting: fixup: correct under-intended lines --- sphinx/util/requests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index ab54105ffd7..a565e6b3947 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -102,10 +102,10 @@ def get(self, return super().get(url, **kwargs) def head(self, - url: str, - _user_agent: str = '', - _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] - **kwargs: Any) -> requests.Response: + url: str, + _user_agent: str = '', + _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] + **kwargs: Any) -> requests.Response: """Sends a HEAD request like requests.head(). This sets up User-Agent header and TLS verification automatically.""" From 31fec3273c02ed4e20779c99307b971fcc57bc31 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 18:20:17 +0100 Subject: [PATCH 04/10] linting: adjust method signatures and their formatting to fit within flake8 style guidance --- sphinx/builders/linkcheck.py | 4 +++- sphinx/util/requests.py | 24 ++++++++---------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 2747d7f71dc..b3777afdb0b 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -350,7 +350,9 @@ def _check(self, docname: str, uri: str, hyperlink: Hyperlink) -> tuple[str, str return status, info, code - def _retrieval_methods(self, check_anchors: bool, anchor: str) -> Iterator[tuple[Callable, dict]]: + def _retrieval_methods(self, + check_anchors: bool, + anchor: str) -> Iterator[tuple[Callable, dict]]: if not check_anchors or not anchor: yield self._session.head, {'allow_redirects': True} yield self._session.get, {'stream': True} diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index a565e6b3947..408efccd821 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -81,18 +81,14 @@ def head(url: str, class _Session(requests.Session): - def get(self, - url: str, - _user_agent: str = '', - _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] - **kwargs: Any) -> requests.Response: + def get(self, url: str, **kwargs: Any) -> requests.Response: """Sends a GET request like requests.get(). This sets up User-Agent header and TLS verification automatically.""" headers = kwargs.setdefault('headers', {}) - headers.setdefault('User-Agent', _user_agent or _USER_AGENT) - if _tls_info: - tls_verify, tls_cacerts = _tls_info + headers.setdefault('User-Agent', kwargs.pop('_user_agent', _USER_AGENT)) + if '_tls_info' in kwargs: + tls_verify, tls_cacerts = kwargs.pop('_tls_info') verify = bool(kwargs.get('verify', tls_verify)) kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) else: @@ -101,18 +97,14 @@ def get(self, with ignore_insecure_warning(verify): return super().get(url, **kwargs) - def head(self, - url: str, - _user_agent: str = '', - _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] - **kwargs: Any) -> requests.Response: + def head(self, url: str, **kwargs: Any) -> requests.Response: """Sends a HEAD request like requests.head(). This sets up User-Agent header and TLS verification automatically.""" headers = kwargs.setdefault('headers', {}) - headers.setdefault('User-Agent', _user_agent or _USER_AGENT) - if _tls_info: - tls_verify, tls_cacerts = _tls_info + headers.setdefault('User-Agent', kwargs.pop('_user_agent', _USER_AGENT)) + if '_tls_info' in kwargs: + tls_verify, tls_cacerts = kwargs.pop('_tls_info') verify = bool(kwargs.get('verify', tls_verify)) kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) else: From b0e0ccd766dd6cef8d8d506015303eed7026017d Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 18:22:18 +0100 Subject: [PATCH 05/10] linting: add mypy typecheck override for Session request methods (head, get) --- sphinx/util/requests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index 408efccd821..7741c830a8e 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -81,7 +81,7 @@ def head(url: str, class _Session(requests.Session): - def get(self, url: str, **kwargs: Any) -> requests.Response: + def get(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore[override] """Sends a GET request like requests.get(). This sets up User-Agent header and TLS verification automatically.""" @@ -97,7 +97,7 @@ def get(self, url: str, **kwargs: Any) -> requests.Response: with ignore_insecure_warning(verify): return super().get(url, **kwargs) - def head(self, url: str, **kwargs: Any) -> requests.Response: + def head(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore[override] """Sends a HEAD request like requests.head(). This sets up User-Agent header and TLS verification automatically.""" From b7d7fcfc9ba65680cb41e6366fe8da34b9f7cd3a Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 21:55:28 +0100 Subject: [PATCH 06/10] util.requests: consolidate onetime and session-based HTTP request code paths --- sphinx/util/requests.py | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index 7741c830a8e..0540a24bbb2 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -39,44 +39,20 @@ def _get_tls_cacert(url: str, certs: str | dict[str, str] | None) -> str | bool: return certs.get(hostname, True) -def get(url: str, - _user_agent: str = '', - _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] - **kwargs: Any) -> requests.Response: +def get(url: str, **kwargs: Any) -> requests.Response: """Sends a HEAD request like requests.head(). This sets up User-Agent header and TLS verification automatically.""" - headers = kwargs.setdefault('headers', {}) - headers.setdefault('User-Agent', _user_agent or _USER_AGENT) - if _tls_info: - tls_verify, tls_cacerts = _tls_info - verify = bool(kwargs.get('verify', tls_verify)) - kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) - else: - verify = kwargs.get('verify', True) - - with ignore_insecure_warning(verify): - return requests.get(url, **kwargs) + with _Session() as session: + return session.get(url, **kwargs) -def head(url: str, - _user_agent: str = '', - _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] - **kwargs: Any) -> requests.Response: +def head(url: str, **kwargs: Any) -> requests.Response: """Sends a HEAD request like requests.head(). This sets up User-Agent header and TLS verification automatically.""" - headers = kwargs.setdefault('headers', {}) - headers.setdefault('User-Agent', _user_agent or _USER_AGENT) - if _tls_info: - tls_verify, tls_cacerts = _tls_info - verify = bool(kwargs.get('verify', tls_verify)) - kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) - else: - verify = kwargs.get('verify', True) - - with ignore_insecure_warning(verify): - return requests.head(url, **kwargs) + with _Session() as session: + return session.head(url, **kwargs) class _Session(requests.Session): From 4792c87ac15253b8508565379543df4163716fa3 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 22:05:59 +0100 Subject: [PATCH 07/10] util.requests: restore type hints for '_user_agent' and '_tls_info' HTTP request method parameters --- sphinx/util/requests.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index 0540a24bbb2..ca9352f2301 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -57,7 +57,13 @@ def head(url: str, **kwargs: Any) -> requests.Response: class _Session(requests.Session): - def get(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore[override] + def get( # type: ignore[override] + self, + url: str, + _user_agent: str = '', + _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] + **kwargs: Any + ) -> requests.Response: """Sends a GET request like requests.get(). This sets up User-Agent header and TLS verification automatically.""" @@ -73,7 +79,13 @@ def get(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore[ove with ignore_insecure_warning(verify): return super().get(url, **kwargs) - def head(self, url: str, **kwargs: Any) -> requests.Response: # type: ignore[override] + def head( # type: ignore[override] + self, + url: str, + _user_agent: str = '', + _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] + **kwargs: Any + ) -> requests.Response: """Sends a HEAD request like requests.head(). This sets up User-Agent header and TLS verification automatically.""" From 018df61b39bac5e082d9837596603f640d06fc5f Mon Sep 17 00:00:00 2001 From: James Addison Date: Sun, 23 Jul 2023 22:09:27 +0100 Subject: [PATCH 08/10] util.requests: fixup: correctly read _user_agent and _tls_info HTTP request method arguments Partially reverts commit 31fec3273c02ed4e20779c99307b971fcc57bc31. --- sphinx/util/requests.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index ca9352f2301..a4c0273a8ee 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -68,9 +68,9 @@ def get( # type: ignore[override] This sets up User-Agent header and TLS verification automatically.""" headers = kwargs.setdefault('headers', {}) - headers.setdefault('User-Agent', kwargs.pop('_user_agent', _USER_AGENT)) - if '_tls_info' in kwargs: - tls_verify, tls_cacerts = kwargs.pop('_tls_info') + headers.setdefault('User-Agent', _user_agent or _USER_AGENT) + if _tls_info: + tls_verify, tls_cacerts = _tls_info verify = bool(kwargs.get('verify', tls_verify)) kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) else: @@ -90,9 +90,9 @@ def head( # type: ignore[override] This sets up User-Agent header and TLS verification automatically.""" headers = kwargs.setdefault('headers', {}) - headers.setdefault('User-Agent', kwargs.pop('_user_agent', _USER_AGENT)) - if '_tls_info' in kwargs: - tls_verify, tls_cacerts = kwargs.pop('_tls_info') + headers.setdefault('User-Agent', _user_agent or _USER_AGENT) + if _tls_info: + tls_verify, tls_cacerts = _tls_info verify = bool(kwargs.get('verify', tls_verify)) kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) else: From 927062ee4dc73a2ba2fa40ada43bdb9312e645d9 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 23 Jul 2023 22:11:53 +0100 Subject: [PATCH 09/10] Override UA and TLS on ``Session.request()`` --- sphinx/util/requests.py | 51 +++++++++-------------------------------- 1 file changed, 11 insertions(+), 40 deletions(-) diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index a4c0273a8ee..08bf3b6434d 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -3,8 +3,7 @@ from __future__ import annotations import warnings -from contextlib import contextmanager -from typing import Any, Iterator +from typing import Any from urllib.parse import urlsplit import requests @@ -16,15 +15,6 @@ f'Sphinx/{sphinx.__version__}') -@contextmanager -def ignore_insecure_warning(verify: bool) -> Iterator[None]: - with warnings.catch_warnings(): - if not verify: - # ignore InsecureRequestWarning if verify=False - warnings.filterwarnings("ignore", category=InsecureRequestWarning) - yield - - def _get_tls_cacert(url: str, certs: str | dict[str, str] | None) -> str | bool: """Get additional CA cert for a specific URL.""" if not certs: @@ -40,7 +30,7 @@ def _get_tls_cacert(url: str, certs: str | dict[str, str] | None) -> str | bool: def get(url: str, **kwargs: Any) -> requests.Response: - """Sends a HEAD request like requests.head(). + """Sends a GET request like requests.get(). This sets up User-Agent header and TLS verification automatically.""" with _Session() as session: @@ -56,15 +46,13 @@ def head(url: str, **kwargs: Any) -> requests.Response: class _Session(requests.Session): - - def get( # type: ignore[override] - self, - url: str, + def request( # type: ignore[override] + self, method: str, url: str, _user_agent: str = '', _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] **kwargs: Any ) -> requests.Response: - """Sends a GET request like requests.get(). + """Sends a request with an HTTP verb and url. This sets up User-Agent header and TLS verification automatically.""" headers = kwargs.setdefault('headers', {}) @@ -76,27 +64,10 @@ def get( # type: ignore[override] else: verify = kwargs.get('verify', True) - with ignore_insecure_warning(verify): - return super().get(url, **kwargs) + if verify: + return super().request(method, url, **kwargs) - def head( # type: ignore[override] - self, - url: str, - _user_agent: str = '', - _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] - **kwargs: Any - ) -> requests.Response: - """Sends a HEAD request like requests.head(). - - This sets up User-Agent header and TLS verification automatically.""" - headers = kwargs.setdefault('headers', {}) - headers.setdefault('User-Agent', _user_agent or _USER_AGENT) - if _tls_info: - tls_verify, tls_cacerts = _tls_info - verify = bool(kwargs.get('verify', tls_verify)) - kwargs.setdefault('verify', verify and _get_tls_cacert(url, tls_cacerts)) - else: - verify = kwargs.get('verify', True) - - with ignore_insecure_warning(verify): - return super().head(url, **kwargs) + with warnings.catch_warnings(): + # ignore InsecureRequestWarning if verify=False + warnings.filterwarnings("ignore", category=InsecureRequestWarning) + return super().request(method, url, **kwargs) From 330dce957d90a52909cf1ff70b0afc69d6ed8961 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Sun, 23 Jul 2023 22:15:13 +0100 Subject: [PATCH 10/10] Ruff --- sphinx/util/requests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sphinx/util/requests.py b/sphinx/util/requests.py index 08bf3b6434d..ec3d8d2d5b8 100644 --- a/sphinx/util/requests.py +++ b/sphinx/util/requests.py @@ -50,7 +50,7 @@ def request( # type: ignore[override] self, method: str, url: str, _user_agent: str = '', _tls_info: tuple[bool, str | dict[str, str] | None] = (), # type: ignore[assignment] - **kwargs: Any + **kwargs: Any, ) -> requests.Response: """Sends a request with an HTTP verb and url.