Remove chardet/charset-normalizer. Add fallback_charset_resolver Clie…

…ntSession parameter. (aio-libs#7561) Co-authored-by: Sam Bull <git@sambull.org> (cherry picked from commit 6755796)
john-parton · Sep 7, 2023 · e55796d · e55796d
1 parent 5946c74
commit e55796d
Show file tree

Hide file tree

Showing 8 changed files with 110 additions and 98 deletions.
diff --git a/CHANGES/7561.feature b/CHANGES/7561.feature
@@ -0,0 +1,2 @@
+Replace automatic character set detection with a `fallback_charset_resolver` parameter
+in `ClientSession` to allow user-supplied character set detection functions.
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
@@ -163,6 +163,7 @@ Jesus Cea
 Jian Zeng
 Jinkyu Yi
 Joel Watts
+John Parton
 Jon Nabozny
 Jonas Krüger Svensson
 Jonas Obrist

diff --git a/aiohttp/client.py b/aiohttp/client.py
@@ -88,6 +88,11 @@
 from .tracing import Trace, TraceConfig
 from .typedefs import Final, JSONEncoder, LooseCookies, LooseHeaders, StrOrURL
 
+try:
+    import cchardet as chardet
+except ImportError:  # pragma: no cover
+    import charset_normalizer as chardet  # type: ignore[no-redef]
+
 __all__ = (
     # client_exceptions
     "ClientConnectionError",
@@ -159,6 +164,7 @@ class ClientTimeout:
 DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)
 
 _RetType = TypeVar("_RetType")
+_CharsetResolver = Callable[[ClientResponse, bytes], str]
 
 
 class ClientSession:
@@ -220,6 +226,10 @@ def __init__(
         requote_redirect_url: bool = True,
         trace_configs: Optional[List[TraceConfig]] = None,
         read_bufsize: int = 2**16,
+        fallback_charset_resolver: _CharsetResolver = lambda r, b: chardet.detect(b)[
+            "encoding"
+        ]
+        or "utf-8",
     ) -> None:
         if loop is None:
             if connector is not None:
@@ -313,6 +323,8 @@ def __init__(
         for trace_config in self._trace_configs:
             trace_config.freeze()
 
+        self._resolve_charset = fallback_charset_resolver
+
     def __init_subclass__(cls: Type["ClientSession"]) -> None:
         warnings.warn(
             "Inheritance class {} from ClientSession "

diff --git a/aiohttp/client_reqrep.py b/aiohttp/client_reqrep.py
@@ -1,5 +1,6 @@
 import asyncio
 import codecs
+import contextlib
 import functools
 import io
 import re
@@ -12,6 +13,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    Callable,
     Dict,
     Iterable,
     List,
@@ -66,11 +68,6 @@
     ssl = None  # type: ignore[assignment]
     SSLContext = object  # type: ignore[misc,assignment]
 
-try:
-    import cchardet as chardet
-except ImportError:  # pragma: no cover
-    import charset_normalizer as chardet  # type: ignore[no-redef]
-
 
 __all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")
 
@@ -722,8 +719,8 @@ class ClientResponse(HeadersMixin):
     _raw_headers: RawHeaders = None  # type: ignore[assignment] # Response raw headers
 
     _connection = None  # current connection
-    _source_traceback = None
-    # setted up by ClientRequest after ClientResponse object creation
+    _source_traceback: Optional[traceback.StackSummary] = None
+    # set up by ClientRequest after ClientResponse object creation
     # post-init stage allows to not change ctor signature
     _closed = True  # to allow __del__ for non-initialized properly response
     _released = False
@@ -760,6 +757,15 @@ def __init__(
         self._loop = loop
         # store a reference to session #1985
         self._session: Optional[ClientSession] = session
+        # Save reference to _resolve_charset, so that get_encoding() will still
+        # work after the response has finished reading the body.
+        if session is None:
+            # TODO: Fix session=None in tests (see ClientRequest.__init__).
+            self._resolve_charset: Callable[
+                ["ClientResponse", bytes], str
+            ] = lambda *_: "utf-8"
+        else:
+            self._resolve_charset = session._resolve_charset
         if loop.get_debug():
             self._source_traceback = traceback.extract_stack(sys._getframe(1))
 
@@ -1053,27 +1059,22 @@ def get_encoding(self) -> str:
 
         encoding = mimetype.parameters.get("charset")
         if encoding:
-            try:
-                codecs.lookup(encoding)
-            except LookupError:
-                encoding = None
-        if not encoding:
-            if mimetype.type == "application" and (
-                mimetype.subtype == "json" or mimetype.subtype == "rdap"
-            ):
-                # RFC 7159 states that the default encoding is UTF-8.
-                # RFC 7483 defines application/rdap+json
-                encoding = "utf-8"
-            elif self._body is None:
-                raise RuntimeError(
-                    "Cannot guess the encoding of " "a not yet read body"
-                )
-            else:
-                encoding = chardet.detect(self._body)["encoding"]
-        if not encoding:
-            encoding = "utf-8"
+            with contextlib.suppress(LookupError):
+                return codecs.lookup(encoding).name
+
+        if mimetype.type == "application" and (
+            mimetype.subtype == "json" or mimetype.subtype == "rdap"
+        ):
+            # RFC 7159 states that the default encoding is UTF-8.
+            # RFC 7483 defines application/rdap+json
+            return "utf-8"
+
+        if self._body is None:
+            raise RuntimeError(
+                "Cannot compute fallback encoding of a not yet read body"
+            )
 
-        return encoding
+        return self._resolve_charset(self, self._body)
 
     async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
         """Read response payload and decode."""

diff --git a/docs/client_advanced.rst b/docs/client_advanced.rst
@@ -640,3 +640,38 @@ are changed so that aiohttp itself can wait on the underlying
 connection to close. Please follow issue `#1925
 <https://github.com/aio-libs/aiohttp/issues/1925>`_ for the progress
 on this.
+
+HTTP Pipelining
+---------------
+
+aiohttp does not support HTTP/HTTPS pipelining.
+
+
+Character Set Detection
+-----------------------
+
+If you encounter a :exc:`UnicodeDecodeError` when using :meth:`ClientResponse.text()`
+this may be because the response does not include the charset needed
+to decode the body.
+
+If you know the correct encoding for a request, you can simply specify
+the encoding as a parameter (e.g. ``resp.text("windows-1252")``).
+
+Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
+can be used to introduce charset guessing functionality. When a charset is not found
+in the Content-Type header, this function will be called to get the charset encoding. For
+example, this can be used with the ``chardetng_py`` library.::
+
+    from chardetng_py import detect
+
+    def charset_resolver(resp: ClientResponse, body: bytes) -> str:
+        tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
+        return detect(body, allow_utf8=True, tld=tld)
+
+    ClientSession(fallback_charset_resolver=charset_resolver)
+
+Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::
+
+    from charset_normalizer import detect
+
+    ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")
diff --git a/docs/client_reference.rst b/docs/client_reference.rst
@@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
                          read_bufsize=2**16, \
                          requote_redirect_url=False, \
                          trust_env=False, \
-                         trace_configs=None)
+                         trace_configs=None, \
+                         fallback_charset_resolver=lambda r, b: "utf-8")
 
    The class for creating client sessions and making requests.
 
@@ -200,6 +201,16 @@ The client session supports the context manager protocol for self closing.
                          disabling.  See :ref:`aiohttp-client-tracing-reference` for
                          more information.
 
+   :param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
+      A :term:`callable` that accepts a :class:`ClientResponse` and the
+      :class:`bytes` contents, and returns a :class:`str` which will be used as
+      the encoding parameter to :meth:`bytes.decode()`.
+
+      This function will be called when the charset is not known (e.g. not specified in the
+      Content-Type header). The default function simply defaults to ``utf-8``.
+
+      .. versionadded:: 3.8.6
+
    .. attribute:: closed
 
       ``True`` if the session has been closed, ``False`` otherwise.
@@ -1400,12 +1411,8 @@ Response object
       Read response's body and return decoded :class:`str` using
       specified *encoding* parameter.
 
-      If *encoding* is ``None`` content encoding is autocalculated
-      using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
-      header is not provided by server.
-
-      :term:`cchardet` is used with fallback to :term:`charset-normalizer` if
-      *cchardet* is not available.
+      If *encoding* is ``None`` content encoding is determined from the
+      Content-Type header, or using the ``fallback_charset_resolver`` function.
 
       Close underlying connection if data reading gets an error,
       release connection otherwise.
@@ -1414,10 +1421,9 @@ Response object
                            ``None`` for encoding autodetection
                            (default).
 
-      :return str: decoded *BODY*
 
-      :raise LookupError: if the encoding detected by cchardet is
-                          unknown by Python (e.g. VISCII).
+      :raises: :exc:`UnicodeDecodeError` if decoding fails. See also
+               :meth:`get_encoding`.
 
       .. note::
 
@@ -1430,18 +1436,14 @@ Response object
 
             await resp.text('ISO-8859-1')
 
-   .. comethod:: json(*, encoding=None, loads=json.loads, \
+   .. method:: json(*, encoding=None, loads=json.loads, \
                       content_type='application/json')
 
       Read response's body as *JSON*, return :class:`dict` using
       specified *encoding* and *loader*. If data is not still available
-      a ``read`` call will be done,
+      a ``read`` call will be done.
 
-      If *encoding* is ``None`` content encoding is autocalculated
-      using :term:`cchardet` or :term:`charset-normalizer` as fallback if
-      *cchardet* is not available.
-
-      if response's `content-type` does not match `content_type` parameter
+      If response's `content-type` does not match `content_type` parameter
       :exc:`aiohttp.ContentTypeError` get raised.
       To disable content type check pass ``None`` value.
 
@@ -1473,17 +1475,9 @@ Response object
 
    .. method:: get_encoding()
 
-      Automatically detect content encoding using ``charset`` info in
-      ``Content-Type`` HTTP header. If this info is not exists or there
-      are no appropriate codecs for encoding then :term:`cchardet` /
-      :term:`charset-normalizer` is used.
-
-      Beware that it is not always safe to use the result of this function to
-      decode a response. Some encodings detected by cchardet are not known by
-      Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.
-
-      :raise RuntimeError: if called before the body has been read,
-                           for :term:`cchardet` usage
+      Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
+      If no charset is present or the charset is not understood by Python, the
+      ``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.
 
       .. versionadded:: 3.0
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -162,14 +162,6 @@ Dependencies
 - *charset-normalizer*
 - *multidict*
 - *yarl*
-- *Optional* :term:`cchardet` as faster replacement for
-  :term:`charset-normalizer`.
-
-  Install it explicitly via:
-
-  .. code-block:: bash
-
-     $ pip install cchardet
 
 - *Optional* :term:`aiodns` for fast DNS resolving. The
   library is highly recommended.

diff --git a/tests/test_client_response.py b/tests/test_client_response.py
@@ -2,6 +2,7 @@
 
 import gc
 import sys
+from typing import Any
 from unittest import mock
 
 import pytest
@@ -440,7 +441,11 @@ def side_effect(*args, **kwargs):
     assert not response.get_encoding.called
 
 
-async def test_text_detect_encoding(loop, session) -> None:
+@pytest.mark.parametrize("content_type", ("text/plain", "text/plain;charset=invalid"))
+async def test_text_charset_resolver(
+    content_type: str, loop: Any, session: Any
+) -> None:
+    session._resolve_charset = lambda r, b: "cp1251"
     response = ClientResponse(
         "get",
         URL("http://def-cl-resp.org"),
@@ -458,43 +463,15 @@ def side_effect(*args, **kwargs):
         fut.set_result('{"тест": "пройден"}'.encode("cp1251"))
         return fut
 
-    response._headers = {"Content-Type": "text/plain"}
+    response._headers = {"Content-Type": content_type}
     content = response.content = mock.Mock()
     content.read.side_effect = side_effect
 
     await response.read()
     res = await response.text()
     assert res == '{"тест": "пройден"}'
     assert response._connection is None
-
-
-async def test_text_detect_encoding_if_invalid_charset(loop, session) -> None:
-    response = ClientResponse(
-        "get",
-        URL("http://def-cl-resp.org"),
-        request_info=mock.Mock(),
-        writer=mock.Mock(),
-        continue100=None,
-        timer=TimerNoop(),
-        traces=[],
-        loop=loop,
-        session=session,
-    )
-
-    def side_effect(*args, **kwargs):
-        fut = loop.create_future()
-        fut.set_result('{"тест": "пройден"}'.encode("cp1251"))
-        return fut
-
-    response._headers = {"Content-Type": "text/plain;charset=invalid"}
-    content = response.content = mock.Mock()
-    content.read.side_effect = side_effect
-
-    await response.read()
-    res = await response.text()
-    assert res == '{"тест": "пройден"}'
-    assert response._connection is None
-    assert response.get_encoding().lower() in ("windows-1251", "maccyrillic")
+    assert response.get_encoding() == "cp1251"
 
 
 async def test_get_encoding_body_none(loop, session) -> None:
@@ -521,7 +498,7 @@ def side_effect(*args, **kwargs):
 
     with pytest.raises(
         RuntimeError,
-        match="^Cannot guess the encoding of a not yet read body$",
+        match="^Cannot compute fallback encoding of a not yet read body$",
     ):
         response.get_encoding()
     assert response.closed
@@ -742,9 +719,7 @@ def test_get_encoding_unknown(loop, session) -> None:
     )
 
     response._headers = {"Content-Type": "application/json"}
-    with mock.patch("aiohttp.client_reqrep.chardet") as m_chardet:
-        m_chardet.detect.return_value = {"encoding": None}
-        assert response.get_encoding() == "utf-8"
+    assert response.get_encoding() == "utf-8"
 
 
 def test_raise_for_status_2xx() -> None: