Skip to content

Commit

Permalink
Remove chardet/charset-normalizer. Add fallback_charset_resolver Clie…
Browse files Browse the repository at this point in the history
…ntSession parameter. (aio-libs#7561)

Co-authored-by: Sam Bull <git@sambull.org>
(cherry picked from commit 6755796)
  • Loading branch information
john-parton committed Sep 7, 2023
1 parent 5946c74 commit e55796d
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 98 deletions.
2 changes: 2 additions & 0 deletions CHANGES/7561.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Replace automatic character set detection with a `fallback_charset_resolver` parameter
in `ClientSession` to allow user-supplied character set detection functions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ Jesus Cea
Jian Zeng
Jinkyu Yi
Joel Watts
John Parton
Jon Nabozny
Jonas Krüger Svensson
Jonas Obrist
Expand Down
12 changes: 12 additions & 0 deletions aiohttp/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
from .tracing import Trace, TraceConfig
from .typedefs import Final, JSONEncoder, LooseCookies, LooseHeaders, StrOrURL

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]

__all__ = (
# client_exceptions
"ClientConnectionError",
Expand Down Expand Up @@ -159,6 +164,7 @@ class ClientTimeout:
DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)

_RetType = TypeVar("_RetType")
_CharsetResolver = Callable[[ClientResponse, bytes], str]


class ClientSession:
Expand Down Expand Up @@ -220,6 +226,10 @@ def __init__(
requote_redirect_url: bool = True,
trace_configs: Optional[List[TraceConfig]] = None,
read_bufsize: int = 2**16,
fallback_charset_resolver: _CharsetResolver = lambda r, b: chardet.detect(b)[
"encoding"
]
or "utf-8",
) -> None:
if loop is None:
if connector is not None:
Expand Down Expand Up @@ -313,6 +323,8 @@ def __init__(
for trace_config in self._trace_configs:
trace_config.freeze()

self._resolve_charset = fallback_charset_resolver

def __init_subclass__(cls: Type["ClientSession"]) -> None:
warnings.warn(
"Inheritance class {} from ClientSession "
Expand Down
55 changes: 28 additions & 27 deletions aiohttp/client_reqrep.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import codecs
import contextlib
import functools
import io
import re
Expand All @@ -12,6 +13,7 @@
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -66,11 +68,6 @@
ssl = None # type: ignore[assignment]
SSLContext = object # type: ignore[misc,assignment]

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]


__all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")

Expand Down Expand Up @@ -722,8 +719,8 @@ class ClientResponse(HeadersMixin):
_raw_headers: RawHeaders = None # type: ignore[assignment] # Response raw headers

_connection = None # current connection
_source_traceback = None
# setted up by ClientRequest after ClientResponse object creation
_source_traceback: Optional[traceback.StackSummary] = None
# set up by ClientRequest after ClientResponse object creation
# post-init stage allows to not change ctor signature
_closed = True # to allow __del__ for non-initialized properly response
_released = False
Expand Down Expand Up @@ -760,6 +757,15 @@ def __init__(
self._loop = loop
# store a reference to session #1985
self._session: Optional[ClientSession] = session
# Save reference to _resolve_charset, so that get_encoding() will still
# work after the response has finished reading the body.
if session is None:
# TODO: Fix session=None in tests (see ClientRequest.__init__).
self._resolve_charset: Callable[
["ClientResponse", bytes], str
] = lambda *_: "utf-8"
else:
self._resolve_charset = session._resolve_charset
if loop.get_debug():
self._source_traceback = traceback.extract_stack(sys._getframe(1))

Expand Down Expand Up @@ -1053,27 +1059,22 @@ def get_encoding(self) -> str:

encoding = mimetype.parameters.get("charset")
if encoding:
try:
codecs.lookup(encoding)
except LookupError:
encoding = None
if not encoding:
if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
encoding = "utf-8"
elif self._body is None:
raise RuntimeError(
"Cannot guess the encoding of " "a not yet read body"
)
else:
encoding = chardet.detect(self._body)["encoding"]
if not encoding:
encoding = "utf-8"
with contextlib.suppress(LookupError):
return codecs.lookup(encoding).name

if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
return "utf-8"

if self._body is None:
raise RuntimeError(
"Cannot compute fallback encoding of a not yet read body"
)

return encoding
return self._resolve_charset(self, self._body)

async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
"""Read response payload and decode."""
Expand Down
35 changes: 35 additions & 0 deletions docs/client_advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -640,3 +640,38 @@ are changed so that aiohttp itself can wait on the underlying
connection to close. Please follow issue `#1925
<https://github.com/aio-libs/aiohttp/issues/1925>`_ for the progress
on this.

HTTP Pipelining
---------------

aiohttp does not support HTTP/HTTPS pipelining.


Character Set Detection
-----------------------

If you encounter a :exc:`UnicodeDecodeError` when using :meth:`ClientResponse.text()`
this may be because the response does not include the charset needed
to decode the body.

If you know the correct encoding for a request, you can simply specify
the encoding as a parameter (e.g. ``resp.text("windows-1252")``).

Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
can be used to introduce charset guessing functionality. When a charset is not found
in the Content-Type header, this function will be called to get the charset encoding. For
example, this can be used with the ``chardetng_py`` library.::

from chardetng_py import detect

def charset_resolver(resp: ClientResponse, body: bytes) -> str:
tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
return detect(body, allow_utf8=True, tld=tld)

ClientSession(fallback_charset_resolver=charset_resolver)

Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::

from charset_normalizer import detect

ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")
50 changes: 22 additions & 28 deletions docs/client_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
read_bufsize=2**16, \
requote_redirect_url=False, \
trust_env=False, \
trace_configs=None)
trace_configs=None, \
fallback_charset_resolver=lambda r, b: "utf-8")

The class for creating client sessions and making requests.

Expand Down Expand Up @@ -200,6 +201,16 @@ The client session supports the context manager protocol for self closing.
disabling. See :ref:`aiohttp-client-tracing-reference` for
more information.

:param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
A :term:`callable` that accepts a :class:`ClientResponse` and the
:class:`bytes` contents, and returns a :class:`str` which will be used as
the encoding parameter to :meth:`bytes.decode()`.

This function will be called when the charset is not known (e.g. not specified in the
Content-Type header). The default function simply defaults to ``utf-8``.

.. versionadded:: 3.8.6

.. attribute:: closed

``True`` if the session has been closed, ``False`` otherwise.
Expand Down Expand Up @@ -1400,12 +1411,8 @@ Response object
Read response's body and return decoded :class:`str` using
specified *encoding* parameter.

If *encoding* is ``None`` content encoding is autocalculated
using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
header is not provided by server.

:term:`cchardet` is used with fallback to :term:`charset-normalizer` if
*cchardet* is not available.
If *encoding* is ``None`` content encoding is determined from the
Content-Type header, or using the ``fallback_charset_resolver`` function.

Close underlying connection if data reading gets an error,
release connection otherwise.
Expand All @@ -1414,10 +1421,9 @@ Response object
``None`` for encoding autodetection
(default).

:return str: decoded *BODY*

:raise LookupError: if the encoding detected by cchardet is
unknown by Python (e.g. VISCII).
:raises: :exc:`UnicodeDecodeError` if decoding fails. See also
:meth:`get_encoding`.

.. note::

Expand All @@ -1430,18 +1436,14 @@ Response object

await resp.text('ISO-8859-1')

.. comethod:: json(*, encoding=None, loads=json.loads, \
.. method:: json(*, encoding=None, loads=json.loads, \
content_type='application/json')

Read response's body as *JSON*, return :class:`dict` using
specified *encoding* and *loader*. If data is not still available
a ``read`` call will be done,
a ``read`` call will be done.

If *encoding* is ``None`` content encoding is autocalculated
using :term:`cchardet` or :term:`charset-normalizer` as fallback if
*cchardet* is not available.

if response's `content-type` does not match `content_type` parameter
If response's `content-type` does not match `content_type` parameter
:exc:`aiohttp.ContentTypeError` get raised.
To disable content type check pass ``None`` value.

Expand Down Expand Up @@ -1473,17 +1475,9 @@ Response object

.. method:: get_encoding()

Automatically detect content encoding using ``charset`` info in
``Content-Type`` HTTP header. If this info is not exists or there
are no appropriate codecs for encoding then :term:`cchardet` /
:term:`charset-normalizer` is used.

Beware that it is not always safe to use the result of this function to
decode a response. Some encodings detected by cchardet are not known by
Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.

:raise RuntimeError: if called before the body has been read,
for :term:`cchardet` usage
Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
If no charset is present or the charset is not understood by Python, the
``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.

.. versionadded:: 3.0

Expand Down
8 changes: 0 additions & 8 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,6 @@ Dependencies
- *charset-normalizer*
- *multidict*
- *yarl*
- *Optional* :term:`cchardet` as faster replacement for
:term:`charset-normalizer`.

Install it explicitly via:

.. code-block:: bash
$ pip install cchardet

- *Optional* :term:`aiodns` for fast DNS resolving. The
library is highly recommended.
Expand Down
45 changes: 10 additions & 35 deletions tests/test_client_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import gc
import sys
from typing import Any
from unittest import mock

import pytest
Expand Down Expand Up @@ -440,7 +441,11 @@ def side_effect(*args, **kwargs):
assert not response.get_encoding.called


async def test_text_detect_encoding(loop, session) -> None:
@pytest.mark.parametrize("content_type", ("text/plain", "text/plain;charset=invalid"))
async def test_text_charset_resolver(
content_type: str, loop: Any, session: Any
) -> None:
session._resolve_charset = lambda r, b: "cp1251"
response = ClientResponse(
"get",
URL("http://def-cl-resp.org"),
Expand All @@ -458,43 +463,15 @@ def side_effect(*args, **kwargs):
fut.set_result('{"тест": "пройден"}'.encode("cp1251"))
return fut

response._headers = {"Content-Type": "text/plain"}
response._headers = {"Content-Type": content_type}
content = response.content = mock.Mock()
content.read.side_effect = side_effect

await response.read()
res = await response.text()
assert res == '{"тест": "пройден"}'
assert response._connection is None


async def test_text_detect_encoding_if_invalid_charset(loop, session) -> None:
response = ClientResponse(
"get",
URL("http://def-cl-resp.org"),
request_info=mock.Mock(),
writer=mock.Mock(),
continue100=None,
timer=TimerNoop(),
traces=[],
loop=loop,
session=session,
)

def side_effect(*args, **kwargs):
fut = loop.create_future()
fut.set_result('{"тест": "пройден"}'.encode("cp1251"))
return fut

response._headers = {"Content-Type": "text/plain;charset=invalid"}
content = response.content = mock.Mock()
content.read.side_effect = side_effect

await response.read()
res = await response.text()
assert res == '{"тест": "пройден"}'
assert response._connection is None
assert response.get_encoding().lower() in ("windows-1251", "maccyrillic")
assert response.get_encoding() == "cp1251"


async def test_get_encoding_body_none(loop, session) -> None:
Expand All @@ -521,7 +498,7 @@ def side_effect(*args, **kwargs):

with pytest.raises(
RuntimeError,
match="^Cannot guess the encoding of a not yet read body$",
match="^Cannot compute fallback encoding of a not yet read body$",
):
response.get_encoding()
assert response.closed
Expand Down Expand Up @@ -742,9 +719,7 @@ def test_get_encoding_unknown(loop, session) -> None:
)

response._headers = {"Content-Type": "application/json"}
with mock.patch("aiohttp.client_reqrep.chardet") as m_chardet:
m_chardet.detect.return_value = {"encoding": None}
assert response.get_encoding() == "utf-8"
assert response.get_encoding() == "utf-8"


def test_raise_for_status_2xx() -> None:
Expand Down

0 comments on commit e55796d

Please sign in to comment.