Skip to content

Commit

Permalink
Remove chardet/charset-normalizer. (#7589)
Browse files Browse the repository at this point in the history
Add fallback_charset_resolver ClientSession parameter. (#7561)

Co-authored-by: Sam Bull <git@sambull.org>
(cherry picked from commit 6755796)

---------

Co-authored-by: Sam Bull <git@sambull.org>
  • Loading branch information
john-parton and Dreamsorcerer committed Sep 9, 2023
1 parent 5946c74 commit b30c0cd
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 98 deletions.
2 changes: 2 additions & 0 deletions CHANGES/7561.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Replace automatic character set detection with a `fallback_charset_resolver` parameter
in `ClientSession` to allow user-supplied character set detection functions.
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ Jesus Cea
Jian Zeng
Jinkyu Yi
Joel Watts
John Parton
Jon Nabozny
Jonas Krüger Svensson
Jonas Obrist
Expand Down
26 changes: 26 additions & 0 deletions aiohttp/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
from .tracing import Trace, TraceConfig
from .typedefs import Final, JSONEncoder, LooseCookies, LooseHeaders, StrOrURL

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]

__all__ = (
# client_exceptions
"ClientConnectionError",
Expand Down Expand Up @@ -159,6 +164,22 @@ class ClientTimeout:
DEFAULT_TIMEOUT: Final[ClientTimeout] = ClientTimeout(total=5 * 60)

_RetType = TypeVar("_RetType")
_CharsetResolver = Callable[[ClientResponse, bytes], str]


def _default_fallback_charset_resolver(response: ClientResponse, body: bytes) -> str:

ret: str = chardet.detect(body)["encoding"] or "utf-8"

if ret != "utf-8":
warnings.warn(
"Automatic charset detection will be removed in 3.9, see: "
"https://docs.aiohttp.org/en/stable/client_advanced.html#character-set-detection", # noqa: E501
DeprecationWarning,
stacklevel=3,
)

return ret


class ClientSession:
Expand Down Expand Up @@ -220,6 +241,9 @@ def __init__(
requote_redirect_url: bool = True,
trace_configs: Optional[List[TraceConfig]] = None,
read_bufsize: int = 2**16,
fallback_charset_resolver: _CharsetResolver = (
_default_fallback_charset_resolver
),
) -> None:
if loop is None:
if connector is not None:
Expand Down Expand Up @@ -313,6 +337,8 @@ def __init__(
for trace_config in self._trace_configs:
trace_config.freeze()

self._resolve_charset = fallback_charset_resolver

def __init_subclass__(cls: Type["ClientSession"]) -> None:
warnings.warn(
"Inheritance class {} from ClientSession "
Expand Down
55 changes: 28 additions & 27 deletions aiohttp/client_reqrep.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import codecs
import contextlib
import functools
import io
import re
Expand All @@ -12,6 +13,7 @@
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Expand Down Expand Up @@ -66,11 +68,6 @@
ssl = None # type: ignore[assignment]
SSLContext = object # type: ignore[misc,assignment]

try:
import cchardet as chardet
except ImportError: # pragma: no cover
import charset_normalizer as chardet # type: ignore[no-redef]


__all__ = ("ClientRequest", "ClientResponse", "RequestInfo", "Fingerprint")

Expand Down Expand Up @@ -722,8 +719,8 @@ class ClientResponse(HeadersMixin):
_raw_headers: RawHeaders = None # type: ignore[assignment] # Response raw headers

_connection = None # current connection
_source_traceback = None
# setted up by ClientRequest after ClientResponse object creation
_source_traceback: Optional[traceback.StackSummary] = None
# set up by ClientRequest after ClientResponse object creation
# post-init stage allows to not change ctor signature
_closed = True # to allow __del__ for non-initialized properly response
_released = False
Expand Down Expand Up @@ -760,6 +757,15 @@ def __init__(
self._loop = loop
# store a reference to session #1985
self._session: Optional[ClientSession] = session
# Save reference to _resolve_charset, so that get_encoding() will still
# work after the response has finished reading the body.
if session is None:
# TODO: Fix session=None in tests (see ClientRequest.__init__).
self._resolve_charset: Callable[
["ClientResponse", bytes], str
] = lambda *_: "utf-8"
else:
self._resolve_charset = session._resolve_charset
if loop.get_debug():
self._source_traceback = traceback.extract_stack(sys._getframe(1))

Expand Down Expand Up @@ -1053,27 +1059,22 @@ def get_encoding(self) -> str:

encoding = mimetype.parameters.get("charset")
if encoding:
try:
codecs.lookup(encoding)
except LookupError:
encoding = None
if not encoding:
if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
encoding = "utf-8"
elif self._body is None:
raise RuntimeError(
"Cannot guess the encoding of " "a not yet read body"
)
else:
encoding = chardet.detect(self._body)["encoding"]
if not encoding:
encoding = "utf-8"
with contextlib.suppress(LookupError):
return codecs.lookup(encoding).name

if mimetype.type == "application" and (
mimetype.subtype == "json" or mimetype.subtype == "rdap"
):
# RFC 7159 states that the default encoding is UTF-8.
# RFC 7483 defines application/rdap+json
return "utf-8"

if self._body is None:
raise RuntimeError(
"Cannot compute fallback encoding of a not yet read body"
)

return encoding
return self._resolve_charset(self, self._body)

async def text(self, encoding: Optional[str] = None, errors: str = "strict") -> str:
"""Read response payload and decode."""
Expand Down
30 changes: 30 additions & 0 deletions docs/client_advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -640,3 +640,33 @@ are changed so that aiohttp itself can wait on the underlying
connection to close. Please follow issue `#1925
<https://github.com/aio-libs/aiohttp/issues/1925>`_ for the progress
on this.


Character Set Detection
-----------------------

If you encounter an 'Automatic charset detection will be removed' warning
when using :meth:`ClientResponse.text()` this may be because the response
does not include the charset needed to decode the body.

If you know the correct encoding for a request, you can simply specify
the encoding as a parameter (e.g. ``resp.text("windows-1252")``).

Alternatively, :class:`ClientSession` accepts a ``fallback_charset_resolver`` parameter which
can be used to reintroduce charset guessing functionality. When a charset is not found
in the Content-Type header, this function will be called to get the charset encoding. For
example, this can be used with the ``chardetng_py`` library.::

from chardetng_py import detect

def charset_resolver(resp: ClientResponse, body: bytes) -> str:
tld = resp.url.host.rsplit(".", maxsplit=1)[-1]
return detect(body, allow_utf8=True, tld=tld)

ClientSession(fallback_charset_resolver=charset_resolver)

Or, if ``chardetng_py`` doesn't work for you, then ``charset-normalizer`` is another option::

from charset_normalizer import detect

ClientSession(fallback_charset_resolver=lamba r, b: detect(b)["encoding"] or "utf-8")
51 changes: 23 additions & 28 deletions docs/client_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ The client session supports the context manager protocol for self closing.
read_bufsize=2**16, \
requote_redirect_url=False, \
trust_env=False, \
trace_configs=None)
trace_configs=None, \
fallback_charset_resolver=_chardet_resolver)

The class for creating client sessions and making requests.

Expand Down Expand Up @@ -200,6 +201,18 @@ The client session supports the context manager protocol for self closing.
disabling. See :ref:`aiohttp-client-tracing-reference` for
more information.

:param Callable[[ClientResponse,bytes],str] fallback_charset_resolver:
A :term:`callable` that accepts a :class:`ClientResponse` and the
:class:`bytes` contents, and returns a :class:`str` which will be used as
the encoding parameter to :meth:`bytes.decode()`.

This function will be called when the charset is not known (e.g. not specified in the
Content-Type header). The default function in 3.8.6 calls ``chardetng``
or ``charset-normaliser``. In 3.9+ this be replaced with a function that
simply defaults to ``utf-8``.

.. versionadded:: 3.8.6

.. attribute:: closed

``True`` if the session has been closed, ``False`` otherwise.
Expand Down Expand Up @@ -1400,12 +1413,8 @@ Response object
Read response's body and return decoded :class:`str` using
specified *encoding* parameter.

If *encoding* is ``None`` content encoding is autocalculated
using ``Content-Type`` HTTP header and *charset-normalizer* tool if the
header is not provided by server.

:term:`cchardet` is used with fallback to :term:`charset-normalizer` if
*cchardet* is not available.
If *encoding* is ``None`` content encoding is determined from the
Content-Type header, or using the ``fallback_charset_resolver`` function.

Close underlying connection if data reading gets an error,
release connection otherwise.
Expand All @@ -1414,10 +1423,7 @@ Response object
``None`` for encoding autodetection
(default).

:return str: decoded *BODY*

:raise LookupError: if the encoding detected by cchardet is
unknown by Python (e.g. VISCII).

.. note::

Expand All @@ -1430,18 +1436,15 @@ Response object

await resp.text('ISO-8859-1')

.. comethod:: json(*, encoding=None, loads=json.loads, \
.. method:: json(*, encoding=None, loads=json.loads, \
content_type='application/json')
:async:

Read response's body as *JSON*, return :class:`dict` using
specified *encoding* and *loader*. If data is not still available
a ``read`` call will be done,
a ``read`` call will be done.

If *encoding* is ``None`` content encoding is autocalculated
using :term:`cchardet` or :term:`charset-normalizer` as fallback if
*cchardet* is not available.

if response's `content-type` does not match `content_type` parameter
If response's `content-type` does not match `content_type` parameter
:exc:`aiohttp.ContentTypeError` get raised.
To disable content type check pass ``None`` value.

Expand Down Expand Up @@ -1473,17 +1476,9 @@ Response object

.. method:: get_encoding()

Automatically detect content encoding using ``charset`` info in
``Content-Type`` HTTP header. If this info is not exists or there
are no appropriate codecs for encoding then :term:`cchardet` /
:term:`charset-normalizer` is used.

Beware that it is not always safe to use the result of this function to
decode a response. Some encodings detected by cchardet are not known by
Python (e.g. VISCII). *charset-normalizer* is not concerned by that issue.

:raise RuntimeError: if called before the body has been read,
for :term:`cchardet` usage
Retrieve content encoding using ``charset`` info in ``Content-Type`` HTTP header.
If no charset is present or the charset is not understood by Python, the
``fallback_charset_resolver`` function associated with the ``ClientSession`` is called.

.. versionadded:: 3.0

Expand Down
8 changes: 0 additions & 8 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,6 @@ Dependencies
- *charset-normalizer*
- *multidict*
- *yarl*
- *Optional* :term:`cchardet` as faster replacement for
:term:`charset-normalizer`.

Install it explicitly via:

.. code-block:: bash
$ pip install cchardet

- *Optional* :term:`aiodns` for fast DNS resolving. The
library is highly recommended.
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ filterwarnings =
# can be dropped with the next release of `certify`, specifically
# `certify > 2022.06.15`.
ignore:path is deprecated. Use files.. instead. Refer to https.//importlib-resources.readthedocs.io/en/latest/using.html#migrating-from-legacy for migration advice.:DeprecationWarning:certifi.core
ignore:Automatic charset detection will be removed in 3.9:DeprecationWarning
junit_suite_name = aiohttp_test_suite
norecursedirs = dist docs build .tox .eggs
minversion = 3.8.2
Expand Down

0 comments on commit b30c0cd

Please sign in to comment.