sklearn/datasets/tests/test_openml.py

"""Test the openml loader."""
import gzip
import json
import os
import re
from functools import partial
from io import BytesIO
from urllib.error import HTTPError

import numpy as np
import scipy.sparse
import pytest

import sklearn
from sklearn import config_context
from sklearn.utils import Bunch, check_pandas_support
from sklearn.utils.fixes import _open_binary
from sklearn.utils._testing import (
    SkipTest,
    assert_allclose,
    assert_array_equal,
    fails_if_pypy,
)

from sklearn.datasets import fetch_openml as fetch_openml_orig
from sklearn.datasets._openml import (
    _OPENML_PREFIX,
    _open_openml_url,
    _get_local_path,
    _retry_with_clean_cache,
)


OPENML_TEST_DATA_MODULE = "sklearn.datasets.tests.data.openml"
# if True, urlopen will be monkey patched to only use local files
test_offline = True


class _MockHTTPResponse:
    def __init__(self, data, is_gzip):
        self.data = data
        self.is_gzip = is_gzip

    def read(self, amt=-1):
        return self.data.read(amt)

    def close(self):
        self.data.close()

    def info(self):
        if self.is_gzip:
            return {"Content-Encoding": "gzip"}
        return {}

    def __iter__(self):
        return iter(self.data)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        return False


# Disable the disk-based cache when testing `fetch_openml`:
# the mock data in sklearn/datasets/tests/data/openml/ is not always consistent
# with the version on openml.org. If one were to load the dataset outside of
# the tests, it may result in data that does not represent openml.org.
fetch_openml = partial(fetch_openml_orig, data_home=None)


def _monkey_patch_webbased_functions(context, data_id, gzip_response):
    # monkey patches the urlopen function. Important note: Do NOT use this
    # in combination with a regular cache directory, as the files that are
    # stored as cache should not be mixed up with real openml datasets
    url_prefix_data_description = "https://openml.org/api/v1/json/data/"
    url_prefix_data_features = "https://openml.org/api/v1/json/data/features/"
    url_prefix_download_data = "https://openml.org/data/v1/"
    url_prefix_data_list = "https://openml.org/api/v1/json/data/list/"

    path_suffix = ".gz"
    read_fn = gzip.open

    data_module = OPENML_TEST_DATA_MODULE + "." + f"id_{data_id}"

    def _file_name(url, suffix):
        output = (
            re.sub(r"\W", "-", url[len("https://openml.org/") :]) + suffix + path_suffix
        )
        # Shorten the filenames to have better compatibility with windows 10
        # and filenames > 260 characters
        return (
            output.replace("-json-data-list", "-jdl")
            .replace("-json-data-features", "-jdf")
            .replace("-json-data-qualities", "-jdq")
            .replace("-json-data", "-jd")
            .replace("-data_name", "-dn")
            .replace("-download", "-dl")
            .replace("-limit", "-l")
            .replace("-data_version", "-dv")
            .replace("-status", "-s")
            .replace("-deactivated", "-dact")
            .replace("-active", "-act")
        )

    def _mock_urlopen_shared(url, has_gzip_header, expected_prefix, suffix):
        assert url.startswith(expected_prefix)

        data_file_name = _file_name(url, suffix)

        with _open_binary(data_module, data_file_name) as f:
            if has_gzip_header and gzip_response:
                fp = BytesIO(f.read())
                return _MockHTTPResponse(fp, True)
            else:
                decompressed_f = read_fn(f, "rb")
                fp = BytesIO(decompressed_f.read())
                return _MockHTTPResponse(fp, False)

    def _mock_urlopen_data_description(url, has_gzip_header):
        return _mock_urlopen_shared(
            url=url,
            has_gzip_header=has_gzip_header,
            expected_prefix=url_prefix_data_description,
            suffix=".json",
        )

    def _mock_urlopen_data_features(url, has_gzip_header):
        return _mock_urlopen_shared(
            url=url,
            has_gzip_header=has_gzip_header,
            expected_prefix=url_prefix_data_features,
            suffix=".json",
        )

    def _mock_urlopen_download_data(url, has_gzip_header):
        return _mock_urlopen_shared(
            url=url,
            has_gzip_header=has_gzip_header,
            expected_prefix=url_prefix_download_data,
            suffix=".arff",
        )

    def _mock_urlopen_data_list(url, has_gzip_header):
        assert url.startswith(url_prefix_data_list)

        data_file_name = _file_name(url, ".json")

        # load the file itself, to simulate a http error
        with _open_binary(data_module, data_file_name) as f:
            decompressed_f = read_fn(f, "rb")
            decoded_s = decompressed_f.read().decode("utf-8")
            json_data = json.loads(decoded_s)
        if "error" in json_data:
            raise HTTPError(
                url=None, code=412, msg="Simulated mock error", hdrs=None, fp=None
            )

        with _open_binary(data_module, data_file_name) as f:
            if has_gzip_header:
                fp = BytesIO(f.read())
                return _MockHTTPResponse(fp, True)
            else:
                decompressed_f = read_fn(f, "rb")
                fp = BytesIO(decompressed_f.read())
                return _MockHTTPResponse(fp, False)

    def _mock_urlopen(request, *args, **kwargs):
        url = request.get_full_url()
        has_gzip_header = request.get_header("Accept-encoding") == "gzip"
        if url.startswith(url_prefix_data_list):
            return _mock_urlopen_data_list(url, has_gzip_header)
        elif url.startswith(url_prefix_data_features):
            return _mock_urlopen_data_features(url, has_gzip_header)
        elif url.startswith(url_prefix_download_data):
            return _mock_urlopen_download_data(url, has_gzip_header)
        elif url.startswith(url_prefix_data_description):
            return _mock_urlopen_data_description(url, has_gzip_header)
        else:
            raise ValueError("Unknown mocking URL pattern: %s" % url)

    # XXX: Global variable
    if test_offline:
        context.setattr(sklearn.datasets._openml, "urlopen", _mock_urlopen)


###############################################################################
# Test the behaviour of `fetch_openml` depending of the input parameters.

# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize(
    "data_id, dataset_params, n_samples, n_features, n_targets",
    [
        # iris
        (61, {"data_id": 61}, 150, 4, 1),
        (61, {"name": "iris", "version": 1}, 150, 4, 1),
        # anneal
        (2, {"data_id": 2}, 11, 38, 1),
        (2, {"name": "anneal", "version": 1}, 11, 38, 1),
        # cpu
        (561, {"data_id": 561}, 209, 7, 1),
        (561, {"name": "cpu", "version": 1}, 209, 7, 1),
        # emotions
        (40589, {"data_id": 40589}, 13, 72, 6),
        # adult-census
        (1119, {"data_id": 1119}, 10, 14, 1),
        (1119, {"name": "adult-census"}, 10, 14, 1),
        # miceprotein
        (40966, {"data_id": 40966}, 7, 77, 1),
        (40966, {"name": "MiceProtein"}, 7, 77, 1),
        # titanic
        (40945, {"data_id": 40945}, 1309, 13, 1),
    ],
)
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
@pytest.mark.parametrize("gzip_response", [True, False])
def test_fetch_openml_as_frame_true(
    monkeypatch,
    data_id,
    dataset_params,
    n_samples,
    n_features,
    n_targets,
    parser,
    gzip_response,
):
    """Check the behaviour of `fetch_openml` with `as_frame=True`.

    Fetch by ID and/or name (depending if the file was previously cached).
    """
    pd = pytest.importorskip("pandas")

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=gzip_response)
    bunch = fetch_openml(
        as_frame=True,
        cache=False,
        parser=parser,
        **dataset_params,
    )

    assert int(bunch.details["id"]) == data_id
    assert isinstance(bunch, Bunch)

    assert isinstance(bunch.frame, pd.DataFrame)
    assert bunch.frame.shape == (n_samples, n_features + n_targets)

    assert isinstance(bunch.data, pd.DataFrame)
    assert bunch.data.shape == (n_samples, n_features)

    if n_targets == 1:
        assert isinstance(bunch.target, pd.Series)
        assert bunch.target.shape == (n_samples,)
    else:
        assert isinstance(bunch.target, pd.DataFrame)
        assert bunch.target.shape == (n_samples, n_targets)

    assert bunch.categories is None


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize(
    "data_id, dataset_params, n_samples, n_features, n_targets",
    [
        # iris
        (61, {"data_id": 61}, 150, 4, 1),
        (61, {"name": "iris", "version": 1}, 150, 4, 1),
        # anneal
        (2, {"data_id": 2}, 11, 38, 1),
        (2, {"name": "anneal", "version": 1}, 11, 38, 1),
        # cpu
        (561, {"data_id": 561}, 209, 7, 1),
        (561, {"name": "cpu", "version": 1}, 209, 7, 1),
        # emotions
        (40589, {"data_id": 40589}, 13, 72, 6),
        # adult-census
        (1119, {"data_id": 1119}, 10, 14, 1),
        (1119, {"name": "adult-census"}, 10, 14, 1),
        # miceprotein
        (40966, {"data_id": 40966}, 7, 77, 1),
        (40966, {"name": "MiceProtein"}, 7, 77, 1),
    ],
)
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
def test_fetch_openml_as_frame_false(
    monkeypatch,
    data_id,
    dataset_params,
    n_samples,
    n_features,
    n_targets,
    parser,
):
    """Check the behaviour of `fetch_openml` with `as_frame=False`.

    Fetch both by ID and/or name + version.
    """
    pytest.importorskip("pandas")

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=True)
    bunch = fetch_openml(
        as_frame=False,
        cache=False,
        parser=parser,
        **dataset_params,
    )
    assert int(bunch.details["id"]) == data_id
    assert isinstance(bunch, Bunch)

    assert bunch.frame is None

    assert isinstance(bunch.data, np.ndarray)
    assert bunch.data.shape == (n_samples, n_features)

    assert isinstance(bunch.target, np.ndarray)
    if n_targets == 1:
        assert bunch.target.shape == (n_samples,)
    else:
        assert bunch.target.shape == (n_samples, n_targets)

    assert isinstance(bunch.categories, dict)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize("data_id", [61, 1119, 40945])
def test_fetch_openml_consistency_parser(monkeypatch, data_id):
    """Check the consistency of the LIAC-ARFF and pandas parsers."""
    pd = pytest.importorskip("pandas")

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=True)
    bunch_liac = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        parser="liac-arff",
    )
    bunch_pandas = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        parser="pandas",
    )

    # The data frames for the input features should match up to some numerical
    # dtype conversions (e.g. float64 <=> Int64) due to limitations of the
    # LIAC-ARFF parser.
    data_liac, data_pandas = bunch_liac.data, bunch_pandas.data

    def convert_numerical_dtypes(series):
        pandas_series = data_pandas[series.name]
        if pd.api.types.is_numeric_dtype(pandas_series):
            return series.astype(pandas_series.dtype)
        else:
            return series

    data_liac_with_fixed_dtypes = data_liac.apply(convert_numerical_dtypes)
    pd.testing.assert_frame_equal(data_liac_with_fixed_dtypes, data_pandas)

    # Let's also check that the .frame attributes also match
    frame_liac, frame_pandas = bunch_liac.frame, bunch_pandas.frame

    # Note that the .frame attribute is a superset of the .data attribute:
    pd.testing.assert_frame_equal(frame_pandas[bunch_pandas.feature_names], data_pandas)

    # However the remaining columns, typically the target(s), are not necessarily
    # dtyped similarly by both parsers due to limitations of the LIAC-ARFF parser.
    # Therefore, extra dtype conversions are required for those columns:

    def convert_numerical_and_categorical_dtypes(series):
        pandas_series = frame_pandas[series.name]
        if pd.api.types.is_numeric_dtype(pandas_series):
            return series.astype(pandas_series.dtype)
        elif pd.api.types.is_categorical_dtype(pandas_series):
            # Compare categorical features by converting categorical liac uses
            # strings to denote the categories, we rename the categories to make
            # them comparable to the pandas parser. Fixing this behavior in
            # LIAC-ARFF would allow to check the consistency in the future but
            # we do not plan to maintain the LIAC-ARFF on the long term.
            return series.cat.rename_categories(pandas_series.cat.categories)
        else:
            return series

    frame_liac_with_fixed_dtypes = frame_liac.apply(
        convert_numerical_and_categorical_dtypes
    )
    pd.testing.assert_frame_equal(frame_liac_with_fixed_dtypes, frame_pandas)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
def test_fetch_openml_equivalence_array_dataframe(monkeypatch, parser):
    """Check the equivalence of the dataset when using `as_frame=False` and
    `as_frame=True`.
    """
    pytest.importorskip("pandas")

    data_id = 61
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=True)
    bunch_as_frame_true = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        parser=parser,
    )

    bunch_as_frame_false = fetch_openml(
        data_id=data_id,
        as_frame=False,
        cache=False,
        parser=parser,
    )

    assert_allclose(bunch_as_frame_false.data, bunch_as_frame_true.data)
    assert_array_equal(bunch_as_frame_false.target, bunch_as_frame_true.target)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
def test_fetch_openml_iris_pandas(monkeypatch, parser):
    """Check fetching on a numerical only dataset with string labels."""
    pd = pytest.importorskip("pandas")
    CategoricalDtype = pd.api.types.CategoricalDtype
    data_id = 61
    data_shape = (150, 4)
    target_shape = (150,)
    frame_shape = (150, 5)

    target_dtype = CategoricalDtype(
        ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
    )
    data_dtypes = [np.float64] * 4
    data_names = ["sepallength", "sepalwidth", "petallength", "petalwidth"]
    target_name = "class"

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    bunch = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        parser=parser,
    )
    data = bunch.data
    target = bunch.target
    frame = bunch.frame

    assert isinstance(data, pd.DataFrame)
    assert np.all(data.dtypes == data_dtypes)
    assert data.shape == data_shape
    assert np.all(data.columns == data_names)
    assert np.all(bunch.feature_names == data_names)
    assert bunch.target_names == [target_name]

    assert isinstance(target, pd.Series)
    assert target.dtype == target_dtype
    assert target.shape == target_shape
    assert target.name == target_name
    assert target.index.is_unique

    assert isinstance(frame, pd.DataFrame)
    assert frame.shape == frame_shape
    assert np.all(frame.dtypes == data_dtypes + [target_dtype])
    assert frame.index.is_unique


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
@pytest.mark.parametrize("target_column", ["petalwidth", ["petalwidth", "petallength"]])
def test_fetch_openml_forcing_targets(monkeypatch, parser, target_column):
    """Check that we can force the target to not be the default target."""
    pd = pytest.importorskip("pandas")

    data_id = 61
    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    bunch_forcing_target = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        target_column=target_column,
        parser=parser,
    )
    bunch_default = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        parser=parser,
    )

    pd.testing.assert_frame_equal(bunch_forcing_target.frame, bunch_default.frame)
    if isinstance(target_column, list):
        pd.testing.assert_index_equal(
            bunch_forcing_target.target.columns, pd.Index(target_column)
        )
        assert bunch_forcing_target.data.shape == (150, 3)
    else:
        assert bunch_forcing_target.target.name == target_column
        assert bunch_forcing_target.data.shape == (150, 4)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize("data_id", [61, 2, 561, 40589, 1119])
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
def test_fetch_openml_equivalence_frame_return_X_y(monkeypatch, data_id, parser):
    """Check the behaviour of `return_X_y=True` when `as_frame=True`."""
    pd = pytest.importorskip("pandas")

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=True)
    bunch = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        return_X_y=False,
        parser=parser,
    )
    X, y = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        return_X_y=True,
        parser=parser,
    )

    pd.testing.assert_frame_equal(bunch.data, X)
    if isinstance(y, pd.Series):
        pd.testing.assert_series_equal(bunch.target, y)
    else:
        pd.testing.assert_frame_equal(bunch.target, y)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize("data_id", [61, 561, 40589, 1119])
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
def test_fetch_openml_equivalence_array_return_X_y(monkeypatch, data_id, parser):
    """Check the behaviour of `return_X_y=True` when `as_frame=False`."""
    pytest.importorskip("pandas")

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=True)
    bunch = fetch_openml(
        data_id=data_id,
        as_frame=False,
        cache=False,
        return_X_y=False,
        parser=parser,
    )
    X, y = fetch_openml(
        data_id=data_id,
        as_frame=False,
        cache=False,
        return_X_y=True,
        parser=parser,
    )

    assert_array_equal(bunch.data, X)
    assert_array_equal(bunch.target, y)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
def test_fetch_openml_difference_parsers(monkeypatch):
    """Check the difference between liac-arff and pandas parser."""
    pytest.importorskip("pandas")

    data_id = 1119
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=True)
    # When `as_frame=False`, the categories will be ordinally encoded with
    # liac-arff parser while this is not the case with pandas parser.
    as_frame = False
    bunch_liac_arff = fetch_openml(
        data_id=data_id,
        as_frame=as_frame,
        cache=False,
        parser="liac-arff",
    )
    bunch_pandas = fetch_openml(
        data_id=data_id,
        as_frame=as_frame,
        cache=False,
        parser="pandas",
    )

    assert bunch_liac_arff.data.dtype.kind == "f"
    assert bunch_pandas.data.dtype == "O"


###############################################################################
# Test the ARFF parsing on several dataset to check if detect the correct
# types (categories, intgers, floats).


@pytest.fixture(scope="module")
def datasets_column_names():
    """Returns the columns names for each dataset."""
    return {
        61: ["sepallength", "sepalwidth", "petallength", "petalwidth", "class"],
        2: [
            "family",
            "product-type",
            "steel",
            "carbon",
            "hardness",
            "temper_rolling",
            "condition",
            "formability",
            "strength",
            "non-ageing",
            "surface-finish",
            "surface-quality",
            "enamelability",
            "bc",
            "bf",
            "bt",
            "bw%2Fme",
            "bl",
            "m",
            "chrom",
            "phos",
            "cbond",
            "marvi",
            "exptl",
            "ferro",
            "corr",
            "blue%2Fbright%2Fvarn%2Fclean",
            "lustre",
            "jurofm",
            "s",
            "p",
            "shape",
            "thick",
            "width",
            "len",
            "oil",
            "bore",
            "packing",
            "class",
        ],
        561: ["vendor", "MYCT", "MMIN", "MMAX", "CACH", "CHMIN", "CHMAX", "class"],
        40589: [
            "Mean_Acc1298_Mean_Mem40_Centroid",
            "Mean_Acc1298_Mean_Mem40_Rolloff",
            "Mean_Acc1298_Mean_Mem40_Flux",
            "Mean_Acc1298_Mean_Mem40_MFCC_0",
            "Mean_Acc1298_Mean_Mem40_MFCC_1",
            "Mean_Acc1298_Mean_Mem40_MFCC_2",
            "Mean_Acc1298_Mean_Mem40_MFCC_3",
            "Mean_Acc1298_Mean_Mem40_MFCC_4",
            "Mean_Acc1298_Mean_Mem40_MFCC_5",
            "Mean_Acc1298_Mean_Mem40_MFCC_6",
            "Mean_Acc1298_Mean_Mem40_MFCC_7",
            "Mean_Acc1298_Mean_Mem40_MFCC_8",
            "Mean_Acc1298_Mean_Mem40_MFCC_9",
            "Mean_Acc1298_Mean_Mem40_MFCC_10",
            "Mean_Acc1298_Mean_Mem40_MFCC_11",
            "Mean_Acc1298_Mean_Mem40_MFCC_12",
            "Mean_Acc1298_Std_Mem40_Centroid",
            "Mean_Acc1298_Std_Mem40_Rolloff",
            "Mean_Acc1298_Std_Mem40_Flux",
            "Mean_Acc1298_Std_Mem40_MFCC_0",
            "Mean_Acc1298_Std_Mem40_MFCC_1",
            "Mean_Acc1298_Std_Mem40_MFCC_2",
            "Mean_Acc1298_Std_Mem40_MFCC_3",
            "Mean_Acc1298_Std_Mem40_MFCC_4",
            "Mean_Acc1298_Std_Mem40_MFCC_5",
            "Mean_Acc1298_Std_Mem40_MFCC_6",
            "Mean_Acc1298_Std_Mem40_MFCC_7",
            "Mean_Acc1298_Std_Mem40_MFCC_8",
            "Mean_Acc1298_Std_Mem40_MFCC_9",
            "Mean_Acc1298_Std_Mem40_MFCC_10",
            "Mean_Acc1298_Std_Mem40_MFCC_11",
            "Mean_Acc1298_Std_Mem40_MFCC_12",
            "Std_Acc1298_Mean_Mem40_Centroid",
            "Std_Acc1298_Mean_Mem40_Rolloff",
            "Std_Acc1298_Mean_Mem40_Flux",
            "Std_Acc1298_Mean_Mem40_MFCC_0",
            "Std_Acc1298_Mean_Mem40_MFCC_1",
            "Std_Acc1298_Mean_Mem40_MFCC_2",
            "Std_Acc1298_Mean_Mem40_MFCC_3",
            "Std_Acc1298_Mean_Mem40_MFCC_4",
            "Std_Acc1298_Mean_Mem40_MFCC_5",
            "Std_Acc1298_Mean_Mem40_MFCC_6",
            "Std_Acc1298_Mean_Mem40_MFCC_7",
            "Std_Acc1298_Mean_Mem40_MFCC_8",
            "Std_Acc1298_Mean_Mem40_MFCC_9",
            "Std_Acc1298_Mean_Mem40_MFCC_10",
            "Std_Acc1298_Mean_Mem40_MFCC_11",
            "Std_Acc1298_Mean_Mem40_MFCC_12",
            "Std_Acc1298_Std_Mem40_Centroid",
            "Std_Acc1298_Std_Mem40_Rolloff",
            "Std_Acc1298_Std_Mem40_Flux",
            "Std_Acc1298_Std_Mem40_MFCC_0",
            "Std_Acc1298_Std_Mem40_MFCC_1",
            "Std_Acc1298_Std_Mem40_MFCC_2",
            "Std_Acc1298_Std_Mem40_MFCC_3",
            "Std_Acc1298_Std_Mem40_MFCC_4",
            "Std_Acc1298_Std_Mem40_MFCC_5",
            "Std_Acc1298_Std_Mem40_MFCC_6",
            "Std_Acc1298_Std_Mem40_MFCC_7",
            "Std_Acc1298_Std_Mem40_MFCC_8",
            "Std_Acc1298_Std_Mem40_MFCC_9",
            "Std_Acc1298_Std_Mem40_MFCC_10",
            "Std_Acc1298_Std_Mem40_MFCC_11",
            "Std_Acc1298_Std_Mem40_MFCC_12",
            "BH_LowPeakAmp",
            "BH_LowPeakBPM",
            "BH_HighPeakAmp",
            "BH_HighPeakBPM",
            "BH_HighLowRatio",
            "BHSUM1",
            "BHSUM2",
            "BHSUM3",
            "amazed.suprised",
            "happy.pleased",
            "relaxing.calm",
            "quiet.still",
            "sad.lonely",
            "angry.aggresive",
        ],
        1119: [
            "age",
            "workclass",
            "fnlwgt:",
            "education:",
            "education-num:",
            "marital-status:",
            "occupation:",
            "relationship:",
            "race:",
            "sex:",
            "capital-gain:",
            "capital-loss:",
            "hours-per-week:",
            "native-country:",
            "class",
        ],
        40966: [
            "DYRK1A_N",
            "ITSN1_N",
            "BDNF_N",
            "NR1_N",
            "NR2A_N",
            "pAKT_N",
            "pBRAF_N",
            "pCAMKII_N",
            "pCREB_N",
            "pELK_N",
            "pERK_N",
            "pJNK_N",
            "PKCA_N",
            "pMEK_N",
            "pNR1_N",
            "pNR2A_N",
            "pNR2B_N",
            "pPKCAB_N",
            "pRSK_N",
            "AKT_N",
            "BRAF_N",
            "CAMKII_N",
            "CREB_N",
            "ELK_N",
            "ERK_N",
            "GSK3B_N",
            "JNK_N",
            "MEK_N",
            "TRKA_N",
            "RSK_N",
            "APP_N",
            "Bcatenin_N",
            "SOD1_N",
            "MTOR_N",
            "P38_N",
            "pMTOR_N",
            "DSCR1_N",
            "AMPKA_N",
            "NR2B_N",
            "pNUMB_N",
            "RAPTOR_N",
            "TIAM1_N",
            "pP70S6_N",
            "NUMB_N",
            "P70S6_N",
            "pGSK3B_N",
            "pPKCG_N",
            "CDK5_N",
            "S6_N",
            "ADARB1_N",
            "AcetylH3K9_N",
            "RRP1_N",
            "BAX_N",
            "ARC_N",
            "ERBB4_N",
            "nNOS_N",
            "Tau_N",
            "GFAP_N",
            "GluR3_N",
            "GluR4_N",
            "IL1B_N",
            "P3525_N",
            "pCASP9_N",
            "PSD95_N",
            "SNCA_N",
            "Ubiquitin_N",
            "pGSK3B_Tyr216_N",
            "SHH_N",
            "BAD_N",
            "BCL2_N",
            "pS6_N",
            "pCFOS_N",
            "SYP_N",
            "H3AcK18_N",
            "EGR1_N",
            "H3MeK4_N",
            "CaNA_N",
            "class",
        ],
        40945: [
            "pclass",
            "survived",
            "name",
            "sex",
            "age",
            "sibsp",
            "parch",
            "ticket",
            "fare",
            "cabin",
            "embarked",
            "boat",
            "body",
            "home.dest",
        ],
    }


@pytest.fixture(scope="module")
def datasets_missing_values():
    return {
        61: {},
        2: {
            "family": 11,
            "temper_rolling": 9,
            "condition": 2,
            "formability": 4,
            "non-ageing": 10,
            "surface-finish": 11,
            "enamelability": 11,
            "bc": 11,
            "bf": 10,
            "bt": 11,
            "bw%2Fme": 8,
            "bl": 9,
            "m": 11,
            "chrom": 11,
            "phos": 11,
            "cbond": 10,
            "marvi": 11,
            "exptl": 11,
            "ferro": 11,
            "corr": 11,
            "blue%2Fbright%2Fvarn%2Fclean": 11,
            "lustre": 8,
            "jurofm": 11,
            "s": 11,
            "p": 11,
            "oil": 10,
            "packing": 11,
        },
        561: {},
        40589: {},
        1119: {},
        40966: {"BCL2_N": 7},
        40945: {
            "age": 263,
            "fare": 1,
            "cabin": 1014,
            "embarked": 2,
            "boat": 823,
            "body": 1188,
            "home.dest": 564,
        },
    }


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize(
    "data_id, parser, expected_n_categories, expected_n_floats, expected_n_ints",
    [
        # iris dataset
        (61, "liac-arff", 1, 4, 0),
        (61, "pandas", 1, 4, 0),
        # anneal dataset
        (2, "liac-arff", 33, 6, 0),
        (2, "pandas", 33, 2, 4),
        # cpu dataset
        (561, "liac-arff", 1, 7, 0),
        (561, "pandas", 1, 0, 7),
        # emotions dataset
        (40589, "liac-arff", 6, 72, 0),
        (40589, "pandas", 6, 69, 3),
        # adult-census dataset
        (1119, "liac-arff", 9, 6, 0),
        (1119, "pandas", 9, 0, 6),
        # miceprotein
        # 1 column has only missing values with object dtype
        (40966, "liac-arff", 1, 76, 0),
        # with casting it will be transformed to either float or Int64
        (40966, "pandas", 1, 77, 0),
        # titanic
        (40945, "liac-arff", 3, 5, 0),
        (40945, "pandas", 3, 3, 3),
    ],
)
@pytest.mark.parametrize("gzip_response", [True, False])
def test_fetch_openml_types_inference(
    monkeypatch,
    data_id,
    parser,
    expected_n_categories,
    expected_n_floats,
    expected_n_ints,
    gzip_response,
    datasets_column_names,
    datasets_missing_values,
):
    """Check that `fetch_openml` infer the right number of categories, integers, and
    floats."""
    pd = pytest.importorskip("pandas")
    CategoricalDtype = pd.api.types.CategoricalDtype

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=gzip_response)

    bunch = fetch_openml(
        data_id=data_id,
        as_frame=True,
        cache=False,
        parser=parser,
    )
    frame = bunch.frame

    n_categories = len(
        [dtype for dtype in frame.dtypes if isinstance(dtype, CategoricalDtype)]
    )
    n_floats = len([dtype for dtype in frame.dtypes if dtype.kind == "f"])
    n_ints = len([dtype for dtype in frame.dtypes if dtype.kind == "i"])

    assert n_categories == expected_n_categories
    assert n_floats == expected_n_floats
    assert n_ints == expected_n_ints

    assert frame.columns.tolist() == datasets_column_names[data_id]

    frame_feature_to_n_nan = frame.isna().sum().to_dict()
    for name, n_missing in frame_feature_to_n_nan.items():
        expected_missing = datasets_missing_values[data_id].get(name, 0)
        assert n_missing == expected_missing


###############################################################################
# Test some more specific behaviour

# TODO(1.4): remove this filterwarning decorator
@pytest.mark.filterwarnings("ignore:The default value of `parser` will change")
@pytest.mark.parametrize(
    "params, err_msg",
    [
        ({"parser": "unknown"}, "`parser` must be one of"),
        ({"as_frame": "unknown"}, "`as_frame` must be one of"),
    ],
)
def test_fetch_openml_validation_parameter(monkeypatch, params, err_msg):
    data_id = 1119
    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    with pytest.raises(ValueError, match=err_msg):
        fetch_openml(data_id=data_id, **params)


@pytest.mark.parametrize(
    "params",
    [
        {"as_frame": True, "parser": "auto"},
        {"as_frame": "auto", "parser": "auto"},
        {"as_frame": False, "parser": "pandas"},
    ],
)
def test_fetch_openml_requires_pandas_error(monkeypatch, params):
    """Check that we raise the proper errors when we require pandas."""
    data_id = 1119
    try:
        check_pandas_support("test_fetch_openml_requires_pandas")
    except ImportError:
        _monkey_patch_webbased_functions(monkeypatch, data_id, True)
        err_msg = "requires pandas to be installed. Alternatively, explicitely"
        with pytest.raises(ImportError, match=err_msg):
            fetch_openml(data_id=data_id, **params)
    else:
        raise SkipTest("This test requires pandas to not be installed.")


# TODO(1.4): move this parameter option in`test_fetch_openml_requires_pandas_error`
def test_fetch_openml_requires_pandas_in_future(monkeypatch):
    """Check that we raise a warning that pandas will be required in the future."""
    params = {"as_frame": False, "parser": "auto"}
    data_id = 1119
    try:
        check_pandas_support("test_fetch_openml_requires_pandas")
    except ImportError:
        _monkey_patch_webbased_functions(monkeypatch, data_id, True)
        warn_msg = (
            "From version 1.4, `parser='auto'` with `as_frame=False` will use pandas"
        )
        with pytest.warns(FutureWarning, match=warn_msg):
            fetch_openml(data_id=data_id, **params)
    else:
        raise SkipTest("This test requires pandas to not be installed.")


@pytest.mark.filterwarnings("ignore:Version 1 of dataset Australian is inactive")
# TODO(1.4): remove this filterwarning decorator for `parser`
@pytest.mark.filterwarnings("ignore:The default value of `parser` will change")
@pytest.mark.parametrize(
    "params, err_msg",
    [
        (
            {"parser": "pandas"},
            "Sparse ARFF datasets cannot be loaded with parser='pandas'",
        ),
        (
            {"as_frame": True},
            "Sparse ARFF datasets cannot be loaded with as_frame=True.",
        ),
        (
            {"parser": "pandas", "as_frame": True},
            "Sparse ARFF datasets cannot be loaded with as_frame=True.",
        ),
    ],
)
def test_fetch_openml_sparse_arff_error(monkeypatch, params, err_msg):
    """Check that we raise the expected error for sparse ARFF datasets and
    a wrong set of incompatible parameters.
    """
    pytest.importorskip("pandas")
    data_id = 292

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    with pytest.raises(ValueError, match=err_msg):
        fetch_openml(
            data_id=data_id,
            cache=False,
            **params,
        )


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.filterwarnings("ignore:Version 1 of dataset Australian is inactive")
@pytest.mark.parametrize(
    "data_id, data_type",
    [
        (61, "dataframe"),  # iris dataset version 1
        (292, "sparse"),  # Australian dataset version 1
    ],
)
def test_fetch_openml_auto_mode(monkeypatch, data_id, data_type):
    """Check the auto mode of `fetch_openml`."""
    pd = pytest.importorskip("pandas")

    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
    data = fetch_openml(data_id=data_id, as_frame="auto", parser="auto", cache=False)
    klass = pd.DataFrame if data_type == "dataframe" else scipy.sparse.csr_matrix
    assert isinstance(data.data, klass)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch):
    """Check that we raise a warning regarding the working memory when using
    LIAC-ARFF parser."""
    pytest.importorskip("pandas")

    data_id = 1119
    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    msg = "Could not adhere to working_memory config."
    with pytest.warns(UserWarning, match=msg):
        with config_context(working_memory=1e-6):
            fetch_openml(
                data_id=data_id,
                as_frame=True,
                cache=False,
                parser="liac-arff",
            )


@pytest.mark.parametrize("gzip_response", [True, False])
def test_fetch_openml_iris_warn_multiple_version(monkeypatch, gzip_response):
    """Check that a warning is raised when multiple versions exist and no version is
    requested."""
    data_id = 61
    data_name = "iris"

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)

    msg = (
        "Multiple active versions of the dataset matching the name"
        " iris exist. Versions may be fundamentally different, "
        "returning version 1."
    )
    with pytest.warns(UserWarning, match=msg):
        fetch_openml(
            name=data_name,
            as_frame=False,
            cache=False,
            parser="liac-arff",
        )


@pytest.mark.parametrize("gzip_response", [True, False])
def test_fetch_openml_no_target(monkeypatch, gzip_response):
    """Check that we can get a dataset without target."""
    data_id = 61
    target_column = None
    expected_observations = 150
    expected_features = 5

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    data = fetch_openml(
        data_id=data_id,
        target_column=target_column,
        cache=False,
        as_frame=False,
        parser="liac-arff",
    )
    assert data.data.shape == (expected_observations, expected_features)
    assert data.target is None


@pytest.mark.parametrize("gzip_response", [True, False])
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
def test_missing_values_pandas(monkeypatch, gzip_response, parser):
    """check that missing values in categories are compatible with pandas
    categorical"""
    pytest.importorskip("pandas")

    data_id = 42585
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response=gzip_response)
    penguins = fetch_openml(
        data_id=data_id,
        cache=False,
        as_frame=True,
        parser=parser,
    )

    cat_dtype = penguins.data.dtypes["sex"]
    # there are nans in the categorical
    assert penguins.data["sex"].isna().any()
    assert_array_equal(cat_dtype.categories, ["FEMALE", "MALE", "_"])


@pytest.mark.parametrize("gzip_response", [True, False])
@pytest.mark.parametrize(
    "dataset_params",
    [
        {"data_id": 40675},
        {"data_id": None, "name": "glass2", "version": 1},
    ],
)
def test_fetch_openml_inactive(monkeypatch, gzip_response, dataset_params):
    """Check that we raise a warning when the dataset is inactive."""
    data_id = 40675
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    msg = "Version 1 of dataset glass2 is inactive,"
    with pytest.warns(UserWarning, match=msg):
        glass2 = fetch_openml(
            cache=False, as_frame=False, parser="liac-arff", **dataset_params
        )
    assert glass2.data.shape == (163, 9)
    assert glass2.details["id"] == "40675"


@pytest.mark.parametrize("gzip_response", [True, False])
@pytest.mark.parametrize(
    "data_id, params, err_type, err_msg",
    [
        (40675, {"name": "glass2"}, ValueError, "No active dataset glass2 found"),
        (
            61,
            {"data_id": 61, "target_column": ["sepalwidth", "class"]},
            ValueError,
            "Can only handle homogeneous multi-target datasets",
        ),
        (
            40945,
            {"data_id": 40945, "as_frame": False},
            ValueError,
            "STRING attributes are not supported for array representation. Try"
            " as_frame=True",
        ),
        (
            2,
            {"data_id": 2, "target_column": "family", "as_frame": True},
            ValueError,
            "Target column 'family'",
        ),
        (
            2,
            {"data_id": 2, "target_column": "family", "as_frame": False},
            ValueError,
            "Target column 'family'",
        ),
        (
            61,
            {"data_id": 61, "target_column": "undefined"},
            KeyError,
            "Could not find target_column='undefined'",
        ),
        (
            61,
            {"data_id": 61, "target_column": ["undefined", "class"]},
            KeyError,
            "Could not find target_column='undefined'",
        ),
    ],
)
@pytest.mark.parametrize("parser", ["liac-arff", "pandas"])
def test_fetch_openml_error(
    monkeypatch, gzip_response, data_id, params, err_type, err_msg, parser
):
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    if params.get("as_frame", True) or parser == "pandas":
        pytest.importorskip("pandas")
    with pytest.raises(err_type, match=err_msg):
        fetch_openml(cache=False, parser=parser, **params)


@pytest.mark.parametrize(
    "params, err_type, err_msg",
    [
        (
            {"data_id": -1, "name": None, "version": "version"},
            ValueError,
            "Dataset data_id=-1 and version=version passed, but you can only",
        ),
        (
            {"data_id": -1, "name": "nAmE"},
            ValueError,
            "Dataset data_id=-1 and name=name passed, but you can only",
        ),
        (
            {"data_id": -1, "name": "nAmE", "version": "version"},
            ValueError,
            "Dataset data_id=-1 and name=name passed, but you can only",
        ),
        (
            {},
            ValueError,
            "Neither name nor data_id are provided. Please provide name or data_id.",
        ),
    ],
)
def test_fetch_openml_raises_illegal_argument(params, err_type, err_msg):
    with pytest.raises(err_type, match=err_msg):
        fetch_openml(**params)


@pytest.mark.parametrize("gzip_response", [True, False])
def test_warn_ignore_attribute(monkeypatch, gzip_response):
    data_id = 40966
    expected_row_id_msg = "target_column='{}' has flag is_row_identifier."
    expected_ignore_msg = "target_column='{}' has flag is_ignore."
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    # single column test
    target_col = "MouseID"
    msg = expected_row_id_msg.format(target_col)
    with pytest.warns(UserWarning, match=msg):
        fetch_openml(
            data_id=data_id,
            target_column=target_col,
            cache=False,
            as_frame=False,
            parser="liac-arff",
        )
    target_col = "Genotype"
    msg = expected_ignore_msg.format(target_col)
    with pytest.warns(UserWarning, match=msg):
        fetch_openml(
            data_id=data_id,
            target_column=target_col,
            cache=False,
            as_frame=False,
            parser="liac-arff",
        )
    # multi column test
    target_col = "MouseID"
    msg = expected_row_id_msg.format(target_col)
    with pytest.warns(UserWarning, match=msg):
        fetch_openml(
            data_id=data_id,
            target_column=[target_col, "class"],
            cache=False,
            as_frame=False,
            parser="liac-arff",
        )
    target_col = "Genotype"
    msg = expected_ignore_msg.format(target_col)
    with pytest.warns(UserWarning, match=msg):
        fetch_openml(
            data_id=data_id,
            target_column=[target_col, "class"],
            cache=False,
            as_frame=False,
            parser="liac-arff",
        )


@pytest.mark.parametrize("gzip_response", [True, False])
def test_dataset_with_openml_error(monkeypatch, gzip_response):
    data_id = 1
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    msg = "OpenML registered a problem with the dataset. It might be unusable. Error:"
    with pytest.warns(UserWarning, match=msg):
        fetch_openml(data_id=data_id, cache=False, as_frame=False, parser="liac-arff")


@pytest.mark.parametrize("gzip_response", [True, False])
def test_dataset_with_openml_warning(monkeypatch, gzip_response):
    data_id = 3
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    msg = "OpenML raised a warning on the dataset. It might be unusable. Warning:"
    with pytest.warns(UserWarning, match=msg):
        fetch_openml(data_id=data_id, cache=False, as_frame=False, parser="liac-arff")


###############################################################################
# Test cache, retry mechanisms, checksum, etc.


@pytest.mark.parametrize("gzip_response", [True, False])
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
    data_id = 61

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
    # first fill the cache
    response1 = _open_openml_url(openml_path, cache_directory)
    # assert file exists
    location = _get_local_path(openml_path, cache_directory)
    assert os.path.isfile(location)
    # redownload, to utilize cache
    response2 = _open_openml_url(openml_path, cache_directory)
    assert response1.read() == response2.read()


@pytest.mark.parametrize("write_to_disk", [True, False])
def test_open_openml_url_unlinks_local_path(monkeypatch, tmpdir, write_to_disk):
    data_id = 61
    openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
    location = _get_local_path(openml_path, cache_directory)

    def _mock_urlopen(request, *args, **kwargs):
        if write_to_disk:
            with open(location, "w") as f:
                f.write("")
        raise ValueError("Invalid request")

    monkeypatch.setattr(sklearn.datasets._openml, "urlopen", _mock_urlopen)

    with pytest.raises(ValueError, match="Invalid request"):
        _open_openml_url(openml_path, cache_directory)

    assert not os.path.exists(location)


def test_retry_with_clean_cache(tmpdir):
    data_id = 61
    openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
    location = _get_local_path(openml_path, cache_directory)
    os.makedirs(os.path.dirname(location))

    with open(location, "w") as f:
        f.write("")

    @_retry_with_clean_cache(openml_path, cache_directory)
    def _load_data():
        # The first call will raise an error since location exists
        if os.path.exists(location):
            raise Exception("File exist!")
        return 1

    warn_msg = "Invalid cache, redownloading file"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        result = _load_data()
    assert result == 1


def test_retry_with_clean_cache_http_error(tmpdir):
    data_id = 61
    openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir("scikit_learn_data"))

    @_retry_with_clean_cache(openml_path, cache_directory)
    def _load_data():
        raise HTTPError(
            url=None, code=412, msg="Simulated mock error", hdrs=None, fp=None
        )

    error_msg = "Simulated mock error"
    with pytest.raises(HTTPError, match=error_msg):
        _load_data()


@pytest.mark.parametrize("gzip_response", [True, False])
def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
    def _mock_urlopen_raise(request, *args, **kwargs):
        raise ValueError(
            "This mechanism intends to test correct cache"
            "handling. As such, urlopen should never be "
            "accessed. URL: %s"
            % request.get_full_url()
        )

    data_id = 61
    cache_directory = str(tmpdir.mkdir("scikit_learn_data"))
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    X_fetched, y_fetched = fetch_openml(
        data_id=data_id,
        cache=True,
        data_home=cache_directory,
        return_X_y=True,
        as_frame=False,
        parser="liac-arff",
    )

    monkeypatch.setattr(sklearn.datasets._openml, "urlopen", _mock_urlopen_raise)

    X_cached, y_cached = fetch_openml(
        data_id=data_id,
        cache=True,
        data_home=cache_directory,
        return_X_y=True,
        as_frame=False,
        parser="liac-arff",
    )
    np.testing.assert_array_equal(X_fetched, X_cached)
    np.testing.assert_array_equal(y_fetched, y_cached)


# Known failure of PyPy for OpenML. See the following issue:
# https://github.com/scikit-learn/scikit-learn/issues/18906
@fails_if_pypy
@pytest.mark.parametrize(
    "as_frame, parser",
    [
        (True, "liac-arff"),
        (False, "liac-arff"),
        (True, "pandas"),
        (False, "pandas"),
    ],
)
def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir, parser):
    """Check that the checksum is working as expected."""
    if as_frame or parser == "pandas":
        pytest.importorskip("pandas")

    data_id = 2
    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    # create a temporary modified arff file
    original_data_module = OPENML_TEST_DATA_MODULE + "." + f"id_{data_id}"
    original_data_file_name = "data-v1-dl-1666876.arff.gz"
    corrupt_copy_path = tmpdir / "test_invalid_checksum.arff"
    with _open_binary(original_data_module, original_data_file_name) as orig_file:
        orig_gzip = gzip.open(orig_file, "rb")
        data = bytearray(orig_gzip.read())
        data[len(data) - 1] = 37

    with gzip.GzipFile(corrupt_copy_path, "wb") as modified_gzip:
        modified_gzip.write(data)

    # Requests are already mocked by monkey_patch_webbased_functions.
    # We want to re-use that mock for all requests except file download,
    # hence creating a thin mock over the original mock
    mocked_openml_url = sklearn.datasets._openml.urlopen

    def swap_file_mock(request, *args, **kwargs):
        url = request.get_full_url()
        if url.endswith("data/v1/download/1666876"):
            with open(corrupt_copy_path, "rb") as f:
                corrupted_data = f.read()
            return _MockHTTPResponse(BytesIO(corrupted_data), is_gzip=True)
        else:
            return mocked_openml_url(request)

    monkeypatch.setattr(sklearn.datasets._openml, "urlopen", swap_file_mock)

    # validate failed checksum
    with pytest.raises(ValueError) as exc:
        sklearn.datasets.fetch_openml(
            data_id=data_id, cache=False, as_frame=as_frame, parser=parser
        )
    # exception message should have file-path
    assert exc.match("1666876")


def test_open_openml_url_retry_on_network_error(monkeypatch):
    def _mock_urlopen_network_error(request, *args, **kwargs):
        raise HTTPError("", 404, "Simulated network error", None, None)

    monkeypatch.setattr(
        sklearn.datasets._openml, "urlopen", _mock_urlopen_network_error
    )

    invalid_openml_url = "invalid-url"

    with pytest.warns(
        UserWarning,
        match=re.escape(
            "A network error occurred while downloading"
            f" {_OPENML_PREFIX + invalid_openml_url}. Retrying..."
        ),
    ) as record:
        with pytest.raises(HTTPError, match="Simulated network error"):
            _open_openml_url(invalid_openml_url, None, delay=0)
        assert len(record) == 3


###############################################################################
# Non-regressiont tests


@pytest.mark.parametrize("gzip_response", [True, False])
@pytest.mark.parametrize("parser", ("liac-arff", "pandas"))
def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response, parser):
    """Check that we can load the "zoo" dataset.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14340
    """
    if parser == "pandas":
        pytest.importorskip("pandas")
    data_id = 62
    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)

    dataset = sklearn.datasets.fetch_openml(
        data_id=data_id, cache=False, as_frame=False, parser=parser
    )
    assert dataset is not None
    # The dataset has 17 features, including 1 ignored (animal),
    # so we assert that we don't have the ignored feature in the final Bunch
    assert dataset["data"].shape == (101, 16)
    assert "animal" not in dataset["feature_names"]


def test_fetch_openml_strip_quotes(monkeypatch):
    """Check that we strip the single quotes when used as a string delimiter.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/23381
    """
    pd = pytest.importorskip("pandas")
    data_id = 40966
    _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)

    common_params = {"as_frame": True, "cache": False, "data_id": data_id}
    mice_pandas = fetch_openml(parser="pandas", **common_params)
    mice_liac_arff = fetch_openml(parser="liac-arff", **common_params)
    pd.testing.assert_series_equal(mice_pandas.target, mice_liac_arff.target)
    assert not mice_pandas.target.str.startswith("'").any()
    assert not mice_pandas.target.str.endswith("'").any()

    # similar behaviour should be observed when the column is not the target
    mice_pandas = fetch_openml(parser="pandas", target_column="NUMB_N", **common_params)
    mice_liac_arff = fetch_openml(
        parser="liac-arff", target_column="NUMB_N", **common_params
    )
    pd.testing.assert_series_equal(
        mice_pandas.frame["class"], mice_liac_arff.frame["class"]
    )
    assert not mice_pandas.frame["class"].str.startswith("'").any()
    assert not mice_pandas.frame["class"].str.endswith("'").any()


def test_fetch_openml_leading_whitespace(monkeypatch):
    """Check that we can strip leading whitespace in pandas parser.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/25311
    """
    pd = pytest.importorskip("pandas")
    data_id = 1590
    _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)

    common_params = {"as_frame": True, "cache": False, "data_id": data_id}
    adult_pandas = fetch_openml(parser="pandas", **common_params)
    adult_liac_arff = fetch_openml(parser="liac-arff", **common_params)
    pd.testing.assert_series_equal(
        adult_pandas.frame["class"], adult_liac_arff.frame["class"]
    )


def test_fetch_openml_quotechar_escapechar(monkeypatch):
    """Check that we can handle escapechar and single/double quotechar.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/25478
    """
    pd = pytest.importorskip("pandas")
    data_id = 42074
    _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)

    common_params = {"as_frame": True, "cache": False, "data_id": data_id}
    adult_pandas = fetch_openml(parser="pandas", **common_params)
    adult_liac_arff = fetch_openml(parser="liac-arff", **common_params)
    pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame)


###############################################################################
# Deprecation-changed parameters

# TODO(1.4): remove this test
def test_fetch_openml_deprecation_parser(monkeypatch):
    """Check that we raise a deprecation warning for parser parameter."""
    pytest.importorskip("pandas")
    data_id = 61
    _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)

    with pytest.warns(FutureWarning, match="The default value of `parser` will change"):
        sklearn.datasets.fetch_openml(data_id=data_id)