Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX add retry mechanism to handle quotechar in read_csv #25511

Merged
merged 10 commits into from Mar 6, 2023
6 changes: 5 additions & 1 deletion doc/whats_new/v1.2.rst
Expand Up @@ -129,11 +129,15 @@ Changelog
:mod:`sklearn.datasets`
.......................

- |Fix| Fix an inconsistency in :func:`datasets.fetch_openml` between liac-arff
- |Fix| Fixes an inconsistency in :func:`datasets.fetch_openml` between liac-arff
and pandas parser when a leading space is introduced after the delimiter.
The ARFF specs requires to ignore the leading space.
:pr:`25312` by :user:`Guillaume Lemaitre <glemaitre>`.

- |Fix| Fixes a bug in :func:`datasets.fetch_openml` when using `parser="pandas"`
where single quote and backslash escape characters were not properly handled.
:pr:`25511` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.decomposition`
............................

Expand Down
52 changes: 41 additions & 11 deletions sklearn/datasets/_arff_parser.py
Expand Up @@ -302,6 +302,7 @@ def _pandas_arff_parser(
openml_columns_info,
feature_names_to_select,
target_names_to_select,
read_csv_kwargs=None,
):
"""ARFF parser using `pandas.read_csv`.

Expand Down Expand Up @@ -331,6 +332,10 @@ def _pandas_arff_parser(
target_names_to_select : list of str
A list of the target names to be selected to build `y`.

read_csv_kwargs : dict, default=None
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
the default options.

Returns
-------
X : {ndarray, sparse matrix, dataframe}
Expand Down Expand Up @@ -363,18 +368,37 @@ def _pandas_arff_parser(
dtypes[name] = "Int64"
elif column_dtype.lower() == "nominal":
dtypes[name] = "category"
# since we will not pass `names` when reading the ARFF file, we need to translate
# `dtypes` from column names to column indices to pass to `pandas.read_csv`
dtypes_positional = {
col_idx: dtypes[name]
for col_idx, name in enumerate(openml_columns_info)
if name in dtypes
}

# ARFF represents missing values with "?"
frame = pd.read_csv(
gzip_file,
header=None,
na_values=["?"], # missing values are represented by `?`
comment="%", # skip line starting by `%` since they are comments
quotechar='"', # delimiter to use for quoted strings
names=[name for name in openml_columns_info],
dtype=dtypes,
skipinitialspace=True, # skip spaces after delimiter to follow ARFF specs
)
default_read_csv_kwargs = {
"header": None,
"index_col": False, # always force pandas to not use the first column as index
"na_values": ["?"], # missing values are represented by `?`
"comment": "%", # skip line starting by `%` since they are comments
"quotechar": '"', # delimiter to use for quoted strings
"skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs
"escapechar": "\\",
"dtype": dtypes_positional,
}
read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
frame = pd.read_csv(gzip_file, **read_csv_kwargs)
thomasjpfan marked this conversation as resolved.
Show resolved Hide resolved
try:
# Setting the columns while reading the file will select the N first columns
# and not raise a ParserError. Instead, we set the columns after reading the
# file and raise a ParserError if the number of columns does not match the
# number of columns in the metadata given by OpenML.
frame.columns = [name for name in openml_columns_info]
except ValueError as exc:
raise pd.errors.ParserError(
"The number of columns provided by OpenML does not match the number of "
"columns inferred by pandas when reading the file."
) from exc

columns_to_select = feature_names_to_select + target_names_to_select
columns_to_keep = [col for col in frame.columns if col in columns_to_select]
Expand Down Expand Up @@ -431,6 +455,7 @@ def load_arff_from_gzip_file(
feature_names_to_select,
target_names_to_select,
shape=None,
read_csv_kwargs=None,
):
"""Load a compressed ARFF file using a given parser.

Expand Down Expand Up @@ -461,6 +486,10 @@ def load_arff_from_gzip_file(
target_names_to_select : list of str
A list of the target names to be selected.

read_csv_kwargs : dict, default=None
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
the default options.

Returns
-------
X : {ndarray, sparse matrix, dataframe}
Expand Down Expand Up @@ -493,6 +522,7 @@ def load_arff_from_gzip_file(
openml_columns_info,
feature_names_to_select,
target_names_to_select,
read_csv_kwargs,
)
else:
raise ValueError(
Expand Down
72 changes: 54 additions & 18 deletions sklearn/datasets/_openml.py
Expand Up @@ -37,10 +37,15 @@ def _get_local_path(openml_path: str, data_home: str) -> str:
return os.path.join(data_home, "openml.org", openml_path + ".gz")


def _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable:
def _retry_with_clean_cache(
openml_path: str,
data_home: Optional[str],
no_retry_exception: Optional[Exception] = None,
) -> Callable:
"""If the first call to the decorated function fails, the local cached
file is removed, and the function is called again. If ``data_home`` is
``None``, then the function is called once.
``None``, then the function is called once. We can provide a specific
exception to not retry on usign `no_retry_exception` parameter.
"""

def decorator(f):
Expand All @@ -52,7 +57,11 @@ def wrapper(*args, **kw):
return f(*args, **kw)
except URLError:
raise
except Exception:
except Exception as exc:
if no_retry_exception is not None and isinstance(
exc, no_retry_exception
):
raise
warn("Invalid cache, redownloading file", RuntimeWarning)
local_path = _get_local_path(openml_path, data_home)
if os.path.exists(local_path):
Expand Down Expand Up @@ -216,7 +225,7 @@ def _get_json_content_from_openml_api(
An exception otherwise.
"""

@_retry_with_clean_cache(url, data_home)
@_retry_with_clean_cache(url, data_home=data_home)
def _load_json():
with closing(
_open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
Expand Down Expand Up @@ -492,20 +501,39 @@ def _load_arff_response(
"and retry..."
)

gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
with closing(gzip_file):
def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
with closing(gzip_file):
return load_arff_from_gzip_file(gzip_file, **arff_params)

X, y, frame, categories = load_arff_from_gzip_file(
gzip_file,
parser=parser,
output_type=output_type,
openml_columns_info=openml_columns_info,
feature_names_to_select=feature_names_to_select,
target_names_to_select=target_names_to_select,
shape=shape,
arff_params = dict(
parser=parser,
output_type=output_type,
openml_columns_info=openml_columns_info,
feature_names_to_select=feature_names_to_select,
target_names_to_select=target_names_to_select,
shape=shape,
)
try:
X, y, frame, categories = _open_url_and_load_gzip_file(
url, data_home, n_retries, delay, arff_params
)
except Exception as exc:
if parser == "pandas":
from pandas.errors import ParserError

if isinstance(exc, ParserError):
# A parsing error could come from providing the wrong quotechar
# to pandas. By default, we use a double quote. Thus, we retry
# with a single quote before to raise the error.
arff_params["read_csv_kwargs"] = {"quotechar": "'"}
X, y, frame, categories = _open_url_and_load_gzip_file(
url, data_home, n_retries, delay, arff_params
)
else:
raise

return X, y, frame, categories
return X, y, frame, categories


def _download_data_to_bunch(
Expand Down Expand Up @@ -605,9 +633,17 @@ def _download_data_to_bunch(
"values. Missing values are not supported for target columns."
)

X, y, frame, categories = _retry_with_clean_cache(url, data_home)(
_load_arff_response
)(
no_retry_exception = None
if parser == "pandas":
# If we get a ParserError with pandas, then we don't want to retry and we raise
# early.
from pandas.errors import ParserError

no_retry_exception = ParserError

X, y, frame, categories = _retry_with_clean_cache(
url, data_home, no_retry_exception
)(_load_arff_response)(
url,
data_home,
parser=parser,
Expand Down
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
16 changes: 16 additions & 0 deletions sklearn/datasets/tests/test_openml.py
Expand Up @@ -1617,6 +1617,22 @@ def test_fetch_openml_leading_whitespace(monkeypatch):
)


def test_fetch_openml_quotechar_escapechar(monkeypatch):
"""Check that we can handle escapechar and single/double quotechar.

Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/25478
"""
pd = pytest.importorskip("pandas")
data_id = 42074
_monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)

common_params = {"as_frame": True, "cache": False, "data_id": data_id}
adult_pandas = fetch_openml(parser="pandas", **common_params)
adult_liac_arff = fetch_openml(parser="liac-arff", **common_params)
pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame)


###############################################################################
# Deprecation-changed parameters

Expand Down