Skip to content

Commit

Permalink
FIX add retry mechanism to handle quotechar in read_csv (#25511)
Browse files Browse the repository at this point in the history
  • Loading branch information
glemaitre authored and jeremiedbb committed Mar 8, 2023
1 parent fd6006d commit 12f1675
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 30 deletions.
6 changes: 5 additions & 1 deletion doc/whats_new/v1.2.rst
Expand Up @@ -146,11 +146,15 @@ Changelog
:mod:`sklearn.datasets`
.......................

- |Fix| Fix an inconsistency in :func:`datasets.fetch_openml` between liac-arff
- |Fix| Fixes an inconsistency in :func:`datasets.fetch_openml` between liac-arff
and pandas parser when a leading space is introduced after the delimiter.
The ARFF specs requires to ignore the leading space.
:pr:`25312` by :user:`Guillaume Lemaitre <glemaitre>`.

- |Fix| Fixes a bug in :func:`datasets.fetch_openml` when using `parser="pandas"`
where single quote and backslash escape characters were not properly handled.
:pr:`25511` by :user:`Guillaume Lemaitre <glemaitre>`.

:mod:`sklearn.decomposition`
............................

Expand Down
52 changes: 41 additions & 11 deletions sklearn/datasets/_arff_parser.py
Expand Up @@ -302,6 +302,7 @@ def _pandas_arff_parser(
openml_columns_info,
feature_names_to_select,
target_names_to_select,
read_csv_kwargs=None,
):
"""ARFF parser using `pandas.read_csv`.
Expand Down Expand Up @@ -331,6 +332,10 @@ def _pandas_arff_parser(
target_names_to_select : list of str
A list of the target names to be selected to build `y`.
read_csv_kwargs : dict, default=None
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
the default options.
Returns
-------
X : {ndarray, sparse matrix, dataframe}
Expand Down Expand Up @@ -363,18 +368,37 @@ def _pandas_arff_parser(
dtypes[name] = "Int64"
elif column_dtype.lower() == "nominal":
dtypes[name] = "category"
# since we will not pass `names` when reading the ARFF file, we need to translate
# `dtypes` from column names to column indices to pass to `pandas.read_csv`
dtypes_positional = {
col_idx: dtypes[name]
for col_idx, name in enumerate(openml_columns_info)
if name in dtypes
}

# ARFF represents missing values with "?"
frame = pd.read_csv(
gzip_file,
header=None,
na_values=["?"], # missing values are represented by `?`
comment="%", # skip line starting by `%` since they are comments
quotechar='"', # delimiter to use for quoted strings
names=[name for name in openml_columns_info],
dtype=dtypes,
skipinitialspace=True, # skip spaces after delimiter to follow ARFF specs
)
default_read_csv_kwargs = {
"header": None,
"index_col": False, # always force pandas to not use the first column as index
"na_values": ["?"], # missing values are represented by `?`
"comment": "%", # skip line starting by `%` since they are comments
"quotechar": '"', # delimiter to use for quoted strings
"skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs
"escapechar": "\\",
"dtype": dtypes_positional,
}
read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
frame = pd.read_csv(gzip_file, **read_csv_kwargs)
try:
# Setting the columns while reading the file will select the N first columns
# and not raise a ParserError. Instead, we set the columns after reading the
# file and raise a ParserError if the number of columns does not match the
# number of columns in the metadata given by OpenML.
frame.columns = [name for name in openml_columns_info]
except ValueError as exc:
raise pd.errors.ParserError(
"The number of columns provided by OpenML does not match the number of "
"columns inferred by pandas when reading the file."
) from exc

columns_to_select = feature_names_to_select + target_names_to_select
columns_to_keep = [col for col in frame.columns if col in columns_to_select]
Expand Down Expand Up @@ -431,6 +455,7 @@ def load_arff_from_gzip_file(
feature_names_to_select,
target_names_to_select,
shape=None,
read_csv_kwargs=None,
):
"""Load a compressed ARFF file using a given parser.
Expand Down Expand Up @@ -461,6 +486,10 @@ def load_arff_from_gzip_file(
target_names_to_select : list of str
A list of the target names to be selected.
read_csv_kwargs : dict, default=None
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
the default options.
Returns
-------
X : {ndarray, sparse matrix, dataframe}
Expand Down Expand Up @@ -493,6 +522,7 @@ def load_arff_from_gzip_file(
openml_columns_info,
feature_names_to_select,
target_names_to_select,
read_csv_kwargs,
)
else:
raise ValueError(
Expand Down
72 changes: 54 additions & 18 deletions sklearn/datasets/_openml.py
Expand Up @@ -37,10 +37,15 @@ def _get_local_path(openml_path: str, data_home: str) -> str:
return os.path.join(data_home, "openml.org", openml_path + ".gz")


def _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable:
def _retry_with_clean_cache(
openml_path: str,
data_home: Optional[str],
no_retry_exception: Optional[Exception] = None,
) -> Callable:
"""If the first call to the decorated function fails, the local cached
file is removed, and the function is called again. If ``data_home`` is
``None``, then the function is called once.
``None``, then the function is called once. We can provide a specific
exception to not retry on usign `no_retry_exception` parameter.
"""

def decorator(f):
Expand All @@ -52,7 +57,11 @@ def wrapper(*args, **kw):
return f(*args, **kw)
except URLError:
raise
except Exception:
except Exception as exc:
if no_retry_exception is not None and isinstance(
exc, no_retry_exception
):
raise
warn("Invalid cache, redownloading file", RuntimeWarning)
local_path = _get_local_path(openml_path, data_home)
if os.path.exists(local_path):
Expand Down Expand Up @@ -216,7 +225,7 @@ def _get_json_content_from_openml_api(
An exception otherwise.
"""

@_retry_with_clean_cache(url, data_home)
@_retry_with_clean_cache(url, data_home=data_home)
def _load_json():
with closing(
_open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
Expand Down Expand Up @@ -492,20 +501,39 @@ def _load_arff_response(
"and retry..."
)

gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
with closing(gzip_file):
def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
with closing(gzip_file):
return load_arff_from_gzip_file(gzip_file, **arff_params)

X, y, frame, categories = load_arff_from_gzip_file(
gzip_file,
parser=parser,
output_type=output_type,
openml_columns_info=openml_columns_info,
feature_names_to_select=feature_names_to_select,
target_names_to_select=target_names_to_select,
shape=shape,
arff_params = dict(
parser=parser,
output_type=output_type,
openml_columns_info=openml_columns_info,
feature_names_to_select=feature_names_to_select,
target_names_to_select=target_names_to_select,
shape=shape,
)
try:
X, y, frame, categories = _open_url_and_load_gzip_file(
url, data_home, n_retries, delay, arff_params
)
except Exception as exc:
if parser == "pandas":
from pandas.errors import ParserError

if isinstance(exc, ParserError):
# A parsing error could come from providing the wrong quotechar
# to pandas. By default, we use a double quote. Thus, we retry
# with a single quote before to raise the error.
arff_params["read_csv_kwargs"] = {"quotechar": "'"}
X, y, frame, categories = _open_url_and_load_gzip_file(
url, data_home, n_retries, delay, arff_params
)
else:
raise

return X, y, frame, categories
return X, y, frame, categories


def _download_data_to_bunch(
Expand Down Expand Up @@ -605,9 +633,17 @@ def _download_data_to_bunch(
"values. Missing values are not supported for target columns."
)

X, y, frame, categories = _retry_with_clean_cache(url, data_home)(
_load_arff_response
)(
no_retry_exception = None
if parser == "pandas":
# If we get a ParserError with pandas, then we don't want to retry and we raise
# early.
from pandas.errors import ParserError

no_retry_exception = ParserError

X, y, frame, categories = _retry_with_clean_cache(
url, data_home, no_retry_exception
)(_load_arff_response)(
url,
data_home,
parser=parser,
Expand Down
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
16 changes: 16 additions & 0 deletions sklearn/datasets/tests/test_openml.py
Expand Up @@ -1617,6 +1617,22 @@ def test_fetch_openml_leading_whitespace(monkeypatch):
)


def test_fetch_openml_quotechar_escapechar(monkeypatch):
"""Check that we can handle escapechar and single/double quotechar.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/25478
"""
pd = pytest.importorskip("pandas")
data_id = 42074
_monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)

common_params = {"as_frame": True, "cache": False, "data_id": data_id}
adult_pandas = fetch_openml(parser="pandas", **common_params)
adult_liac_arff = fetch_openml(parser="liac-arff", **common_params)
pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame)


###############################################################################
# Deprecation-changed parameters

Expand Down

0 comments on commit 12f1675

Please sign in to comment.