diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index a6295b31b5acb..582dc9b299ab5 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -146,11 +146,15 @@ Changelog :mod:`sklearn.datasets` ....................... -- |Fix| Fix an inconsistency in :func:`datasets.fetch_openml` between liac-arff +- |Fix| Fixes an inconsistency in :func:`datasets.fetch_openml` between liac-arff and pandas parser when a leading space is introduced after the delimiter. The ARFF specs requires to ignore the leading space. :pr:`25312` by :user:`Guillaume Lemaitre `. +- |Fix| Fixes a bug in :func:`datasets.fetch_openml` when using `parser="pandas"` + where single quote and backslash escape characters were not properly handled. + :pr:`25511` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index a624cbe367a97..aeb325fc1e50c 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -302,6 +302,7 @@ def _pandas_arff_parser( openml_columns_info, feature_names_to_select, target_names_to_select, + read_csv_kwargs=None, ): """ARFF parser using `pandas.read_csv`. @@ -331,6 +332,10 @@ def _pandas_arff_parser( target_names_to_select : list of str A list of the target names to be selected to build `y`. + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + Returns ------- X : {ndarray, sparse matrix, dataframe} @@ -363,18 +368,37 @@ def _pandas_arff_parser( dtypes[name] = "Int64" elif column_dtype.lower() == "nominal": dtypes[name] = "category" + # since we will not pass `names` when reading the ARFF file, we need to translate + # `dtypes` from column names to column indices to pass to `pandas.read_csv` + dtypes_positional = { + col_idx: dtypes[name] + for col_idx, name in enumerate(openml_columns_info) + if name in dtypes + } - # ARFF represents missing values with "?" - frame = pd.read_csv( - gzip_file, - header=None, - na_values=["?"], # missing values are represented by `?` - comment="%", # skip line starting by `%` since they are comments - quotechar='"', # delimiter to use for quoted strings - names=[name for name in openml_columns_info], - dtype=dtypes, - skipinitialspace=True, # skip spaces after delimiter to follow ARFF specs - ) + default_read_csv_kwargs = { + "header": None, + "index_col": False, # always force pandas to not use the first column as index + "na_values": ["?"], # missing values are represented by `?` + "comment": "%", # skip line starting by `%` since they are comments + "quotechar": '"', # delimiter to use for quoted strings + "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs + "escapechar": "\\", + "dtype": dtypes_positional, + } + read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})} + frame = pd.read_csv(gzip_file, **read_csv_kwargs) + try: + # Setting the columns while reading the file will select the N first columns + # and not raise a ParserError. Instead, we set the columns after reading the + # file and raise a ParserError if the number of columns does not match the + # number of columns in the metadata given by OpenML. + frame.columns = [name for name in openml_columns_info] + except ValueError as exc: + raise pd.errors.ParserError( + "The number of columns provided by OpenML does not match the number of " + "columns inferred by pandas when reading the file." + ) from exc columns_to_select = feature_names_to_select + target_names_to_select columns_to_keep = [col for col in frame.columns if col in columns_to_select] @@ -431,6 +455,7 @@ def load_arff_from_gzip_file( feature_names_to_select, target_names_to_select, shape=None, + read_csv_kwargs=None, ): """Load a compressed ARFF file using a given parser. @@ -461,6 +486,10 @@ def load_arff_from_gzip_file( target_names_to_select : list of str A list of the target names to be selected. + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + Returns ------- X : {ndarray, sparse matrix, dataframe} @@ -493,6 +522,7 @@ def load_arff_from_gzip_file( openml_columns_info, feature_names_to_select, target_names_to_select, + read_csv_kwargs, ) else: raise ValueError( diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index be85e72d822b0..cee95d740a3c2 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -37,10 +37,15 @@ def _get_local_path(openml_path: str, data_home: str) -> str: return os.path.join(data_home, "openml.org", openml_path + ".gz") -def _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable: +def _retry_with_clean_cache( + openml_path: str, + data_home: Optional[str], + no_retry_exception: Optional[Exception] = None, +) -> Callable: """If the first call to the decorated function fails, the local cached file is removed, and the function is called again. If ``data_home`` is - ``None``, then the function is called once. + ``None``, then the function is called once. We can provide a specific + exception to not retry on usign `no_retry_exception` parameter. """ def decorator(f): @@ -52,7 +57,11 @@ def wrapper(*args, **kw): return f(*args, **kw) except URLError: raise - except Exception: + except Exception as exc: + if no_retry_exception is not None and isinstance( + exc, no_retry_exception + ): + raise warn("Invalid cache, redownloading file", RuntimeWarning) local_path = _get_local_path(openml_path, data_home) if os.path.exists(local_path): @@ -216,7 +225,7 @@ def _get_json_content_from_openml_api( An exception otherwise. """ - @_retry_with_clean_cache(url, data_home) + @_retry_with_clean_cache(url, data_home=data_home) def _load_json(): with closing( _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) @@ -492,20 +501,39 @@ def _load_arff_response( "and retry..." ) - gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) - with closing(gzip_file): + def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params): + gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) + with closing(gzip_file): + return load_arff_from_gzip_file(gzip_file, **arff_params) - X, y, frame, categories = load_arff_from_gzip_file( - gzip_file, - parser=parser, - output_type=output_type, - openml_columns_info=openml_columns_info, - feature_names_to_select=feature_names_to_select, - target_names_to_select=target_names_to_select, - shape=shape, + arff_params = dict( + parser=parser, + output_type=output_type, + openml_columns_info=openml_columns_info, + feature_names_to_select=feature_names_to_select, + target_names_to_select=target_names_to_select, + shape=shape, + ) + try: + X, y, frame, categories = _open_url_and_load_gzip_file( + url, data_home, n_retries, delay, arff_params ) + except Exception as exc: + if parser == "pandas": + from pandas.errors import ParserError + + if isinstance(exc, ParserError): + # A parsing error could come from providing the wrong quotechar + # to pandas. By default, we use a double quote. Thus, we retry + # with a single quote before to raise the error. + arff_params["read_csv_kwargs"] = {"quotechar": "'"} + X, y, frame, categories = _open_url_and_load_gzip_file( + url, data_home, n_retries, delay, arff_params + ) + else: + raise - return X, y, frame, categories + return X, y, frame, categories def _download_data_to_bunch( @@ -605,9 +633,17 @@ def _download_data_to_bunch( "values. Missing values are not supported for target columns." ) - X, y, frame, categories = _retry_with_clean_cache(url, data_home)( - _load_arff_response - )( + no_retry_exception = None + if parser == "pandas": + # If we get a ParserError with pandas, then we don't want to retry and we raise + # early. + from pandas.errors import ParserError + + no_retry_exception = ParserError + + X, y, frame, categories = _retry_with_clean_cache( + url, data_home, no_retry_exception + )(_load_arff_response)( url, data_home, parser=parser, diff --git a/sklearn/datasets/tests/data/openml/id_42074/__init__.py b/sklearn/datasets/tests/data/openml/id_42074/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz new file mode 100644 index 0000000000000..8bfe157eb6dfe Binary files /dev/null and b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz new file mode 100644 index 0000000000000..622baf5bda2aa Binary files /dev/null and b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz new file mode 100644 index 0000000000000..241c38b25714e Binary files /dev/null and b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz b/sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz new file mode 100644 index 0000000000000..72be9d36cad74 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz differ diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 73a94da6603c6..d2bf1ce4262fc 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1617,6 +1617,22 @@ def test_fetch_openml_leading_whitespace(monkeypatch): ) +def test_fetch_openml_quotechar_escapechar(monkeypatch): + """Check that we can handle escapechar and single/double quotechar. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/25478 + """ + pd = pytest.importorskip("pandas") + data_id = 42074 + _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False) + + common_params = {"as_frame": True, "cache": False, "data_id": data_id} + adult_pandas = fetch_openml(parser="pandas", **common_params) + adult_liac_arff = fetch_openml(parser="liac-arff", **common_params) + pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame) + + ############################################################################### # Deprecation-changed parameters