FIX add retry mechanism to handle quotechar in read_csv (#25511)

scikit-learn · Mar 8, 2023 · 12f1675 · 12f1675
1 parent fd6006d
commit 12f1675
Show file tree

Hide file tree

Showing 9 changed files with 116 additions and 30 deletions.
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -146,11 +146,15 @@ Changelog
 :mod:`sklearn.datasets`
 .......................
 
-- |Fix| Fix an inconsistency in :func:`datasets.fetch_openml` between liac-arff
+- |Fix| Fixes an inconsistency in :func:`datasets.fetch_openml` between liac-arff
   and pandas parser when a leading space is introduced after the delimiter.
   The ARFF specs requires to ignore the leading space.
   :pr:`25312` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| Fixes a bug in :func:`datasets.fetch_openml` when using `parser="pandas"`
+  where single quote and backslash escape characters were not properly handled.
+  :pr:`25511` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.decomposition`
 ............................
 

diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py
@@ -302,6 +302,7 @@ def _pandas_arff_parser(
     openml_columns_info,
     feature_names_to_select,
     target_names_to_select,
+    read_csv_kwargs=None,
 ):
     """ARFF parser using `pandas.read_csv`.
 
@@ -331,6 +332,10 @@ def _pandas_arff_parser(
     target_names_to_select : list of str
         A list of the target names to be selected to build `y`.
 
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
     Returns
     -------
     X : {ndarray, sparse matrix, dataframe}
@@ -363,18 +368,37 @@ def _pandas_arff_parser(
             dtypes[name] = "Int64"
         elif column_dtype.lower() == "nominal":
             dtypes[name] = "category"
+    # since we will not pass `names` when reading the ARFF file, we need to translate
+    # `dtypes` from column names to column indices to pass to `pandas.read_csv`
+    dtypes_positional = {
+        col_idx: dtypes[name]
+        for col_idx, name in enumerate(openml_columns_info)
+        if name in dtypes
+    }
 
-    # ARFF represents missing values with "?"
-    frame = pd.read_csv(
-        gzip_file,
-        header=None,
-        na_values=["?"],  # missing values are represented by `?`
-        comment="%",  # skip line starting by `%` since they are comments
-        quotechar='"',  # delimiter to use for quoted strings
-        names=[name for name in openml_columns_info],
-        dtype=dtypes,
-        skipinitialspace=True,  # skip spaces after delimiter to follow ARFF specs
-    )
+    default_read_csv_kwargs = {
+        "header": None,
+        "index_col": False,  # always force pandas to not use the first column as index
+        "na_values": ["?"],  # missing values are represented by `?`
+        "comment": "%",  # skip line starting by `%` since they are comments
+        "quotechar": '"',  # delimiter to use for quoted strings
+        "skipinitialspace": True,  # skip spaces after delimiter to follow ARFF specs
+        "escapechar": "\\",
+        "dtype": dtypes_positional,
+    }
+    read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
+    frame = pd.read_csv(gzip_file, **read_csv_kwargs)
+    try:
+        # Setting the columns while reading the file will select the N first columns
+        # and not raise a ParserError. Instead, we set the columns after reading the
+        # file and raise a ParserError if the number of columns does not match the
+        # number of columns in the metadata given by OpenML.
+        frame.columns = [name for name in openml_columns_info]
+    except ValueError as exc:
+        raise pd.errors.ParserError(
+            "The number of columns provided by OpenML does not match the number of "
+            "columns inferred by pandas when reading the file."
+        ) from exc
 
     columns_to_select = feature_names_to_select + target_names_to_select
     columns_to_keep = [col for col in frame.columns if col in columns_to_select]
@@ -431,6 +455,7 @@ def load_arff_from_gzip_file(
     feature_names_to_select,
     target_names_to_select,
     shape=None,
+    read_csv_kwargs=None,
 ):
     """Load a compressed ARFF file using a given parser.
 
@@ -461,6 +486,10 @@ def load_arff_from_gzip_file(
     target_names_to_select : list of str
         A list of the target names to be selected.
 
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
     Returns
     -------
     X : {ndarray, sparse matrix, dataframe}
@@ -493,6 +522,7 @@ def load_arff_from_gzip_file(
             openml_columns_info,
             feature_names_to_select,
             target_names_to_select,
+            read_csv_kwargs,
         )
     else:
         raise ValueError(

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
@@ -37,10 +37,15 @@ def _get_local_path(openml_path: str, data_home: str) -> str:
     return os.path.join(data_home, "openml.org", openml_path + ".gz")
 
 
-def _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable:
+def _retry_with_clean_cache(
+    openml_path: str,
+    data_home: Optional[str],
+    no_retry_exception: Optional[Exception] = None,
+) -> Callable:
     """If the first call to the decorated function fails, the local cached
     file is removed, and the function is called again. If ``data_home`` is
-    ``None``, then the function is called once.
+    ``None``, then the function is called once. We can provide a specific
+    exception to not retry on usign `no_retry_exception` parameter.
     """
 
     def decorator(f):
@@ -52,7 +57,11 @@ def wrapper(*args, **kw):
                 return f(*args, **kw)
             except URLError:
                 raise
-            except Exception:
+            except Exception as exc:
+                if no_retry_exception is not None and isinstance(
+                    exc, no_retry_exception
+                ):
+                    raise
                 warn("Invalid cache, redownloading file", RuntimeWarning)
                 local_path = _get_local_path(openml_path, data_home)
                 if os.path.exists(local_path):
@@ -216,7 +225,7 @@ def _get_json_content_from_openml_api(
         An exception otherwise.
     """
 
-    @_retry_with_clean_cache(url, data_home)
+    @_retry_with_clean_cache(url, data_home=data_home)
     def _load_json():
         with closing(
             _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
@@ -492,20 +501,39 @@ def _load_arff_response(
             "and retry..."
         )
 
-    gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
-    with closing(gzip_file):
+    def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
+        gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
+        with closing(gzip_file):
+            return load_arff_from_gzip_file(gzip_file, **arff_params)
 
-        X, y, frame, categories = load_arff_from_gzip_file(
-            gzip_file,
-            parser=parser,
-            output_type=output_type,
-            openml_columns_info=openml_columns_info,
-            feature_names_to_select=feature_names_to_select,
-            target_names_to_select=target_names_to_select,
-            shape=shape,
+    arff_params = dict(
+        parser=parser,
+        output_type=output_type,
+        openml_columns_info=openml_columns_info,
+        feature_names_to_select=feature_names_to_select,
+        target_names_to_select=target_names_to_select,
+        shape=shape,
+    )
+    try:
+        X, y, frame, categories = _open_url_and_load_gzip_file(
+            url, data_home, n_retries, delay, arff_params
         )
+    except Exception as exc:
+        if parser == "pandas":
+            from pandas.errors import ParserError
+
+            if isinstance(exc, ParserError):
+                # A parsing error could come from providing the wrong quotechar
+                # to pandas. By default, we use a double quote. Thus, we retry
+                # with a single quote before to raise the error.
+                arff_params["read_csv_kwargs"] = {"quotechar": "'"}
+                X, y, frame, categories = _open_url_and_load_gzip_file(
+                    url, data_home, n_retries, delay, arff_params
+                )
+            else:
+                raise
 
-        return X, y, frame, categories
+    return X, y, frame, categories
 
 
 def _download_data_to_bunch(
@@ -605,9 +633,17 @@ def _download_data_to_bunch(
                 "values. Missing values are not supported for target columns."
             )
 
-    X, y, frame, categories = _retry_with_clean_cache(url, data_home)(
-        _load_arff_response
-    )(
+    no_retry_exception = None
+    if parser == "pandas":
+        # If we get a ParserError with pandas, then we don't want to retry and we raise
+        # early.
+        from pandas.errors import ParserError
+
+        no_retry_exception = ParserError
+
+    X, y, frame, categories = _retry_with_clean_cache(
+        url, data_home, no_retry_exception
+    )(_load_arff_response)(
         url,
         data_home,
         parser=parser,

diff --git a/sklearn/datasets/tests/data/openml/id_42074/__init__.py b/sklearn/datasets/tests/data/openml/id_42074/__init__.py
diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz
diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz
diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz
diff --git a/sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz b/sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -1617,6 +1617,22 @@ def test_fetch_openml_leading_whitespace(monkeypatch):
     )
 
 
+def test_fetch_openml_quotechar_escapechar(monkeypatch):
+    """Check that we can handle escapechar and single/double quotechar.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/25478
+    """
+    pd = pytest.importorskip("pandas")
+    data_id = 42074
+    _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)
+
+    common_params = {"as_frame": True, "cache": False, "data_id": data_id}
+    adult_pandas = fetch_openml(parser="pandas", **common_params)
+    adult_liac_arff = fetch_openml(parser="liac-arff", **common_params)
+    pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame)
+
+
 ###############################################################################
 # Deprecation-changed parameters