From 7f4905757f0242f875f9f48787b22a72dfcd02c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 6 Mar 2023 18:12:52 +0100 Subject: [PATCH] FIX add retry mechanism to handle quotechar in read_csv (#25511) --- doc/whats_new/v1.2.rst | 6 +- sklearn/datasets/_arff_parser.py | 52 ++++++++++--- sklearn/datasets/_openml.py | 72 +++++++++++++----- .../tests/data/openml/id_42074/__init__.py | 0 .../openml/id_42074/api-v1-jd-42074.json.gz | Bin 0 -> 584 bytes .../openml/id_42074/api-v1-jdf-42074.json.gz | Bin 0 -> 272 bytes .../openml/id_42074/api-v1-jdq-42074.json.gz | Bin 0 -> 722 bytes .../id_42074/data-v1-dl-21552912.arff.gz | Bin 0 -> 2326 bytes sklearn/datasets/tests/test_openml.py | 16 ++++ 9 files changed, 116 insertions(+), 30 deletions(-) create mode 100644 sklearn/datasets/tests/data/openml/id_42074/__init__.py create mode 100644 sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz create mode 100644 sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz create mode 100644 sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz create mode 100644 sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index a6295b31b5acb..582dc9b299ab5 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -146,11 +146,15 @@ Changelog :mod:`sklearn.datasets` ....................... -- |Fix| Fix an inconsistency in :func:`datasets.fetch_openml` between liac-arff +- |Fix| Fixes an inconsistency in :func:`datasets.fetch_openml` between liac-arff and pandas parser when a leading space is introduced after the delimiter. The ARFF specs requires to ignore the leading space. :pr:`25312` by :user:`Guillaume Lemaitre `. +- |Fix| Fixes a bug in :func:`datasets.fetch_openml` when using `parser="pandas"` + where single quote and backslash escape characters were not properly handled. + :pr:`25511` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index a624cbe367a97..aeb325fc1e50c 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -302,6 +302,7 @@ def _pandas_arff_parser( openml_columns_info, feature_names_to_select, target_names_to_select, + read_csv_kwargs=None, ): """ARFF parser using `pandas.read_csv`. @@ -331,6 +332,10 @@ def _pandas_arff_parser( target_names_to_select : list of str A list of the target names to be selected to build `y`. + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + Returns ------- X : {ndarray, sparse matrix, dataframe} @@ -363,18 +368,37 @@ def _pandas_arff_parser( dtypes[name] = "Int64" elif column_dtype.lower() == "nominal": dtypes[name] = "category" + # since we will not pass `names` when reading the ARFF file, we need to translate + # `dtypes` from column names to column indices to pass to `pandas.read_csv` + dtypes_positional = { + col_idx: dtypes[name] + for col_idx, name in enumerate(openml_columns_info) + if name in dtypes + } - # ARFF represents missing values with "?" - frame = pd.read_csv( - gzip_file, - header=None, - na_values=["?"], # missing values are represented by `?` - comment="%", # skip line starting by `%` since they are comments - quotechar='"', # delimiter to use for quoted strings - names=[name for name in openml_columns_info], - dtype=dtypes, - skipinitialspace=True, # skip spaces after delimiter to follow ARFF specs - ) + default_read_csv_kwargs = { + "header": None, + "index_col": False, # always force pandas to not use the first column as index + "na_values": ["?"], # missing values are represented by `?` + "comment": "%", # skip line starting by `%` since they are comments + "quotechar": '"', # delimiter to use for quoted strings + "skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs + "escapechar": "\\", + "dtype": dtypes_positional, + } + read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})} + frame = pd.read_csv(gzip_file, **read_csv_kwargs) + try: + # Setting the columns while reading the file will select the N first columns + # and not raise a ParserError. Instead, we set the columns after reading the + # file and raise a ParserError if the number of columns does not match the + # number of columns in the metadata given by OpenML. + frame.columns = [name for name in openml_columns_info] + except ValueError as exc: + raise pd.errors.ParserError( + "The number of columns provided by OpenML does not match the number of " + "columns inferred by pandas when reading the file." + ) from exc columns_to_select = feature_names_to_select + target_names_to_select columns_to_keep = [col for col in frame.columns if col in columns_to_select] @@ -431,6 +455,7 @@ def load_arff_from_gzip_file( feature_names_to_select, target_names_to_select, shape=None, + read_csv_kwargs=None, ): """Load a compressed ARFF file using a given parser. @@ -461,6 +486,10 @@ def load_arff_from_gzip_file( target_names_to_select : list of str A list of the target names to be selected. + read_csv_kwargs : dict, default=None + Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite + the default options. + Returns ------- X : {ndarray, sparse matrix, dataframe} @@ -493,6 +522,7 @@ def load_arff_from_gzip_file( openml_columns_info, feature_names_to_select, target_names_to_select, + read_csv_kwargs, ) else: raise ValueError( diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index be85e72d822b0..cee95d740a3c2 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -37,10 +37,15 @@ def _get_local_path(openml_path: str, data_home: str) -> str: return os.path.join(data_home, "openml.org", openml_path + ".gz") -def _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable: +def _retry_with_clean_cache( + openml_path: str, + data_home: Optional[str], + no_retry_exception: Optional[Exception] = None, +) -> Callable: """If the first call to the decorated function fails, the local cached file is removed, and the function is called again. If ``data_home`` is - ``None``, then the function is called once. + ``None``, then the function is called once. We can provide a specific + exception to not retry on usign `no_retry_exception` parameter. """ def decorator(f): @@ -52,7 +57,11 @@ def wrapper(*args, **kw): return f(*args, **kw) except URLError: raise - except Exception: + except Exception as exc: + if no_retry_exception is not None and isinstance( + exc, no_retry_exception + ): + raise warn("Invalid cache, redownloading file", RuntimeWarning) local_path = _get_local_path(openml_path, data_home) if os.path.exists(local_path): @@ -216,7 +225,7 @@ def _get_json_content_from_openml_api( An exception otherwise. """ - @_retry_with_clean_cache(url, data_home) + @_retry_with_clean_cache(url, data_home=data_home) def _load_json(): with closing( _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) @@ -492,20 +501,39 @@ def _load_arff_response( "and retry..." ) - gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) - with closing(gzip_file): + def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params): + gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay) + with closing(gzip_file): + return load_arff_from_gzip_file(gzip_file, **arff_params) - X, y, frame, categories = load_arff_from_gzip_file( - gzip_file, - parser=parser, - output_type=output_type, - openml_columns_info=openml_columns_info, - feature_names_to_select=feature_names_to_select, - target_names_to_select=target_names_to_select, - shape=shape, + arff_params = dict( + parser=parser, + output_type=output_type, + openml_columns_info=openml_columns_info, + feature_names_to_select=feature_names_to_select, + target_names_to_select=target_names_to_select, + shape=shape, + ) + try: + X, y, frame, categories = _open_url_and_load_gzip_file( + url, data_home, n_retries, delay, arff_params ) + except Exception as exc: + if parser == "pandas": + from pandas.errors import ParserError + + if isinstance(exc, ParserError): + # A parsing error could come from providing the wrong quotechar + # to pandas. By default, we use a double quote. Thus, we retry + # with a single quote before to raise the error. + arff_params["read_csv_kwargs"] = {"quotechar": "'"} + X, y, frame, categories = _open_url_and_load_gzip_file( + url, data_home, n_retries, delay, arff_params + ) + else: + raise - return X, y, frame, categories + return X, y, frame, categories def _download_data_to_bunch( @@ -605,9 +633,17 @@ def _download_data_to_bunch( "values. Missing values are not supported for target columns." ) - X, y, frame, categories = _retry_with_clean_cache(url, data_home)( - _load_arff_response - )( + no_retry_exception = None + if parser == "pandas": + # If we get a ParserError with pandas, then we don't want to retry and we raise + # early. + from pandas.errors import ParserError + + no_retry_exception = ParserError + + X, y, frame, categories = _retry_with_clean_cache( + url, data_home, no_retry_exception + )(_load_arff_response)( url, data_home, parser=parser, diff --git a/sklearn/datasets/tests/data/openml/id_42074/__init__.py b/sklearn/datasets/tests/data/openml/id_42074/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jd-42074.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..8bfe157eb6dfed219c2a73bbe01b955d2d374350 GIT binary patch literal 584 zcmV-O0=NAiiwFp6=-6Wb12i%)H#7jPQ%!H%Fbuu#R|wv<{)`*-*Y~5`)lOo?EAD^EhP91G6xR!%uModm<744oP!J}PN7Fk_J zcafqaU^oi}t$~w(*<$tt#xB)Sj?qnj^c_n{z$QI)0~p|>JCnh=$?lr8N#}V^jIq_#8Q@6tfqe1EnOAPO zBSn_i2?=0Kb07z8mdXpDA&e^0g|t_kQ1@o8ULZqvor$ue8?<*0=SB9I15B|5Y7|o6 zuH4>=frKM<7*KOKV9X3qrwt}l7fL^A-CU?&p+a?`}o@@ zQYN9n8+kJxokXH1P@_z=>6%6D3!9CTbo%dhiu_ z0lt9dL8ZibNR+&Qqg@wg{*KxuOs2;my^zo@?tS5WwQ4JA2Z16>!j(?KicU!3&X}5f zhf8Bt8_^3zWxMw;**faH z4Q@jxoPsSTqZwF>HvRkd1aDszyF4f`@~UM6u%q=O7HzYxs5jM`clD~yIMl4HHsz|O W1uyGrTSm{%AASKJq-kd}1ONbR>JIq; literal 0 HcmV?d00001 diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdf-42074.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..622baf5bda2aa2e7e9bde6d60e7d45751cd5c646 GIT binary patch literal 272 zcmV+r0q_1FiwFoJ-q>RT|1>f%H#7jv)Z1=^AP@%NcbV(G@UZIYt27OvI+{sVFax@4 zjPG7-n;_l$(mV1KhA+cmgHBp0dX#o%G|+7DGlu4E15zD6p@9T0pycR6X!WctuP*pn zY1ipUjb&1U3&{v8c|hyJUfvCUXEZ2%)I4XbAA&kmmU==y&8d0(Ko+_eBN^lBibdY% zLk9)y7tcyPt3%T=O_sEZL`wo%7SP-tG9v9-RC(CB-18<)%9>v?R}7ijzwX-g#B$qX zELED9b=a`tEYl0A7hIetP literal 0 HcmV?d00001 diff --git a/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz b/sklearn/datasets/tests/data/openml/id_42074/api-v1-jdq-42074.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..241c38b25714e2cb2797740dd63dc5c6dce8d4a6 GIT binary patch literal 722 zcmV;@0xkU?iwFoK-q>RT|1>f%H#7jHR$WuuFcADNdIpW8(9=;sQ=Q?N1soTmfZLu=t=n7 zjl0L1dAKa2@1GKpAIdytMJUd$v0tBu>1w+KjG$IXfj`5P14CB%Cq79#$<|P7t>ji3 z!NF*srE7xF;*^6({J$rGvkl&#d`Qp7#VwwBZnHXuo zzvy;)wM4b(@eaY`w1`PMUB%)T3hVISLi}+OiXx;p&HQkkaoiB?0-YQqr7`WCf3dZU zCE5v-#r!gxwK5uW)rk%d<1J^9A4XEfm;%P6VfYQ&F12n$snU!KJi54Ws3p1-daUZ1 zVz^(Q$|R3r=J~+YRN=Q>P?gZeYpPZlxU4F|mJ6uX*?bKxHN;5Y#dha;CA0aVUQ3L$ zU2J#!)-qe1!AlVsX*+Cnm#tJbztPqbBVC8>?y9xS<~LTP-VM4XTs%HL0RgbLoQ(|t E08sR5Z~y=R literal 0 HcmV?d00001 diff --git a/sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz b/sklearn/datasets/tests/data/openml/id_42074/data-v1-dl-21552912.arff.gz new file mode 100644 index 0000000000000000000000000000000000000000..72be9d36cad744834fe67aca28428a856d736e34 GIT binary patch literal 2326 zcmV+x3F-D9iwFoZ-`HaS12Qo+H8MFdG61buO>Z2z5xwhI@UoaoYb=doZ#I`$l4Y+G zSvDl?8UYL}YPvby%&^H0$)3^ZwErPF?QL_*A*TSjUhSh#m7Y}6HQT)f$RQfcI2x@5e2`l+cMec@^6jZ0Cz zy|_Mmb9VXQNid~WmuJW45AXVwad_}~sGw`C@x$SlAI+Y4w4z{idTLu-iTnNTy?k|i zeZ2SGRWZ44OjIf8R=djgQ5_x}eEMn#L{`324H+9V(C=xvGS+HE0EgRvYV?3#X~r;U ztu{a*D&v%=C z&O!DU69p{nIk<^vR%F{?n|WK*XsS3*BkxptrLFJOq7Td=<$8`gRcG$ceuFlj&3^gu zud{=rgXyH0yhGrIK24(f9(ixD=%o1OMKL>iK0PQ-$r|v$87d|h)RFq0P~^>|;EQ5% zN^>0?Dsn~r3JvJpr1(~wbh~%ek#WW3(sY0X8$hZ^Z(-^}%`Nskm#XtdONtRtQvlQa zB55}iAcCR!3bjQAt>^(41c3nJMwN{(J)6veB2=v#G#{whmz_b;c;Vy8<3 z7F_D6!-@p3!@~mrHo5SEIb?tlm=xDO_~Q5A6ZTK&48=DP_yiJHlR|tjOkMga-2Joe z_3cF1_l`>Sky1o?8t=FcT%}gG9;l%tlnLlrBw9jL7HMp-_0n+C+zZ;)?5*al2!o(Cvze-&o%3aYuvR@v9s8OBBXTv=Uhv@c z&$$D?rLfX52@>BNJYvotov9nd9m#WrlD4?NWsN%FdQ1wGosi1rUIBqL6livU%3_R( zfE}?AI=xn}1Cnl-3x(_!lI=r>JtrLpK@;ux;N7iZf*z;=mt%!WFB11L3RuOF>io_Qf#mfm)hkTh#cqp%sK1PoK>cE#ohE!Gj!|# zaP%YloL}!S@Ms3SGsQIH`hpelJ;t1evVCN+X>~%R5!NT~eK5`^b>U5z6avKF>%hG~ z;A=9o64Y84C4iliI=TBt-B(Z~C*y@6jMH+IuLg z*r_~^<2ZZxEBGaSlLCpLI@EbwwnR?{F)LH`0s7XLS>-vy6TUns>jAGJUCtC6>k=8;Fz#X+bURV0zppy|4 zk)eR_Xb`==C+;pyGcgOSRyMSw0=(u(H=cudHMSn=A@9hz8?*| zi@t@2YM`0`Kf zy5;jAtP}W(GZ6|1`tbd->D}_70f$VMJV~-gHyCp9fo)(Xh)?Zk>-9~kyR3IurW%y+ z(Ks51@-Ru@F*sui44jmC7&JDBmH$i63u!)A@fiJrkKl=OW#E4GmPMnDIBj%8R=4UM zpZozv{2zS8i4HX?cK7dgz(MnooyTav8LAJZ9!|w+?NKscF0&2Wbqe{f8`ui(^Oqn0 z#%EGiEYTJ|%dVs%()=))C=(z8fi!cDg>|$HU=9?qY;Ycfu^EJ`^Q{IBj=pTf;i!Wg zd{s1d$x~zS;ah{xDZ@bU_!Ko|b>qj}Ul6orIJGlql=AT+PeEaKUPe601E+%(>eMh( w-cZMKd>Y_xb`QUR=&^&(j$NJd#Jl(JyO>N)Y_HX8FlDm$U#@)7WXBQ!0L__th5!Hn literal 0 HcmV?d00001 diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 73a94da6603c6..d2bf1ce4262fc 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1617,6 +1617,22 @@ def test_fetch_openml_leading_whitespace(monkeypatch): ) +def test_fetch_openml_quotechar_escapechar(monkeypatch): + """Check that we can handle escapechar and single/double quotechar. + + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/25478 + """ + pd = pytest.importorskip("pandas") + data_id = 42074 + _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False) + + common_params = {"as_frame": True, "cache": False, "data_id": data_id} + adult_pandas = fetch_openml(parser="pandas", **common_params) + adult_liac_arff = fetch_openml(parser="liac-arff", **common_params) + pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame) + + ############################################################################### # Deprecation-changed parameters