diff --git a/.gitignore b/.gitignore index 9b158da07a2ec..b8ee8d20322c3 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ doc/samples *.prof .tox/ .coverage +pip-wheel-metadata lfw_preprocessed/ nips2010_pdf/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000..aa8df3c3cbc87 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.8 + hooks: + - id: flake8 + types: [file, python] + # only check for unused imports for now, as long as + # the code is not fully PEP8 compatible + args: [--select=F401] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.730 + hooks: + - id: mypy + args: + - --ignore-missing-imports + files: sklearn/ diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index d1849a940d96c..f30db7f0ae08a 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -98,9 +98,6 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install -U pip python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist - # TODO: Remove pin when https://github.com/python-pillow/Pillow/issues/4518 gets fixed - python -m pip install "pillow>=4.3.0,!=7.1.0,!=7.1.1" - python -m pip install pandas matplotlib pyamg scikit-image # do not install dependencies for lightgbm since it requires scikit-learn python -m pip install lightgbm --no-deps diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py index 81e99856c6890..eaad1df75475c 100644 --- a/build_tools/generate_authors_table.py +++ b/build_tools/generate_authors_table.py @@ -11,6 +11,7 @@ import getpass import time from pathlib import Path +from os import path print("user:", file=sys.stderr) user = input() @@ -18,7 +19,7 @@ auth = (user, passwd) LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4' -REPO_FOLDER = Path(__file__).parent.parent +REPO_FOLDER = Path(path.abspath(__file__)).parent.parent def get(url): @@ -100,7 +101,6 @@ def get_profile(login): 'Duchesnay': 'Edouard Duchesnay', 'Lars': 'Lars Buitinck', 'MechCoder': 'Manoj Kumar', - 'jeremiedbb': 'Jérémie Du Boisberranger', } if profile["name"] in missing_names: profile["name"] = missing_names[profile["name"]] diff --git a/conftest.py b/conftest.py index 2b9e87bf9f292..874931341e195 100644 --- a/conftest.py +++ b/conftest.py @@ -99,16 +99,6 @@ def pytest_unconfigure(config): del sys._is_pytest_session -def pytest_runtest_setup(item): - if isinstance(item, DoctestItem): - set_config(print_changed_only=True) - - -def pytest_runtest_teardown(item, nextitem): - if isinstance(item, DoctestItem): - set_config(print_changed_only=False) - - # TODO: Remove when modules are deprecated in 0.24 # Configures pytest to ignore deprecated modules. collect_ignore_glob = [ diff --git a/doc/about.rst b/doc/about.rst index a6cdd54eb9201..814a4724d9579 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -271,14 +271,18 @@ July 2017. -............ +Past Sponsors +............. .. raw:: html
hello-world' in html_label + if checked: + assert 'checked>' in html_label + + +@pytest.mark.parametrize('est', ['passthrough', 'drop', None]) +def test_get_visual_block_single_str_none(est): + # Test estimators that are represnted by strings + est_html_info = _get_visual_block(est) + assert est_html_info.kind == 'single' + assert est_html_info.estimators == est + assert est_html_info.names == str(est) + assert est_html_info.name_details == str(est) + + +def test_get_visual_block_single_estimator(): + est = LogisticRegression(C=10.0) + est_html_info = _get_visual_block(est) + assert est_html_info.kind == 'single' + assert est_html_info.estimators == est + assert est_html_info.names == est.__class__.__name__ + assert est_html_info.name_details == str(est) + + +def test_get_visual_block_pipeline(): + pipe = Pipeline([ + ('imputer', SimpleImputer()), + ('do_nothing', 'passthrough'), + ('do_nothing_more', None), + ('classifier', LogisticRegression()) + ]) + est_html_info = _get_visual_block(pipe) + assert est_html_info.kind == 'serial' + assert est_html_info.estimators == tuple(step[1] for step in pipe.steps) + assert est_html_info.names == ['imputer: SimpleImputer', + 'do_nothing: passthrough', + 'do_nothing_more: passthrough', + 'classifier: LogisticRegression'] + assert est_html_info.name_details == [str(est) for _, est in pipe.steps] + + +def test_get_visual_block_feature_union(): + f_union = FeatureUnion([ + ('pca', PCA()), ('svd', TruncatedSVD()) + ]) + est_html_info = _get_visual_block(f_union) + assert est_html_info.kind == 'parallel' + assert est_html_info.names == ('pca', 'svd') + assert est_html_info.estimators == tuple( + trans[1] for trans in f_union.transformer_list) + assert est_html_info.name_details == (None, None) + + +def test_get_visual_block_voting(): + clf = VotingClassifier([ + ('log_reg', LogisticRegression()), + ('mlp', MLPClassifier()) + ]) + est_html_info = _get_visual_block(clf) + assert est_html_info.kind == 'parallel' + assert est_html_info.estimators == tuple(trans[1] + for trans in clf.estimators) + assert est_html_info.names == ('log_reg', 'mlp') + assert est_html_info.name_details == (None, None) + + +def test_get_visual_block_column_transformer(): + ct = ColumnTransformer([ + ('pca', PCA(), ['num1', 'num2']), + ('svd', TruncatedSVD, [0, 3]) + ]) + est_html_info = _get_visual_block(ct) + assert est_html_info.kind == 'parallel' + assert est_html_info.estimators == tuple( + trans[1] for trans in ct.transformers) + assert est_html_info.names == ('pca', 'svd') + assert est_html_info.name_details == (['num1', 'num2'], [0, 3]) + + +def test_estimator_html_repr_pipeline(): + num_trans = Pipeline(steps=[ + ('pass', 'passthrough'), + ('imputer', SimpleImputer(strategy='median')) + ]) + + cat_trans = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', + missing_values='empty')), + ('one-hot', OneHotEncoder(drop='first')) + ]) + + preprocess = ColumnTransformer([ + ('num', num_trans, ['a', 'b', 'c', 'd', 'e']), + ('cat', cat_trans, [0, 1, 2, 3]) + ]) + + feat_u = FeatureUnion([ + ('pca', PCA(n_components=1)), + ('tsvd', Pipeline([('first', TruncatedSVD(n_components=3)), + ('select', SelectPercentile())])) + ]) + + clf = VotingClassifier([ + ('lr', LogisticRegression(solver='lbfgs', random_state=1)), + ('mlp', MLPClassifier(alpha=0.001)) + ]) + + pipe = Pipeline([ + ('preprocessor', preprocess), ('feat_u', feat_u), ('classifier', clf) + ]) + html_output = estimator_html_repr(pipe) + + # top level estimators show estimator with changes + assert str(pipe) in html_output + for _, est in pipe.steps: + assert (f"
{str(est)}") in html_output + + # low level estimators do not show changes + with config_context(print_changed_only=True): + assert str(num_trans['pass']) in html_output + assert 'passthrough' in html_output + assert str(num_trans['imputer']) in html_output + + for _, _, cols in preprocess.transformers: + assert f"{cols}" in html_output + + # feature union + for name, _ in feat_u.transformer_list: + assert f"" in html_output + + pca = feat_u.transformer_list[0][1] + assert f"{str(pca)}" in html_output + + tsvd = feat_u.transformer_list[1][1] + first = tsvd['first'] + select = tsvd['select'] + assert f"{str(first)}" in html_output + assert f"{str(select)}" in html_output + + # voting classifer + for name, est in clf.estimators: + assert f"" in html_output + assert f"{str(est)}" in html_output + + +@pytest.mark.parametrize("final_estimator", [None, LinearSVC()]) +def test_stacking_classsifer(final_estimator): + estimators = [('mlp', MLPClassifier(alpha=0.001)), + ('tree', DecisionTreeClassifier())] + clf = StackingClassifier( + estimators=estimators, final_estimator=final_estimator) + + html_output = estimator_html_repr(clf) + + assert str(clf) in html_output + # If final_estimator's default changes from LogisticRegression + # this should be updated + if final_estimator is None: + assert "LogisticRegression(" in html_output + else: + assert final_estimator.__class__.__name__ in html_output + + +@pytest.mark.parametrize("final_estimator", [None, LinearSVR()]) +def test_stacking_regressor(final_estimator): + reg = StackingRegressor( + estimators=[('svr', LinearSVR())], final_estimator=final_estimator) + html_output = estimator_html_repr(reg) + + assert str(reg.estimators[0][0]) in html_output + assert "LinearSVR" in html_output + if final_estimator is None: + assert "RidgeCV" in html_output + else: + assert final_estimator.__class__.__name__ in html_output + + +def test_birch_duck_typing_meta(): + # Test duck typing meta estimators with Birch + birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3)) + html_output = estimator_html_repr(birch) + + # inner estimators do not show changes + with config_context(print_changed_only=True): + assert f"{str(birch.n_clusters)}" in html_output + assert "AgglomerativeClustering" in html_output + + # outer estimator contains all changes + assert f"{str(birch)}" in html_output + + +def test_ovo_classifier_duck_typing_meta(): + # Test duck typing metaestimators with OVO + ovo = OneVsOneClassifier(LinearSVC(penalty='l1')) + html_output = estimator_html_repr(ovo) + + # inner estimators do not show changes + with config_context(print_changed_only=True): + assert f"{str(ovo.estimator)}" in html_output + assert "LinearSVC" in html_output + + # outter estimator + assert f"{str(ovo)}" in html_output + + +def test_duck_typing_nested_estimator(): + # Test duck typing metaestimators with GP + kernel = RationalQuadratic(length_scale=1.0, alpha=0.1) + gp = GaussianProcessRegressor(kernel=kernel) + html_output = estimator_html_repr(gp) + + assert f"{str(kernel)}" in html_output + assert f"{str(gp)}" in html_output + + +@pytest.mark.parametrize('print_changed_only', [True, False]) +def test_one_estimator_print_change_only(print_changed_only): + pca = PCA(n_components=10) + + with config_context(print_changed_only=print_changed_only): + pca_repr = str(pca) + html_output = estimator_html_repr(pca) + assert pca_repr in html_output diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 2abcbfa3c74e7..80151be8c7e20 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -100,12 +100,12 @@ def check_randomized_svd_low_rank(dtype): assert X.shape == (n_samples, n_features) # compute the singular values of X using the slow exact method - U, s, V = linalg.svd(X, full_matrices=False) + U, s, Vt = linalg.svd(X, full_matrices=False) # Convert the singular values to the specific dtype U = U.astype(dtype, copy=False) s = s.astype(dtype, copy=False) - V = V.astype(dtype, copy=False) + Vt = Vt.astype(dtype, copy=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method @@ -133,7 +133,7 @@ def check_randomized_svd_low_rank(dtype): assert_almost_equal(s[:k], sa, decimal=decimal) # check the singular vectors too (while not checking the sign) - assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va), + assert_almost_equal(np.dot(U[:, :k], Vt[:k, :]), np.dot(Ua, Va), decimal=decimal) # check the sparse matrix representation @@ -306,28 +306,28 @@ def test_randomized_svd_power_iteration_normalizer(): n_components = 50 # Check that it diverges with many (non-normalized) power iterations - U, s, V = randomized_svd(X, n_components, n_iter=2, - power_iteration_normalizer='none') - A = X - U.dot(np.diag(s).dot(V)) + U, s, Vt = randomized_svd(X, n_components, n_iter=2, + power_iteration_normalizer='none') + A = X - U.dot(np.diag(s).dot(Vt)) error_2 = linalg.norm(A, ord='fro') - U, s, V = randomized_svd(X, n_components, n_iter=20, - power_iteration_normalizer='none') - A = X - U.dot(np.diag(s).dot(V)) + U, s, Vt = randomized_svd(X, n_components, n_iter=20, + power_iteration_normalizer='none') + A = X - U.dot(np.diag(s).dot(Vt)) error_20 = linalg.norm(A, ord='fro') assert np.abs(error_2 - error_20) > 100 for normalizer in ['LU', 'QR', 'auto']: - U, s, V = randomized_svd(X, n_components, n_iter=2, - power_iteration_normalizer=normalizer, - random_state=0) - A = X - U.dot(np.diag(s).dot(V)) + U, s, Vt = randomized_svd(X, n_components, n_iter=2, + power_iteration_normalizer=normalizer, + random_state=0) + A = X - U.dot(np.diag(s).dot(Vt)) error_2 = linalg.norm(A, ord='fro') for i in [5, 10, 50]: - U, s, V = randomized_svd(X, n_components, n_iter=i, - power_iteration_normalizer=normalizer, - random_state=0) - A = X - U.dot(np.diag(s).dot(V)) + U, s, Vt = randomized_svd(X, n_components, n_iter=i, + power_iteration_normalizer=normalizer, + random_state=0) + A = X - U.dot(np.diag(s).dot(Vt)) error = linalg.norm(A, ord='fro') assert 15 > np.abs(error_2 - error) @@ -355,20 +355,20 @@ def test_svd_flip(): X = rs.randn(n_samples, n_features) # Check matrix reconstruction - U, S, V = linalg.svd(X, full_matrices=False) - U1, V1 = svd_flip(U, V, u_based_decision=False) + U, S, Vt = linalg.svd(X, full_matrices=False) + U1, V1 = svd_flip(U, Vt, u_based_decision=False) assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6) # Check transposed matrix reconstruction XT = X.T - U, S, V = linalg.svd(XT, full_matrices=False) - U2, V2 = svd_flip(U, V, u_based_decision=True) + U, S, Vt = linalg.svd(XT, full_matrices=False) + U2, V2 = svd_flip(U, Vt, u_based_decision=True) assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6) # Check that different flip methods are equivalent under reconstruction - U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True) + U_flip1, V_flip1 = svd_flip(U, Vt, u_based_decision=True) assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6) - U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False) + U_flip2, V_flip2 = svd_flip(U, Vt, u_based_decision=False) assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6) diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index 146ccf781ae8a..866d872a9b65c 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -174,7 +174,7 @@ def __init__(self, missing_values=np.nan, strategy="mean", self.copy = copy -def test_basic(): +def test_basic(print_changed_only_false): # Basic pprint test lr = LogisticRegression() expected = """ @@ -189,8 +189,7 @@ def test_basic(): def test_changed_only(): - # Make sure the changed_only param is correctly used - set_config(print_changed_only=True) + # Make sure the changed_only param is correctly used when True (default) lr = LogisticRegression(C=99) expected = """LogisticRegression(C=99)""" assert lr.__repr__() == expected @@ -216,10 +215,8 @@ def test_changed_only(): # make sure array parameters don't throw error (see #13583) repr(LogisticRegressionCV(Cs=np.array([0.1, 1]))) - set_config(print_changed_only=False) - -def test_pipeline(): +def test_pipeline(print_changed_only_false): # Render a pipeline object pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999)) expected = """ @@ -240,7 +237,7 @@ def test_pipeline(): assert pipeline.__repr__() == expected -def test_deeply_nested(): +def test_deeply_nested(print_changed_only_false): # Render a deeply nested estimator rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression()))))))) expected = """ @@ -277,7 +274,7 @@ def test_deeply_nested(): assert rfe.__repr__() == expected -def test_gridsearch(): +def test_gridsearch(print_changed_only_false): # render a gridsearch param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, @@ -302,7 +299,7 @@ def test_gridsearch(): assert gs.__repr__() == expected -def test_gridsearch_pipeline(): +def test_gridsearch_pipeline(print_changed_only_false): # render a pipeline inside a gridsearch pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True) @@ -372,7 +369,7 @@ def test_gridsearch_pipeline(): assert repr_ == expected -def test_n_max_elements_to_show(): +def test_n_max_elements_to_show(print_changed_only_false): n_max_elements_to_show = 30 pp = _EstimatorPrettyPrinter( @@ -461,7 +458,7 @@ def test_n_max_elements_to_show(): assert pp.pformat(gs) == expected -def test_bruteforce_ellipsis(): +def test_bruteforce_ellipsis(print_changed_only_false): # Check that the bruteforce ellipsis (used when the number of non-blank # characters exceeds N_CHAR_MAX) renders correctly. diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5f6df9685a25c..bcfd8fcd8d50e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -63,7 +63,7 @@ def test_as_float_array(): X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten - assert as_float_array(X, False) is not X + assert as_float_array(X, copy=False) is not X assert X2.dtype == np.float64 # Test int dtypes <= 32bit tested_dtypes = [np.bool, @@ -349,6 +349,37 @@ def test_check_array(): check_array(X, dtype="numeric") +@pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"]) +@pytest.mark.parametrize("dtype, expected_dtype", [ + ([np.float32, np.float64], np.float32), + (np.float64, np.float64), + ("numeric", np.float64), +]) +def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype): + # Test pandas IntegerArray with pd.NA + pd = pytest.importorskip('pandas', minversion="1.0") + + X_np = np.array([[1, 2, 3, np.nan, np.nan], + [np.nan, np.nan, 8, 4, 6], + [1, 2, 3, 4, 5]]).T + + # Creates dataframe with IntegerArrays with pd.NA + X = pd.DataFrame(X_np, dtype=pd_dtype, columns=['a', 'b', 'c']) + # column c has no nans + X['c'] = X['c'].astype('float') + X_checked = check_array(X, force_all_finite='allow-nan', dtype=dtype) + assert_allclose(X_checked, X_np) + assert X_checked.dtype == expected_dtype + + X_checked = check_array(X, force_all_finite=False, dtype=dtype) + assert_allclose(X_checked, X_np) + assert X_checked.dtype == expected_dtype + + msg = "Input contains NaN, infinity" + with pytest.raises(ValueError, match=msg): + check_array(X, force_all_finite=True) + + def test_check_array_pandas_dtype_object_conversion(): # test that data-frame like objects with dtype object # get converted @@ -912,7 +943,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val): """Test that check_scalar returns no error/warning if valid inputs are provided""" with pytest.warns(None) as record: - check_scalar(x, "test_name", target_type, min_val, max_val) + check_scalar(x, "test_name", target_type=target_type, + min_val=min_val, max_val=max_val) assert len(record) == 0 @@ -1097,6 +1129,15 @@ def f2(a=1, *, b=1, c=1, d=1): match=r"Pass b=2 as keyword args"): f2(1, 2) + # The * is place before a keyword only argument without a default value + @_deprecate_positional_args + def f3(a, *, b, c=1, d=1): + pass + + with pytest.warns(FutureWarning, + match=r"Pass b=2 as keyword args"): + f3(1, 2) + def test_deprecate_positional_args_warns_for_class(): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4bb50c3deb5e7..7a6ef1e05fdde 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -36,6 +36,44 @@ warnings.simplefilter('ignore', NonBLASDotWarning) +def _deprecate_positional_args(f): + """Decorator for methods that issues warnings for positional arguments + + Using the keyword-only argument syntax in pep 3102, arguments after the + * will issue a warning when passed as a positional argument. + + Parameters + ---------- + f : function + function to check arguments on + """ + sig = signature(f) + kwonly_args = [] + all_args = [] + + for name, param in sig.parameters.items(): + if param.kind == Parameter.POSITIONAL_OR_KEYWORD: + all_args.append(name) + elif param.kind == Parameter.KEYWORD_ONLY: + kwonly_args.append(name) + + @wraps(f) + def inner_f(*args, **kwargs): + extra_args = len(args) - len(all_args) + if extra_args > 0: + # ignore first 'self' argument for instance methods + args_msg = ['{}={}'.format(name, arg) + for name, arg in zip(kwonly_args[:extra_args], + args[-extra_args:])] + warnings.warn("Pass {} as keyword args. From version 0.25 " + "passing these as positional arguments will " + "result in an error".format(", ".join(args_msg)), + FutureWarning) + kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) + return f(**kwargs) + return inner_f + + def _assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath @@ -67,7 +105,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None): raise ValueError("Input contains NaN") -def assert_all_finite(X, allow_nan=False): +@_deprecate_positional_args +def assert_all_finite(X, *, allow_nan=False): """Throw a ValueError if X contains NaN or infinity. Parameters @@ -79,7 +118,8 @@ def assert_all_finite(X, allow_nan=False): _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) -def as_float_array(X, copy=True, force_all_finite=True): +@_deprecate_positional_args +def as_float_array(X, *, copy=True, force_all_finite=True): """Converts an array-like to an array of floats. The new dtype will be np.float32 or np.float64, depending on the original @@ -95,17 +135,20 @@ def as_float_array(X, copy=True, force_all_finite=True): returned if X's dtype is not a floating point type. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. The possibilities - are: + Whether to raise an error on np.inf, np.nan, pd.NA in X. The + possibilities are: - True: Force all values of X to be finite. - - False: accept both np.inf and np.nan in X. - - 'allow-nan': accept only np.nan values in X. Values cannot be - infinite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + Returns ------- XT : {array, sparse matrix} @@ -113,9 +156,9 @@ def as_float_array(X, copy=True, force_all_finite=True): """ if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray) and not sp.issparse(X)): - return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64, - copy=copy, force_all_finite=force_all_finite, - ensure_2d=False) + return check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64, copy=copy, + force_all_finite=force_all_finite, ensure_2d=False) elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: return X.copy() if copy else X elif X.dtype in [np.float32, np.float64]: # is numpy array @@ -277,17 +320,20 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. The possibilities - are: + Whether to raise an error on np.inf, np.nan, pd.NA in X. The + possibilities are: - True: Force all values of X to be finite. - - False: accept both np.inf and np.nan in X. - - 'allow-nan': accept only np.nan values in X. Values cannot be - infinite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + Returns ------- spmatrix_converted : scipy sparse matrix. @@ -349,7 +395,8 @@ def _ensure_no_complex_data(array): "{}\n".format(array)) -def check_array(array, accept_sparse=False, accept_large_sparse=True, +@_deprecate_positional_args +def check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None): @@ -397,19 +444,20 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - - False: accept both np.inf and np.nan in array. - - 'allow-nan': accept only np.nan values in array. Values cannot - be infinite. - - For object dtyped data, only np.nan is checked and not np.inf. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + ensure_2d : boolean (default=True) Whether to raise a value error if array is not 2D. @@ -450,6 +498,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # check if the object contains several dtypes (typically a pandas # DataFrame), and store them. If not, store None. dtypes_orig = None + has_pd_integer_array = False if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): # throw warning if columns are sparse. If all columns are sparse, then # array.sparse exists and sparsity will be perserved (later). @@ -466,7 +515,20 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # pandas boolean dtype __array__ interface coerces bools to objects for i, dtype_iter in enumerate(dtypes_orig): if dtype_iter.kind == 'b': - dtypes_orig[i] = np.object + dtypes_orig[i] = np.dtype(np.object) + elif dtype_iter.name.startswith(("Int", "UInt")): + # name looks like an Integer Extension Array, now check for + # the dtype + with suppress(ImportError): + from pandas import (Int8Dtype, Int16Dtype, + Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, + UInt32Dtype, UInt64Dtype) + if isinstance(dtype_iter, (Int8Dtype, Int16Dtype, + Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, + UInt32Dtype, UInt64Dtype)): + has_pd_integer_array = True if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) @@ -487,6 +549,10 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True, # list of accepted types. dtype = dtype[0] + if has_pd_integer_array: + # If there are any pandas integer extension arrays, + array = array.astype(dtype) + if force_all_finite not in (True, False, 'allow-nan'): raise ValueError('force_all_finite should be a bool or "allow-nan"' '. Got {!r} instead'.format(force_all_finite)) @@ -620,7 +686,8 @@ def _check_large_sparse(X, accept_large_sparse=False): % indices_datatype) -def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, +@_deprecate_positional_args +def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, @@ -670,18 +737,21 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. This parameter - does not influence whether y can have np.inf or np.nan values. + Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter + does not influence whether y can have np.inf, np.nan, pd.NA values. The possibilities are: - True: Force all values of X to be finite. - - False: accept both np.inf and np.nan in X. - - 'allow-nan': accept only np.nan values in X. Values cannot be - infinite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + ensure_2d : boolean (default=True) Whether to raise a value error if X is not 2D. @@ -732,8 +802,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, ensure_min_features=ensure_min_features, estimator=estimator) if multi_output: - y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, - dtype=None) + y = check_array(y, accept_sparse='csr', force_all_finite=True, + ensure_2d=False, dtype=None) else: y = column_or_1d(y, warn=True) _assert_all_finite(y) @@ -745,7 +815,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, return X, y -def column_or_1d(y, warn=False): +@_deprecate_positional_args +def column_or_1d(y, *, warn=False): """ Ravel column or 1d numpy array, else raises an error Parameters @@ -825,7 +896,8 @@ def has_fit_parameter(estimator, parameter): return parameter in signature(estimator.fit).parameters -def check_symmetric(array, tol=1E-10, raise_warning=True, +@_deprecate_positional_args +def check_symmetric(array, *, tol=1E-10, raise_warning=True, raise_exception=False): """Make sure that array is 2D, square and symmetric. @@ -881,7 +953,8 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all): +@_deprecate_positional_args +def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -974,7 +1047,7 @@ def check_non_negative(X, whom): raise ValueError("Negative values in data passed to %s" % whom) -def check_scalar(x, name, target_type, min_val=None, max_val=None): +def check_scalar(x, name, target_type, *, min_val=None, max_val=None): """Validate scalar parameters type and value. Parameters @@ -1268,44 +1341,6 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): "matrix and an array") -def _deprecate_positional_args(f): - """Decorator for methods that issues warnings for positional arguments - - Using the keyword-only argument syntax in pep 3102, arguments after the - * will issue a warning when passed as a positional argument. - - Parameters - ---------- - f : function - function to check arguments on - """ - sig = signature(f) - kwonly_args = [] - all_args = [] - - for name, param in sig.parameters.items(): - if param.kind == Parameter.POSITIONAL_OR_KEYWORD: - all_args.append(name) - elif param.kind == Parameter.KEYWORD_ONLY: - kwonly_args.append(name) - - @wraps(f) - def inner_f(*args, **kwargs): - extra_args = len(args) - len(all_args) - if extra_args > 0: - # ignore first 'self' argument for instance methods - args_msg = ['{}={}'.format(name, arg) - for name, arg in zip(kwonly_args[:extra_args], - args[-extra_args:])] - warnings.warn("Pass {} as keyword args. From version 0.25 " - "passing these as positional arguments will " - "result in an error".format(", ".join(args_msg)), - FutureWarning) - kwargs.update({k: arg for k, arg in zip(all_args, args)}) - return f(**kwargs) - return inner_f - - def _check_fit_params(X, fit_params, indices=None): """Check and validate the parameters passed during `fit`.