Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT Ensure disjoint interval constraints #25797

Merged
merged 4 commits into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 7 additions & 7 deletions sklearn/cluster/_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ class OPTICS(ClusterMixin, BaseEstimator):
_parameter_constraints: dict = {
"min_samples": [
Interval(Integral, 2, None, closed="left"),
Interval(Real, 0, 1, closed="both"),
Interval("real_not_int", 0, 1, closed="both"),
Copy link
Member

@jjerphan jjerphan Mar 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naive question: do you think it would be possible to implement ReadNotInt which would extend Number or even Real and perfom the check?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's doable and I did it in this commit 718523f to show how it would look like.

I don't have a strong opinion about using this instead of 'real_not_int'. I slightly tend toward using RealNotInt. Do you have an opinion @glemaitre @thomasjpfan ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm okay with RealNotInt. It looks a little better when placed next to Integral.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good with me as well.

],
"max_eps": [Interval(Real, 0, None, closed="both")],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
Expand All @@ -245,7 +245,7 @@ class OPTICS(ClusterMixin, BaseEstimator):
"predecessor_correction": ["boolean"],
"min_cluster_size": [
Interval(Integral, 2, None, closed="left"),
Interval(Real, 0, 1, closed="right"),
Interval("real_not_int", 0, 1, closed="right"),
None,
],
"algorithm": [StrOptions({"auto", "brute", "ball_tree", "kd_tree"})],
Expand Down Expand Up @@ -431,7 +431,7 @@ def _compute_core_distances_(X, neighbors, min_samples, working_memory):
"X": [np.ndarray, "sparse matrix"],
"min_samples": [
Interval(Integral, 2, None, closed="left"),
Interval(Real, 0, 1, closed="both"),
Interval("real_not_int", 0, 1, closed="both"),
],
"max_eps": [Interval(Real, 0, None, closed="both")],
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
Expand Down Expand Up @@ -723,12 +723,12 @@ def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
"predecessor": [np.ndarray],
"ordering": [np.ndarray],
"min_samples": [
Interval(Integral, 1, None, closed="neither"),
Interval(Real, 0, 1, closed="both"),
Interval(Integral, 2, None, closed="left"),
Interval("real_not_int", 0, 1, closed="both"),
],
"min_cluster_size": [
Interval(Integral, 1, None, closed="neither"),
Interval(Real, 0, 1, closed="both"),
Interval(Integral, 2, None, closed="left"),
Interval("real_not_int", 0, 1, closed="both"),
None,
],
"xi": [Interval(Real, 0, 1, closed="both")],
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/tests/test_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def test_minimum_number_of_sample_check():

# Compute OPTICS
X = [[1, 1]]
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1)
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1.0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically this breaks backward compatibility, but the docs says int > 1, so I'm okay with being strict here.

min_cluster_size : int > 1 or float between 0 and 1, default=None


# Run the fit
with pytest.raises(ValueError, match=msg):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/decomposition/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ class PCA(_BasePCA):
_parameter_constraints: dict = {
"n_components": [
Interval(Integral, 0, None, closed="left"),
Interval(Real, 0, 1, closed="neither"),
Interval("real_not_int", 0, 1, closed="neither"),
StrOptions({"mle"}),
None,
],
Expand Down
6 changes: 3 additions & 3 deletions sklearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numbers
import numpy as np
from abc import ABCMeta, abstractmethod
from numbers import Integral, Real
from numbers import Integral
from warnings import warn
from functools import partial

Expand Down Expand Up @@ -248,11 +248,11 @@ class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
"n_estimators": [Interval(Integral, 1, None, closed="left")],
"max_samples": [
Interval(Integral, 1, None, closed="left"),
Interval(Real, 0, 1, closed="right"),
Interval("real_not_int", 0, 1, closed="right"),
],
"max_features": [
Interval(Integral, 1, None, closed="left"),
Interval(Real, 0, 1, closed="right"),
Interval("real_not_int", 0, 1, closed="right"),
],
"bootstrap": ["boolean"],
"bootstrap_features": ["boolean"],
Expand Down
2 changes: 1 addition & 1 deletion sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
"warm_start": ["boolean"],
"max_samples": [
None,
Interval(Real, 0.0, 1.0, closed="right"),
Interval("real_not_int", 0.0, 1.0, closed="right"),
Interval(Integral, 1, None, closed="left"),
],
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
],
"n_iter_no_change": [Interval(Integral, 1, None, closed="left")],
"validation_fraction": [
Interval(Real, 0, 1, closed="neither"),
Interval("real_not_int", 0, 1, closed="neither"),
Interval(Integral, 1, None, closed="left"),
None,
],
Expand Down
2 changes: 1 addition & 1 deletion sklearn/ensemble/_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ class IsolationForest(OutlierMixin, BaseBagging):
"max_samples": [
StrOptions({"auto"}),
Interval(Integral, 1, None, closed="left"),
Interval(Real, 0, 1, closed="right"),
Interval("real_not_int", 0, 1, closed="right"),
],
"contamination": [
StrOptions({"auto"}),
Expand Down
6 changes: 3 additions & 3 deletions sklearn/feature_extraction/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,8 @@ def _extract_patches(arr, patch_shape=8, extraction_step=1):
"image": [np.ndarray],
"patch_size": [tuple, list],
"max_patches": [
Interval(Real, left=0, right=1, closed="neither"),
Interval(Integral, left=1, right=None, closed="left"),
Interval("real_not_int", 0, 1, closed="neither"),
Interval(Integral, 1, None, closed="left"),
None,
],
"random_state": ["random_state"],
Expand Down Expand Up @@ -542,7 +542,7 @@ class PatchExtractor(TransformerMixin, BaseEstimator):
"patch_size": [tuple, None],
"max_patches": [
None,
Interval(Real, 0, 1, closed="neither"),
Interval("real_not_int", 0, 1, closed="neither"),
Interval(Integral, 1, None, closed="left"),
],
"random_state": ["random_state"],
Expand Down
6 changes: 3 additions & 3 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from collections import defaultdict
from collections.abc import Mapping
from functools import partial
from numbers import Integral, Real
from numbers import Integral
from operator import itemgetter
import re
import unicodedata
Expand Down Expand Up @@ -1148,11 +1148,11 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
"ngram_range": [tuple],
"analyzer": [StrOptions({"word", "char", "char_wb"}), callable],
"max_df": [
Interval(Real, 0, 1, closed="both"),
Interval("real_not_int", 0, 1, closed="both"),
Interval(Integral, 1, None, closed="left"),
],
"min_df": [
Interval(Real, 0, 1, closed="both"),
Interval("real_not_int", 0, 1, closed="both"),
Interval(Integral, 1, None, closed="left"),
],
"max_features": [Interval(Integral, 1, None, closed="left"), None],
Expand Down
6 changes: 3 additions & 3 deletions sklearn/feature_selection/_rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""Recursive feature elimination for feature ranking"""

import numpy as np
from numbers import Integral, Real
from numbers import Integral
from joblib import effective_n_jobs


Expand Down Expand Up @@ -187,12 +187,12 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
"estimator": [HasMethods(["fit"])],
"n_features_to_select": [
None,
Interval(Real, 0, 1, closed="right"),
Interval("real_not_int", 0, 1, closed="right"),
Interval(Integral, 0, None, closed="neither"),
],
"step": [
Interval(Integral, 0, None, closed="neither"),
Interval(Real, 0, 1, closed="neither"),
Interval("real_not_int", 0, 1, closed="neither"),
],
"verbose": ["verbose"],
"importance_getter": [str, callable],
Expand Down
2 changes: 1 addition & 1 deletion sklearn/feature_selection/_sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator
"estimator": [HasMethods(["fit"])],
"n_features_to_select": [
StrOptions({"auto", "warn"}, deprecated={"warn"}),
Interval(Real, 0, 1, closed="right"),
Interval("real_not_int", 0, 1, closed="right"),
Interval(Integral, 0, None, closed="neither"),
Hidden(None),
],
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/_ransac.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ class RANSACRegressor(
"estimator": [HasMethods(["fit", "score", "predict"]), None],
"min_samples": [
Interval(Integral, 1, None, closed="left"),
Interval(Real, 0, 1, closed="both"),
Interval("real_not_int", 0, 1, closed="both"),
None,
],
"residual_threshold": [Interval(Real, 0, None, closed="left"), None],
Expand Down
4 changes: 2 additions & 2 deletions sklearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -2464,12 +2464,12 @@ def check_cv(cv=5, y=None, *, classifier=False):
@validate_params(
{
"test_size": [
Interval(numbers.Real, 0, 1, closed="neither"),
Interval("real_not_int", 0, 1, closed="neither"),
Interval(numbers.Integral, 1, None, closed="left"),
None,
],
"train_size": [
Interval(numbers.Real, 0, 1, closed="neither"),
Interval("real_not_int", 0, 1, closed="neither"),
Interval(numbers.Integral, 1, None, closed="left"),
None,
],
Expand Down
4 changes: 2 additions & 2 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# License: BSD 3 clause

import numbers
from numbers import Integral, Real
from numbers import Integral
import warnings

import numpy as np
Expand Down Expand Up @@ -493,7 +493,7 @@ class OneHotEncoder(_BaseEncoder):
"max_categories": [Interval(Integral, 1, None, closed="left"), None],
"min_frequency": [
Interval(Integral, 1, None, closed="left"),
Interval(Real, 0, 1, closed="neither"),
Interval("real_not_int", 0, 1, closed="neither"),
None,
],
"sparse": [Hidden(StrOptions({"deprecated"})), "boolean"], # deprecated
Expand Down
18 changes: 17 additions & 1 deletion sklearn/tests/test_public_functions.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from importlib import import_module
from inspect import signature
from numbers import Integral, Real

import pytest

from sklearn.utils._param_validation import generate_invalid_param_val
from sklearn.utils._param_validation import generate_valid_param
from sklearn.utils._param_validation import make_constraint
from sklearn.utils._param_validation import InvalidParameterError
from sklearn.utils._param_validation import Interval


def _get_func_info(func_module):
Expand Down Expand Up @@ -70,6 +72,20 @@ def _check_function_param_validation(
# This parameter is not validated
continue

# Mixing an interval of reals and an interval of integers must be avoided.
if any(
isinstance(constraint, Interval) and constraint.type == Integral
for constraint in constraints
) and any(
isinstance(constraint, Interval) and constraint.type == Real
for constraint in constraints
):
raise ValueError(
f"The constraint for parameter {param_name} of {func_name} can't have a"
" mix of intervals of Integral and Real types. Use the type"
" 'real_not_int' instead of Real."
)

match = (
rf"The '{param_name}' parameter of {func_name} must be .* Got .* instead."
)
Expand All @@ -85,7 +101,7 @@ def _check_function_param_validation(

for constraint in constraints:
try:
bad_value = generate_invalid_param_val(constraint, constraints)
bad_value = generate_invalid_param_val(constraint)
except NotImplementedError:
continue

Expand Down