Skip to content

Commit

Permalink
Fix some docstrings for class_weight=auto/balanced. Totally how I wan…
Browse files Browse the repository at this point in the history
…ted to spend my afternoon....
  • Loading branch information
amueller committed Mar 10, 2015
1 parent 9c3055e commit e99705c
Show file tree
Hide file tree
Showing 15 changed files with 147 additions and 120 deletions.
2 changes: 1 addition & 1 deletion doc/modules/svm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ Tips on Practical Use
approximates the fraction of training errors and support vectors.

* In :class:`SVC`, if data for classification are unbalanced (e.g. many
positive and few negative), set ``class_weight='auto'`` and/or try
positive and few negative), set ``class_weight='balanced'`` and/or try
different penalty parameters ``C``.

* The underlying :class:`LinearSVC` implementation uses a random
Expand Down
30 changes: 16 additions & 14 deletions sklearn/ensemble/forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
curr_sample_weight *= sample_counts

if class_weight == 'subsample':
curr_sample_weight *= compute_sample_weight('auto', y, indices)
curr_sample_weight *= compute_sample_weight('balanced', y, indices)

tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)

Expand Down Expand Up @@ -408,17 +408,17 @@ def _validate_y_class_weight(self, y):
self.n_classes_.append(classes_k.shape[0])

if self.class_weight is not None:
valid_presets = ('auto', 'subsample')
valid_presets = ('auto', 'balanced', 'subsample')
if isinstance(self.class_weight, six.string_types):
if self.class_weight not in valid_presets:
raise ValueError('Valid presets for class_weight include '
'"auto" and "subsample". Given "%s".'
'"balanced" and "subsample". Given "%s".'
% self.class_weight)
if self.warm_start:
warn('class_weight presets "auto" or "subsample" are '
warn('class_weight presets "balanced" or "subsample" are '
'not recommended for warm_start if the fitted data '
'differs from the full dataset. In order to use '
'"auto" weights, use compute_class_weight("auto", '
'"auto" weights, use compute_class_weight("balanced", '
'classes, y). In place of y you can use a large '
'enough sample of the full training set target to '
'properly estimate the class frequency '
Expand All @@ -427,7 +427,7 @@ def _validate_y_class_weight(self, y):

if self.class_weight != 'subsample' or not self.bootstrap:
if self.class_weight == 'subsample':
class_weight = 'auto'
class_weight = 'balanced'
else:
class_weight = self.class_weight
expanded_class_weight = compute_sample_weight(class_weight,
Expand Down Expand Up @@ -760,17 +760,18 @@ class RandomForestClassifier(ForestClassifier):
and add more estimators to the ensemble, otherwise, just fit a whole
new forest.
class_weight : dict, list of dicts, "auto", "subsample" or None, optional
class_weight : dict, list of dicts, "balanced", "subsample" or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
The "subsample" mode is the same as "auto" except that weights are
The "subsample" mode is the same as "balanced" except that weights are
computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Expand Down Expand Up @@ -1090,17 +1091,18 @@ class ExtraTreesClassifier(ForestClassifier):
and add more estimators to the ensemble, otherwise, just fit a whole
new forest.
class_weight : dict, list of dicts, "auto", "subsample" or None, optional
class_weight : dict, list of dicts, "balanced", "subsample" or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
The "subsample" mode is the same as "auto" except that weights are
The "subsample" mode is the same as "balanced" except that weights are
computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Expand Down
22 changes: 11 additions & 11 deletions sklearn/ensemble/tests/test_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def test_parallel():
yield check_parallel, name, iris.data, iris.target

for name in FOREST_REGRESSORS:
yield check_parallel, name, boston.data, boston.target
yield check_parallel, name, boston.data, boston.target


def check_pickle(name, X, y):
Expand All @@ -352,7 +352,7 @@ def test_pickle():
yield check_pickle, name, iris.data[::2], iris.target[::2]

for name in FOREST_REGRESSORS:
yield check_pickle, name, boston.data[::2], boston.target[::2]
yield check_pickle, name, boston.data[::2], boston.target[::2]


def check_multioutput(name):
Expand Down Expand Up @@ -749,10 +749,10 @@ def check_class_weights(name):
"""Check class_weights resemble sample_weights behavior."""
ForestClassifier = FOREST_CLASSIFIERS[name]

# Iris is balanced, so no effect expected for using 'auto' weights
# Iris is balanced, so no effect expected for using 'balanced' weights
clf1 = ForestClassifier(random_state=0)
clf1.fit(iris.data, iris.target)
clf2 = ForestClassifier(class_weight='auto', random_state=0)
clf2 = ForestClassifier(class_weight='balanced', random_state=0)
clf2.fit(iris.data, iris.target)
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)

Expand All @@ -765,8 +765,8 @@ def check_class_weights(name):
random_state=0)
clf3.fit(iris.data, iris_multi)
assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
# Check against multi-output "auto" which should also have no effect
clf4 = ForestClassifier(class_weight='auto', random_state=0)
# Check against multi-output "balanced" which should also have no effect
clf4 = ForestClassifier(class_weight='balanced', random_state=0)
clf4.fit(iris.data, iris_multi)
assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)

Expand All @@ -782,7 +782,7 @@ def check_class_weights(name):

# Check that sample_weight and class_weight are multiplicative
clf1 = ForestClassifier(random_state=0)
clf1.fit(iris.data, iris.target, sample_weight**2)
clf1.fit(iris.data, iris.target, sample_weight ** 2)
clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
clf2.fit(iris.data, iris.target, sample_weight)
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
Expand All @@ -793,11 +793,11 @@ def test_class_weights():
yield check_class_weights, name


def check_class_weight_auto_and_bootstrap_multi_output(name):
def check_class_weight_balanced_and_bootstrap_multi_output(name):
"""Test class_weight works for multi-output"""
ForestClassifier = FOREST_CLASSIFIERS[name]
_y = np.vstack((y, np.array(y) * 2)).T
clf = ForestClassifier(class_weight='auto', random_state=0)
clf = ForestClassifier(class_weight='balanced', random_state=0)
clf.fit(X, _y)
clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}],
random_state=0)
Expand All @@ -806,9 +806,9 @@ def check_class_weight_auto_and_bootstrap_multi_output(name):
clf.fit(X, _y)


def test_class_weight_auto_and_bootstrap_multi_output():
def test_class_weight_balanced_and_bootstrap_multi_output():
for name in FOREST_CLASSIFIERS:
yield check_class_weight_auto_and_bootstrap_multi_output, name
yield check_class_weight_balanced_and_bootstrap_multi_output, name


def check_class_weight_errors(name):
Expand Down
60 changes: 35 additions & 25 deletions sklearn/linear_model/logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,11 +456,13 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
is called repeatedly with the same data, as y is modified
along the path.
class_weight : {dict, 'auto'}, optional
Over-/undersamples the samples of each class according to the given
weights. If not given, all classes are supposed to have weight one.
The 'auto' mode selects weights inversely proportional to class
frequencies in the training set.
class_weight : dict or 'balanced', optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
dual : bool
Dual or primal formulation. Dual formulation is only implemented for
Expand Down Expand Up @@ -729,11 +731,13 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
tol : float
Tolerance for stopping criteria.
class_weight : {dict, 'auto'}, optional
Over-/undersamples the samples of each class according to the given
weights. If not given, all classes are supposed to have weight one.
The 'auto' mode selects weights inversely proportional to class
frequencies in the training set.
class_weight : dict or 'balanced', optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
verbose : int
For the liblinear and lbfgs solvers set verbose to any positive
Expand Down Expand Up @@ -897,11 +901,13 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
To lessen the effect of regularization on synthetic feature weight
(and therefore on the intercept) intercept_scaling has to be increased.
class_weight : {dict, 'auto'}, optional
Over-/undersamples the samples of each class according to the given
weights. If not given, all classes are supposed to have weight one.
The 'auto' mode selects weights inversely proportional to class
frequencies in the training set.
class_weight : dict or 'balanced', optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
max_iter : int
Useful only for the newton-cg and lbfgs solvers. Maximum number of
Expand Down Expand Up @@ -1147,11 +1153,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
Specifies if a constant (a.k.a. bias or intercept) should be
added the decision function.
class_weight : {dict, 'auto'}, optional
Over-/undersamples the samples of each class according to the given
weights. If not given, all classes are supposed to have weight one.
The 'auto' mode selects weights inversely proportional to class
frequencies in the training set.
class_weight : dict or 'balanced', optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
cv : integer or cross-validation generator
The default cross-validation generator used is Stratified K-Folds.
Expand Down Expand Up @@ -1182,11 +1190,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
max_iter : int, optional
Maximum number of iterations of the optimization algorithm.
class_weight : {dict, 'auto'}, optional
Over-/undersamples the samples of each class according to the given
weights. If not given, all classes are supposed to have weight one.
The 'auto' mode selects weights inversely proportional to class
frequencies in the training set.
class_weight : dict or 'balanced', optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
n_jobs : int, optional
Number of CPU cores used during the cross-validation loop. If given
Expand Down
7 changes: 4 additions & 3 deletions sklearn/linear_model/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ class Perceptron(BaseSGDClassifier, _LearntSelectorMixin):
eta0 : double
Constant by which the updates are multiplied. Defaults to 1.
class_weight : dict, {class_label: weight} or "auto" or None, optional
class_weight : dict, {class_label: weight} or "balanced" or None, optional
Preset for the class_weight fit parameter.
Weights associated with classes. If not given, all classes
are supposed to have weight one.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
warm_start : bool, optional
When set to True, reuse the solution of the previous call to fit as
Expand Down
24 changes: 15 additions & 9 deletions sklearn/linear_model/ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from ..base import RegressorMixin
from ..utils.extmath import safe_sparse_dot
from ..utils import check_X_y
from ..utils import compute_sample_weight, compute_class_weight
from ..utils import compute_sample_weight
from ..utils import column_or_1d
from ..preprocessing import LabelBinarizer
from ..grid_search import GridSearchCV
Expand Down Expand Up @@ -521,10 +521,13 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
``(2*C)^-1`` in other linear models such as LogisticRegression or
LinearSVC.
class_weight : dict, optional
Weights associated with classes in the form
``{class_label : weight}``. If not given, all classes are
supposed to have weight one.
class_weight : dict or 'balanced', optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
copy_X : boolean, optional, default True
If True, X will be copied; else, it may be overwritten.
Expand Down Expand Up @@ -1008,10 +1011,13 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
If None, Generalized Cross-Validation (efficient Leave-One-Out)
will be used.
class_weight : dict, optional
Weights associated with classes in the form
``{class_label : weight}``. If not given, all classes are
supposed to have weight one.
class_weight : dict or 'balanced', optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
Attributes
----------
Expand Down
15 changes: 8 additions & 7 deletions sklearn/linear_model/stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,15 +511,15 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
-------
self : returns an instance of self.
"""
if self.class_weight == 'auto':
raise ValueError("class_weight 'auto' is not supported for "
if self.class_weight in ['balanced', 'auto']:
raise ValueError("class_weight '{0}' is not supported for "
"partial_fit. In order to use 'auto' weights, "
"use compute_class_weight('auto', classes, y). "
"use compute_class_weight('{0}', classes, y). "
"In place of y you can us a large enough sample "
"of the full training set target to properly "
"estimate the class frequency distributions. "
"Pass the resulting weights as the class_weight "
"parameter.")
"parameter.".format(self.class_weight))
return self._partial_fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss,
learning_rate=self.learning_rate, n_iter=1,
classes=classes, sample_weight=sample_weight,
Expand Down Expand Up @@ -664,14 +664,15 @@ class SGDClassifier(BaseSGDClassifier, _LearntSelectorMixin):
power_t : double
The exponent for inverse scaling learning rate [default 0.5].
class_weight : dict, {class_label: weight} or "auto" or None, optional
class_weight : dict, {class_label: weight} or "balanced" or None, optional
Preset for the class_weight fit parameter.
Weights associated with classes. If not given, all classes
are supposed to have weight one.
The "auto" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data
as ``n_samples / (n_classes * np.bincount(y))``
warm_start : bool, optional
When set to True, reuse the solution of the previous call to fit as
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/tests/test_logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ def test_logistic_regressioncv_class_weights():
clf_lib.fit(X, y_)
assert_array_equal(clf_lib.classes_, [0, 1])

# Test for class_weight=auto
# Test for class_weight=balanced
X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
random_state=0)
clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,
Expand Down

0 comments on commit e99705c

Please sign in to comment.