Fix some docstrings for class_weight=auto/balanced. Totally how I wan…

…ted to spend my afternoon....
scikit-learn · Mar 10, 2015 · e99705c · e99705c
1 parent 9c3055e
commit e99705c
Show file tree

Hide file tree

Showing 15 changed files with 147 additions and 120 deletions.
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
@@ -405,7 +405,7 @@ Tips on Practical Use
     approximates the fraction of training errors and support vectors.
 
   * In :class:`SVC`, if data for classification are unbalanced (e.g. many
-    positive and few negative), set ``class_weight='auto'`` and/or try
+    positive and few negative), set ``class_weight='balanced'`` and/or try
     different penalty parameters ``C``.
 
   * The underlying :class:`LinearSVC` implementation uses a random

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -89,7 +89,7 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         curr_sample_weight *= sample_counts
 
         if class_weight == 'subsample':
-            curr_sample_weight *= compute_sample_weight('auto', y, indices)
+            curr_sample_weight *= compute_sample_weight('balanced', y, indices)
 
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
 
@@ -408,17 +408,17 @@ def _validate_y_class_weight(self, y):
             self.n_classes_.append(classes_k.shape[0])
 
         if self.class_weight is not None:
-            valid_presets = ('auto', 'subsample')
+            valid_presets = ('auto', 'balanced', 'subsample')
             if isinstance(self.class_weight, six.string_types):
                 if self.class_weight not in valid_presets:
                     raise ValueError('Valid presets for class_weight include '
-                                     '"auto" and "subsample". Given "%s".'
+                                     '"balanced" and "subsample". Given "%s".'
                                      % self.class_weight)
                 if self.warm_start:
-                    warn('class_weight presets "auto" or "subsample" are '
+                    warn('class_weight presets "balanced" or "subsample" are '
                          'not recommended for warm_start if the fitted data '
                          'differs from the full dataset. In order to use '
-                         '"auto" weights, use compute_class_weight("auto", '
+                         '"auto" weights, use compute_class_weight("balanced", '
                          'classes, y). In place of y you can use a large '
                          'enough sample of the full training set target to '
                          'properly estimate the class frequency '
@@ -427,7 +427,7 @@ def _validate_y_class_weight(self, y):
 
             if self.class_weight != 'subsample' or not self.bootstrap:
                 if self.class_weight == 'subsample':
-                    class_weight = 'auto'
+                    class_weight = 'balanced'
                 else:
                     class_weight = self.class_weight
                 expanded_class_weight = compute_sample_weight(class_weight,
@@ -760,17 +760,18 @@ class RandomForestClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "auto", "subsample" or None, optional
+    class_weight : dict, list of dicts, "balanced", "subsample" or None, optional
 
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
         order as the columns of y.
 
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
-        The "subsample" mode is the same as "auto" except that weights are
+        The "subsample" mode is the same as "balanced" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
         For multi-output, the weights of each column of y will be multiplied.
@@ -1090,17 +1091,18 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "auto", "subsample" or None, optional
+    class_weight : dict, list of dicts, "balanced", "subsample" or None, optional
 
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
         order as the columns of y.
 
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
-        The "subsample" mode is the same as "auto" except that weights are
+        The "subsample" mode is the same as "balanced" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
         For multi-output, the weights of each column of y will be multiplied.

diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -329,7 +329,7 @@ def test_parallel():
         yield check_parallel, name, iris.data, iris.target
 
     for name in FOREST_REGRESSORS:
-        yield check_parallel, name,  boston.data, boston.target
+        yield check_parallel, name, boston.data, boston.target
 
 
 def check_pickle(name, X, y):
@@ -352,7 +352,7 @@ def test_pickle():
         yield check_pickle, name, iris.data[::2], iris.target[::2]
 
     for name in FOREST_REGRESSORS:
-        yield check_pickle, name,  boston.data[::2], boston.target[::2]
+        yield check_pickle, name, boston.data[::2], boston.target[::2]
 
 
 def check_multioutput(name):
@@ -749,10 +749,10 @@ def check_class_weights(name):
     """Check class_weights resemble sample_weights behavior."""
     ForestClassifier = FOREST_CLASSIFIERS[name]
 
-    # Iris is balanced, so no effect expected for using 'auto' weights
+    # Iris is balanced, so no effect expected for using 'balanced' weights
     clf1 = ForestClassifier(random_state=0)
     clf1.fit(iris.data, iris.target)
-    clf2 = ForestClassifier(class_weight='auto', random_state=0)
+    clf2 = ForestClassifier(class_weight='balanced', random_state=0)
     clf2.fit(iris.data, iris.target)
     assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
 
@@ -765,8 +765,8 @@ def check_class_weights(name):
                             random_state=0)
     clf3.fit(iris.data, iris_multi)
     assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
-    # Check against multi-output "auto" which should also have no effect
-    clf4 = ForestClassifier(class_weight='auto', random_state=0)
+    # Check against multi-output "balanced" which should also have no effect
+    clf4 = ForestClassifier(class_weight='balanced', random_state=0)
     clf4.fit(iris.data, iris_multi)
     assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
 
@@ -782,7 +782,7 @@ def check_class_weights(name):
 
     # Check that sample_weight and class_weight are multiplicative
     clf1 = ForestClassifier(random_state=0)
-    clf1.fit(iris.data, iris.target, sample_weight**2)
+    clf1.fit(iris.data, iris.target, sample_weight ** 2)
     clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
     clf2.fit(iris.data, iris.target, sample_weight)
     assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
@@ -793,11 +793,11 @@ def test_class_weights():
         yield check_class_weights, name
 
 
-def check_class_weight_auto_and_bootstrap_multi_output(name):
+def check_class_weight_balanced_and_bootstrap_multi_output(name):
     """Test class_weight works for multi-output"""
     ForestClassifier = FOREST_CLASSIFIERS[name]
     _y = np.vstack((y, np.array(y) * 2)).T
-    clf = ForestClassifier(class_weight='auto', random_state=0)
+    clf = ForestClassifier(class_weight='balanced', random_state=0)
     clf.fit(X, _y)
     clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}],
                            random_state=0)
@@ -806,9 +806,9 @@ def check_class_weight_auto_and_bootstrap_multi_output(name):
     clf.fit(X, _y)
 
 
-def test_class_weight_auto_and_bootstrap_multi_output():
+def test_class_weight_balanced_and_bootstrap_multi_output():
     for name in FOREST_CLASSIFIERS:
-        yield check_class_weight_auto_and_bootstrap_multi_output, name
+        yield check_class_weight_balanced_and_bootstrap_multi_output, name
 
 
 def check_class_weight_errors(name):

diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -456,11 +456,13 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         is called repeatedly with the same data, as y is modified
         along the path.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     dual : bool
         Dual or primal formulation. Dual formulation is only implemented for
@@ -729,11 +731,13 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
     tol : float
         Tolerance for stopping criteria.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     verbose : int
         For the liblinear and lbfgs solvers set verbose to any positive
@@ -897,11 +901,13 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         To lessen the effect of regularization on synthetic feature weight
         (and therefore on the intercept) intercept_scaling has to be increased.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     max_iter : int
         Useful only for the newton-cg and lbfgs solvers. Maximum number of
@@ -1147,11 +1153,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         Specifies if a constant (a.k.a. bias or intercept) should be
         added the decision function.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     cv : integer or cross-validation generator
         The default cross-validation generator used is Stratified K-Folds.
@@ -1182,11 +1190,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
     max_iter : int, optional
         Maximum number of iterations of the optimization algorithm.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     n_jobs : int, optional
         Number of CPU cores used during the cross-validation loop. If given

diff --git a/sklearn/linear_model/perceptron.py b/sklearn/linear_model/perceptron.py
@@ -44,14 +44,15 @@ class Perceptron(BaseSGDClassifier, _LearntSelectorMixin):
     eta0 : double
         Constant by which the updates are multiplied. Defaults to 1.
 
-    class_weight : dict, {class_label: weight} or "auto" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced" or None, optional
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
         are supposed to have weight one.
 
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     warm_start : bool, optional
         When set to True, reuse the solution of the previous call to fit as

diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
@@ -21,7 +21,7 @@
 from ..base import RegressorMixin
 from ..utils.extmath import safe_sparse_dot
 from ..utils import check_X_y
-from ..utils import compute_sample_weight, compute_class_weight
+from ..utils import compute_sample_weight
 from ..utils import column_or_1d
 from ..preprocessing import LabelBinarizer
 from ..grid_search import GridSearchCV
@@ -521,10 +521,13 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
         ``(2*C)^-1`` in other linear models such as LogisticRegression or
         LinearSVC.
 
-    class_weight : dict, optional
-        Weights associated with classes in the form
-        ``{class_label : weight}``. If not given, all classes are
-        supposed to have weight one.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
@@ -1008,10 +1011,13 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
         If None, Generalized Cross-Validation (efficient Leave-One-Out)
         will be used.
 
-    class_weight : dict, optional
-        Weights associated with classes in the form
-        ``{class_label : weight}``. If not given, all classes are
-        supposed to have weight one.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     Attributes
     ----------

diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
@@ -511,15 +511,15 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         -------
         self : returns an instance of self.
         """
-        if self.class_weight == 'auto':
-            raise ValueError("class_weight 'auto' is not supported for "
+        if self.class_weight in ['balanced', 'auto']:
+            raise ValueError("class_weight '{0}' is not supported for "
                              "partial_fit. In order to use 'auto' weights, "
-                             "use compute_class_weight('auto', classes, y). "
+                             "use compute_class_weight('{0}', classes, y). "
                              "In place of y you can us a large enough sample "
                              "of the full training set target to properly "
                              "estimate the class frequency distributions. "
                              "Pass the resulting weights as the class_weight "
-                             "parameter.")
+                             "parameter.".format(self.class_weight))
         return self._partial_fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss,
                                  learning_rate=self.learning_rate, n_iter=1,
                                  classes=classes, sample_weight=sample_weight,
@@ -664,14 +664,15 @@ class SGDClassifier(BaseSGDClassifier, _LearntSelectorMixin):
     power_t : double
         The exponent for inverse scaling learning rate [default 0.5].
 
-    class_weight : dict, {class_label: weight} or "auto" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced" or None, optional
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
         are supposed to have weight one.
 
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     warm_start : bool, optional
         When set to True, reuse the solution of the previous call to fit as

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
@@ -477,7 +477,7 @@ def test_logistic_regressioncv_class_weights():
     clf_lib.fit(X, y_)
     assert_array_equal(clf_lib.classes_, [0, 1])
 
-    # Test for class_weight=auto
+    # Test for class_weight=balanced
     X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                                random_state=0)
     clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,