scikit-learn · GaelVaroquaux · Jun 1, 2015 · May 12, 2015 · May 12, 2015 · Jun 1, 2015
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
@@ -405,7 +405,7 @@ Tips on Practical Use
     approximates the fraction of training errors and support vectors.
 
   * In :class:`SVC`, if data for classification are unbalanced (e.g. many
-    positive and few negative), set ``class_weight='auto'`` and/or try
+    positive and few negative), set ``class_weight='balanced'`` and/or try
     different penalty parameters ``C``.
 
   * The underlying :class:`LinearSVC` implementation uses a random

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -56,6 +56,11 @@ Enhancements
      :class:`linear_model.LogisticRegression`, by avoiding loss computation.
      By `Mathieu Blondel`_ and `Tom Dupre la Tour`_.
 
+   - The ``class_weight="auto"`` heuristic in classifiers supporting 
+     ``class_weight`` was deprecated and replaced by the ``class_weight="balanced"``
+     option, which has a simpler forumlar and interpretation.
+     By Hanna Wallach and `Andreas Müller`_.
+
 Bug fixes
 .........
 
@@ -339,6 +344,7 @@ Enhancements
    - :class:`svm.SVC` fitted on sparse input now implements ``decision_function``.
      By `Rob Zinkov`_ and `Andreas Müller`_.
 
+
 Documentation improvements
 ..........................
 
@@ -462,7 +468,7 @@ Bug fixes
       in GMM. By `Alexis Mignon`_.
 
     - Fixed a error in the computation of conditional probabilities in
-      :class:`naive_bayes.BernoulliNB`. By `Hanna Wallach`_.
+      :class:`naive_bayes.BernoulliNB`. By Hanna Wallach.
 
     - Make the method ``radius_neighbors`` of
       :class:`neighbors.NearestNeighbors` return the samples lying on the

diff --git a/examples/applications/face_recognition.py b/examples/applications/face_recognition.py
@@ -105,7 +105,7 @@
 t0 = time()
 param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
               'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
-clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
+clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
 clf = clf.fit(X_train_pca, y_train)
 print("done in %0.3fs" % (time() - t0))
 print("Best estimator found by grid search:")

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -41,7 +41,9 @@ class calls the ``fit`` method of each sub-estimator on random samples
 
 from __future__ import division
 
+import warnings
 from warnings import warn
+
 from abc import ABCMeta, abstractmethod
 
 import numpy as np
@@ -89,7 +91,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         curr_sample_weight *= sample_counts
 
         if class_weight == 'subsample':
-            curr_sample_weight *= compute_sample_weight('auto', y, indices)
+            with warnings.catch_warnings():
+                warnings.simplefilter('ignore', DeprecationWarning)
+                curr_sample_weight *= compute_sample_weight('auto', y, indices)
+        elif class_weight == 'balanced_subsample':
+            curr_sample_weight *= compute_sample_weight('balanced', y, indices)
 
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
 
@@ -414,30 +420,40 @@ def _validate_y_class_weight(self, y):
             self.n_classes_.append(classes_k.shape[0])
 
         if self.class_weight is not None:
-            valid_presets = ('auto', 'subsample')
+            valid_presets = ('auto', 'balanced', 'balanced_subsample', 'subsample', 'auto')
             if isinstance(self.class_weight, six.string_types):
                 if self.class_weight not in valid_presets:
                     raise ValueError('Valid presets for class_weight include '
-                                     '"auto" and "subsample". Given "%s".'
+                                     '"balanced" and "balanced_subsample". Given "%s".'
                                      % self.class_weight)
+                if self.class_weight == "subsample":
+                    warn("class_weight='subsample' is deprecated and will be removed in 0.18."
+                         " It was replaced by class_weight='balanced_subsample' "
+                         "using the balanced strategy.", DeprecationWarning)
                 if self.warm_start:
-                    warn('class_weight presets "auto" or "subsample" are '
+                    warn('class_weight presets "balanced" or "balanced_subsample" are '
                          'not recommended for warm_start if the fitted data '
                          'differs from the full dataset. In order to use '
-                         '"auto" weights, use compute_class_weight("auto", '
+                         '"balanced" weights, use compute_class_weight("balanced", '
                          'classes, y). In place of y you can use a large '
                          'enough sample of the full training set target to '
                          'properly estimate the class frequency '
                          'distributions. Pass the resulting weights as the '
                          'class_weight parameter.')
 
-            if self.class_weight != 'subsample' or not self.bootstrap:
+            if (self.class_weight not in ['subsample', 'balanced_subsample'] or
+                    not self.bootstrap):
                 if self.class_weight == 'subsample':
                     class_weight = 'auto'
+                elif self.class_weight == "balanced_subsample":
+                    class_weight = "balanced"
                 else:
                     class_weight = self.class_weight
-                expanded_class_weight = compute_sample_weight(class_weight,
-                                                              y_original)
+                with warnings.catch_warnings():
+                    if class_weight == "auto":
+                        warnings.simplefilter('ignore', DeprecationWarning)
+                    expanded_class_weight = compute_sample_weight(class_weight,
+                                                                  y_original)
 
         return y, expanded_class_weight
 
@@ -758,17 +774,18 @@ class RandomForestClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "auto", "subsample" or None, optional
+    class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional
 
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
         order as the columns of y.
 
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
-        The "subsample" mode is the same as "auto" except that weights are
+        The "balanced_subsample" mode is the same as "balanced" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
         For multi-output, the weights of each column of y will be multiplied.
@@ -1100,17 +1117,18 @@ class ExtraTreesClassifier(ForestClassifier):
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest.
 
-    class_weight : dict, list of dicts, "auto", "subsample" or None, optional
+    class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional
 
         Weights associated with classes in the form ``{class_label: weight}``.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
         order as the columns of y.
 
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies in the input data.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
-        The "subsample" mode is the same as "auto" except that weights are
+        The "balanced_subsample" mode is the same as "balanced" except that weights are
         computed based on the bootstrap sample for every tree grown.
 
         For multi-output, the weights of each column of y will be multiplied.

diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -24,6 +24,7 @@
 from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_warns
+from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import ignore_warnings
 
 from sklearn import datasets
@@ -329,7 +330,7 @@ def test_parallel():
         yield check_parallel, name, iris.data, iris.target
 
     for name in FOREST_REGRESSORS:
-        yield check_parallel, name,  boston.data, boston.target
+        yield check_parallel, name, boston.data, boston.target
 
 
 def check_pickle(name, X, y):
@@ -352,7 +353,7 @@ def test_pickle():
         yield check_pickle, name, iris.data[::2], iris.target[::2]
 
     for name in FOREST_REGRESSORS:
-        yield check_pickle, name,  boston.data[::2], boston.target[::2]
+        yield check_pickle, name, boston.data[::2], boston.target[::2]
 
 
 def check_multioutput(name):
@@ -749,10 +750,10 @@ def check_class_weights(name):
     # Check class_weights resemble sample_weights behavior.
     ForestClassifier = FOREST_CLASSIFIERS[name]
 
-    # Iris is balanced, so no effect expected for using 'auto' weights
+    # Iris is balanced, so no effect expected for using 'balanced' weights
     clf1 = ForestClassifier(random_state=0)
     clf1.fit(iris.data, iris.target)
-    clf2 = ForestClassifier(class_weight='auto', random_state=0)
+    clf2 = ForestClassifier(class_weight='balanced', random_state=0)
     clf2.fit(iris.data, iris.target)
     assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
 
@@ -765,8 +766,8 @@ def check_class_weights(name):
                             random_state=0)
     clf3.fit(iris.data, iris_multi)
     assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
-    # Check against multi-output "auto" which should also have no effect
-    clf4 = ForestClassifier(class_weight='auto', random_state=0)
+    # Check against multi-output "balanced" which should also have no effect
+    clf4 = ForestClassifier(class_weight='balanced', random_state=0)
     clf4.fit(iris.data, iris_multi)
     assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
 
@@ -782,7 +783,7 @@ def check_class_weights(name):
 
     # Check that sample_weight and class_weight are multiplicative
     clf1 = ForestClassifier(random_state=0)
-    clf1.fit(iris.data, iris.target, sample_weight**2)
+    clf1.fit(iris.data, iris.target, sample_weight ** 2)
     clf2 = ForestClassifier(class_weight=class_weight, random_state=0)
     clf2.fit(iris.data, iris.target, sample_weight)
     assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
@@ -793,22 +794,26 @@ def test_class_weights():
         yield check_class_weights, name
 
 
-def check_class_weight_auto_and_bootstrap_multi_output(name):
-    # Test class_weight works for multi-output
+def check_class_weight_balanced_and_bootstrap_multi_output(name):
+    # Test class_weight works for multi-output"""
     ForestClassifier = FOREST_CLASSIFIERS[name]
     _y = np.vstack((y, np.array(y) * 2)).T
-    clf = ForestClassifier(class_weight='auto', random_state=0)
+    clf = ForestClassifier(class_weight='balanced', random_state=0)
     clf.fit(X, _y)
     clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}],
                            random_state=0)
     clf.fit(X, _y)
+    # smoke test for subsample and balanced subsample
+    clf = ForestClassifier(class_weight='balanced_subsample', random_state=0)
+    clf.fit(X, _y)
     clf = ForestClassifier(class_weight='subsample', random_state=0)
+    #assert_warns_message(DeprecationWarning, "balanced_subsample", clf.fit, X, _y)
     clf.fit(X, _y)
 
 
-def test_class_weight_auto_and_bootstrap_multi_output():
+def test_class_weight_balanced_and_bootstrap_multi_output():
     for name in FOREST_CLASSIFIERS:
-        yield check_class_weight_auto_and_bootstrap_multi_output, name
+        yield check_class_weight_balanced_and_bootstrap_multi_output, name
 
 
 def check_class_weight_errors(name):

diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -473,11 +473,13 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         is called repeatedly with the same data, as y is modified
         along the path.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     dual : bool
         Dual or primal formulation. Dual formulation is only implemented for
@@ -734,11 +736,13 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
     tol : float
         Tolerance for stopping criteria.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     verbose : int
         For the liblinear and lbfgs solvers set verbose to any positive
@@ -903,11 +907,13 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
         To lessen the effect of regularization on synthetic feature weight
         (and therefore on the intercept) intercept_scaling has to be increased.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     max_iter : int
         Useful only for the newton-cg and lbfgs solvers. Maximum number of
@@ -1150,11 +1156,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
         Specifies if a constant (a.k.a. bias or intercept) should be
         added the decision function.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     cv : integer or cross-validation generator
         The default cross-validation generator used is Stratified K-Folds.
@@ -1185,11 +1193,13 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
     max_iter : int, optional
         Maximum number of iterations of the optimization algorithm.
 
-    class_weight : {dict, 'auto'}, optional
-        Over-/undersamples the samples of each class according to the given
-        weights. If not given, all classes are supposed to have weight one.
-        The 'auto' mode selects weights inversely proportional to class
-        frequencies in the training set.
+    class_weight : dict or 'balanced', optional
+        Weights associated with classes in the form ``{class_label: weight}``.
+        If not given, all classes are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     n_jobs : int, optional
         Number of CPU cores used during the cross-validation loop. If given
@@ -1363,9 +1373,9 @@ def fit(self, X, y):
             iter_labels = [None]
 
         if self.class_weight and not(isinstance(self.class_weight, dict) or
-                                     self.class_weight == 'auto'):
+                                     self.class_weight in ['balanced', 'auto']):
             raise ValueError("class_weight provided should be a "
-                             "dict or 'auto'")
+                             "dict or 'balanced'")
 
         path_func = delayed(_log_reg_scoring_path)
 

diff --git a/sklearn/linear_model/perceptron.py b/sklearn/linear_model/perceptron.py
@@ -44,14 +44,15 @@ class Perceptron(BaseSGDClassifier, _LearntSelectorMixin):
     eta0 : double
         Constant by which the updates are multiplied. Defaults to 1.
 
-    class_weight : dict, {class_label: weight} or "auto" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced" or None, optional
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
         are supposed to have weight one.
 
-        The "auto" mode uses the values of y to automatically adjust
-        weights inversely proportional to class frequencies.
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
 
     warm_start : bool, optional
         When set to True, reuse the solution of the previous call to fit as