From 833695a2a63d90aab13dfb28f360457934341e89 Mon Sep 17 00:00:00 2001
From: Amit Aides <amitibo@tx.technion.ac.il>
Date: Mon, 21 Mar 2011 23:07:23 +0200
Subject: [PATCH 01/31] Added Multinomial Naive Bayes classifier

---
 doc/modules/classes.rst                 |   1 +
 scikits/learn/naive_bayes.py            | 137 ++++++++++++++++++++++++
 scikits/learn/tests/test_naive_bayes.py |  18 ++++
 3 files changed, 156 insertions(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index f0acc6e304cd6..30399a99d78d6 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -98,6 +98,7 @@ Naive Bayes
    :template: class.rst
 
    naive_bayes.GNB
+   naive_bayes.MNNB
 
 
 Nearest Neighbors
diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 8f3a4a53e96d5..82f81414f35d3 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -2,6 +2,8 @@
 """
 
 # Author: Vincent Michel <vincent.michel@inria.fr>
+#         Amit Aides <amitibo@tx.technion.ac.il>
+#
 # License: BSD Style.
 import numpy as np
 
@@ -108,3 +110,138 @@ def predict_log_proba(self, X):
         aB[sup] = np.exp(logaB[sup])
         log_proba -= np.log(np.sum(aB, axis=1))[:, np.newaxis] + B
         return log_proba
+
+
+class MNNB( BaseEstimator, ClassifierMixin  ):
+    r"""
+    Multinomial Naive Bayes (MNNB)
+    
+    The Multinomial Naive Bayes classifier is suitable for text classification.
+
+    Parameters
+    ----------
+    X : array-like, shape = [n_samples, n_features]
+        Training vector, where n_samples in the number of samples and
+        n_features is the number of features.
+    y : array, shape = [n_samples]
+        Target vector relative to X
+
+    Parameters
+    ----------
+    alpha_i: float, optional (default=1.0)
+        smoothing constant.
+    
+    alpha_ratio: float, optional (default=1.0)
+        smoothing ratio.
+
+    Methods
+    -------
+    fit(X, y) : self
+        Fit the model
+
+    predict(X) : array
+        Predict using the model.
+
+    predict_proba(X) : array
+        Predict the probability of each class using the model.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.random.randint( 5, size=(6, 100) )
+    >>> Y = np.array([1, 2, 3, 4, 5, 6])
+    >>> from scikits.learn.naive_bayes import MNNB
+    >>> clf = MNNB()
+    >>> clf.fit(X, Y)
+    MNNB(alpha_ratio=1.0, alpha_i=1.0)
+    >>> print clf.predict(X[2])
+    3
+
+    See also
+    --------
+
+    """
+
+    def __init__( self, alpha_i=1.0, alpha_ratio=1.0 ):
+        
+        self.alpha_i = alpha_i
+        self.alpha_ratio = alpha_ratio
+    
+    def fit(self, X, y ):
+        """Fit the Multinomial distribution"""
+        
+        #
+        # N_c is the count of all words in all documents of class c.
+        # N_c_i is the a count of word i in all documents of class c.
+        # theta_c is the prior empirical probability of a document of class c.
+        # theta_c_i is the (smoothened) empirical likelihood of word i
+        # given a document of class c.
+        #
+        N_c_i_temp = []
+        theta_c = []
+        self.unique_y = np.unique(y)
+        
+        for yi in self.unique_y:
+            N_c_i_temp.append( np.sum( X[y==yi, :], 0 ) )
+            theta_c.append( np.float(np.sum(y==yi)) / y.size )
+
+        N_c_i = np.array( N_c_i_temp )
+        N_c = np.sum( N_c_i, axis=1 )
+        
+        #
+        # Smoothing
+        #
+        alpha_i = self.alpha_i
+        alpha = self.alpha_ratio * X.shape[1] 
+
+        #
+        # Estimate the parameters of the distribution
+        #
+        self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
+        self.theta_c = np.array( theta_c )
+    
+        return self
+
+    def predict(self, X):
+        """Predict the classification of samples X"""
+        
+        y_pred = self.unique_y[np.argmax( self.predict_log_proba( X ), axis=0 )]
+        return y_pred
+
+    def _joint_log_likelihood(self, X):
+        """Calculate the posterior log probability of the samples X"""
+        
+        joint_log_likelihood = []
+        for i in range( self.unique_y.size ):
+            jointi = np.log( self.theta_c[i] )
+            n_ij   = np.dot( np.log( self.theta_c_i[i] ), X.T )
+            joint_log_likelihood.append( jointi + n_ij )
+            
+        joint_log_likelihood = np.array(joint_log_likelihood)
+        
+        return joint_log_likelihood
+
+    def predict_proba(self, X):
+        """
+        Predict the posterior probability of samples X
+        
+        Notes
+        -----
+        The values calculated by the joint log likelihood are very small and
+        might cause numerical underflow when used as the power of an exponent.
+        This function should be used with caution. It is preferable to use the
+        predict_log_proba() function instead.
+        """
+        
+        joint_log_likelihood = self._joint_log_likelihood( X )
+        proba = np.exp( joint_log_likelihood )
+        proba = proba / np.sum( proba, 1 )[:, np.newaxis]
+        return proba
+
+    def predict_log_proba(self, X):
+        """Predict the posterior log probability of samples X"""
+        
+        joint_log_likelihood = self._joint_log_likelihood( X )
+        return joint_log_likelihood
+
+
diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 0f4f74437e430..2a2b9dfafe41c 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -23,3 +23,21 @@ def test_gnb():
     y_pred_log_proba = clf.predict_log_proba(X)
     assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
     
+# Data is 6 random points in an 100 dimensional space classified to
+# three classes.
+X2 = np.random.randint( 5, size=(6, 100) )
+y2 = np.array( [1, 1, 2, 2, 3, 3] )
+
+def test_mnnb():
+    """
+    Multinomial Naive Bayes classification.
+
+    This checks that MNNB implements fit and predict and returns
+    correct values for a simple toy dataset.
+    """
+
+    clf =  naive_bayes.MNNB()
+    y_pred = clf.fit(X2, y2).predict(X2)
+
+    assert_array_equal(y_pred, y2)
+    

From 14a66c031923d05e778cec9bbbb93020680986b0 Mon Sep 17 00:00:00 2001
From: Amit Aides <amitibo@tx.technion.ac.il>
Date: Mon, 21 Mar 2011 23:08:47 +0200
Subject: [PATCH 02/31] Fix to the documentation of the Multinomial Naive
 Bayes.

---
 scikits/learn/naive_bayes.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 82f81414f35d3..a699cb8f01d44 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -113,7 +113,7 @@ def predict_log_proba(self, X):
 
 
 class MNNB( BaseEstimator, ClassifierMixin  ):
-    r"""
+    """
     Multinomial Naive Bayes (MNNB)
     
     The Multinomial Naive Bayes classifier is suitable for text classification.
@@ -145,6 +145,9 @@ class MNNB( BaseEstimator, ClassifierMixin  ):
     predict_proba(X) : array
         Predict the probability of each class using the model.
 
+    predict_log_proba(X) : array
+        Predict the log probability of each class using the model.
+        
     Examples
     --------
     >>> import numpy as np
@@ -189,7 +192,7 @@ def fit(self, X, y ):
         N_c = np.sum( N_c_i, axis=1 )
         
         #
-        # Smoothing
+        # Smoothing coefficients
         #
         alpha_i = self.alpha_i
         alpha = self.alpha_ratio * X.shape[1] 

From ceaf255190e6cb67b4ddb08fcd6e5171b591e750 Mon Sep 17 00:00:00 2001
From: Amit Aides <amitibo@tx.technion.ac.il>
Date: Mon, 28 Mar 2011 16:04:46 +0200
Subject: [PATCH 03/31] Pep 8 compliance and cleanup for the multinomial naive
 bayes

---
 scikits/learn/naive_bayes.py            | 117 ++++++++++++++----------
 scikits/learn/tests/test_naive_bayes.py |  10 ++
 2 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index a699cb8f01d44..d9bb572eb8b80 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -9,6 +9,7 @@
 
 from .base import BaseEstimator, ClassifierMixin
 
+eps = np.finfo(np.float).eps
 
 class GNB(BaseEstimator, ClassifierMixin):
     """
@@ -69,9 +70,9 @@ def fit(self, X, y):
         proba_y = []
         unique_y = np.unique(y)
         for yi in unique_y:
-            theta.append(np.mean(X[y==yi, :], 0))
-            sigma.append(np.var(X[y==yi, :], 0))
-            proba_y.append(np.float(np.sum(y==yi)) / np.size(y))
+            theta.append(np.mean(X[y == yi, :], 0))
+            sigma.append(np.var(X[y == yi, :], 0))
+            proba_y.append(np.float(np.sum(y == yi)) / np.size(y))
         self.theta = np.array(theta)
         self.sigma = np.array(sigma)
         self.proba_y = np.array(proba_y)
@@ -86,8 +87,8 @@ def _joint_log_likelihood(self, X):
         joint_log_likelihood = []
         for i in range(np.size(self.unique_y)):
             jointi = np.log(self.proba_y[i])
-            n_ij = - 0.5 * np.sum(np.log(np.pi*self.sigma[i, :]))
-            n_ij -= 0.5 * np.sum(((X - self.theta[i, :])**2) / \
+            n_ij = - 0.5 * np.sum(np.log(np.pi * self.sigma[i, :]))
+            n_ij -= 0.5 * np.sum(((X - self.theta[i, :]) ** 2) / \
                                     (self.sigma[i, :]), 1)
             joint_log_likelihood.append(jointi + n_ij)
         joint_log_likelihood = np.array(joint_log_likelihood).T
@@ -112,10 +113,10 @@ def predict_log_proba(self, X):
         return log_proba
 
 
-class MNNB( BaseEstimator, ClassifierMixin  ):
+class MNNB(BaseEstimator, ClassifierMixin):
     """
     Multinomial Naive Bayes (MNNB)
-    
+
     The Multinomial Naive Bayes classifier is suitable for text classification.
 
     Parameters
@@ -130,7 +131,7 @@ class MNNB( BaseEstimator, ClassifierMixin  ):
     ----------
     alpha_i: float, optional (default=1.0)
         smoothing constant.
-    
+
     alpha_ratio: float, optional (default=1.0)
         smoothing ratio.
 
@@ -147,7 +148,7 @@ class MNNB( BaseEstimator, ClassifierMixin  ):
 
     predict_log_proba(X) : array
         Predict the log probability of each class using the model.
-        
+
     Examples
     --------
     >>> import numpy as np
@@ -165,14 +166,14 @@ class MNNB( BaseEstimator, ClassifierMixin  ):
 
     """
 
-    def __init__( self, alpha_i=1.0, alpha_ratio=1.0 ):
-        
+    def __init__(self, alpha_i=1.0, alpha_ratio=1.0):
+
         self.alpha_i = alpha_i
         self.alpha_ratio = alpha_ratio
-    
-    def fit(self, X, y ):
+
+    def fit(self, X, y):
         """Fit the Multinomial distribution"""
-        
+
         #
         # N_c is the count of all words in all documents of class c.
         # N_c_i is the a count of word i in all documents of class c.
@@ -183,68 +184,86 @@ def fit(self, X, y ):
         N_c_i_temp = []
         theta_c = []
         self.unique_y = np.unique(y)
-        
+
         for yi in self.unique_y:
-            N_c_i_temp.append( np.sum( X[y==yi, :], 0 ) )
-            theta_c.append( np.float(np.sum(y==yi)) / y.size )
+            N_c_i_temp.append(np.sum(X[y == yi, :], 0))
+            theta_c.append(np.float(np.sum(y == yi)) / y.size)
+
+        N_c_i = np.array(N_c_i_temp)
+        N_c = np.sum(N_c_i, axis=1)
 
-        N_c_i = np.array( N_c_i_temp )
-        N_c = np.sum( N_c_i, axis=1 )
-        
         #
         # Smoothing coefficients
         #
         alpha_i = self.alpha_i
-        alpha = self.alpha_ratio * X.shape[1] 
+        alpha = self.alpha_ratio * alpha_i * X.shape[1]
 
         #
         # Estimate the parameters of the distribution
         #
         self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
-        self.theta_c = np.array( theta_c )
-    
+        self.theta_c = np.array(theta_c)
+
         return self
 
     def predict(self, X):
         """Predict the classification of samples X"""
-        
-        y_pred = self.unique_y[np.argmax( self.predict_log_proba( X ), axis=0 )]
+
+        joint_log_likelihood = self._joint_log_likelihood(X)
+        y_pred = self.unique_y[np.argmax(joint_log_likelihood, axis=0)]
+
         return y_pred
 
     def _joint_log_likelihood(self, X):
         """Calculate the posterior log probability of the samples X"""
-        
+
         joint_log_likelihood = []
-        for i in range( self.unique_y.size ):
-            jointi = np.log( self.theta_c[i] )
-            n_ij   = np.dot( np.log( self.theta_c_i[i] ), X.T )
-            joint_log_likelihood.append( jointi + n_ij )
-            
+        for i in range(self.unique_y.size):
+            jointi = np.log(self.theta_c[i])
+            n_ij = np.dot(np.log(self.theta_c_i[i]), X.T)
+            joint_log_likelihood.append(jointi + n_ij)
+
         joint_log_likelihood = np.array(joint_log_likelihood)
-        
+
         return joint_log_likelihood
 
-    def predict_proba(self, X):
-        """
-        Predict the posterior probability of samples X
+    def _mininf(self, X, axis=None):
+        """Calculate the minimum of a matrix ignoring -inf values"""
         
-        Notes
-        -----
-        The values calculated by the joint log likelihood are very small and
-        might cause numerical underflow when used as the power of an exponent.
-        This function should be used with caution. It is preferable to use the
-        predict_log_proba() function instead.
-        """
+        A = X.copy()
+        A[np.isinf(X)] = np.inf
+        return np.min(X, axis=axis)
         
-        joint_log_likelihood = self._joint_log_likelihood( X )
-        proba = np.exp( joint_log_likelihood )
-        proba = proba / np.sum( proba, 1 )[:, np.newaxis]
+    def predict_proba(self, X):
+        """Predict the posterior probability of samples X"""
+
+        joint_log_likelihood = self._joint_log_likelihood(X)
+        
+        #
+        # The _joint_log_likelihood has very low values that create underflow
+        # in the computation of the exponent. Therefore I 'fix' it by adding
+        # a minimal value.
+        #
+        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
+        loga_fix = joint_log_likelihood - fix
+        proba_fix = np.exp(loga_fix)
+        proba = proba_fix / np.sum(proba_fix, 1)[:, np.newaxis]
+
         return proba
 
     def predict_log_proba(self, X):
         """Predict the posterior log probability of samples X"""
-        
-        joint_log_likelihood = self._joint_log_likelihood( X )
-        return joint_log_likelihood
-
 
+        joint_log_likelihood = self._joint_log_likelihood(X)
+        
+        #
+        # The _joint_log_likelihood has very low values that create underflow
+        # in the computation of the exponent. Therefore I 'fix' it by adding
+        # a minimal value.
+        #
+        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
+        loga_fix = joint_log_likelihood - fix
+        proba_fix = np.exp(loga_fix)
+        log_proba = loga_fix - np.log(np.sum(proba_fix, axis=1))[:, np.newaxis]
+        
+        return log_proba
diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 2a2b9dfafe41c..5d4b22eb5cbf9 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -36,8 +36,18 @@ def test_mnnb():
     correct values for a simple toy dataset.
     """
 
+    #
+    # Check the ability to predict the learning set.
+    #
     clf =  naive_bayes.MNNB()
     y_pred = clf.fit(X2, y2).predict(X2)
 
     assert_array_equal(y_pred, y2)
     
+    #
+    # Verify that np.log(clf.predict_proba(X)) gives the same results as
+    # clf.predict_log_proba(X)
+    #
+    y_pred_proba = clf.predict_proba(X2)
+    y_pred_log_proba = clf.predict_log_proba(X2)
+    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)

From 7ae1566ef2227160d578d1f312b40bcbf14f0bd5 Mon Sep 17 00:00:00 2001
From: Amit Aides <amitibo@tx.technion.ac.il>
Date: Wed, 30 Mar 2011 15:39:18 +0200
Subject: [PATCH 04/31] Some more pep8

---
 scikits/learn/naive_bayes.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index d9bb572eb8b80..2d71999206801 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -3,13 +3,13 @@
 
 # Author: Vincent Michel <vincent.michel@inria.fr>
 #         Amit Aides <amitibo@tx.technion.ac.il>
+#         Yehuda Finkelstein <yehudaf@tx.technion.ac.il>
 #
 # License: BSD Style.
 import numpy as np
 
 from .base import BaseEstimator, ClassifierMixin
 
-eps = np.finfo(np.float).eps
 
 class GNB(BaseEstimator, ClassifierMixin):
     """
@@ -229,11 +229,11 @@ def _joint_log_likelihood(self, X):
 
     def _mininf(self, X, axis=None):
         """Calculate the minimum of a matrix ignoring -inf values"""
-        
+
         A = X.copy()
         A[np.isinf(X)] = np.inf
         return np.min(X, axis=axis)
-        
+
     def predict_proba(self, X):
         """Predict the posterior probability of samples X"""
 

From 9e60a3f21f842bf35c8ba4af0f4e79df2b060b0e Mon Sep 17 00:00:00 2001
From: unknown <User@.(none)>
Date: Mon, 11 Apr 2011 13:20:43 +0300
Subject: [PATCH 05/31] Added documentation for the Naive Bayes classifiers.

---
 doc/modules/naive_bayes.rst  | 63 ++++++++++++++++++++++++++++++-
 scikits/learn/naive_bayes.py | 73 ++++++++++++++++++++++++++++++------
 2 files changed, 123 insertions(+), 13 deletions(-)

diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index 860aa445ad78b..2ff871fa549c3 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -7,7 +7,28 @@ Naive Bayes
 
 **Naive Bayes** algorithms are a set of supervised learning methods
 based on applying Baye's theorem with strong (naive) independence
-assumptions.
+assumptions. Given a class variable :math:`c` and a dependent set
+of feature variables :math:`f_1` through :math:`f_n`, the bayes
+theorem states the following relationship:
+
+.. math::
+
+   p(c \mid f_1,\dots,f_n) \propto p(c) p(\mid f_1,\dots,f_n \mid c)
+
+Using the naive assumption this relationship is simplified:
+
+.. math::
+
+   p(c \mid f_1,\dots,f_n) \propto p(c) \prod_{i=1}^{n} p(f_i \mid c)
+
+   \Downarrow
+
+   \hat{c} = \arg\max_c p(c) \prod_{i=1}^{n} p(f_i \mid c),
+
+where we used the Maximum a Posteriori estimator.
+
+The differnt Naive Bayes classifiers differ by the assumption on the
+distrubtion of :math:`p(f_i \mid c)`:
 
 The advantage of Naive Bayes approaches are:
 
@@ -28,9 +49,49 @@ Gaussian Naive Bayes
 --------------------
 
 :class:`GNB` implements the Gaussian Naive Bayes algorithm for classification.
+The likelihood of the features is assumed to be gaussian:
 
+.. math::
 
+   p(f_i \mid c) &= \frac{1}{\sqrt{2\pi\sigma^2_c}} \exp^{-\frac{ (f_i - \mu_c)^2}{2\pi\sigma^2_c}}
+
+The parameters of the distribution, :math:`\sigma_c` and :math:`\mu_c` are
+estimated using maximum likelihood.
 
 .. topic:: Examples:
 
  * :ref:`example_naive_bayes.py`,
+
+Multinomial Naive Bayes
+-----------------------
+
+:class:`MNNB` implements the Multinomial Naive Bayes algorithm for classification.
+Multinomial Naive Bayes models the distribution of words in a document as a
+multinomial. The distribution is parametrized by the vector
+:math:`\overline{\theta_c} = (\theta_{c1},\ldots,\theta_{cn})` where :math:`c`
+is the class of document, :math:`n` is the size of the vocabulary and :math:`\theta_{ci}`
+is the prbability of word :math:`i` appearing in a document of class :math:`c`.
+The likelihood of document :math:`d` is,
+
+.. math::
+
+   p(d \mid \overline{\theta_c}) &= \frac{ (\sum_i f_i)! }{\prod_i f_i !} \prod_i(\theta_{ci})^{f_i}
+
+where :math:`f_{i}` is the frequency count of word :math:`i`. It can be shown
+that the maximum posterior probability is,
+
+.. math::
+
+   \hat{c} = \arg\max_c [ \log p(\overline{\theta_c}) + \sum_i f_i \log \theta_{ci} ]
+
+The vector of parameters :math:`\overline{\theta_c}` is estimated by a smoothed
+version of maximum likelihood,
+
+.. math::
+
+    \hat{\theta}_{ci} = \frac{ N_{ci} + \alpha_i }{N_c + \alpha }
+
+where :math:`N_{ci}` is the number of times word :math:`i` appears in a document
+of class :math:`c` and :math:`N_{c}` is the total count of words in a document
+of class :math:`c`. The smoothness priors :math:`\alpha_i` and their sum 
+:math:`\alpha` account for words not seen in the learning samples.
diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index db6caafb7cbf8..3cf7bcedae5f8 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -201,10 +201,7 @@ class MNNB(BaseEstimator, ClassifierMixin):
     Parameters
     ----------
     alpha_i: float, optional (default=1.0)
-        smoothing constant.
-
-    alpha_ratio: float, optional (default=1.0)
-        smoothing ratio.
+        smoothness prior.
 
     Methods
     -------
@@ -228,7 +225,7 @@ class MNNB(BaseEstimator, ClassifierMixin):
     >>> from scikits.learn.naive_bayes import MNNB
     >>> clf = MNNB()
     >>> clf.fit(X, Y)
-    MNNB(alpha_ratio=1.0, alpha_i=1.0)
+    MNNB(alpha_i=1.0)
     >>> print clf.predict(X[2])
     3
 
@@ -237,13 +234,29 @@ class MNNB(BaseEstimator, ClassifierMixin):
 
     """
 
-    def __init__(self, alpha_i=1.0, alpha_ratio=1.0):
+    def __init__(self, alpha_i=1.0):
 
         self.alpha_i = alpha_i
-        self.alpha_ratio = alpha_ratio
 
     def fit(self, X, y):
-        """Fit the Multinomial distribution"""
+        """Fit Multinomial Naive Bayes according to X, y
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : array-like, shape = [n_samples]
+            Target values.
+
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
 
         #
         # N_c is the count of all words in all documents of class c.
@@ -267,7 +280,7 @@ def fit(self, X, y):
         # Smoothing coefficients
         #
         alpha_i = self.alpha_i
-        alpha = self.alpha_ratio * alpha_i * X.shape[1]
+        alpha = alpha_i * X.shape[1]
 
         #
         # Estimate the parameters of the distribution
@@ -278,7 +291,17 @@ def fit(self, X, y):
         return self
 
     def predict(self, X):
-        """Predict the classification of samples X"""
+        """
+        Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        C : array, shape = [n_samples]
+        """
 
         joint_log_likelihood = self._joint_log_likelihood(X)
         y_pred = self.unique_y[np.argmax(joint_log_likelihood, axis=0)]
@@ -306,7 +329,20 @@ def _mininf(self, X, axis=None):
         return np.min(X, axis=axis)
 
     def predict_proba(self, X):
-        """Predict the posterior probability of samples X"""
+        """
+        Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        C : array-like, shape = [n_samples, n_classes]
+            Returns the probability of the sample for each class in
+            the model, where classes are ordered by arithmetical
+            order.
+        """
 
         joint_log_likelihood = self._joint_log_likelihood(X)
         
@@ -323,7 +359,20 @@ def predict_proba(self, X):
         return proba
 
     def predict_log_proba(self, X):
-        """Predict the posterior log probability of samples X"""
+        """
+        Return log-probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        C : array-like, shape = [n_samples, n_classes]
+            Returns the log-probability of the sample for each class
+            in the model, where classes are ordered by arithmetical
+            order.
+        """
 
         joint_log_likelihood = self._joint_log_likelihood(X)
         

From 5c6a4530a9ff5c8be94a11baa00c73956dac8649 Mon Sep 17 00:00:00 2001
From: unknown <User@.(none)>
Date: Mon, 11 Apr 2011 21:40:11 +0300
Subject: [PATCH 06/31] Added sparse MNNB and modified the textual examples to
 benchmark it.

---
 .../document_classification_20newsgroups.py   |  13 +-
 .../mlcomp_sparse_document_classification.py  |  77 ++++---
 scikits/learn/naive_bayes/__init__.py         |  15 ++
 .../learn/{ => naive_bayes}/naive_bayes.py    |   2 +-
 scikits/learn/naive_bayes/sparse/__init__.py  |  10 +
 .../learn/naive_bayes/sparse/naive_bayes.py   | 215 ++++++++++++++++++
 6 files changed, 299 insertions(+), 33 deletions(-)
 create mode 100644 scikits/learn/naive_bayes/__init__.py
 rename scikits/learn/{ => naive_bayes}/naive_bayes.py (99%)
 create mode 100644 scikits/learn/naive_bayes/sparse/__init__.py
 create mode 100644 scikits/learn/naive_bayes/sparse/naive_bayes.py

diff --git a/examples/document_classification_20newsgroups.py b/examples/document_classification_20newsgroups.py
index 295aaabb21321..76df897b44068 100644
--- a/examples/document_classification_20newsgroups.py
+++ b/examples/document_classification_20newsgroups.py
@@ -45,6 +45,7 @@
 from scikits.learn.linear_model import RidgeClassifier
 from scikits.learn.svm.sparse import LinearSVC
 from scikits.learn.linear_model.sparse import SGDClassifier
+from scikits.learn.naive_bayes.sparse import MNNB
 from scikits.learn import metrics
 
 
@@ -128,9 +129,10 @@ def benchmark(clf):
     score = metrics.f1_score(y_test, pred)
     print "f1-score:   %0.3f" % score
 
-    nnz = clf.coef_.nonzero()[0].shape[0]
-    print "non-zero coef: %d" % nnz
-    print
+    if hasattr(clf, 'coef_'):
+        nnz = clf.coef_.nonzero()[0].shape[0]
+        print "non-zero coef: %d" % nnz
+        print
 
     if print_report:
         print "classification report:"
@@ -165,3 +167,8 @@ def benchmark(clf):
 print "Elastic-Net penalty"
 sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet"))
+
+# Train sparse MNNB
+print 80 * '='
+print "MNNB penalty"
+mnnb_results = benchmark(MNNB(alpha_i=.01))
diff --git a/examples/mlcomp_sparse_document_classification.py b/examples/mlcomp_sparse_document_classification.py
index d07d9e79e255a..ecfbe587e9077 100644
--- a/examples/mlcomp_sparse_document_classification.py
+++ b/examples/mlcomp_sparse_document_classification.py
@@ -49,6 +49,8 @@
 from scikits.learn.linear_model.sparse import SGDClassifier
 from scikits.learn.metrics import confusion_matrix
 from scikits.learn.metrics import classification_report
+from scikits.learn.naive_bayes.sparse import MNNB
+
 
 if 'MLCOMP_DATASETS_HOME' not in os.environ:
     print "Please follow those instructions to get started:"
@@ -71,20 +73,6 @@
 assert sp.issparse(X_train)
 y_train = news_train.target
 
-print "Training a linear classifier..."
-parameters = {
-    'loss': 'hinge',
-    'penalty': 'l2',
-    'n_iter': 50,
-    'alpha': 0.00001,
-    'fit_intercept': True,
-}
-print "parameters:", parameters
-t0 = time()
-clf = SGDClassifier(**parameters).fit(X_train, y_train)
-print "done in %fs" % (time() - t0)
-print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)
-
 print "Loading 20 newsgroups test set... "
 news_test = load_mlcomp('20news-18828', 'test')
 t0 = time()
@@ -101,22 +89,53 @@
 print "done in %fs" % (time() - t0)
 print "n_samples: %d, n_features: %d" % X_test.shape
 
-print "Predicting the outcomes of the testing set"
-t0 = time()
-pred = clf.predict(X_test)
-print "done in %fs" % (time() - t0)
+################################################################################
+# Benchmark classifiers
+def benchmark(clf_class, params, name):
+    print "parameters:", params
+    t0 = time()
+    clf = clf_class(**params).fit(X_train, y_train)
+    print "done in %fs" % (time() - t0)
+
+    if hasattr(clf, 'coef_'):
+        print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)
+
+    print "Predicting the outcomes of the testing set"
+    t0 = time()
+    pred = clf.predict(X_test)
+    print "done in %fs" % (time() - t0)
+    
+    print "Classification report on test set for classifier:"
+    print clf
+    print
+    print classification_report(y_test, pred, target_names=news_test.target_names)
+    
+    cm = confusion_matrix(y_test, pred)
+    print "Confusion matrix:"
+    print cm
+    
+    # Show confusion matrix
+    pl.matshow(cm)
+    pl.title('Confusion matrix of the %s classifier' % name)
+    pl.colorbar()
+    
+    
+print "Testbenching a linear classifier..."
+parameters = {
+    'loss': 'hinge',
+    'penalty': 'l2',
+    'n_iter': 50,
+    'alpha': 0.00001,
+    'fit_intercept': True,
+}
+
+benchmark(SGDClassifier, parameters, 'SGD')
 
-print "Classification report on test set for classifier:"
-print clf
-print
-print classification_report(y_test, pred, target_names=news_test.target_names)
+print "Testbenching a MNNB classifier..."
+parameters = {
+    'alpha_i': 0.01
+}
 
-cm = confusion_matrix(y_test, pred)
-print "Confusion matrix:"
-print cm
+benchmark(MNNB, parameters, 'MNNB')
 
-# Show confusion matrix
-pl.matshow(cm)
-pl.title('Confusion matrix')
-pl.colorbar()
 pl.show()
diff --git a/scikits/learn/naive_bayes/__init__.py b/scikits/learn/naive_bayes/__init__.py
new file mode 100644
index 0000000000000..8ddff9f4e51d8
--- /dev/null
+++ b/scikits/learn/naive_bayes/__init__.py
@@ -0,0 +1,15 @@
+"""
+Naive Bayes models
+==================
+
+Naive Bayes algorithms are a set of supervised learning methods based on
+applying Baye`s theorem with strong (naive) independence assumptions. 
+
+See http://scikit-learn.sourceforge.net/modules/naive_bayes.html for
+complete documentation.
+"""
+
+from .naive_bayes import GNB, MNNB
+
+from . import sparse
+
diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes/naive_bayes.py
similarity index 99%
rename from scikits/learn/naive_bayes.py
rename to scikits/learn/naive_bayes/naive_bayes.py
index 3cf7bcedae5f8..c849f547aaf9f 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes/naive_bayes.py
@@ -10,7 +10,7 @@
 # License: BSD Style.
 import numpy as np
 
-from .base import BaseEstimator, ClassifierMixin
+from ..base import BaseEstimator, ClassifierMixin
 
 
 class GNB(BaseEstimator, ClassifierMixin):
diff --git a/scikits/learn/naive_bayes/sparse/__init__.py b/scikits/learn/naive_bayes/sparse/__init__.py
new file mode 100644
index 0000000000000..19dc64fbc66b8
--- /dev/null
+++ b/scikits/learn/naive_bayes/sparse/__init__.py
@@ -0,0 +1,10 @@
+"""
+Naive Bayes models with sparse data
+===================================
+
+scikits.learn.naive_bayes.sparse implements the sparse counterpart
+of scikits.learn.naive_bayes.MNNB
+
+"""
+
+from .naive_bayes import MNNB
diff --git a/scikits/learn/naive_bayes/sparse/naive_bayes.py b/scikits/learn/naive_bayes/sparse/naive_bayes.py
new file mode 100644
index 0000000000000..d727c93f62da5
--- /dev/null
+++ b/scikits/learn/naive_bayes/sparse/naive_bayes.py
@@ -0,0 +1,215 @@
+""" Naives Bayes classifiers for sparse data.
+"""
+
+# Author: Amit Aides <amitibo@tx.technion.ac.il>
+#
+# License: BSD Style.
+import numpy as np
+
+from ...base import BaseEstimator, ClassifierMixin
+
+
+class MNNB(BaseEstimator, ClassifierMixin):
+    """
+    Multinomial Naive Bayes for sparse matrices
+
+    The Multinomial Naive Bayes classifier is suitable for text classification.
+
+    Parameters
+    ----------
+    X : array-like, shape = [n_samples, n_features]
+        Training vector, where n_samples in the number of samples and
+        n_features is the number of features.
+    y : array, shape = [n_samples]
+        Target vector relative to X
+
+    Parameters
+    ----------
+    alpha_i: float, optional (default=1.0)
+        smoothness prior.
+
+    Methods
+    -------
+    fit(X, y) : self
+        Fit the model
+
+    predict(X) : array
+        Predict using the model.
+
+    predict_proba(X) : array
+        Predict the probability of each class using the model.
+
+    predict_log_proba(X) : array
+        Predict the log probability of each class using the model.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.random.randint( 5, size=(6, 100) )
+    >>> Y = np.array([1, 2, 3, 4, 5, 6])
+    >>> from scikits.learn.naive_bayes import MNNB
+    >>> clf = MNNB()
+    >>> clf.fit(X, Y)
+    MNNB(alpha_i=1.0)
+    >>> print clf.predict(X[2])
+    3
+
+    See also
+    --------
+
+    """
+
+    def __init__(self, alpha_i=1.0):
+
+        self.alpha_i = alpha_i
+
+    def fit(self, X, y):
+        """Fit Multinomial Naive Bayes according to X, y
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples
+            and n_features is the number of features.
+
+        y : array-like, shape = [n_samples]
+            Target values.
+
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+
+        #
+        # N_c is the count of all words in all documents of class c.
+        # N_c_i is the a count of word i in all documents of class c.
+        # theta_c is the prior empirical probability of a document of class c.
+        # theta_c_i is the (smoothened) empirical likelihood of word i
+        # given a document of class c.
+        #
+        N_c_i_temp = []
+        theta_c = []
+        self.unique_y = np.unique(y)
+
+        for yi in self.unique_y:
+            row_ind = np.nonzero(y == yi)[0]
+            N_c_i_temp.append(np.array(X[row_ind, :].sum(axis=0)).ravel())
+            theta_c.append(np.float(np.sum(y == yi)) / y.size)
+
+        N_c_i = np.array(N_c_i_temp)
+        N_c = np.sum(N_c_i, axis=1)
+
+        #
+        # Smoothing coefficients
+        #
+        alpha_i = self.alpha_i
+        alpha = alpha_i * X.shape[1]
+
+        #
+        # Estimate the parameters of the distribution
+        #
+        self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
+        self.theta_c = np.array(theta_c)
+
+        return self
+
+    def predict(self, X):
+        """
+        Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        C : array, shape = [n_samples]
+        """
+
+        joint_log_likelihood = self._joint_log_likelihood(X)
+        y_pred = self.unique_y[np.argmax(joint_log_likelihood, axis=0)]
+
+        return y_pred
+
+    def _joint_log_likelihood(self, X):
+        """Calculate the posterior log probability of the samples X"""
+
+        joint_log_likelihood = []
+        for i in range(self.unique_y.size):
+            jointi = np.log(self.theta_c[i])
+            n_ij = np.log(self.theta_c_i[i]) * X.T
+            joint_log_likelihood.append(jointi + n_ij)
+
+        joint_log_likelihood = np.array(joint_log_likelihood)
+
+        return joint_log_likelihood
+
+    def _mininf(self, X, axis=None):
+        """Calculate the minimum of a matrix ignoring -inf values"""
+
+        A = X.copy()
+        A[np.isinf(X)] = np.inf
+        return np.min(X, axis=axis)
+
+    def predict_proba(self, X):
+        """
+        Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        C : array-like, shape = [n_samples, n_classes]
+            Returns the probability of the sample for each class in
+            the model, where classes are ordered by arithmetical
+            order.
+        """
+
+        joint_log_likelihood = self._joint_log_likelihood(X)
+        
+        #
+        # The _joint_log_likelihood has very low values that create underflow
+        # in the computation of the exponent. Therefore I 'fix' it by adding
+        # a minimal value.
+        #
+        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
+        loga_fix = joint_log_likelihood - fix
+        proba_fix = np.exp(loga_fix)
+        proba = proba_fix / np.sum(proba_fix, 1)[:, np.newaxis]
+
+        return proba
+
+    def predict_log_proba(self, X):
+        """
+        Return log-probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        C : array-like, shape = [n_samples, n_classes]
+            Returns the log-probability of the sample for each class
+            in the model, where classes are ordered by arithmetical
+            order.
+        """
+
+        joint_log_likelihood = self._joint_log_likelihood(X)
+        
+        #
+        # The _joint_log_likelihood has very low values that create underflow
+        # in the computation of the exponent. Therefore I 'fix' it by adding
+        # a minimal value.
+        #
+        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
+        loga_fix = joint_log_likelihood - fix
+        proba_fix = np.exp(loga_fix)
+        log_proba = loga_fix - np.log(np.sum(proba_fix, axis=1))[:, np.newaxis]
+        
+        return log_proba

From cdd0ef7a5e17631ae32040b95843eb371af4be7f Mon Sep 17 00:00:00 2001
From: unknown <User@.(none)>
Date: Mon, 11 Apr 2011 22:05:58 +0300
Subject: [PATCH 07/31] Modified the Naive Bayes nose tests to the new location
 of the module and added sparse test.

---
 .../tests/test_naive_bayes.py                 | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 rename scikits/learn/{ => naive_bayes}/tests/test_naive_bayes.py (65%)

diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/naive_bayes/tests/test_naive_bayes.py
similarity index 65%
rename from scikits/learn/tests/test_naive_bayes.py
rename to scikits/learn/naive_bayes/tests/test_naive_bayes.py
index 5d4b22eb5cbf9..5a6f460bfa1b9 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/naive_bayes/tests/test_naive_bayes.py
@@ -1,7 +1,9 @@
 import numpy as np
+import scipy.sparse
 from numpy.testing import assert_array_equal, assert_array_almost_equal
 
 from .. import naive_bayes
+from ..sparse import naive_bayes as naive_bayes_sparse
 
 # Data is just 6 separable points in the plane
 X = np.array( [[-2,-1], [-1, -1], [-1, -2], [1,1], [1,2], [2, 1]])
@@ -51,3 +53,30 @@ def test_mnnb():
     y_pred_proba = clf.predict_proba(X2)
     y_pred_log_proba = clf.predict_log_proba(X2)
     assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
+
+
+def test_sparse_mnnb():
+    """
+    Multinomial Naive Bayes classification for sparse data.
+
+    This checks that sparse MNNB implements fit and predict and returns
+    correct values for a simple toy dataset.
+    """
+
+    X2S = scipy.sparse.csr_matrix(X2)
+
+    #
+    # Check the ability to predict the learning set.
+    #
+    clf =  naive_bayes_sparse.MNNB()
+    y_pred = clf.fit(X2S, y2).predict(X2S)
+
+    assert_array_equal(y_pred, y2)
+    
+    #
+    # Verify that np.log(clf.predict_proba(X)) gives the same results as
+    # clf.predict_log_proba(X)
+    #
+    y_pred_proba = clf.predict_proba(X2S)
+    y_pred_log_proba = clf.predict_log_proba(X2S)
+    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)

From 3dd1d00532c70cc3f5ebf2db916e072138afa0f1 Mon Sep 17 00:00:00 2001
From: Amit Aides <amitibo@tx.technion.ac.il>
Date: Wed, 11 May 2011 19:20:43 +0300
Subject: [PATCH 08/31] naive bayes name change MNNB->MultinomialNB

---
 doc/modules/classes.rst                       | 19 +++++++++++++++-
 doc/modules/naive_bayes.rst                   |  2 +-
 .../document_classification_20newsgroups.py   |  8 +++----
 .../mlcomp_sparse_document_classification.py  |  6 ++---
 scikits/learn/naive_bayes/__init__.py         |  2 +-
 scikits/learn/naive_bayes/naive_bayes.py      | 22 ++++++++-----------
 scikits/learn/naive_bayes/sparse/__init__.py  |  4 ++--
 .../learn/naive_bayes/sparse/naive_bayes.py   | 15 ++++++-------
 scikits/learn/naive_bayes/tests/__init__.py   |  0
 .../naive_bayes/tests/test_naive_bayes.py     |  8 +++----
 10 files changed, 49 insertions(+), 37 deletions(-)
 create mode 100644 scikits/learn/naive_bayes/tests/__init__.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index bb699009a7983..96847e3549c69 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -148,7 +148,24 @@ Naive Bayes
    :template: class.rst
 
    naive_bayes.GNB
-   naive_bayes.MNNB
+   naive_bayes.MultinomialNB
+
+
+For sparse data
+---------------
+
+.. automodule:: scikits.learn.naive_bayes.sparse
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: scikits.learn
+
+.. autosummary::
+
+   :toctree: generated/
+   :template: class.rst
+
+   naive_bayes.sparse.MultinomialNB
 
 
 Nearest Neighbors
diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index 2ff871fa549c3..5e349bcb0731e 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -65,7 +65,7 @@ estimated using maximum likelihood.
 Multinomial Naive Bayes
 -----------------------
 
-:class:`MNNB` implements the Multinomial Naive Bayes algorithm for classification.
+:class:`MultinomialNB` implements the Multinomial Naive Bayes algorithm for classification.
 Multinomial Naive Bayes models the distribution of words in a document as a
 multinomial. The distribution is parametrized by the vector
 :math:`\overline{\theta_c} = (\theta_{c1},\ldots,\theta_{cn})` where :math:`c`
diff --git a/examples/document_classification_20newsgroups.py b/examples/document_classification_20newsgroups.py
index 757f2c17a071f..9a558cd0d75dc 100644
--- a/examples/document_classification_20newsgroups.py
+++ b/examples/document_classification_20newsgroups.py
@@ -45,7 +45,7 @@
 from scikits.learn.linear_model import RidgeClassifier
 from scikits.learn.svm.sparse import LinearSVC
 from scikits.learn.linear_model.sparse import SGDClassifier
-from scikits.learn.naive_bayes.sparse import MNNB
+from scikits.learn.naive_bayes.sparse import MultinomialNB
 from scikits.learn import metrics
 
 
@@ -168,7 +168,7 @@ def benchmark(clf):
 sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet"))
 
-# Train sparse MNNB
+# Train sparse MultinomialNB
 print 80 * '='
-print "MNNB penalty"
-mnnb_results = benchmark(MNNB(alpha_i=.01))
+print "MultinomialNB penalty"
+mnnb_results = benchmark(MultinomialNB(alpha_i=.01))
diff --git a/examples/mlcomp_sparse_document_classification.py b/examples/mlcomp_sparse_document_classification.py
index 85a71905b58fa..5f8f2444543e4 100644
--- a/examples/mlcomp_sparse_document_classification.py
+++ b/examples/mlcomp_sparse_document_classification.py
@@ -49,7 +49,7 @@
 from scikits.learn.linear_model.sparse import SGDClassifier
 from scikits.learn.metrics import confusion_matrix
 from scikits.learn.metrics import classification_report
-from scikits.learn.naive_bayes.sparse import MNNB
+from scikits.learn.naive_bayes.sparse import MultinomialNB
 
 
 if 'MLCOMP_DATASETS_HOME' not in os.environ:
@@ -131,11 +131,11 @@ def benchmark(clf_class, params, name):
 
 benchmark(SGDClassifier, parameters, 'SGD')
 
-print "Testbenching a MNNB classifier..."
+print "Testbenching a MultinomialNB classifier..."
 parameters = {
     'alpha_i': 0.01
 }
 
-benchmark(MNNB, parameters, 'MNNB')
+benchmark(MultinomialNB, parameters, 'MultinomialNB')
 
 pl.show()
diff --git a/scikits/learn/naive_bayes/__init__.py b/scikits/learn/naive_bayes/__init__.py
index 8ddff9f4e51d8..0f632219df385 100644
--- a/scikits/learn/naive_bayes/__init__.py
+++ b/scikits/learn/naive_bayes/__init__.py
@@ -9,7 +9,7 @@
 complete documentation.
 """
 
-from .naive_bayes import GNB, MNNB
+from .naive_bayes import GNB, MultinomialNB
 
 from . import sparse
 
diff --git a/scikits/learn/naive_bayes/naive_bayes.py b/scikits/learn/naive_bayes/naive_bayes.py
index c849f547aaf9f..f643d16210a73 100644
--- a/scikits/learn/naive_bayes/naive_bayes.py
+++ b/scikits/learn/naive_bayes/naive_bayes.py
@@ -3,7 +3,7 @@
 
 # Author: Vincent Michel <vincent.michel@inria.fr>
 #         Minor fixes by Fabian Pedregosa
-#         MNNB classifier by:
+#         MultinomialNB classifier by:
 #         Amit Aides <amitibo@tx.technion.ac.il> &
 #         Yehuda Finkelstein <yehudaf@tx.technion.ac.il>
 #
@@ -122,7 +122,6 @@ def predict(self, X):
         y_pred = self.unique_y[np.argmax(self.predict_proba(X), 1)]
         return y_pred
 
-
     def _joint_log_likelihood(self, X):
         joint_log_likelihood = []
         for i in range(np.size(self.unique_y)):
@@ -134,7 +133,6 @@ def _joint_log_likelihood(self, X):
         joint_log_likelihood = np.array(joint_log_likelihood).T
         return joint_log_likelihood
 
-
     def predict_proba(self, X):
         """
         Return probability estimates for the test vector X.
@@ -156,7 +154,6 @@ def predict_proba(self, X):
         proba = proba / np.sum(proba, 1)[:, np.newaxis]
         return proba
 
-
     def predict_log_proba(self, X):
         """
         Return log-probability estimates for the test vector X.
@@ -184,9 +181,9 @@ def predict_log_proba(self, X):
         return log_proba
 
 
-class MNNB(BaseEstimator, ClassifierMixin):
+class MultinomialNB(BaseEstimator, ClassifierMixin):
     """
-    Multinomial Naive Bayes (MNNB)
+    Multinomial Naive Bayes (MultinomialNB)
 
     The Multinomial Naive Bayes classifier is suitable for text classification.
 
@@ -222,10 +219,10 @@ class MNNB(BaseEstimator, ClassifierMixin):
     >>> import numpy as np
     >>> X = np.random.randint( 5, size=(6, 100) )
     >>> Y = np.array([1, 2, 3, 4, 5, 6])
-    >>> from scikits.learn.naive_bayes import MNNB
-    >>> clf = MNNB()
+    >>> from scikits.learn.naive_bayes import MultinomialNB
+    >>> clf = MultinomialNB()
     >>> clf.fit(X, Y)
-    MNNB(alpha_i=1.0)
+    MultinomialNB(alpha_i=1.0)
     >>> print clf.predict(X[2])
     3
 
@@ -257,7 +254,6 @@ def fit(self, X, y):
             Returns self.
         """
 
-
         #
         # N_c is the count of all words in all documents of class c.
         # N_c_i is the a count of word i in all documents of class c.
@@ -345,7 +341,7 @@ def predict_proba(self, X):
         """
 
         joint_log_likelihood = self._joint_log_likelihood(X)
-        
+
         #
         # The _joint_log_likelihood has very low values that create underflow
         # in the computation of the exponent. Therefore I 'fix' it by adding
@@ -375,7 +371,7 @@ def predict_log_proba(self, X):
         """
 
         joint_log_likelihood = self._joint_log_likelihood(X)
-        
+
         #
         # The _joint_log_likelihood has very low values that create underflow
         # in the computation of the exponent. Therefore I 'fix' it by adding
@@ -385,5 +381,5 @@ def predict_log_proba(self, X):
         loga_fix = joint_log_likelihood - fix
         proba_fix = np.exp(loga_fix)
         log_proba = loga_fix - np.log(np.sum(proba_fix, axis=1))[:, np.newaxis]
-        
+
         return log_proba
diff --git a/scikits/learn/naive_bayes/sparse/__init__.py b/scikits/learn/naive_bayes/sparse/__init__.py
index 19dc64fbc66b8..ab82d37b90301 100644
--- a/scikits/learn/naive_bayes/sparse/__init__.py
+++ b/scikits/learn/naive_bayes/sparse/__init__.py
@@ -3,8 +3,8 @@
 ===================================
 
 scikits.learn.naive_bayes.sparse implements the sparse counterpart
-of scikits.learn.naive_bayes.MNNB
+of scikits.learn.naive_bayes.MultinomialNB
 
 """
 
-from .naive_bayes import MNNB
+from .naive_bayes import MultinomialNB
diff --git a/scikits/learn/naive_bayes/sparse/naive_bayes.py b/scikits/learn/naive_bayes/sparse/naive_bayes.py
index d727c93f62da5..03cd431d97caa 100644
--- a/scikits/learn/naive_bayes/sparse/naive_bayes.py
+++ b/scikits/learn/naive_bayes/sparse/naive_bayes.py
@@ -9,7 +9,7 @@
 from ...base import BaseEstimator, ClassifierMixin
 
 
-class MNNB(BaseEstimator, ClassifierMixin):
+class MultinomialNB(BaseEstimator, ClassifierMixin):
     """
     Multinomial Naive Bayes for sparse matrices
 
@@ -47,10 +47,10 @@ class MNNB(BaseEstimator, ClassifierMixin):
     >>> import numpy as np
     >>> X = np.random.randint( 5, size=(6, 100) )
     >>> Y = np.array([1, 2, 3, 4, 5, 6])
-    >>> from scikits.learn.naive_bayes import MNNB
-    >>> clf = MNNB()
+    >>> from scikits.learn.naive_bayes import MultinomialNB
+    >>> clf = MultinomialNB()
     >>> clf.fit(X, Y)
-    MNNB(alpha_i=1.0)
+    MultinomialNB(alpha_i=1.0)
     >>> print clf.predict(X[2])
     3
 
@@ -82,7 +82,6 @@ def fit(self, X, y):
             Returns self.
         """
 
-
         #
         # N_c is the count of all words in all documents of class c.
         # N_c_i is the a count of word i in all documents of class c.
@@ -171,7 +170,7 @@ def predict_proba(self, X):
         """
 
         joint_log_likelihood = self._joint_log_likelihood(X)
-        
+
         #
         # The _joint_log_likelihood has very low values that create underflow
         # in the computation of the exponent. Therefore I 'fix' it by adding
@@ -201,7 +200,7 @@ def predict_log_proba(self, X):
         """
 
         joint_log_likelihood = self._joint_log_likelihood(X)
-        
+
         #
         # The _joint_log_likelihood has very low values that create underflow
         # in the computation of the exponent. Therefore I 'fix' it by adding
@@ -211,5 +210,5 @@ def predict_log_proba(self, X):
         loga_fix = joint_log_likelihood - fix
         proba_fix = np.exp(loga_fix)
         log_proba = loga_fix - np.log(np.sum(proba_fix, axis=1))[:, np.newaxis]
-        
+
         return log_proba
diff --git a/scikits/learn/naive_bayes/tests/__init__.py b/scikits/learn/naive_bayes/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/scikits/learn/naive_bayes/tests/test_naive_bayes.py b/scikits/learn/naive_bayes/tests/test_naive_bayes.py
index 5a6f460bfa1b9..16b037c5634bd 100644
--- a/scikits/learn/naive_bayes/tests/test_naive_bayes.py
+++ b/scikits/learn/naive_bayes/tests/test_naive_bayes.py
@@ -34,14 +34,14 @@ def test_mnnb():
     """
     Multinomial Naive Bayes classification.
 
-    This checks that MNNB implements fit and predict and returns
+    This checks that MultinomialNB implements fit and predict and returns
     correct values for a simple toy dataset.
     """
 
     #
     # Check the ability to predict the learning set.
     #
-    clf =  naive_bayes.MNNB()
+    clf =  naive_bayes.MultinomialNB()
     y_pred = clf.fit(X2, y2).predict(X2)
 
     assert_array_equal(y_pred, y2)
@@ -59,7 +59,7 @@ def test_sparse_mnnb():
     """
     Multinomial Naive Bayes classification for sparse data.
 
-    This checks that sparse MNNB implements fit and predict and returns
+    This checks that sparse MultinomialNB implements fit and predict and returns
     correct values for a simple toy dataset.
     """
 
@@ -68,7 +68,7 @@ def test_sparse_mnnb():
     #
     # Check the ability to predict the learning set.
     #
-    clf =  naive_bayes_sparse.MNNB()
+    clf =  naive_bayes_sparse.MultinomialNB()
     y_pred = clf.fit(X2S, y2).predict(X2S)
 
     assert_array_equal(y_pred, y2)

From e83a8ccb51061582374841fa716da77cec4af6ed Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Fri, 20 May 2011 13:31:42 +0200
Subject: [PATCH 09/31] naive bayes: copyedit + rename alpha_i to alpha

---
 scikits/learn/naive_bayes/__init__.py         |  2 +-
 scikits/learn/naive_bayes/naive_bayes.py      | 20 ++++++++-----------
 .../learn/naive_bayes/sparse/naive_bayes.py   | 19 ++++++++----------
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/scikits/learn/naive_bayes/__init__.py b/scikits/learn/naive_bayes/__init__.py
index 0f632219df385..f89f5b43e1fff 100644
--- a/scikits/learn/naive_bayes/__init__.py
+++ b/scikits/learn/naive_bayes/__init__.py
@@ -3,7 +3,7 @@
 ==================
 
 Naive Bayes algorithms are a set of supervised learning methods based on
-applying Baye`s theorem with strong (naive) independence assumptions. 
+applying Bayes' theorem with strong (naive) independence assumptions. 
 
 See http://scikit-learn.sourceforge.net/modules/naive_bayes.html for
 complete documentation.
diff --git a/scikits/learn/naive_bayes/naive_bayes.py b/scikits/learn/naive_bayes/naive_bayes.py
index f643d16210a73..d3c47b6862544 100644
--- a/scikits/learn/naive_bayes/naive_bayes.py
+++ b/scikits/learn/naive_bayes/naive_bayes.py
@@ -1,4 +1,4 @@
-""" Naives Bayes classifiers.
+""" Naive Bayes classifiers.
 """
 
 # Author: Vincent Michel <vincent.michel@inria.fr>
@@ -197,8 +197,9 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
 
     Parameters
     ----------
-    alpha_i: float, optional (default=1.0)
-        smoothness prior.
+    alpha: float, optional (default=1.0)
+        Additive (Laplace/Lidstone) smoothing parameter
+        (0 for no smoothing).
 
     Methods
     -------
@@ -217,7 +218,7 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     Examples
     --------
     >>> import numpy as np
-    >>> X = np.random.randint( 5, size=(6, 100) )
+    >>> X = np.random.randint(5, size=(6, 100))
     >>> Y = np.array([1, 2, 3, 4, 5, 6])
     >>> from scikits.learn.naive_bayes import MultinomialNB
     >>> clf = MultinomialNB()
@@ -225,15 +226,10 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     MultinomialNB(alpha_i=1.0)
     >>> print clf.predict(X[2])
     3
-
-    See also
-    --------
-
     """
 
-    def __init__(self, alpha_i=1.0):
-
-        self.alpha_i = alpha_i
+    def __init__(self, alpha=1.0):
+        self.alpha = alpha
 
     def fit(self, X, y):
         """Fit Multinomial Naive Bayes according to X, y
@@ -275,7 +271,7 @@ def fit(self, X, y):
         #
         # Smoothing coefficients
         #
-        alpha_i = self.alpha_i
+        alpha_i = self.alpha
         alpha = alpha_i * X.shape[1]
 
         #
diff --git a/scikits/learn/naive_bayes/sparse/naive_bayes.py b/scikits/learn/naive_bayes/sparse/naive_bayes.py
index 03cd431d97caa..0d15f57545132 100644
--- a/scikits/learn/naive_bayes/sparse/naive_bayes.py
+++ b/scikits/learn/naive_bayes/sparse/naive_bayes.py
@@ -1,4 +1,4 @@
-""" Naives Bayes classifiers for sparse data.
+""" Naive Bayes classifiers for sparse data.
 """
 
 # Author: Amit Aides <amitibo@tx.technion.ac.il>
@@ -25,8 +25,9 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
 
     Parameters
     ----------
-    alpha_i: float, optional (default=1.0)
-        smoothness prior.
+    alpha: float, optional (default=1.0)
+        Additive (Laplace/Lidstone) smoothing parameter
+        (0 for no smoothing).
 
     Methods
     -------
@@ -50,18 +51,14 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     >>> from scikits.learn.naive_bayes import MultinomialNB
     >>> clf = MultinomialNB()
     >>> clf.fit(X, Y)
-    MultinomialNB(alpha_i=1.0)
+    MultinomialNB(alpha=1.0)
     >>> print clf.predict(X[2])
     3
 
-    See also
-    --------
-
     """
 
-    def __init__(self, alpha_i=1.0):
-
-        self.alpha_i = alpha_i
+    def __init__(self, alpha=1.0):
+        self.alpha = alpha
 
     def fit(self, X, y):
         """Fit Multinomial Naive Bayes according to X, y
@@ -104,7 +101,7 @@ def fit(self, X, y):
         #
         # Smoothing coefficients
         #
-        alpha_i = self.alpha_i
+        alpha_i = self.alpha
         alpha = alpha_i * X.shape[1]
 
         #

From 0b86ee2ff0a69b5a26bbbac6738d75232399ae6f Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Fri, 20 May 2011 15:52:30 +0200
Subject: [PATCH 10/31] ENH: optional and user-settable priors in multinom
 naive bayes

---
 scikits/learn/naive_bayes/naive_bayes.py      | 61 ++++++++--------
 .../learn/naive_bayes/sparse/naive_bayes.py   | 69 +++++++++++--------
 2 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/scikits/learn/naive_bayes/naive_bayes.py b/scikits/learn/naive_bayes/naive_bayes.py
index d3c47b6862544..834f38427d8f1 100644
--- a/scikits/learn/naive_bayes/naive_bayes.py
+++ b/scikits/learn/naive_bayes/naive_bayes.py
@@ -82,7 +82,6 @@ def fit(self, X, y):
         y : array-like, shape = [n_samples]
             Target values.
 
-
         Returns
         -------
         self : object
@@ -187,19 +186,13 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
 
     The Multinomial Naive Bayes classifier is suitable for text classification.
 
-    Parameters
-    ----------
-    X : array-like, shape = [n_samples, n_features]
-        Training vector, where n_samples in the number of samples and
-        n_features is the number of features.
-    y : array, shape = [n_samples]
-        Target vector relative to X
-
     Parameters
     ----------
     alpha: float, optional (default=1.0)
         Additive (Laplace/Lidstone) smoothing parameter
         (0 for no smoothing).
+    use_prior: boolean
+        Whether to use label prior probabilities or not.
 
     Methods
     -------
@@ -210,10 +203,10 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
         Predict using the model.
 
     predict_proba(X) : array
-        Predict the probability of each class using the model.
+        Predict the probability of each label using the model.
 
     predict_log_proba(X) : array
-        Predict the log probability of each class using the model.
+        Predict the log probability of each label using the model.
 
     Examples
     --------
@@ -228,10 +221,11 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     3
     """
 
-    def __init__(self, alpha=1.0):
+    def __init__(self, alpha=1.0, use_prior=True):
         self.alpha = alpha
+        self.use_prior = use_prior
 
-    def fit(self, X, y):
+    def fit(self, X, y, theta=None):
         """Fit Multinomial Naive Bayes according to X, y
 
         Parameters
@@ -243,27 +237,32 @@ def fit(self, X, y):
         y : array-like, shape = [n_samples]
             Target values.
 
+        theta : array, shape [n_labels * n_features]
+            Prior probability per label.
 
         Returns
         -------
-        self : object
+            self : object
             Returns self.
         """
+        compute_priors = theta is None
 
         #
-        # N_c is the count of all words in all documents of class c.
-        # N_c_i is the a count of word i in all documents of class c.
-        # theta_c is the prior empirical probability of a document of class c.
-        # theta_c_i is the (smoothened) empirical likelihood of word i
-        # given a document of class c.
+        # N_c is the count of all words in all documents of label c.
+        # N_c_i is the a count of word i in all documents of label c.
+        # theta[c] is the prior empirical probability of a document of label c.
+        # theta_c_i is the (smoothed) empirical likelihood of word i
+        # given a document of label c.
         #
         N_c_i_temp = []
-        theta_c = []
+        if compute_priors:
+            theta = []
         self.unique_y = np.unique(y)
 
         for yi in self.unique_y:
             N_c_i_temp.append(np.sum(X[y == yi, :], 0))
-            theta_c.append(np.float(np.sum(y == yi)) / y.size)
+            if compute_priors:
+                theta.append(np.float(np.sum(y == yi)) / y.size)
 
         N_c_i = np.array(N_c_i_temp)
         N_c = np.sum(N_c_i, axis=1)
@@ -278,7 +277,7 @@ def fit(self, X, y):
         # Estimate the parameters of the distribution
         #
         self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
-        self.theta_c = np.array(theta_c)
+        self.theta = np.array(theta)
 
         return self
 
@@ -305,9 +304,11 @@ def _joint_log_likelihood(self, X):
 
         joint_log_likelihood = []
         for i in range(self.unique_y.size):
-            jointi = np.log(self.theta_c[i])
             n_ij = np.dot(np.log(self.theta_c_i[i]), X.T)
-            joint_log_likelihood.append(jointi + n_ij)
+            if self.use_prior:
+                jointi = np.log(self.theta[i])
+                n_ij += jointi
+            joint_log_likelihood.append(n_ij)
 
         joint_log_likelihood = np.array(joint_log_likelihood)
 
@@ -330,9 +331,9 @@ def predict_proba(self, X):
 
         Returns
         -------
-        C : array-like, shape = [n_samples, n_classes]
-            Returns the probability of the sample for each class in
-            the model, where classes are ordered by arithmetical
+        C : array-like, shape = [n_samples, n_labels]
+            Returns the probability of the sample for each label in
+            the model, where labels are ordered by arithmetical
             order.
         """
 
@@ -360,9 +361,9 @@ def predict_log_proba(self, X):
 
         Returns
         -------
-        C : array-like, shape = [n_samples, n_classes]
-            Returns the log-probability of the sample for each class
-            in the model, where classes are ordered by arithmetical
+        C : array-like, shape = [n_samples, n_labels]
+            Returns the log-probability of the sample for each label
+            in the model, where labels are ordered by arithmetical
             order.
         """
 
diff --git a/scikits/learn/naive_bayes/sparse/naive_bayes.py b/scikits/learn/naive_bayes/sparse/naive_bayes.py
index 0d15f57545132..7687b01b0bf53 100644
--- a/scikits/learn/naive_bayes/sparse/naive_bayes.py
+++ b/scikits/learn/naive_bayes/sparse/naive_bayes.py
@@ -4,9 +4,18 @@
 # Author: Amit Aides <amitibo@tx.technion.ac.il>
 #
 # License: BSD Style.
-import numpy as np
 
 from ...base import BaseEstimator, ClassifierMixin
+from ...utils import safe_asanyarray
+import numpy as np
+from scipy.sparse import issparse
+
+
+def samplearray(X):
+    if issparse(X):
+        return X.tocsr()
+    else:
+        return np.asanyarray(X)
 
 
 class MultinomialNB(BaseEstimator, ClassifierMixin):
@@ -15,19 +24,13 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
 
     The Multinomial Naive Bayes classifier is suitable for text classification.
 
-    Parameters
-    ----------
-    X : array-like, shape = [n_samples, n_features]
-        Training vector, where n_samples in the number of samples and
-        n_features is the number of features.
-    y : array, shape = [n_samples]
-        Target vector relative to X
-
     Parameters
     ----------
     alpha: float, optional (default=1.0)
         Additive (Laplace/Lidstone) smoothing parameter
         (0 for no smoothing).
+    use_prior: boolean
+        Whether to use label prior probabilities or not.
 
     Methods
     -------
@@ -38,10 +41,10 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
         Predict using the model.
 
     predict_proba(X) : array
-        Predict the probability of each class using the model.
+        Predict the probability of each label using the model.
 
     predict_log_proba(X) : array
-        Predict the log probability of each class using the model.
+        Predict the log probability of each label using the model.
 
     Examples
     --------
@@ -54,13 +57,13 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     MultinomialNB(alpha=1.0)
     >>> print clf.predict(X[2])
     3
-
     """
 
-    def __init__(self, alpha=1.0):
+    def __init__(self, alpha=1.0, use_prior=True):
         self.alpha = alpha
+        self.use_prior = use_prior
 
-    def fit(self, X, y):
+    def fit(self, X, y, theta=None):
         """Fit Multinomial Naive Bayes according to X, y
 
         Parameters
@@ -78,22 +81,28 @@ def fit(self, X, y):
         self : object
             Returns self.
         """
+        X = samplearray(X)
+        y = safe_asanyarray(y)
+
+        compute_priors = theta is None
 
         #
-        # N_c is the count of all words in all documents of class c.
-        # N_c_i is the a count of word i in all documents of class c.
-        # theta_c is the prior empirical probability of a document of class c.
+        # N_c is the count of all words in all documents of label c.
+        # N_c_i is the a count of word i in all documents of label c.
+        # theta[c] is the prior empirical probability of a document of label c.
         # theta_c_i is the (smoothened) empirical likelihood of word i
-        # given a document of class c.
+        # given a document of label c.
         #
         N_c_i_temp = []
-        theta_c = []
+        if compute_priors:
+            theta = []
         self.unique_y = np.unique(y)
 
         for yi in self.unique_y:
             row_ind = np.nonzero(y == yi)[0]
             N_c_i_temp.append(np.array(X[row_ind, :].sum(axis=0)).ravel())
-            theta_c.append(np.float(np.sum(y == yi)) / y.size)
+            if compute_priors:
+                theta.append(np.float(np.sum(y == yi)) / y.size)
 
         N_c_i = np.array(N_c_i_temp)
         N_c = np.sum(N_c_i, axis=1)
@@ -108,7 +117,7 @@ def fit(self, X, y):
         # Estimate the parameters of the distribution
         #
         self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
-        self.theta_c = np.array(theta_c)
+        self.theta = np.array(theta)
 
         return self
 
@@ -135,9 +144,11 @@ def _joint_log_likelihood(self, X):
 
         joint_log_likelihood = []
         for i in range(self.unique_y.size):
-            jointi = np.log(self.theta_c[i])
             n_ij = np.log(self.theta_c_i[i]) * X.T
-            joint_log_likelihood.append(jointi + n_ij)
+            if self.use_prior:
+                jointi = np.log(self.theta[i])
+                n_ij += jointi
+            joint_log_likelihood.append(n_ij)
 
         joint_log_likelihood = np.array(joint_log_likelihood)
 
@@ -160,9 +171,9 @@ def predict_proba(self, X):
 
         Returns
         -------
-        C : array-like, shape = [n_samples, n_classes]
-            Returns the probability of the sample for each class in
-            the model, where classes are ordered by arithmetical
+        C : array-like, shape = [n_samples, n_labels]
+            Returns the probability of the sample for each label in
+            the model, where labels are ordered by arithmetical
             order.
         """
 
@@ -190,9 +201,9 @@ def predict_log_proba(self, X):
 
         Returns
         -------
-        C : array-like, shape = [n_samples, n_classes]
-            Returns the log-probability of the sample for each class
-            in the model, where classes are ordered by arithmetical
+        C : array-like, shape = [n_samples, n_labels]
+            Returns the log-probability of the sample for each label
+            in the model, where labels are ordered by arithmetical
             order.
         """
 

From 2cc1837430590ad4ab32e3ad837f67d65d020b6c Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Fri, 20 May 2011 17:57:26 +0200
Subject: [PATCH 11/31] naive bayes: minor fixes

---
 scikits/learn/naive_bayes/naive_bayes.py        | 7 +++----
 scikits/learn/naive_bayes/sparse/naive_bayes.py | 6 ++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/scikits/learn/naive_bayes/naive_bayes.py b/scikits/learn/naive_bayes/naive_bayes.py
index 834f38427d8f1..014f6775e396c 100644
--- a/scikits/learn/naive_bayes/naive_bayes.py
+++ b/scikits/learn/naive_bayes/naive_bayes.py
@@ -8,9 +8,9 @@
 #         Yehuda Finkelstein <yehudaf@tx.technion.ac.il>
 #
 # License: BSD Style.
-import numpy as np
 
 from ..base import BaseEstimator, ClassifierMixin
+import numpy as np
 
 
 class GNB(BaseEstimator, ClassifierMixin):
@@ -37,7 +37,6 @@ class GNB(BaseEstimator, ClassifierMixin):
     sigma : array, shape [n_classes * n_features]
         variance of each feature for the different class
 
-
     Methods
     -------
     fit(X, y) : self
@@ -216,7 +215,7 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     >>> from scikits.learn.naive_bayes import MultinomialNB
     >>> clf = MultinomialNB()
     >>> clf.fit(X, Y)
-    MultinomialNB(alpha_i=1.0)
+    MultinomialNB(alpha=1.0)
     >>> print clf.predict(X[2])
     3
     """
@@ -242,7 +241,7 @@ def fit(self, X, y, theta=None):
 
         Returns
         -------
-            self : object
+        self : object
             Returns self.
         """
         compute_priors = theta is None
diff --git a/scikits/learn/naive_bayes/sparse/naive_bayes.py b/scikits/learn/naive_bayes/sparse/naive_bayes.py
index 7687b01b0bf53..352e6c34d0dfb 100644
--- a/scikits/learn/naive_bayes/sparse/naive_bayes.py
+++ b/scikits/learn/naive_bayes/sparse/naive_bayes.py
@@ -49,7 +49,7 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     Examples
     --------
     >>> import numpy as np
-    >>> X = np.random.randint( 5, size=(6, 100) )
+    >>> X = np.random.randint(5, size=(6, 100))
     >>> Y = np.array([1, 2, 3, 4, 5, 6])
     >>> from scikits.learn.naive_bayes import MultinomialNB
     >>> clf = MultinomialNB()
@@ -75,6 +75,8 @@ def fit(self, X, y, theta=None):
         y : array-like, shape = [n_samples]
             Target values.
 
+        theta : array, shape [n_labels * n_features]
+            Prior probability per label.
 
         Returns
         -------
@@ -90,7 +92,7 @@ def fit(self, X, y, theta=None):
         # N_c is the count of all words in all documents of label c.
         # N_c_i is the a count of word i in all documents of label c.
         # theta[c] is the prior empirical probability of a document of label c.
-        # theta_c_i is the (smoothened) empirical likelihood of word i
+        # theta_c_i is the (smoothed) empirical likelihood of word i
         # given a document of label c.
         #
         N_c_i_temp = []

From 2a302fdcb65178f53037d05b3b0ad982fe4abda3 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Fri, 20 May 2011 18:08:19 +0200
Subject: [PATCH 12/31] Merge sparse and vanilla naive Bayes

---
 .../learn/{naive_bayes => }/naive_bayes.py    |  40 +++-
 scikits/learn/naive_bayes/__init__.py         |  15 --
 scikits/learn/naive_bayes/sparse/__init__.py  |  10 -
 .../learn/naive_bayes/sparse/naive_bayes.py   | 224 ------------------
 scikits/learn/naive_bayes/tests/__init__.py   |   0
 .../tests/test_naive_bayes.py                 |  26 +-
 6 files changed, 49 insertions(+), 266 deletions(-)
 rename scikits/learn/{naive_bayes => }/naive_bayes.py (90%)
 delete mode 100644 scikits/learn/naive_bayes/__init__.py
 delete mode 100644 scikits/learn/naive_bayes/sparse/__init__.py
 delete mode 100644 scikits/learn/naive_bayes/sparse/naive_bayes.py
 delete mode 100644 scikits/learn/naive_bayes/tests/__init__.py
 rename scikits/learn/{naive_bayes => }/tests/test_naive_bayes.py (81%)

diff --git a/scikits/learn/naive_bayes/naive_bayes.py b/scikits/learn/naive_bayes.py
similarity index 90%
rename from scikits/learn/naive_bayes/naive_bayes.py
rename to scikits/learn/naive_bayes.py
index 014f6775e396c..1f08afae6ef5d 100644
--- a/scikits/learn/naive_bayes/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -1,4 +1,12 @@
-""" Naive Bayes classifiers.
+"""
+Naive Bayes models
+==================
+
+Naive Bayes algorithms are a set of supervised learning methods based on
+applying Bayes' theorem with strong (naive) independence assumptions. 
+
+See http://scikit-learn.sourceforge.net/modules/naive_bayes.html for
+complete documentation.
 """
 
 # Author: Vincent Michel <vincent.michel@inria.fr>
@@ -9,8 +17,10 @@
 #
 # License: BSD Style.
 
-from ..base import BaseEstimator, ClassifierMixin
+from .base import BaseEstimator, ClassifierMixin
+from .utils import safe_asanyarray
 import numpy as np
+from scipy.sparse import issparse
 
 
 class GNB(BaseEstimator, ClassifierMixin):
@@ -179,9 +189,16 @@ def predict_log_proba(self, X):
         return log_proba
 
 
+def asanyarray_or_csr(X):
+    if issparse(X):
+        return X.tocsr(), True
+    else:
+        return np.asanyarray(X), False
+
+
 class MultinomialNB(BaseEstimator, ClassifierMixin):
     """
-    Multinomial Naive Bayes (MultinomialNB)
+    Naive Bayes classifier for multinomial models
 
     The Multinomial Naive Bayes classifier is suitable for text classification.
 
@@ -244,6 +261,9 @@ def fit(self, X, y, theta=None):
         self : object
             Returns self.
         """
+        X, self.sparse = asanyarray_or_csr(X)
+        y = safe_asanyarray(y)
+
         compute_priors = theta is None
 
         #
@@ -259,7 +279,11 @@ def fit(self, X, y, theta=None):
         self.unique_y = np.unique(y)
 
         for yi in self.unique_y:
-            N_c_i_temp.append(np.sum(X[y == yi, :], 0))
+            if self.sparse:
+                row_ind = np.nonzero(y == yi)[0]
+                N_c_i_temp.append(np.array(X[row_ind, :].sum(axis=0)).ravel())
+            else:
+                N_c_i_temp.append(np.sum(X[y == yi, :], 0))
             if compute_priors:
                 theta.append(np.float(np.sum(y == yi)) / y.size)
 
@@ -292,6 +316,7 @@ def predict(self, X):
         -------
         C : array, shape = [n_samples]
         """
+        X, _ = asanyarray_or_csr(X)
 
         joint_log_likelihood = self._joint_log_likelihood(X)
         y_pred = self.unique_y[np.argmax(joint_log_likelihood, axis=0)]
@@ -303,7 +328,10 @@ def _joint_log_likelihood(self, X):
 
         joint_log_likelihood = []
         for i in range(self.unique_y.size):
-            n_ij = np.dot(np.log(self.theta_c_i[i]), X.T)
+            if self.sparse:
+                n_ij = np.log(self.theta_c_i[i]) * X.T
+            else:
+                n_ij = np.dot(np.log(self.theta_c_i[i]), X.T)
             if self.use_prior:
                 jointi = np.log(self.theta[i])
                 n_ij += jointi
@@ -335,6 +363,7 @@ def predict_proba(self, X):
             the model, where labels are ordered by arithmetical
             order.
         """
+        X, _ = asanyarray_or_csr(X)
 
         joint_log_likelihood = self._joint_log_likelihood(X)
 
@@ -365,6 +394,7 @@ def predict_log_proba(self, X):
             in the model, where labels are ordered by arithmetical
             order.
         """
+        X, _ = asanyarray_or_csr(X)
 
         joint_log_likelihood = self._joint_log_likelihood(X)
 
diff --git a/scikits/learn/naive_bayes/__init__.py b/scikits/learn/naive_bayes/__init__.py
deleted file mode 100644
index f89f5b43e1fff..0000000000000
--- a/scikits/learn/naive_bayes/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-Naive Bayes models
-==================
-
-Naive Bayes algorithms are a set of supervised learning methods based on
-applying Bayes' theorem with strong (naive) independence assumptions. 
-
-See http://scikit-learn.sourceforge.net/modules/naive_bayes.html for
-complete documentation.
-"""
-
-from .naive_bayes import GNB, MultinomialNB
-
-from . import sparse
-
diff --git a/scikits/learn/naive_bayes/sparse/__init__.py b/scikits/learn/naive_bayes/sparse/__init__.py
deleted file mode 100644
index ab82d37b90301..0000000000000
--- a/scikits/learn/naive_bayes/sparse/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-Naive Bayes models with sparse data
-===================================
-
-scikits.learn.naive_bayes.sparse implements the sparse counterpart
-of scikits.learn.naive_bayes.MultinomialNB
-
-"""
-
-from .naive_bayes import MultinomialNB
diff --git a/scikits/learn/naive_bayes/sparse/naive_bayes.py b/scikits/learn/naive_bayes/sparse/naive_bayes.py
deleted file mode 100644
index 352e6c34d0dfb..0000000000000
--- a/scikits/learn/naive_bayes/sparse/naive_bayes.py
+++ /dev/null
@@ -1,224 +0,0 @@
-""" Naive Bayes classifiers for sparse data.
-"""
-
-# Author: Amit Aides <amitibo@tx.technion.ac.il>
-#
-# License: BSD Style.
-
-from ...base import BaseEstimator, ClassifierMixin
-from ...utils import safe_asanyarray
-import numpy as np
-from scipy.sparse import issparse
-
-
-def samplearray(X):
-    if issparse(X):
-        return X.tocsr()
-    else:
-        return np.asanyarray(X)
-
-
-class MultinomialNB(BaseEstimator, ClassifierMixin):
-    """
-    Multinomial Naive Bayes for sparse matrices
-
-    The Multinomial Naive Bayes classifier is suitable for text classification.
-
-    Parameters
-    ----------
-    alpha: float, optional (default=1.0)
-        Additive (Laplace/Lidstone) smoothing parameter
-        (0 for no smoothing).
-    use_prior: boolean
-        Whether to use label prior probabilities or not.
-
-    Methods
-    -------
-    fit(X, y) : self
-        Fit the model
-
-    predict(X) : array
-        Predict using the model.
-
-    predict_proba(X) : array
-        Predict the probability of each label using the model.
-
-    predict_log_proba(X) : array
-        Predict the log probability of each label using the model.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> X = np.random.randint(5, size=(6, 100))
-    >>> Y = np.array([1, 2, 3, 4, 5, 6])
-    >>> from scikits.learn.naive_bayes import MultinomialNB
-    >>> clf = MultinomialNB()
-    >>> clf.fit(X, Y)
-    MultinomialNB(alpha=1.0)
-    >>> print clf.predict(X[2])
-    3
-    """
-
-    def __init__(self, alpha=1.0, use_prior=True):
-        self.alpha = alpha
-        self.use_prior = use_prior
-
-    def fit(self, X, y, theta=None):
-        """Fit Multinomial Naive Bayes according to X, y
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-            Training vectors, where n_samples is the number of samples
-            and n_features is the number of features.
-
-        y : array-like, shape = [n_samples]
-            Target values.
-
-        theta : array, shape [n_labels * n_features]
-            Prior probability per label.
-
-        Returns
-        -------
-        self : object
-            Returns self.
-        """
-        X = samplearray(X)
-        y = safe_asanyarray(y)
-
-        compute_priors = theta is None
-
-        #
-        # N_c is the count of all words in all documents of label c.
-        # N_c_i is the a count of word i in all documents of label c.
-        # theta[c] is the prior empirical probability of a document of label c.
-        # theta_c_i is the (smoothed) empirical likelihood of word i
-        # given a document of label c.
-        #
-        N_c_i_temp = []
-        if compute_priors:
-            theta = []
-        self.unique_y = np.unique(y)
-
-        for yi in self.unique_y:
-            row_ind = np.nonzero(y == yi)[0]
-            N_c_i_temp.append(np.array(X[row_ind, :].sum(axis=0)).ravel())
-            if compute_priors:
-                theta.append(np.float(np.sum(y == yi)) / y.size)
-
-        N_c_i = np.array(N_c_i_temp)
-        N_c = np.sum(N_c_i, axis=1)
-
-        #
-        # Smoothing coefficients
-        #
-        alpha_i = self.alpha
-        alpha = alpha_i * X.shape[1]
-
-        #
-        # Estimate the parameters of the distribution
-        #
-        self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
-        self.theta = np.array(theta)
-
-        return self
-
-    def predict(self, X):
-        """
-        Perform classification on an array of test vectors X.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-        C : array, shape = [n_samples]
-        """
-
-        joint_log_likelihood = self._joint_log_likelihood(X)
-        y_pred = self.unique_y[np.argmax(joint_log_likelihood, axis=0)]
-
-        return y_pred
-
-    def _joint_log_likelihood(self, X):
-        """Calculate the posterior log probability of the samples X"""
-
-        joint_log_likelihood = []
-        for i in range(self.unique_y.size):
-            n_ij = np.log(self.theta_c_i[i]) * X.T
-            if self.use_prior:
-                jointi = np.log(self.theta[i])
-                n_ij += jointi
-            joint_log_likelihood.append(n_ij)
-
-        joint_log_likelihood = np.array(joint_log_likelihood)
-
-        return joint_log_likelihood
-
-    def _mininf(self, X, axis=None):
-        """Calculate the minimum of a matrix ignoring -inf values"""
-
-        A = X.copy()
-        A[np.isinf(X)] = np.inf
-        return np.min(X, axis=axis)
-
-    def predict_proba(self, X):
-        """
-        Return probability estimates for the test vector X.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-        C : array-like, shape = [n_samples, n_labels]
-            Returns the probability of the sample for each label in
-            the model, where labels are ordered by arithmetical
-            order.
-        """
-
-        joint_log_likelihood = self._joint_log_likelihood(X)
-
-        #
-        # The _joint_log_likelihood has very low values that create underflow
-        # in the computation of the exponent. Therefore I 'fix' it by adding
-        # a minimal value.
-        #
-        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
-        loga_fix = joint_log_likelihood - fix
-        proba_fix = np.exp(loga_fix)
-        proba = proba_fix / np.sum(proba_fix, 1)[:, np.newaxis]
-
-        return proba
-
-    def predict_log_proba(self, X):
-        """
-        Return log-probability estimates for the test vector X.
-
-        Parameters
-        ----------
-        X : array-like, shape = [n_samples, n_features]
-
-        Returns
-        -------
-        C : array-like, shape = [n_samples, n_labels]
-            Returns the log-probability of the sample for each label
-            in the model, where labels are ordered by arithmetical
-            order.
-        """
-
-        joint_log_likelihood = self._joint_log_likelihood(X)
-
-        #
-        # The _joint_log_likelihood has very low values that create underflow
-        # in the computation of the exponent. Therefore I 'fix' it by adding
-        # a minimal value.
-        #
-        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
-        loga_fix = joint_log_likelihood - fix
-        proba_fix = np.exp(loga_fix)
-        log_proba = loga_fix - np.log(np.sum(proba_fix, axis=1))[:, np.newaxis]
-
-        return log_proba
diff --git a/scikits/learn/naive_bayes/tests/__init__.py b/scikits/learn/naive_bayes/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/scikits/learn/naive_bayes/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
similarity index 81%
rename from scikits/learn/naive_bayes/tests/test_naive_bayes.py
rename to scikits/learn/tests/test_naive_bayes.py
index 16b037c5634bd..2a60abbf70ee0 100644
--- a/scikits/learn/naive_bayes/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -3,11 +3,11 @@
 from numpy.testing import assert_array_equal, assert_array_almost_equal
 
 from .. import naive_bayes
-from ..sparse import naive_bayes as naive_bayes_sparse
 
 # Data is just 6 separable points in the plane
-X = np.array( [[-2,-1], [-1, -1], [-1, -2], [1,1], [1,2], [2, 1]])
-y = np.array( [1, 1, 1, 2, 2, 2])
+X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
+y = np.array([1, 1, 1, 2, 2, 2])
+
 
 def test_gnb():
     """
@@ -17,18 +17,20 @@ def test_gnb():
     correct values for a simple toy dataset.
     """
 
-    clf =  naive_bayes.GNB()
+    clf = naive_bayes.GNB()
     y_pred = clf.fit(X, y).predict(X)
     assert_array_equal(y_pred, y)
 
     y_pred_proba = clf.predict_proba(X)
     y_pred_log_proba = clf.predict_log_proba(X)
     assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
-    
+
+
 # Data is 6 random points in an 100 dimensional space classified to
 # three classes.
-X2 = np.random.randint( 5, size=(6, 100) )
-y2 = np.array( [1, 1, 2, 2, 3, 3] )
+X2 = np.random.randint(5, size=(6, 100))
+y2 = np.array([1, 1, 2, 2, 3, 3])
+
 
 def test_mnnb():
     """
@@ -41,11 +43,11 @@ def test_mnnb():
     #
     # Check the ability to predict the learning set.
     #
-    clf =  naive_bayes.MultinomialNB()
+    clf = naive_bayes.MultinomialNB()
     y_pred = clf.fit(X2, y2).predict(X2)
 
     assert_array_equal(y_pred, y2)
-    
+
     #
     # Verify that np.log(clf.predict_proba(X)) gives the same results as
     # clf.predict_log_proba(X)
@@ -59,7 +61,7 @@ def test_sparse_mnnb():
     """
     Multinomial Naive Bayes classification for sparse data.
 
-    This checks that sparse MultinomialNB implements fit and predict and returns
+    This checks that MultinomialNB implements fit and predict and returns
     correct values for a simple toy dataset.
     """
 
@@ -68,11 +70,11 @@ def test_sparse_mnnb():
     #
     # Check the ability to predict the learning set.
     #
-    clf =  naive_bayes_sparse.MultinomialNB()
+    clf = naive_bayes.MultinomialNB()
     y_pred = clf.fit(X2S, y2).predict(X2S)
 
     assert_array_equal(y_pred, y2)
-    
+
     #
     # Verify that np.log(clf.predict_proba(X)) gives the same results as
     # clf.predict_log_proba(X)

From 80da99a483b774f0ad08afb19883bc658a20d9be Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Fri, 20 May 2011 19:17:44 +0200
Subject: [PATCH 13/31] docs + cosmit in naive_bayes

---
 scikits/learn/naive_bayes.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 1f08afae6ef5d..a03207cd2ad8a 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -132,7 +132,7 @@ def predict(self, X):
 
     def _joint_log_likelihood(self, X):
         joint_log_likelihood = []
-        for i in range(np.size(self.unique_y)):
+        for i in xrange(np.size(self.unique_y)):
             jointi = np.log(self.proba_y[i])
             n_ij = - 0.5 * np.sum(np.log(np.pi * self.sigma[i, :]))
             n_ij -= 0.5 * np.sum(((X - self.theta[i, :]) ** 2) / \
@@ -200,7 +200,9 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     """
     Naive Bayes classifier for multinomial models
 
-    The Multinomial Naive Bayes classifier is suitable for text classification.
+    The multinomial Naive Bayes classifier is suitable for text classification.
+    This class is designed to handle both dense and sparse data; it will enter
+    "sparse mode" if its training matrix (X) is a sparse matrix.
 
     Parameters
     ----------
@@ -266,7 +268,6 @@ def fit(self, X, y, theta=None):
 
         compute_priors = theta is None
 
-        #
         # N_c is the count of all words in all documents of label c.
         # N_c_i is the a count of word i in all documents of label c.
         # theta[c] is the prior empirical probability of a document of label c.
@@ -290,13 +291,11 @@ def fit(self, X, y, theta=None):
         N_c_i = np.array(N_c_i_temp)
         N_c = np.sum(N_c_i, axis=1)
 
-        #
         # Smoothing coefficients
         #
         alpha_i = self.alpha
         alpha = alpha_i * X.shape[1]
 
-        #
         # Estimate the parameters of the distribution
         #
         self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
@@ -327,7 +326,7 @@ def _joint_log_likelihood(self, X):
         """Calculate the posterior log probability of the samples X"""
 
         joint_log_likelihood = []
-        for i in range(self.unique_y.size):
+        for i in xrange(self.unique_y.size):
             if self.sparse:
                 n_ij = np.log(self.theta_c_i[i]) * X.T
             else:
@@ -337,9 +336,7 @@ def _joint_log_likelihood(self, X):
                 n_ij += jointi
             joint_log_likelihood.append(n_ij)
 
-        joint_log_likelihood = np.array(joint_log_likelihood)
-
-        return joint_log_likelihood
+        return np.array(joint_log_likelihood)
 
     def _mininf(self, X, axis=None):
         """Calculate the minimum of a matrix ignoring -inf values"""
@@ -367,7 +364,6 @@ def predict_proba(self, X):
 
         joint_log_likelihood = self._joint_log_likelihood(X)
 
-        #
         # The _joint_log_likelihood has very low values that create underflow
         # in the computation of the exponent. Therefore I 'fix' it by adding
         # a minimal value.
@@ -398,7 +394,6 @@ def predict_log_proba(self, X):
 
         joint_log_likelihood = self._joint_log_likelihood(X)
 
-        #
         # The _joint_log_likelihood has very low values that create underflow
         # in the computation of the exponent. Therefore I 'fix' it by adding
         # a minimal value.

From bbbb022a824977b00a4d017cbdb8e96d6327c1fb Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Sun, 22 May 2011 11:03:46 +0200
Subject: [PATCH 14/31] naive bayes: handle 1-d input

---
 scikits/learn/naive_bayes.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index a03207cd2ad8a..44f917097ed86 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -195,6 +195,12 @@ def asanyarray_or_csr(X):
     else:
         return np.asanyarray(X), False
 
+def atleast2d_or_csr(X):
+    if issparse(X):
+        return X.tocsr()
+    else:
+        return np.atleast_2d(X)
+
 
 class MultinomialNB(BaseEstimator, ClassifierMixin):
     """
@@ -315,7 +321,7 @@ def predict(self, X):
         -------
         C : array, shape = [n_samples]
         """
-        X, _ = asanyarray_or_csr(X)
+        X = atleast2d_or_csr(X)
 
         joint_log_likelihood = self._joint_log_likelihood(X)
         y_pred = self.unique_y[np.argmax(joint_log_likelihood, axis=0)]
@@ -360,7 +366,7 @@ def predict_proba(self, X):
             the model, where labels are ordered by arithmetical
             order.
         """
-        X, _ = asanyarray_or_csr(X)
+        X = atleast2d_or_csr(X)
 
         joint_log_likelihood = self._joint_log_likelihood(X)
 
@@ -390,7 +396,7 @@ def predict_log_proba(self, X):
             in the model, where labels are ordered by arithmetical
             order.
         """
-        X, _ = asanyarray_or_csr(X)
+        X = atleast2d_or_csr(X)
 
         joint_log_likelihood = self._joint_log_likelihood(X)
 

From e89a0240de1b55784aa265309781328f63eac4b3 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Sun, 22 May 2011 18:30:48 +0200
Subject: [PATCH 15/31] naive bayes: fix predict_proba bug and change priors
 behavior

* predict_proba now gives true probabilities (by normalizing by the evidence)
* if use_priors is False, use uniform priors instead of skipping computation
---
 scikits/learn/naive_bayes.py            | 77 +++++++++----------------
 scikits/learn/tests/test_naive_bayes.py |  9 +++
 2 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 44f917097ed86..3ee7d97931afa 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -3,7 +3,7 @@
 ==================
 
 Naive Bayes algorithms are a set of supervised learning methods based on
-applying Bayes' theorem with strong (naive) independence assumptions. 
+applying Bayes' theorem with strong (naive) independence assumptions.
 
 See http://scikit-learn.sourceforge.net/modules/naive_bayes.html for
 complete documentation.
@@ -195,6 +195,7 @@ def asanyarray_or_csr(X):
     else:
         return np.asanyarray(X), False
 
+
 def atleast2d_or_csr(X):
     if issparse(X):
         return X.tocsr()
@@ -272,7 +273,16 @@ def fit(self, X, y, theta=None):
         X, self.sparse = asanyarray_or_csr(X)
         y = safe_asanyarray(y)
 
-        compute_priors = theta is None
+        self.unique_y = np.unique(y)
+        n_labels = self.unique_y.size
+
+        self.theta = None
+        if not self.use_prior:
+            self.theta = np.ones(n_labels) / n_labels
+        if theta:
+            assert len(theta) == n_labels, \
+                   'Number of priors must match number of labels'
+            self.theta = np.array(theta)
 
         # N_c is the count of all words in all documents of label c.
         # N_c_i is the a count of word i in all documents of label c.
@@ -281,9 +291,8 @@ def fit(self, X, y, theta=None):
         # given a document of label c.
         #
         N_c_i_temp = []
-        if compute_priors:
+        if self.theta is None:
             theta = []
-        self.unique_y = np.unique(y)
 
         for yi in self.unique_y:
             if self.sparse:
@@ -291,7 +300,7 @@ def fit(self, X, y, theta=None):
                 N_c_i_temp.append(np.array(X[row_ind, :].sum(axis=0)).ravel())
             else:
                 N_c_i_temp.append(np.sum(X[y == yi, :], 0))
-            if compute_priors:
+            if self.theta is None:
                 theta.append(np.float(np.sum(y == yi)) / y.size)
 
         N_c_i = np.array(N_c_i_temp)
@@ -304,8 +313,10 @@ def fit(self, X, y, theta=None):
 
         # Estimate the parameters of the distribution
         #
-        self.theta_c_i = (N_c_i + alpha_i) / (N_c.reshape(-1, 1) + alpha)
-        self.theta = np.array(theta)
+        self.theta_c_i = (np.log(N_c_i + alpha_i)
+                         - np.log(N_c.reshape(-1, 1) + alpha))
+        if self.theta is None:
+            self.theta = np.array(theta)
 
         return self
 
@@ -321,8 +332,6 @@ def predict(self, X):
         -------
         C : array, shape = [n_samples]
         """
-        X = atleast2d_or_csr(X)
-
         joint_log_likelihood = self._joint_log_likelihood(X)
         y_pred = self.unique_y[np.argmax(joint_log_likelihood, axis=0)]
 
@@ -331,26 +340,19 @@ def predict(self, X):
     def _joint_log_likelihood(self, X):
         """Calculate the posterior log probability of the samples X"""
 
+        X = atleast2d_or_csr(X)
+
         joint_log_likelihood = []
         for i in xrange(self.unique_y.size):
             if self.sparse:
-                n_ij = np.log(self.theta_c_i[i]) * X.T
+                n_ij = self.theta_c_i[i] * X.T
             else:
-                n_ij = np.dot(np.log(self.theta_c_i[i]), X.T)
-            if self.use_prior:
-                jointi = np.log(self.theta[i])
-                n_ij += jointi
+                n_ij = np.dot(self.theta_c_i[i], X.T)
+            n_ij += np.log(self.theta[i])
             joint_log_likelihood.append(n_ij)
 
         return np.array(joint_log_likelihood)
 
-    def _mininf(self, X, axis=None):
-        """Calculate the minimum of a matrix ignoring -inf values"""
-
-        A = X.copy()
-        A[np.isinf(X)] = np.inf
-        return np.min(X, axis=axis)
-
     def predict_proba(self, X):
         """
         Return probability estimates for the test vector X.
@@ -366,20 +368,7 @@ def predict_proba(self, X):
             the model, where labels are ordered by arithmetical
             order.
         """
-        X = atleast2d_or_csr(X)
-
-        joint_log_likelihood = self._joint_log_likelihood(X)
-
-        # The _joint_log_likelihood has very low values that create underflow
-        # in the computation of the exponent. Therefore I 'fix' it by adding
-        # a minimal value.
-        #
-        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
-        loga_fix = joint_log_likelihood - fix
-        proba_fix = np.exp(loga_fix)
-        proba = proba_fix / np.sum(proba_fix, 1)[:, np.newaxis]
-
-        return proba
+        return np.exp(self.predict_log_proba(X))
 
     def predict_log_proba(self, X):
         """
@@ -396,17 +385,7 @@ def predict_log_proba(self, X):
             in the model, where labels are ordered by arithmetical
             order.
         """
-        X = atleast2d_or_csr(X)
-
-        joint_log_likelihood = self._joint_log_likelihood(X)
-
-        # The _joint_log_likelihood has very low values that create underflow
-        # in the computation of the exponent. Therefore I 'fix' it by adding
-        # a minimal value.
-        #
-        fix = self._mininf(joint_log_likelihood, axis=1)[:, np.newaxis]
-        loga_fix = joint_log_likelihood - fix
-        proba_fix = np.exp(loga_fix)
-        log_proba = loga_fix - np.log(np.sum(proba_fix, axis=1))[:, np.newaxis]
-
-        return log_proba
+        jll = self._joint_log_likelihood(X)
+        # normalize by P(x) = P(f_1, ..., f_n)
+        normalize = np.logaddexp.reduce(jll[:, np.newaxis])
+        return jll - normalize
diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 2a60abbf70ee0..93ec8a296e1c8 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -82,3 +82,12 @@ def test_sparse_mnnb():
     y_pred_proba = clf.predict_proba(X2S)
     y_pred_log_proba = clf.predict_log_proba(X2S)
     assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
+
+
+def test_mnnb_predict_proba():
+    '''Test multinomial NB's probability scores'''
+
+    clf = naive_bayes.MultinomialNB().fit([[0,1], [0,1], [1,0]], [0,0,1])
+    assert clf.predict([0,1]) == 0
+    assert np.sum(clf.predict_proba([0,1])) == 1
+    assert np.sum(clf.predict_proba([1,0])) == 1

From 6f4021cef9e096538102422885dc3e353ac47ac3 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Sun, 22 May 2011 19:23:24 +0200
Subject: [PATCH 16/31] fix naive bayes docs and example + credit mblondel +
 vanity

---
 doc/modules/naive_bayes.rst                       | 8 ++++----
 examples/mlcomp_sparse_document_classification.py | 7 ++++---
 scikits/learn/naive_bayes.py                      | 8 +++++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index a4851262310a8..65d775d0b6a55 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -6,7 +6,7 @@ Naive Bayes
 
 
 **Naive Bayes** algorithms are a set of supervised learning methods
-based on applying Baye's theorem with strong (naive) independence
+based on applying Bayes' theorem with strong (naive) independence
 assumptions. Given a class variable :math:`c` and a dependent set
 of feature variables :math:`f_1` through :math:`f_n`, Bayes' theorem
 states the following relationship:
@@ -27,10 +27,10 @@ Using the naive assumption this relationship is simplified:
 
 where we used the Maximum a Posteriori estimator.
 
-The differnt Naive Bayes classifiers differ by the assumption on the
-distrubtion of :math:`p(f_i \mid c)`:
+The different naive Bayes classifiers differ by the assumption on the
+distribution of :math:`p(f_i \mid c)`:
 
-The advantage of Naive Bayes approaches are:
+The advantages of naive Bayes approaches are:
 
    - It requires a small amount of training data to estimate the
      parameters necessary for classification.
diff --git a/examples/mlcomp_sparse_document_classification.py b/examples/mlcomp_sparse_document_classification.py
index 5f8f2444543e4..6080e0ba88308 100644
--- a/examples/mlcomp_sparse_document_classification.py
+++ b/examples/mlcomp_sparse_document_classification.py
@@ -12,7 +12,8 @@
 
   http://mlcomp.org/datasets/379
 
-Once downloaded unzip the arhive somewhere on your filesystem. For instance in::
+Once downloaded unzip the archive somewhere on your filesystem.
+For instance in::
 
   % mkdir -p ~/data/mlcomp
   % cd  ~/data/mlcomp
@@ -49,7 +50,7 @@
 from scikits.learn.linear_model.sparse import SGDClassifier
 from scikits.learn.metrics import confusion_matrix
 from scikits.learn.metrics import classification_report
-from scikits.learn.naive_bayes.sparse import MultinomialNB
+from scikits.learn.naive_bayes import MultinomialNB
 
 
 if 'MLCOMP_DATASETS_HOME' not in os.environ:
@@ -133,7 +134,7 @@ def benchmark(clf_class, params, name):
 
 print "Testbenching a MultinomialNB classifier..."
 parameters = {
-    'alpha_i': 0.01
+    'alpha': 0.01
 }
 
 benchmark(MultinomialNB, parameters, 'MultinomialNB')
diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 3ee7d97931afa..6aae407cfd2d8 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -12,8 +12,10 @@
 # Author: Vincent Michel <vincent.michel@inria.fr>
 #         Minor fixes by Fabian Pedregosa
 #         MultinomialNB classifier by:
-#         Amit Aides <amitibo@tx.technion.ac.il> &
+#         Amit Aides <amitibo@tx.technion.ac.il>
 #         Yehuda Finkelstein <yehudaf@tx.technion.ac.il>
+#         Lars Buitinck <L.J.Buitinck@uva.nl>
+#         (parts based on earlier work by Mathieu Blondel)
 #
 # License: BSD Style.
 
@@ -241,9 +243,9 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     >>> from scikits.learn.naive_bayes import MultinomialNB
     >>> clf = MultinomialNB()
     >>> clf.fit(X, Y)
-    MultinomialNB(alpha=1.0)
+    MultinomialNB(alpha=1.0, use_prior=True)
     >>> print clf.predict(X[2])
-    3
+    [3]
     """
 
     def __init__(self, alpha=1.0, use_prior=True):

From 0c55ac8328ffbffc133dfca10590fbbcb2b10971 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Mon, 23 May 2011 12:00:18 +0200
Subject: [PATCH 17/31] naive bayes: test pickling

---
 scikits/learn/tests/test_naive_bayes.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 93ec8a296e1c8..33ba8322a8338 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -1,3 +1,5 @@
+import cPickle as pickle
+from cStringIO import StringIO
 import numpy as np
 import scipy.sparse
 from numpy.testing import assert_array_equal, assert_array_almost_equal
@@ -84,6 +86,19 @@ def test_sparse_mnnb():
     assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
 
 
+def test_mnnb_pickle():
+    '''Test picklability of multinomial NB'''
+
+    clf = naive_bayes.MultinomialNB(alpha=2, use_prior=False).fit(X, y)
+    y_pred = clf.predict(X)
+
+    store = StringIO()
+    pickle.dump(clf, store)
+    clf = pickle.load(StringIO(store.getvalue()))
+
+    assert_array_equal(y_pred, clf.predict(X))
+
+
 def test_mnnb_predict_proba():
     '''Test multinomial NB's probability scores'''
 

From 4e928ae10627ab6d2be472901d11b89c3ff3ff2c Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Mon, 23 May 2011 14:33:00 +0200
Subject: [PATCH 18/31] naive bayes: safe_sparse_dot, doc and docstring updates

---
 doc/modules/naive_bayes.rst  | 28 ++++++++++++----------------
 scikits/learn/naive_bayes.py | 30 +++++++++++++-----------------
 2 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index 65d775d0b6a55..34610d5369c56 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -6,10 +6,10 @@ Naive Bayes
 
 
 **Naive Bayes** algorithms are a set of supervised learning methods
-based on applying Bayes' theorem with strong (naive) independence
-assumptions. Given a class variable :math:`c` and a dependent set
-of feature variables :math:`f_1` through :math:`f_n`, Bayes' theorem
-states the following relationship:
+based on applying Bayes' theorem with the "naive" assumption of independence
+between every pair of features. Given a class variable :math:`c` and a
+dependent set of feature variables :math:`f_1` through :math:`f_n`, Bayes'
+theorem states the following relationship:
 
 .. math::
 
@@ -30,19 +30,15 @@ where we used the Maximum a Posteriori estimator.
 The different naive Bayes classifiers differ by the assumption on the
 distribution of :math:`p(f_i \mid c)`:
 
-The advantages of naive Bayes approaches are:
+In spite of their naive design and apparently over-simplified assumptions,
+naive Bayes classifiers have worked quite well in many real-world situations,
+famously document classification and spam filtering. They requires a small
+amount of training data to estimate the necessary parameters.
 
-   - It requires a small amount of training data to estimate the
-     parameters necessary for classification.
-
-   - In spite of their naive design and apparently over-simplified
-     assumptions, naive Bayes classifiers have worked quite well in
-     many complex real-world situations.
-
-   - The decoupling of the class conditional feature distributions
-     means that each distribution can be independently estimated as a
-     one dimensional distribution. This in turn helps to alleviate
-     problems stemming from the curse of dimensionality.
+The decoupling of the class conditional feature distributions means that each
+distribution can be independently estimated as a one dimensional distribution.
+This in turn helps to alleviate problems stemming from the curse of
+dimensionality.
 
 
 Gaussian Naive Bayes
diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 6aae407cfd2d8..cbe2195769d68 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -21,6 +21,7 @@
 
 from .base import BaseEstimator, ClassifierMixin
 from .utils import safe_asanyarray
+from .utils.extmath import safe_sparse_dot
 import numpy as np
 from scipy.sparse import issparse
 
@@ -219,7 +220,7 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
         Additive (Laplace/Lidstone) smoothing parameter
         (0 for no smoothing).
     use_prior: boolean
-        Whether to use label prior probabilities or not.
+        Whether to learn label prior probabilities or not.
 
     Methods
     -------
@@ -258,13 +259,13 @@ def fit(self, X, y, theta=None):
         Parameters
         ----------
         X : array-like, shape = [n_samples, n_features]
-            Training vectors, where n_samples is the number of samples
-            and n_features is the number of features.
+            Training vectors, where n_samples is the number of samples and
+            n_features is the number of features. X may be a sparse matrix.
 
         y : array-like, shape = [n_samples]
             Target values.
 
-        theta : array, shape [n_labels * n_features]
+        theta : array, shape [n_classes]
             Prior probability per label.
 
         Returns
@@ -276,13 +277,13 @@ def fit(self, X, y, theta=None):
         y = safe_asanyarray(y)
 
         self.unique_y = np.unique(y)
-        n_labels = self.unique_y.size
+        n_classes = self.unique_y.size
 
         self.theta = None
         if not self.use_prior:
-            self.theta = np.ones(n_labels) / n_labels
+            self.theta = np.ones(n_classes) / n_classes
         if theta:
-            assert len(theta) == n_labels, \
+            assert len(theta) == n_classes, \
                    'Number of priors must match number of labels'
             self.theta = np.array(theta)
 
@@ -344,16 +345,11 @@ def _joint_log_likelihood(self, X):
 
         X = atleast2d_or_csr(X)
 
-        joint_log_likelihood = []
+        jll = safe_sparse_dot(self.theta_c_i, X.T)
         for i in xrange(self.unique_y.size):
-            if self.sparse:
-                n_ij = self.theta_c_i[i] * X.T
-            else:
-                n_ij = np.dot(self.theta_c_i[i], X.T)
-            n_ij += np.log(self.theta[i])
-            joint_log_likelihood.append(n_ij)
+            jll[i] += np.log(self.theta[i])
 
-        return np.array(joint_log_likelihood)
+        return jll
 
     def predict_proba(self, X):
         """
@@ -365,7 +361,7 @@ def predict_proba(self, X):
 
         Returns
         -------
-        C : array-like, shape = [n_samples, n_labels]
+        C : array-like, shape = [n_samples, n_classes]
             Returns the probability of the sample for each label in
             the model, where labels are ordered by arithmetical
             order.
@@ -382,7 +378,7 @@ def predict_log_proba(self, X):
 
         Returns
         -------
-        C : array-like, shape = [n_samples, n_labels]
+        C : array-like, shape = [n_samples, n_classes]
             Returns the log-probability of the sample for each label
             in the model, where labels are ordered by arithmetical
             order.

From 943439d4b0b3e66159f264217efe8ee4e30748f9 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Mon, 23 May 2011 16:21:44 +0200
Subject: [PATCH 19/31] rename MultinomialNB params, rename GNB GaussianNB

---
 benchmarks/bench_sgd_covertype.py             | 10 +--
 doc/modules/classes.rst                       |  2 +-
 doc/modules/naive_bayes.rst                   |  4 +-
 ...naive_bayes.py => gaussian_naive_bayes.py} |  8 +-
 scikits/learn/naive_bayes.py                  | 74 +++++++++----------
 scikits/learn/tests/test_naive_bayes.py       |  4 +-
 6 files changed, 49 insertions(+), 53 deletions(-)
 rename examples/{naive_bayes.py => gaussian_naive_bayes.py} (77%)

diff --git a/benchmarks/bench_sgd_covertype.py b/benchmarks/bench_sgd_covertype.py
index e08f11820fdba..e42f0cfdf8459 100644
--- a/benchmarks/bench_sgd_covertype.py
+++ b/benchmarks/bench_sgd_covertype.py
@@ -17,7 +17,7 @@
     Classifier   train-time test-time error-rate
     --------------------------------------------
     Liblinear     9.4471s    0.0184s     0.2305
-    GNB           2.5426s    0.1725s     0.3633
+    GaussianNB           2.5426s    0.1725s     0.3633
     SGD           0.2137s    0.0047s     0.2300
 
 
@@ -57,7 +57,7 @@
 
 from scikits.learn.svm import LinearSVC
 from scikits.learn.linear_model import SGDClassifier
-from scikits.learn.naive_bayes import GNB
+from scikits.learn.naive_bayes import GaussianNB
 from scikits.learn import metrics
 
 ######################################################################
@@ -158,8 +158,8 @@ def benchmark(clf):
 liblinear_err, liblinear_train_time, liblinear_test_time = liblinear_res
 
 ######################################################################
-## Train GNB model
-gnb_err, gnb_train_time, gnb_test_time = benchmark(GNB())
+## Train GaussianNB model
+gnb_err, gnb_train_time, gnb_test_time = benchmark(GaussianNB())
 
 ######################################################################
 ## Train SGD model
@@ -189,7 +189,7 @@ def print_row(clf_type, train_time, test_time, err):
 print("-" * 44)
 print_row("Liblinear", liblinear_train_time, liblinear_test_time,
           liblinear_err)
-print_row("GNB", gnb_train_time, gnb_test_time, gnb_err)
+print_row("GaussianNB", gnb_train_time, gnb_test_time, gnb_err)
 print_row("SGD", sgd_train_time, sgd_test_time, sgd_err)
 print("")
 print("")
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 1b471ee983d9d..fcf45fa016dda 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -148,7 +148,7 @@ Naive Bayes
    :toctree: generated/
    :template: class.rst
 
-   naive_bayes.GNB
+   naive_bayes.GaussianNB
    naive_bayes.MultinomialNB
 
 
diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index 34610d5369c56..389ecf94cbe40 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -44,8 +44,8 @@ dimensionality.
 Gaussian Naive Bayes
 --------------------
 
-:class:`GNB` implements the Gaussian Naive Bayes algorithm for classification.
-The likelihood of the features is assumed to be gaussian:
+:class:`GaussianNB` implements the Gaussian Naive Bayes algorithm for
+classification. The likelihood of the features is assumed to be gaussian:
 
 .. math::
 
diff --git a/examples/naive_bayes.py b/examples/gaussian_naive_bayes.py
similarity index 77%
rename from examples/naive_bayes.py
rename to examples/gaussian_naive_bayes.py
index 657d386bee037..7c61ae2c1bb3a 100644
--- a/examples/naive_bayes.py
+++ b/examples/gaussian_naive_bayes.py
@@ -3,7 +3,7 @@
 Gaussian Naive Bayes
 ============================
 
-A classification example using Gaussian Naive Bayes (GNB).
+A classification example using Gaussian Naive Bayes (GaussianNB).
 
 """
 
@@ -18,9 +18,9 @@
 y = iris.target
 
 ################################################################################
-# GNB
-from scikits.learn.naive_bayes import GNB
-gnb = GNB()
+# GaussianNB
+from scikits.learn.naive_bayes import GaussianNB
+gnb = GaussianNB()
 
 y_pred = gnb.fit(X, y).predict(X)
 
diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index cbe2195769d68..94a0295637eaf 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -26,9 +26,9 @@
 from scipy.sparse import issparse
 
 
-class GNB(BaseEstimator, ClassifierMixin):
+class GaussianNB(BaseEstimator, ClassifierMixin):
     """
-    Gaussian Naive Bayes (GNB)
+    Gaussian Naive Bayes (GaussianNB)
 
     Parameters
     ----------
@@ -41,7 +41,7 @@ class GNB(BaseEstimator, ClassifierMixin):
 
     Attributes
     ----------
-    proba_y : array, shape = [n_classes]
+    class_prior : array, shape = [n_classes]
         probability of each class.
 
     theta : array, shape [n_classes * n_features]
@@ -70,10 +70,10 @@ class GNB(BaseEstimator, ClassifierMixin):
     >>> import numpy as np
     >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
     >>> Y = np.array([1, 1, 1, 2, 2, 2])
-    >>> from scikits.learn.naive_bayes import GNB
-    >>> clf = GNB()
+    >>> from scikits.learn.naive_bayes import GaussianNB
+    >>> clf = GaussianNB()
     >>> clf.fit(X, Y)
-    GNB()
+    GaussianNB()
     >>> print clf.predict([[-0.8, -1]])
     [1]
 
@@ -105,15 +105,15 @@ def fit(self, X, y):
 
         theta = []
         sigma = []
-        proba_y = []
+        class_prior = []
         unique_y = np.unique(y)
         for yi in unique_y:
             theta.append(np.mean(X[y == yi, :], 0))
             sigma.append(np.var(X[y == yi, :], 0))
-            proba_y.append(np.float(np.sum(y == yi)) / np.size(y))
+            class_prior.append(np.float(np.sum(y == yi)) / np.size(y))
         self.theta = np.array(theta)
         self.sigma = np.array(sigma)
-        self.proba_y = np.array(proba_y)
+        self.class_prior = np.array(class_prior)
         self.unique_y = unique_y
         return self
 
@@ -136,7 +136,7 @@ def predict(self, X):
     def _joint_log_likelihood(self, X):
         joint_log_likelihood = []
         for i in xrange(np.size(self.unique_y)):
-            jointi = np.log(self.proba_y[i])
+            jointi = np.log(self.class_prior[i])
             n_ij = - 0.5 * np.sum(np.log(np.pi * self.sigma[i, :]))
             n_ij -= 0.5 * np.sum(((X - self.theta[i, :]) ** 2) / \
                                     (self.sigma[i, :]), 1)
@@ -253,7 +253,7 @@ def __init__(self, alpha=1.0, use_prior=True):
         self.alpha = alpha
         self.use_prior = use_prior
 
-    def fit(self, X, y, theta=None):
+    def fit(self, X, y, class_prior=None):
         """Fit Multinomial Naive Bayes according to X, y
 
         Parameters
@@ -265,8 +265,8 @@ def fit(self, X, y, theta=None):
         y : array-like, shape = [n_samples]
             Target values.
 
-        theta : array, shape [n_classes]
-            Prior probability per label.
+        class_prior : array, shape [n_classes]
+            Prior probability per class.
 
         Returns
         -------
@@ -279,23 +279,23 @@ def fit(self, X, y, theta=None):
         self.unique_y = np.unique(y)
         n_classes = self.unique_y.size
 
-        self.theta = None
+        self.class_prior = None
         if not self.use_prior:
-            self.theta = np.ones(n_classes) / n_classes
-        if theta:
-            assert len(theta) == n_classes, \
+            self.class_prior = np.ones(n_classes) / n_classes
+        if class_prior:
+            assert len(class_prior) == n_classes, \
                    'Number of priors must match number of labels'
-            self.theta = np.array(theta)
+            self.class_prior = np.array(class_prior)
 
-        # N_c is the count of all words in all documents of label c.
-        # N_c_i is the a count of word i in all documents of label c.
-        # theta[c] is the prior empirical probability of a document of label c.
-        # theta_c_i is the (smoothed) empirical likelihood of word i
-        # given a document of label c.
+        # N_c is the count of all words in all documents of class c.
+        # N_c_i is the a count of word i in all documents of class c.
+        # class_prior[c] is the prior empirical probability of class c.
+        # _prob_c_i is the (smoothed) empirical likelihood of feature i
+        # given class c.
         #
         N_c_i_temp = []
-        if self.theta is None:
-            theta = []
+        if self.class_prior is None:
+            class_prior = []
 
         for yi in self.unique_y:
             if self.sparse:
@@ -303,23 +303,19 @@ def fit(self, X, y, theta=None):
                 N_c_i_temp.append(np.array(X[row_ind, :].sum(axis=0)).ravel())
             else:
                 N_c_i_temp.append(np.sum(X[y == yi, :], 0))
-            if self.theta is None:
-                theta.append(np.float(np.sum(y == yi)) / y.size)
+            if self.class_prior is None:
+                class_prior.append(np.float(np.sum(y == yi)) / y.size)
 
         N_c_i = np.array(N_c_i_temp)
         N_c = np.sum(N_c_i, axis=1)
 
-        # Smoothing coefficients
+        # Estimate (and smooth) the parameters of the distribution
         #
-        alpha_i = self.alpha
-        alpha = alpha_i * X.shape[1]
-
-        # Estimate the parameters of the distribution
-        #
-        self.theta_c_i = (np.log(N_c_i + alpha_i)
-                         - np.log(N_c.reshape(-1, 1) + alpha))
-        if self.theta is None:
-            self.theta = np.array(theta)
+        self._prob_c_i = (np.log(N_c_i + self.alpha)
+                          - np.log(N_c.reshape(-1, 1)
+                                   + self.alpha * X.shape[1]))
+        if self.class_prior is None:
+            self.class_prior = np.array(class_prior)
 
         return self
 
@@ -345,9 +341,9 @@ def _joint_log_likelihood(self, X):
 
         X = atleast2d_or_csr(X)
 
-        jll = safe_sparse_dot(self.theta_c_i, X.T)
+        jll = safe_sparse_dot(self._prob_c_i, X.T)
         for i in xrange(self.unique_y.size):
-            jll[i] += np.log(self.theta[i])
+            jll[i] += np.log(self.class_prior[i])
 
         return jll
 
diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 33ba8322a8338..81382a1ee144d 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -15,11 +15,11 @@ def test_gnb():
     """
     Gaussian Naive Bayes classification.
 
-    This checks that GNB implements fit and predict and returns
+    This checks that GaussianNB implements fit and predict and returns
     correct values for a simple toy dataset.
     """
 
-    clf = naive_bayes.GNB()
+    clf = naive_bayes.GaussianNB()
     y_pred = clf.fit(X, y).predict(X)
     assert_array_equal(y_pred, y)
 

From bd8c8febb7de5f264b52310712e58b3a20aa0573 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Mon, 23 May 2011 22:31:15 +0200
Subject: [PATCH 20/31] reformulate MultinomialNB as linear classifier

---
 .../document_classification_20newsgroups.py   |  4 +-
 scikits/learn/naive_bayes.py                  | 61 +++++++++++--------
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/examples/document_classification_20newsgroups.py b/examples/document_classification_20newsgroups.py
index 9a558cd0d75dc..ac2841261123b 100644
--- a/examples/document_classification_20newsgroups.py
+++ b/examples/document_classification_20newsgroups.py
@@ -45,7 +45,7 @@
 from scikits.learn.linear_model import RidgeClassifier
 from scikits.learn.svm.sparse import LinearSVC
 from scikits.learn.linear_model.sparse import SGDClassifier
-from scikits.learn.naive_bayes.sparse import MultinomialNB
+from scikits.learn.naive_bayes import MultinomialNB
 from scikits.learn import metrics
 
 
@@ -171,4 +171,4 @@ def benchmark(clf):
 # Train sparse MultinomialNB
 print 80 * '='
 print "MultinomialNB penalty"
-mnnb_results = benchmark(MultinomialNB(alpha_i=.01))
+mnnb_results = benchmark(MultinomialNB(alpha=.01))
diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 94a0295637eaf..ae16d455bfe41 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -220,7 +220,7 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
         Additive (Laplace/Lidstone) smoothing parameter
         (0 for no smoothing).
     use_prior: boolean
-        Whether to learn label prior probabilities or not.
+        Whether to learn class prior probabilities or not.
 
     Methods
     -------
@@ -231,10 +231,18 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
         Predict using the model.
 
     predict_proba(X) : array
-        Predict the probability of each label using the model.
+        Predict the probability of each class using the model.
 
     predict_log_proba(X) : array
-        Predict the log probability of each label using the model.
+        Predict the log probability of each class using the model.
+
+    Attributes
+    ----------
+    intercept_ : array, shape = [n_classes]
+        Log probability of each class (smoothed).
+
+    coef_ : array, shape = [n_classes, n_features]
+        Empirical log probability of features given a class, P(x_i|y).
 
     Examples
     --------
@@ -247,6 +255,12 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     MultinomialNB(alpha=1.0, use_prior=True)
     >>> print clf.predict(X[2])
     [3]
+
+    References
+    ----------
+    For the rationale behind the names coef_ and intercept_, i.e. naive Bayes
+    as a linear classifier, see J. Rennie et al. (2003), Tackling the poor
+    assumptions of naive Bayes text classifiers, Proc. ICML.
     """
 
     def __init__(self, alpha=1.0, use_prior=True):
@@ -279,22 +293,22 @@ def fit(self, X, y, class_prior=None):
         self.unique_y = np.unique(y)
         n_classes = self.unique_y.size
 
-        self.class_prior = None
+        self.intercept_ = None
         if not self.use_prior:
-            self.class_prior = np.ones(n_classes) / n_classes
+            self.intercept_ = np.ones(n_classes) / n_classes
         if class_prior:
             assert len(class_prior) == n_classes, \
-                   'Number of priors must match number of labels'
-            self.class_prior = np.array(class_prior)
+                   'Number of priors must match number of classs'
+            self.intercept_ = np.array(class_prior)
 
         # N_c is the count of all words in all documents of class c.
         # N_c_i is the a count of word i in all documents of class c.
-        # class_prior[c] is the prior empirical probability of class c.
-        # _prob_c_i is the (smoothed) empirical likelihood of feature i
+        # intercept_[c] is the prior empirical probability of class c.
+        # coef_ is the (smoothed) empirical likelihood of feature i
         # given class c.
         #
         N_c_i_temp = []
-        if self.class_prior is None:
+        if self.intercept_ is None:
             class_prior = []
 
         for yi in self.unique_y:
@@ -303,7 +317,7 @@ def fit(self, X, y, class_prior=None):
                 N_c_i_temp.append(np.array(X[row_ind, :].sum(axis=0)).ravel())
             else:
                 N_c_i_temp.append(np.sum(X[y == yi, :], 0))
-            if self.class_prior is None:
+            if self.intercept_ is None:
                 class_prior.append(np.float(np.sum(y == yi)) / y.size)
 
         N_c_i = np.array(N_c_i_temp)
@@ -311,11 +325,11 @@ def fit(self, X, y, class_prior=None):
 
         # Estimate (and smooth) the parameters of the distribution
         #
-        self._prob_c_i = (np.log(N_c_i + self.alpha)
+        self.coef_ = (np.log(N_c_i + self.alpha)
                           - np.log(N_c.reshape(-1, 1)
                                    + self.alpha * X.shape[1]))
-        if self.class_prior is None:
-            self.class_prior = np.array(class_prior)
+        if self.intercept_ is None:
+            self.intercept_ = np.log(class_prior)
 
         return self
 
@@ -341,11 +355,8 @@ def _joint_log_likelihood(self, X):
 
         X = atleast2d_or_csr(X)
 
-        jll = safe_sparse_dot(self._prob_c_i, X.T)
-        for i in xrange(self.unique_y.size):
-            jll[i] += np.log(self.class_prior[i])
-
-        return jll
+        jll = safe_sparse_dot(self.coef_, X.T)
+        return jll + np.atleast_2d(self.intercept_).T
 
     def predict_proba(self, X):
         """
@@ -358,8 +369,8 @@ def predict_proba(self, X):
         Returns
         -------
         C : array-like, shape = [n_samples, n_classes]
-            Returns the probability of the sample for each label in
-            the model, where labels are ordered by arithmetical
+            Returns the probability of the sample for each class in
+            the model, where classes are ordered by arithmetical
             order.
         """
         return np.exp(self.predict_log_proba(X))
@@ -375,11 +386,11 @@ def predict_log_proba(self, X):
         Returns
         -------
         C : array-like, shape = [n_samples, n_classes]
-            Returns the log-probability of the sample for each label
-            in the model, where labels are ordered by arithmetical
+            Returns the log-probability of the sample for each class
+            in the model, where classes are ordered by arithmetical
             order.
         """
         jll = self._joint_log_likelihood(X)
         # normalize by P(x) = P(f_1, ..., f_n)
-        normalize = np.logaddexp.reduce(jll[:, np.newaxis])
-        return jll - normalize
+        log_prob_x = np.logaddexp.reduce(jll[:, np.newaxis])
+        return jll - log_prob_x

From 57cf25f3e1555fb54796fce611cd37a542648937 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Tue, 24 May 2011 11:00:41 +0200
Subject: [PATCH 21/31] NB: add class_log_prior_ and feature_log_prob_ back as
 properties

---
 scikits/learn/naive_bayes.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index ae16d455bfe41..703c2283bd309 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -238,12 +238,15 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
 
     Attributes
     ----------
-    intercept_ : array, shape = [n_classes]
+    class_log_prior_, intercept_ : array, shape = [n_classes]
         Log probability of each class (smoothed).
 
-    coef_ : array, shape = [n_classes, n_features]
+    feature_log_prob_, coef_ : array, shape = [n_classes, n_features]
         Empirical log probability of features given a class, P(x_i|y).
 
+    (class_log_prior_ and feature_log_prob_ are properties referring to
+    intercept_ and feature_log_prob_, respectively.)
+
     Examples
     --------
     >>> import numpy as np
@@ -333,6 +336,9 @@ def fit(self, X, y, class_prior=None):
 
         return self
 
+    class_log_prior_ = property(lambda self: self.intercept__)
+    feature_log_prob_ = property(lambda self: self.coef_)
+
     def predict(self, X):
         """
         Perform classification on an array of test vectors X.

From b94c8fcaaceaeaf388400050419b8215ef076118 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Tue, 24 May 2011 12:33:27 +0200
Subject: [PATCH 22/31] NB cosmit: *feature* independence

---
 scikits/learn/naive_bayes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 703c2283bd309..3b436528b91a1 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -3,7 +3,7 @@
 ==================
 
 Naive Bayes algorithms are a set of supervised learning methods based on
-applying Bayes' theorem with strong (naive) independence assumptions.
+applying Bayes' theorem with strong (naive) feature independence assumptions.
 
 See http://scikit-learn.sourceforge.net/modules/naive_bayes.html for
 complete documentation.

From 6731b648570c98fb4d79ef7e23f42f565ada1158 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Tue, 24 May 2011 12:44:07 +0200
Subject: [PATCH 23/31] cosmit: expand MultinomialNB docstring

---
 scikits/learn/naive_bayes.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 3b436528b91a1..10902aa2d6902 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -210,7 +210,11 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     """
     Naive Bayes classifier for multinomial models
 
-    The multinomial Naive Bayes classifier is suitable for text classification.
+    The multinomial Naive Bayes classifier is suitable for classification with
+    discrete features (e.g., word counts for text classification). The
+    multinomial distribution normally requires integer feature counts. However,
+    in practice, fractional counts such as tf-idf may also work.
+
     This class is designed to handle both dense and sparse data; it will enter
     "sparse mode" if its training matrix (X) is a sparse matrix.
 
@@ -304,12 +308,8 @@ def fit(self, X, y, class_prior=None):
                    'Number of priors must match number of classs'
             self.intercept_ = np.array(class_prior)
 
-        # N_c is the count of all words in all documents of class c.
-        # N_c_i is the a count of word i in all documents of class c.
-        # intercept_[c] is the prior empirical probability of class c.
-        # coef_ is the (smoothed) empirical likelihood of feature i
-        # given class c.
-        #
+        # N_c is the count of all features in all samples of class c.
+        # N_c_i is the a count of feature i in all samples of class c.
         N_c_i_temp = []
         if self.intercept_ is None:
             class_prior = []

From 485428033ca1eb5edc3dcc88350ca7b2e157ccc7 Mon Sep 17 00:00:00 2001
From: Fabian Pedregosa <fabian.pedregosa@inria.fr>
Date: Wed, 25 May 2011 15:14:24 +0200
Subject: [PATCH 24/31] FIX: fix segfault in cases of infeasible nu (NuSVM)

 ____     __   __            _          ___ _     _____      _   _   _    ___
| __ )  __\ \ / / __      __/ \   ___  |_ _| |_  |  ___|   _| \ | | | |_ / _ \
|  _ \ / _ \ V /  \ \ /\ / / _ \ / __|  | || __| | |_ | | | |  \| | | __| | | |
| |_) | (_) | |    \ V  V / ___ \\__ \  | || |_  |  _|| |_| | |\  | | |_| |_| |
|____/ \___/|_|     \_/\_/_/   \_\___/ |___|\__| |_|   \__,_|_| \_|  \__|\___/

 _   ____        ____ _      ____     __        __       _ _ _
| |_|  _ \ __ _ / ___| | __ |  _ \  __\ \      / / __   | | | |
| __| |_) / _` | |   | |/ / | | | |/ _ \ \ /\ / / '_ \  | | | |
| |_|  _ < (_| | |___|   <  | |_| | (_) \ V  V /| | | | |_|_|_|
 \__|_| \_\__,_|\____|_|\_\ |____/ \___/ \_/\_/ |_| |_| (_|_|_)
---
 scikits/learn/svm/src/libsvm/svm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scikits/learn/svm/src/libsvm/svm.cpp b/scikits/learn/svm/src/libsvm/svm.cpp
index b65de7203cf9e..7fb2109d1bb8a 100644
--- a/scikits/learn/svm/src/libsvm/svm.cpp
+++ b/scikits/learn/svm/src/libsvm/svm.cpp
@@ -3038,7 +3038,7 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param
 				{
 					max_nr_class *= 2;
 					label = (int *)realloc(label,max_nr_class*sizeof(int));
-					count = (double *)realloc(count,max_nr_class*sizeof(int));
+					count = (double *)realloc(count,max_nr_class*sizeof(double));
 
 				}
 				label[nr_class] = this_label;

From e04d4f8e736e6b791c3ea8ddba0183b9bb75280c Mon Sep 17 00:00:00 2001
From: VirgileFritsch <virgile.fritsch@gmail.com>
Date: Wed, 25 May 2011 17:29:03 +0200
Subject: [PATCH 25/31] Covariance errors computation API changes.

Mean Squared Error, Sum of Squared Error are controversial notions in
a covariances comparison context, so it is better not to use those terms
and let the user decide which type of error he exactly wants.
For this purpose, keywords have been introduced so that the user can choose:
 - the type of norm he wants to use,
 - if a scaling by the number of features must be applied,
 - if he wants the square of the error norm or just the error norm
---
 examples/covariance/plot_lw_vs_oas.py         |  6 +--
 .../learn/covariance/empirical_covariance_.py | 48 ++++++++++++-------
 .../learn/covariance/tests/test_covariance.py | 12 ++---
 3 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/examples/covariance/plot_lw_vs_oas.py b/examples/covariance/plot_lw_vs_oas.py
index ddce24e75ce51..1e13c185bbe32 100644
--- a/examples/covariance/plot_lw_vs_oas.py
+++ b/examples/covariance/plot_lw_vs_oas.py
@@ -48,12 +48,12 @@
 
         lw = LedoitWolf(store_precision=False)
         lw.fit(X, assume_centered=True)
-        lw_mse[i,j] = lw.error(real_cov)
+        lw_mse[i,j] = lw.error_norm(real_cov, scaling=False)
         lw_shrinkage[i,j] = lw.shrinkage_
 
         oa = OAS(store_precision=False)
         oa.fit(X, assume_centered=True)
-        oa_mse[i,j] = oa.error(real_cov)
+        oa_mse[i,j] = oa.error_norm(real_cov, scaling=False)
         oa_shrinkage[i,j] = oa.shrinkage_
 
 # plot MSE
@@ -62,7 +62,7 @@
             label='Ledoit-Wolf', color='g')
 pl.errorbar(n_samples_range, oa_mse.mean(1), yerr=oa_mse.std(1),
             label='OAS', color='r')
-pl.ylabel("MSE")
+pl.ylabel("Squared error")
 pl.legend(loc="upper right")
 pl.title("Comparison of covariance estimators")
 pl.xlim(5, 31)
diff --git a/scikits/learn/covariance/empirical_covariance_.py b/scikits/learn/covariance/empirical_covariance_.py
index 62807ac9690a0..80860ef56150d 100644
--- a/scikits/learn/covariance/empirical_covariance_.py
+++ b/scikits/learn/covariance/empirical_covariance_.py
@@ -165,7 +165,8 @@ def score(self, X_test, assume_centered=False):
 
         return res
 
-    def error(self, comp_cov, error_type='mse'):
+    def error_norm(self, comp_cov, norm='frobenius', scaling=True,
+                   squared=True):
         """Computes the Mean Squared Error between two covariance estimators.
         (In the sense of the Frobenius norm)
 
@@ -173,11 +174,18 @@ def error(self, comp_cov, error_type='mse'):
         ----------
         comp_cov: array-like, shape = [n_features, n_features]
           The covariance which to be compared to.
-        error_type: str
-          The type of error. Available error types:
-          - 'mse': Mean Squared Error (default) = tr(A^t.A) / n_features
-          - 'rmse': Root Mean Squared Error = sqrt(tr(A^t.A) / n_features
-          - 'sse': Sum of Squared Errors = tr(A^t.A)
+        norm: str
+          The type of norm used to compute the error. Available error types:
+          - 'frobenius' (default): sqrt(tr(A^t.A))
+          - 'spectral': sqrt(max(eigenvalues(A^t.A))
+          where A is the error (comp_cov - self.covariance_).
+        scaling: bool
+          If True (default), the squared error norm is divided by n_features
+          If False, the squared error norm is not rescaled
+        squared: bool
+          Whether to compute the squared error norm or the error norm.
+          If True (default), the squared error norm is returned.
+          If False, the error norm is returned.
 
         Returns
         -------
@@ -185,15 +193,23 @@ def error(self, comp_cov, error_type='mse'):
         `self` and `comp_cov` covariance estimators.
 
         """
-        diff = comp_cov - self.covariance_
-        sse = np.sum(diff ** 2)
-        if error_type == 'mse':
-            error = sse / diff.shape[0]
-        elif error_type == 'rmse':
-            error = np.sqrt(sse / diff.shape[0])
-        elif error_type == 'sse':
-            error = sse
+        # compute the error
+        error = comp_cov - self.covariance_
+        # compute the error norm
+        if norm == "frobenius":
+            squared_norm = np.sum(error ** 2)
+        elif norm == "spectral":
+            squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
         else:
-            raise Exception('Error type \"%s\" not implemented yet' %error_type)
+            raise NotImplementedError(
+                "Only spectral and frobenius norms are implemented")
+        # optionaly scale the error norm
+        if scaling:
+            squared_norm = squared_norm / error.shape[0]
+        # finally get either the squared norm or the norm
+        if squared:
+            result = squared_norm
+        else:
+            result = np.sqrt(squared_norm)
         
-        return error
+        return result
diff --git a/scikits/learn/covariance/tests/test_covariance.py b/scikits/learn/covariance/tests/test_covariance.py
index 09b475607084d..28463cab42d50 100644
--- a/scikits/learn/covariance/tests/test_covariance.py
+++ b/scikits/learn/covariance/tests/test_covariance.py
@@ -23,22 +23,18 @@ def test_covariance():
     cov = EmpiricalCovariance()
     cov.fit(X)
     assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
-    assert_almost_equal(cov.error(empirical_covariance(X)), 0)
+    assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
     assert_almost_equal(
-        cov.error(empirical_covariance(X), error_type='rmse'), 0)
-    assert_almost_equal(
-        cov.error(empirical_covariance(X), error_type='sse'), 0)
+        cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
 
     # test with n_features = 1
     X_1d = X[:,0]
     cov = EmpiricalCovariance()
     cov.fit(X_1d)
     assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
-    assert_almost_equal(cov.error(empirical_covariance(X_1d)), 0)
-    assert_almost_equal(
-        cov.error(empirical_covariance(X_1d), error_type='rmse'), 0)
+    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
     assert_almost_equal(
-        cov.error(empirical_covariance(X_1d), error_type='sse'), 0)
+        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)
 
     # test integer type
     X_integer = np.asarray([[0,1],[1,0]])

From fa8ae78dd6df907f6745ab1def9e2d64cf66fc04 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Wed, 25 May 2011 23:59:56 +0200
Subject: [PATCH 26/31] rm references to naive_bayes.sparse in docs

The module has been merged into naive_bayes.
---
 doc/modules/classes.rst | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index cbb577423b09a..65b258b871119 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -152,23 +152,6 @@ Naive Bayes
    naive_bayes.MultinomialNB
 
 
-For sparse data
----------------
-
-.. automodule:: scikits.learn.naive_bayes.sparse
-   :no-members:
-   :no-inherited-members:
-
-.. currentmodule:: scikits.learn
-
-.. autosummary::
-
-   :toctree: generated/
-   :template: class.rst
-
-   naive_bayes.sparse.MultinomialNB
-
-
 Nearest Neighbors
 =================
 

From 2dd421d4948e1c30a9cbc669090fafd8adbdc2c6 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <larsmans@gmail.com>
Date: Thu, 26 May 2011 09:46:18 +0200
Subject: [PATCH 27/31] NB: rename use_prior to fit_prior

---
 scikits/learn/naive_bayes.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scikits/learn/naive_bayes.py b/scikits/learn/naive_bayes.py
index 10902aa2d6902..78a55ff3916ca 100644
--- a/scikits/learn/naive_bayes.py
+++ b/scikits/learn/naive_bayes.py
@@ -223,7 +223,7 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     alpha: float, optional (default=1.0)
         Additive (Laplace/Lidstone) smoothing parameter
         (0 for no smoothing).
-    use_prior: boolean
+    fit_prior: boolean
         Whether to learn class prior probabilities or not.
 
     Methods
@@ -259,7 +259,7 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     >>> from scikits.learn.naive_bayes import MultinomialNB
     >>> clf = MultinomialNB()
     >>> clf.fit(X, Y)
-    MultinomialNB(alpha=1.0, use_prior=True)
+    MultinomialNB(alpha=1.0, fit_prior=True)
     >>> print clf.predict(X[2])
     [3]
 
@@ -270,9 +270,9 @@ class MultinomialNB(BaseEstimator, ClassifierMixin):
     assumptions of naive Bayes text classifiers, Proc. ICML.
     """
 
-    def __init__(self, alpha=1.0, use_prior=True):
+    def __init__(self, alpha=1.0, fit_prior=True):
         self.alpha = alpha
-        self.use_prior = use_prior
+        self.fit_prior = fit_prior
 
     def fit(self, X, y, class_prior=None):
         """Fit Multinomial Naive Bayes according to X, y
@@ -301,7 +301,7 @@ def fit(self, X, y, class_prior=None):
         n_classes = self.unique_y.size
 
         self.intercept_ = None
-        if not self.use_prior:
+        if not self.fit_prior:
             self.intercept_ = np.ones(n_classes) / n_classes
         if class_prior:
             assert len(class_prior) == n_classes, \

From b3804e95d8f77b9d584b7463949153fad39b61fd Mon Sep 17 00:00:00 2001
From: GaelVaroquaux <gael.varoquaux@normalesup.org>
Date: Thu, 26 May 2011 14:51:24 +0200
Subject: [PATCH 28/31] BUG: Minor bugs in cross_val

Use pyflakes, folks!
---
 scikits/learn/cross_val.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scikits/learn/cross_val.py b/scikits/learn/cross_val.py
index 90061ec7f289d..7bb0c5d58de85 100644
--- a/scikits/learn/cross_val.py
+++ b/scikits/learn/cross_val.py
@@ -12,7 +12,6 @@
 from .utils.extmath import factorial, combinations
 from .utils.fixes import unique
 from .utils import check_arrays
-from .utils import check_random_state
 from .externals.joblib import Parallel, delayed
 
 
@@ -559,7 +558,7 @@ def __init__(self, n, n_bootstraps=3, n_train=0.5, n_test=None,
                              (self.n_train, n))
 
         if isinstance(n_test, float) and n_test >= 0.0 and n_test <= 1.0:
-            self.n_test = ceil(test * n)
+            self.n_test = ceil(n_test * n)
         elif isinstance(n_test, int):
             self.n_test = n_test
         elif n_test is None:
@@ -652,9 +651,9 @@ def cross_val_score(estimator, X, y=None, score_func=None, cv=None, iid=False,
     if cv is None:
         indices = hasattr(X, 'tocsr')
         if y is not None and is_classifier(estimator):
-            cv = StratifiedKFold(y, k=3, indices=True)
+            cv = StratifiedKFold(y, k=3, indices=indices)
         else:
-            cv = KFold(n_samples, k=3, indices=True)
+            cv = KFold(n_samples, k=3, indices=indices)
     if score_func is None:
         assert hasattr(estimator, 'score'), ValueError(
                 "If no score_func is specified, the estimator passed "

From be2d75a1e86ba14f7b837e40563b1b8c9cc3109a Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 26 May 2011 07:17:51 -0700
Subject: [PATCH 29/31] fix broken test in MultinomialNB

---
 scikits/learn/tests/test_naive_bayes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 81382a1ee144d..d22c7b70ca019 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -89,7 +89,7 @@ def test_sparse_mnnb():
 def test_mnnb_pickle():
     '''Test picklability of multinomial NB'''
 
-    clf = naive_bayes.MultinomialNB(alpha=2, use_prior=False).fit(X, y)
+    clf = naive_bayes.MultinomialNB(alpha=2, fit_prior=False).fit(X, y)
     y_pred = clf.predict(X)
 
     store = StringIO()

From aecc0b44d487c4477ac287aac50a08fdc22ad583 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Fri, 27 May 2011 12:24:20 +0200
Subject: [PATCH 30/31] slightly improved logging in a few easy cases

---
 scikits/learn/datasets/lfw.py               | 15 +++++++++------
 scikits/learn/datasets/twenty_newsgroups.py |  7 +++++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/scikits/learn/datasets/lfw.py b/scikits/learn/datasets/lfw.py
index 4d3775c4607d1..546becd770463 100644
--- a/scikits/learn/datasets/lfw.py
+++ b/scikits/learn/datasets/lfw.py
@@ -44,6 +44,9 @@
 from .base import Bunch
 
 
+logger = logging.getLogger(__name__)
+
+
 BASE_URL = "http://vis-www.cs.umass.edu/lfw/"
 ARCHIVE_NAME = "lfw.tgz"
 FUNNELED_ARCHIVE_NAME = "lfw-funneled.tgz"
@@ -89,7 +92,7 @@ def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True):
         if not exists(target_filepath):
             if download_if_missing:
                 url = BASE_URL + target_filename
-                logging.warn("Downloading LFW metadata: %s", url)
+                logger.warn("Downloading LFW metadata: %s", url)
                 downloader = urllib.urlopen(BASE_URL + target_filename)
                 data = downloader.read()
                 open(target_filepath, 'wb').write(data)
@@ -100,7 +103,7 @@ def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True):
 
         if not exists(archive_path):
             if download_if_missing:
-                logging.warn("Downloading LFW data (~200MB): %s", archive_url)
+                logger.warn("Downloading LFW data (~200MB): %s", archive_url)
                 downloader = urllib.urlopen(archive_url)
                 data = downloader.read()
                 # don't open file until download is complete
@@ -109,7 +112,7 @@ def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True):
                 raise IOError("%s is missing" % target_filepath)
 
         import tarfile
-        logging.info("Decompressing the data archive to %s", data_folder_path)
+        logger.info("Decompressing the data archive to %s", data_folder_path)
         tarfile.open(archive_path, "r:gz").extractall(path=lfw_home)
         remove(archive_path)
 
@@ -148,7 +151,7 @@ def _load_imgs(file_paths, slice_, color, resize):
     # arrays
     for i, file_path in enumerate(file_paths):
         if i % 1000 == 0:
-            logging.info("Loading face #%05d / %05d", i + 1, n_faces)
+            logger.info("Loading face #%05d / %05d", i + 1, n_faces)
         face = np.asarray(imread(file_path)[slice_], dtype=np.float32)
         face /= 255.0  # scale uint8 coded colors to the [0.0, 1.0] floats
         if resize is not None:
@@ -260,7 +263,7 @@ def fetch_lfw_people(data_home=None, funneled=True, resize=0.5,
     lfw_home, data_folder_path = check_fetch_lfw(
         data_home=data_home, funneled=funneled,
         download_if_missing=download_if_missing)
-    logging.info('Loading LFW people faces from %s', lfw_home)
+    logger.info('Loading LFW people faces from %s', lfw_home)
 
     # wrap the loader in a memoizing function that will return memmaped data
     # arrays for optimal memory usage
@@ -398,7 +401,7 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5,
     lfw_home, data_folder_path = check_fetch_lfw(
         data_home=data_home, funneled=funneled,
         download_if_missing=download_if_missing)
-    logging.info('Loading %s LFW pairs from %s', subset, lfw_home)
+    logger.info('Loading %s LFW pairs from %s', subset, lfw_home)
 
     # wrap the loader in a memoizing function that will return memmaped data
     # arrays for optimal memory usage
diff --git a/scikits/learn/datasets/twenty_newsgroups.py b/scikits/learn/datasets/twenty_newsgroups.py
index 7dd3de4dacb91..7b92383df1743 100644
--- a/scikits/learn/datasets/twenty_newsgroups.py
+++ b/scikits/learn/datasets/twenty_newsgroups.py
@@ -44,6 +44,9 @@
 from .base import load_filenames
 
 
+logger = logging.getLogger(__name__)
+
+
 URL = ("http://people.csail.mit.edu/jrennie/"
             "20Newsgroups/20news-bydate.tar.gz")
 ARCHIVE_NAME = "20news-bydate.tar.gz"
@@ -98,13 +101,13 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
 
         if not os.path.exists(archive_path):
             if download_if_missing:
-                logging.warn("Downloading dataset from %s (14 MB)", URL)
+                logger.warn("Downloading dataset from %s (14 MB)", URL)
                 opener = urllib.urlopen(URL)
                 open(archive_path, 'wb').write(opener.read())
             else:
                 raise IOError("%s is missing" % archive_path)
 
-        logging.info("Decompressing %s", archive_path)
+        logger.info("Decompressing %s", archive_path)
         tarfile.open(archive_path, "r:gz").extractall(path=twenty_home)
         os.remove(archive_path)
 

From d3bfaa5eafc2a584e04e50cc7b2e541d22f52758 Mon Sep 17 00:00:00 2001
From: Lars Buitinck <L.J.Buitinck@uva.nl>
Date: Mon, 30 May 2011 12:14:10 +0200
Subject: [PATCH 31/31] Improved error handling + reduce memory use

* Reraise HTTPError without losing backtrace
* Remove file if writing failed
* Use shutil.copyfileobj instead of whole-file read, then write
---
 scikits/learn/datasets/mldata.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/scikits/learn/datasets/mldata.py b/scikits/learn/datasets/mldata.py
index ef273d0689efb..ab07930513026 100755
--- a/scikits/learn/datasets/mldata.py
+++ b/scikits/learn/datasets/mldata.py
@@ -5,8 +5,9 @@
 
 from scipy import io
 
+import os
 from os.path import join, exists
-from os import makedirs
+from shutil import copyfileobj
 import urllib2
 
 from .base import get_data_home, Bunch
@@ -103,22 +104,27 @@ def fetch_mldata(dataname, target_name='label', data_name='data',
     data_home = get_data_home(data_home=data_home)
     data_home = join(data_home, 'mldata')
     if not exists(data_home):
-        makedirs(data_home)
+        os.makedirs(data_home)
 
     matlab_name = dataname + '.mat'
     filename = join(data_home, matlab_name)
 
     # if the file does not exist, download it
     if not exists(filename):
-        urlname = MLDATA_BASE_URL % (dataname)
+        urlname = MLDATA_BASE_URL % urllib2.quote(dataname)
         try:
             mldata_url = urllib2.urlopen(urlname)
-        except urllib2.URLError:
-            msg = "Dataset '%s' not found on mldata.org." % dataname
-            raise IOError(msg)
+        except urllib2.HTTPError, e:
+            if e.code == 404:
+                e.msg = "Dataset '%s' not found on mldata.org." % dataname
+            raise
         # store Matlab file
-        with open(filename, 'w+b') as matlab_file:
-            matlab_file.write(mldata_url.read())
+        try:
+            with open(filename, 'w+b') as matlab_file:
+                copyfileobj(mldata_url, matlab_file)
+        except:
+            os.remove(filename)
+            raise
         mldata_url.close()
 
     # load dataset matlab file