Merge remote-tracking branch 'upstream/master' into treeweights

glouppe · Jan 21, 2013 · f973c27 · f973c27
2 parents a9e6d5c + 3e89aa5
commit f973c27
Show file tree

Hide file tree

Showing 78 changed files with 4,497 additions and 1,489 deletions.
diff --git a/README.rst b/README.rst
@@ -2,8 +2,8 @@
 
 |Travis|_
 
-.. |Travis| image:: https://secure.travis-ci.org/scikit-learn/scikit-learn.png?branch=master
-.. _Travis: https://secure.travis-ci.org/scikit-learn/scikit-learn
+.. |Travis| image:: https://api.travis-ci.org/scikit-learn/scikit-learn.png?branch=master
+.. _Travis: https://travis-ci.org/scikit-learn/scikit-learn
 
 scikit-learn
 ============

diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py
@@ -88,7 +88,7 @@ def compute_bench(samples_range, features_range):
     features_range = np.linspace(10, 2000, 5).astype(np.int)
     results = compute_bench(samples_range, features_range)
 
-    max_time = max(max(t) for t in results.itervalues())
+    max_time = max(max(t) for t in results.values())
 
     fig = plt.figure()
     i = 1

diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py
@@ -101,7 +101,7 @@ def compute_bench(samples_range, features_range):
     samples_range = np.linspace(1000, 5000, 5).astype(np.int)
     features_range = np.linspace(1000, 5000, 5).astype(np.int)
     results = compute_bench(samples_range, features_range)
-    max_time = max(np.max(t) for t in results.itervalues())
+    max_time = max(np.max(t) for t in results.values())
 
     import pylab as pl
     fig = pl.figure()

diff --git a/doc/conf.py b/doc/conf.py
@@ -121,7 +121,7 @@
 # further.  For a list of options available for each theme, see the
 # documentation.
 html_theme_options = {'oldversion': False, 'collapsiblesidebar': True,
-                      'google_analytics': True}
+                      'google_analytics': True, 'surveybanner': True}
 
 # Add any paths that contain custom themes here, relative to this directory.
 html_theme_path = ['themes']

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -364,6 +364,7 @@ From text
    :template: class.rst
 
    feature_extraction.text.CountVectorizer
+   feature_extraction.text.HashingVectorizer
    feature_extraction.text.TfidfTransformer
    feature_extraction.text.TfidfVectorizer
 

diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -132,6 +132,11 @@ and the new centroids is the inertia and the algorithm repeats these last two
 steps until this value is less than a threshold. In other words, it repeats
 until the centroids do not move significantly.
 
+.. image:: ../auto_examples/cluster/images/plot_kmeans_digits_1.png
+   :target: ../auto_examples/cluster/plot_kmeans_digits.html
+   :align: right
+   :scale: 35
+
 The algorithm can be identified through the concept of `Voronoi diagrams
 <https://en.wikipedia.org/wiki/Voronoi_diagram>`_. First the Voronoi diagram
 of the points is calculated using the current centroids. Each segment in the
@@ -198,6 +203,8 @@ the :class:`KMeans` algorithm.
  * :ref:`example_document_clustering.py`: Document clustering using sparse
    MiniBatchKMeans
 
+ * :ref:`example_cluster_plot_dict_face_patches.py`
+
 
 .. topic:: References:
 
@@ -208,18 +215,37 @@ the :class:`KMeans` algorithm.
 
 .. _affinity_propagation:
 
-Affinity propagation
+Affinity Propagation
 ====================
 
-:class:`AffinityPropagation` clusters data by diffusion in the similarity
-matrix. This algorithm automatically sets its numbers of cluster. It
-will have difficulties scaling to thousands of samples.
+:class:`AffinityPropagation` creates clusters by sending messages between
+pairs of samples until convergence. A dataset is then described using a small
+number of exemplars, which are identified as those most representative of other
+samples. The messages sent between pairs represent the suitability for one
+sample to be the exemplar of the other, which is updated in response to the
+values from other pairs. This updating happens iteratively until convergence,
+at which point the final exemplars are chosen, and hence the final clustering
+is given.
 
 .. figure:: ../auto_examples/cluster/images/plot_affinity_propagation_1.png
    :target: ../auto_examples/cluster/plot_affinity_propagation.html
    :align: center
    :scale: 50
 
+
+Affinity Propagation can be interesting as it chooses the number of
+clusters based on the data provided. For this purpose, the two important
+parameters are the `preference`, which controls how many examplars are
+used, and the `damping` factor.
+
+The main drawback of Affinity Propagation is its complexity. The
+algorithm has a time complexity of the order :math:`O(N^2 T)`, where `N`
+is the number of samples and `T` is the number of iterations until
+convergence. Further, the memory complexity is of the order
+:math:`O(N^2)` if a dense similarity matrix is used, but reducible if a
+sparse similarity matrix is used. This makes Affinity Propagation most
+appropriate for small to medium sized datasets.
+
 .. topic:: Examples:
 
  * :ref:`example_cluster_plot_affinity_propagation.py`: Affinity
@@ -228,6 +254,32 @@ will have difficulties scaling to thousands of samples.
  * :ref:`example_applications_plot_stock_market.py` Affinity Propagation on
    Financial time series to find groups of companies
 
+**Algorithm description:**
+The messages sent between points belong to one of two categories. The first is
+the responsibility `r(i, k)`, which is the accumulated evidence that sample `k`
+should be the exemplar for sample `i`. The second is the availability `a(i, k)`
+which is the accumulated evidence that sample `i` should choose sample `k` to
+be its exemplar, and considers the values for all other samples that `k` should
+be an exemplar. In this way, exemplars are chosen by samples if they are (1)
+similar enough to many samples and (2) chosen by many samples to be
+representative of themselves.
+
+More formally, the responsibility of a sample `k` to be the exemplar of sample
+`i` is given by:
+
+.. math::
+
+    r(i, k) \leftarrow s(i, k) - max [ a(i, \acute{k}) + s(i, \acute{k}) \forall \acute{k} \neq k ]
+
+Where :math:`s(i, k)` is the similarity between samples `i` and `k`. The
+availability of sample `k` to be the exemplar of sample `i` is given by:
+
+.. math::
+
+    a(i, k) \leftarrow min [0, r(k, k) + \sum_{\acute{i}~s.t.~\acute{i} \notin \{i, k\}}{r(\acute{i}, k)}]
+
+To begin with, all values for `r` and `a` are set to zero, and the calculation
+of each iterates until convergence.
 
 .. _mean_shift:
 

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
@@ -365,6 +365,25 @@ iterating only once over a mini-batch. This can be used for online learning
 when the data is not readily available from the start, or for when the data
 does not fit into the memory.
 
+.. currentmodule:: sklearn.cluster
+
+.. image:: ../auto_examples/cluster/images/plot_dict_face_patches_1.png
+    :target: ../auto_examples/cluster/plot_dict_face_patches.html
+    :scale: 50%
+    :align: right
+
+.. topic:: **Clustering for dictionary learning**
+
+   Note that when using dictionary learning to extract a representation
+   (e.g. for sparse coding) clustering can be a good proxy to learn the
+   dictionary. For instance the :class:`MiniBatchKMeans` estimator is
+   computationally efficient and implements on-line learning
+   `partial_fit` method.
+
+    Example: :ref:`example_cluster_plot_dict_face_patches.py`
+
+.. currentmodule:: sklearn.decomposition
+
 .. _FA:
 
 Factor Analysis

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -212,7 +212,7 @@ the matching feature to the prediction function.
  * :ref:`example_ensemble_plot_forest_importances_faces.py`
  * :ref:`example_ensemble_plot_forest_importances.py`
 
-.. _random_hashing:
+.. _random_trees_embedding:
 
 Totally Random Trees Embedding
 ------------------------------

diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
@@ -555,8 +555,96 @@ into account. Many such models will thus be casted as "Structured output"
 problems which are currently outside of the scope of scikit-learn.
 
 
+.. _hashing_vectorizer:
+
+Vectorizing a large text corpus with the hashing trick
+------------------------------------------------------
+
+The above vectorization scheme is simple but the fact that it holds an **in-
+memory mapping from the string tokens to the integer feature indices** (the
+``vocabulary_`` attribute) causes several **problems when dealing with large
+datasets**:
+
+- the larger the corpus, the larger the vocabulary will grow and hence the
+  memory use too,
+
+- fitting requires the allocation of intermediate data structures
+  of size proportional to that of the original dataset.
+
+- building the word-mapping requires a full pass over the dataset hence it is
+  not possible to fit text classifiers in a strictly online manner.
+
+- pickling and un-pickling vectorizers with a large ``vocabulary_`` can be very
+  slow (typically much slower than pickling / un-pickling flat data structures
+  such as a NumPy array of the same size),
+
+- it is not easily possible to split the vectorization work into concurrent sub
+  tasks as the ``vocabulary_`` attribute would have to be a shared state with a
+  fine grained synchronization barrier: the mapping from token string to
+  feature index is dependent on ordering of the first occurrence of each token
+  hence would have to be shared, potentially harming the concurrent workers'
+  performance to the point of making them slower than the sequential variant.
+
+It is possible to overcome those limitations by combining the "hashing trick"
+(:ref:`Feature_hashing`) implemented by the
+:class:`sklearn.feature_extraction.FeatureHasher` class and the text
+preprocessing and tokenization features of the :class:`CountVectorizer`.
+
+This combination is implementing in :class:`HashingVectorizer`,
+a transformer class that is mostly API compatible with :class:`CountVectorizer`.
+:class:`HashingVectorizer` is stateless,
+meaning that you don't have to call ``fit`` on it::
+
+  >>> from sklearn.feature_extraction.text import HashingVectorizer
+  >>> hv = HashingVectorizer(n_features=10)
+  >>> hv.transform(corpus)
+  ...                                       # doctest: +NORMALIZE_WHITESPACE
+  <4x10 sparse matrix of type '<type 'numpy.float64'>'
+      with 16 stored elements in Compressed Sparse Row format>
+
+You can see that 16 non-zero feature tokens where extracted in the vector
+output: this is less than the 19 non-zeros extracted previously by the
+:class:`CountVectorizer` on the same toy corpus. The discrepancy comes from
+hash function collisions because of the low value of the ``n_features`` parameter.
+
+In a real world setting, the ``n_features`` parameter can be left to its
+default value of ``2 ** 20`` (roughly one million possible features). If memory
+or downstream models size is an issue selecting a lower value such as ``2 **
+18`` might help without introducing too many additional collisions on typical
+text classification tasks.
+
+Note that the dimensionality does not affect the CPU training time of
+algorithms which operate on CSR matrices (``LinearSVC(dual=True)``,
+``Perceptron``, ``SGDClassifier``, ``PassiveAggressive``) but it does for
+algorithm that work with CSC matrices (``LinearSVC(dual=False)``, ``Lasso()``,
+etc).
+
+Let's try again with the default setting::
+
+  >>> hv = HashingVectorizer()
+  >>> hv.transform(corpus)
+  ...                                       # doctest: +NORMALIZE_WHITESPACE
+  <4x1048576 sparse matrix of type '<type 'numpy.float64'>'
+      with 19 stored elements in Compressed Sparse Row format>
+
+We no longer get the collisions, but this comes at the expense of a much larger
+dimensionality of the output space.
+Of course, other terms than the 19 used here
+might still collide with each other.
+
+The :class:`HashingVectorizer` also comes with the following limitations:
+
+- it is not possible to invert the model (no ``inverse_transform`` method),
+  nor to access the original string representation of the features,
+  because of the one-way nature of the hash function that performs the mapping.
+
+- it does not provide IDF weighting as that would introduce statefulness in the
+  model. A :class:`TfidfTransformer` can be appended to it in a pipeline if
+  required.
+
+
 Customizing the vectorizer classes
------------------------------------
+----------------------------------
 
 It is possible to customize the behavior by passing a callable
 to the vectorizer constructor::

diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst
@@ -28,6 +28,8 @@ is advisable to compare results against exact kernel methods when possible.
 
 .. currentmodule:: sklearn.kernel_approximation
 
+.. _nystroem_kernel_approx:
+
 Nystroem Method for Kernel Approximation
 ----------------------------------------
 The Nystroem method, as implemented in :class:`Nystroem` is a general method

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -701,6 +701,8 @@ The last characteristic implies that the Perceptron is slightly faster to
 train than SGD with the hinge loss and that the resulting models are
 sparser.
 
+.. _passive_aggressive:
+
 Passive Aggressive Algorithms
 =============================
 

diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
@@ -297,6 +297,7 @@ The overall complexity of standard HLLE is
      high-dimensional data" <http://www.pnas.org/content/100/10/5591>`_
      Donoho, D. & Grimes, C. Proc Natl Acad Sci USA. 100:5591 (2003)
 
+.. _spectral_embedding:
 
 Spectral Embedding
 ====================
@@ -498,7 +499,7 @@ Tips on practical use
 
 .. seealso::
 
-   :ref:`random_hashing` can also be useful to derive non-linear
+   :ref:`random_trees_embedding` can also be useful to derive non-linear
    representations of feature space, also it does not perform
    dimensionality reduction.
 
diff --git a/doc/modules/metrics.rst b/doc/modules/metrics.rst
@@ -35,6 +35,34 @@ kernel:
 
 .. currentmodule:: sklearn.metrics.pairwise
 
+Cosine similarity
+-----------------
+:func:`cosine_similarity` computes the L2-normalized dot product of vectors.
+That is, if :math:`x` and :math:`y` are row vectors,
+their cosine similarity :math:`k` is defined as:
+
+.. math::
+
+    k(x, y) = \frac{x \dot y^\top}{\|x\| \|y\|}
+
+This is called cosine similarity, because Euclidean (L2) normalization
+projects the vectors onto the unit sphere,
+and their dot product is then the cosine of the angle between the points
+denoted by the vectors.
+
+This kernel is a popular choice for computing the similarity of documents
+represented as tf-idf vectors.
+:func:`cosine_similarity` accepts ``scipy.sparse`` matrices.
+(Note that the tf-idf functionality in ``sklearn.feature_extraction.text``
+can produce normalized vectors, in which case :func:`cosine_similarity`
+is equivalent to :func:`linear_kernel`, only slower.)
+
+.. topic:: References:
+
+    * C.D. Manning, P. Raghavan and H. Schütze (2008). Introduction to
+      Information Retrieval. Cambridge University Press.
+      http://nlp.stanford.edu/IR-book/html/htmledition/the-vector-space-model-for-scoring-1.html
+
 Chi Squared Kernel
 ------------------
 The chi squared kernel is a very popular choice for training non-linear SVMs in

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -762,6 +762,7 @@ Clustering metrics
 The :mod:`sklearn.metrics` implements several losses, scores and utility
 function for more information see the :ref:`clustering_evaluation` section.
 
+.. _dummy_estimators:
 
 Dummy estimators
 =================

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -306,6 +306,8 @@ to be used when the transformer API is not necessary.
   representation upstream.
 
 
+.. _preprocessing_categorical_features:
+
 Encoding categorical features
 =============================
 Often features are not given as continuous values but categorical.

diff --git a/doc/themes/scikit-learn/layout.html b/doc/themes/scikit-learn/layout.html
@@ -51,6 +51,14 @@
            Latest stable version</a></p></p>
      </div>
    {%- endif %}
+   {%- if theme_surveybanner == true %}
+     <div class="survey-wrapper">
+       <p>Please help us to improve <b>scikit-learn</b>
+	 by completing this quick survey.
+         &mdash; <a href="https://docs.google.com/spreadsheet/viewform?formkey=dFdyeGNhMzlCRWZUdldpMEZlZ1B1YkE6MQ#gid=0">
+           Click here..</a></p></p>
+     </div>
+   {%- endif %}
 
     <div class="header-wrapper">
       <div class="header">