diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py
index f731e2041c2b8..a3d725e4bc62b 100644
--- a/benchmarks/bench_plot_omp_lars.py
+++ b/benchmarks/bench_plot_omp_lars.py
@@ -98,8 +98,8 @@ def compute_bench(samples_range, features_range):
 
 
 if __name__ == '__main__':
-    samples_range = np.linspace(1000, 3000, 5).astype(np.int)
-    features_range = np.linspace(1000, 3000, 5).astype(np.int)
+    samples_range = np.linspace(1000, 5000, 5).astype(np.int)
+    features_range = np.linspace(1000, 5000, 5).astype(np.int)
     results = compute_bench(samples_range, features_range)
     max_time = max(np.max(t) for t in results.itervalues())
 
@@ -107,7 +107,8 @@ def compute_bench(samples_range, features_range):
     fig = pl.figure()
     for i, (label, timings) in enumerate(sorted(results.iteritems())):
         ax = fig.add_subplot(1, 2, i)
-        pl.matshow(timings, fignum=False, cmap='OrRd')
+        vmax = max(1 - timings.min(), -1 + timings.max())
+        pl.matshow(timings, fignum=False, vmin=1-vmax, vmax=1+vmax)
         ax.set_xticklabels([''] + map(str, samples_range))
         ax.set_yticklabels([''] + map(str, features_range))
         pl.xlabel('n_samples')
diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst
index 8e2b3531db545..18c66b8975cbc 100644
--- a/doc/datasets/index.rst
+++ b/doc/datasets/index.rst
@@ -47,6 +47,31 @@ These datasets are useful to quickly illustrate the behavior of the
 various algorithms implemented in the scikit. They are however often too
 small to be representative of real world machine learning tasks.
 
+Sample images
+=============
+
+The scikit also embed a couple of sample JPEG images published under Creative
+Commons license by their authors. Those image can be useful to test algorithms
+and pipeline on 2D data.
+
+.. autosummary::
+
+   load_sample_images
+   load_sample_image
+
+.. note::
+
+  The default coding of images is based on the ``uint8`` dtype to
+  spare memory.  Often machine learning algorithms work best if the
+  input is converted to a floating point representation first.  Also,
+  if you plan to use ``pylab.imshow`` don't forget to scale to the range
+  0 - 1 as done in the following example.
+
+.. topic:: Examples:
+
+    * :ref:`example_cluster_plot_vq_china.py`
+
+
 Sample generators
 =================
 
diff --git a/doc/developers/index.rst b/doc/developers/index.rst
index 23dd7dfff623c..c2fb3caceb7ee 100644
--- a/doc/developers/index.rst
+++ b/doc/developers/index.rst
@@ -328,6 +328,11 @@ classifier or a regressor. All estimators implement the fit method::
 
     estimator.fit(X, y)
 
+All built-in estimators also have a ``set_params`` method, which sets
+data-independent parameters (overriding previous parameter values passed
+to ``__init__``). This method is not required for an object to be an
+estimator.
+
 
 Instantiation
 ^^^^^^^^^^^^^
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index bfd4c440fa48d..76b856de140af 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -561,6 +561,7 @@ Manifold learning
     :template: class.rst
 
     manifold.LocallyLinearEmbedding
+    manifold.Isomap
 
 
 .. autosummary::
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 0479a35820a20..418aaf6771bfa 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -406,7 +406,7 @@ homogeneous but not complete::
 
 
 Mathematical formulation
-------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~
 
 Homogeneity and completeness scores are formally given by:
 
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index e718a52165d03..3fab5008aa43c 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -466,7 +466,7 @@ By default :math:`\alpha_1 = \alpha_2 =  \lambda_1 = \lambda_2 = 1.e^{-6}`, *i.e
     >>> clf = linear_model.BayesianRidge()
     >>> clf.fit (X, Y)
     BayesianRidge(n_iter=300, verbose=False, lambda_1=1e-06, lambda_2=1e-06,
-           fit_intercept=True, eps=0.001, alpha_2=1e-06, alpha_1=1e-06,
+           fit_intercept=True, alpha_2=1e-06, tol=0.001, alpha_1=1e-06,
            compute_score=False)
 
 After being fitted, the model can then be used to predict new values::
diff --git a/doc/modules/manifold.rst b/doc/modules/manifold.rst
index 5949e420000d0..ecf22534a8133 100644
--- a/doc/modules/manifold.rst
+++ b/doc/modules/manifold.rst
@@ -20,52 +20,344 @@ Manifold learning
 
 
 
+.. figure:: ../auto_examples/manifold/images/plot_compare_methods_1.png
+   :target: ../auto_examples/manifold/plot_compare_methods.html
+   :align: center
+   :scale: 60
+
 Manifold learning is an approach to nonlinear dimensionality reduction.
 Algorithms for this task are based on the idea that the dimensionality of
 many data sets is only artificially high.
 
 
+Introduction
+============
+
+High-dimensional datasets can be very difficult to visualize.  While data
+in two or three dimensions can be plotted to show the inherent
+structure of the data, equivalent high-dimensional plots are much less
+intuitive.  To aid visualization of the structure of a dataset, the
+dimension must be reduced in some way.
+
+The simplest way to accomplish this dimensionality reduction is by taking
+a random projection of the data.  Though this allows some degree of
+visualization of the data structure, the randomness of the choice leaves much
+to be desired.  In a random projection, it is likely that the more
+interesting structure within the data will be lost.
+
+
+.. |digits_img| image:: ../auto_examples/manifold/images/plot_lle_digits_1.png
+    :target: ../auto_examples/manifold/plot_lle_digits.html
+    :scale: 50
+
+.. |projected_img| image::  ../auto_examples/manifold/images/plot_lle_digits_2.png
+    :target: ../auto_examples/manifold/plot_lle_digits.html
+    :scale: 50
+
+.. centered:: |digits_img| |projected_img|
+
+
+To address this concern, a number of supervised and unsupervised linear
+dimensionality reduction frameworks have been designed, such as Principal
+Component Analysis (PCA), Independent Component Analysis, Linear 
+Discriminant Analysis, and others.  These algorithms define specific 
+rubrics to choose an "interesting" linear projection of the data.
+These methods can be powerful, but often miss important nonlinear 
+structure in the data.
+
+
+.. |PCA_img| image:: ../auto_examples/manifold/images/plot_lle_digits_3.png
+    :target: ../auto_examples/manifold/plot_lle_digits.html
+    :scale: 50
+
+.. |LDA_img| image::  ../auto_examples/manifold/images/plot_lle_digits_4.png
+    :target: ../auto_examples/manifold/plot_lle_digits.html
+    :scale: 50
+
+.. centered:: |PCA_img| |LDA_img|
+
+Manifold Learning can be thought of as an attempt to generalize linear
+frameworks like PCA to be sensitive to nonlinear structure in data. Though
+supervised variants exist, the typical manifold learning problem is
+unsupervised: it learns the high-dimensional structure of the data
+from the data itself, without the use of predetermined classifications.
+
+
+.. topic:: Examples:
+
+    * See :ref:`example_manifold_plot_lle_digits.py` for an example of
+      dimensionality reduction on handwritten digits.
+
+    * See :ref:`example_manifold_plot_compare_methods.py` for an example of
+      dimensionality reduction on a toy "S-curve" dataset.
+
+The manifold learning implementations available in scikits-learn are
+summarized below
+
+Isomap
+======
+
+One of the earliest approaches to manifold learning is the Isomap
+algorithm, short for Isometric Mapping.  Isomap can be viewed as an
+extension of Multi-dimensional Scaling (MDS) or Kernel PCA.
+Isomap seeks a lower-dimensional embedding which maintains geodesic
+distances between all points.  Isomap can be performed with the object
+:class:`Isomap`.
+
+.. figure:: ../auto_examples/manifold/images/plot_lle_digits_5.png
+   :target: ../auto_examples/manifold/plot_lle_digits.html
+   :align: center
+   :scale: 50
+
+Complexity
+----------
+The Isomap algorithm comprises three stages:
+
+1. **Nearest neighbor search.**  Isomap uses
+   :class:`scikits.learn.neighbors.BallTree` for efficient neighbor search.
+   The cost is approximately :math:`O[D \log(k) N \log(N)]`, for :math:`k`
+   nearest neighbors of :math:`N` points in :math:`D` dimensions.
+
+2. **Shortest-path graph search.**  The most efficient known algorithms
+   for this are *Dijkstra's Algorithm*, which is approximately 
+   :math:`O[N^2(k + \log(N))]`, or the *Floyd-Warshall algorithm*, which
+   is :math:`O[N^3]`.  The algorithm can be selected by the user with
+   the ``path_method`` keyword of ``Isomap``.  If unspecified, the code
+   attempts to choose the best algorithm for the input data.
+
+3. **Partial eigenvalue decomposition.**  The embedding is encoded in the 
+   eigenvectors corresponding to the :math:`d` largest eigenvalues of the
+   :math:`N \times N` isomap kernel.  For a dense solver, the cost is
+   approximately :math:`O[d N^2]`.  This cost can often be improved using
+   the ``ARPACK`` solver.  The eigensolver can be specified by the user
+   with the ``path_method`` keyword of ``Isomap``.  If unspecified, the
+   code attempts to choose the best algorithm for the input data.
+
+The overall complexity of Isomap is
+:math:`O[D \log(k) N \log(N)] + O[N^2(k + \log(N))] + O[d N^2]`.
+
+* :math:`N` : number of training data points
+* :math:`D` : input dimension
+* :math:`k` : number of nearest neighbors
+* :math:`d` : output dimension
+
+.. topic:: References:
+
+   * `"A global geometric framework for nonlinear dimensionality reduction"
+     <http://www.sciencemag.org/content/290/5500/2319.full>`_
+     Tenenbaum, J.B.; De Silva, V.; & Langford, J.C.  Science 290 (5500)
+
 
 Locally Linear Embedding
-------------------------
+========================
+
+Locally linear embedding (LLE) seeks a lower-dimensional projection of the data
+which preserves distances within local neighborhoods.  It can be thought
+of as a series of local Principal Component Analyses which are globally
+compared to find the best nonlinear embedding.
 
 Locally linear embedding can be performed with function
 :func:`locally_linear_embedding` or its object-oriented counterpart
 :class:`LocallyLinearEmbedding`.
 
-These take as input a set of points in a high-dimensional space and return
-those points embedded in a manifold of dimension specified by parameter
-``out_dim``.
+.. figure:: ../auto_examples/manifold/images/plot_lle_digits_6.png
+   :target: ../auto_examples/manifold/plot_lle_digits.html
+   :align: center
+   :scale: 50
+
+Complexity
+----------
+
+The standard LLE algorithm comprises three stages:
+
+1. **Nearest Neighbors Search**.  See discussion under Isomap above.
+
+2. **Weight Matrix Construction**. :math:`O[D N k^3]`.
+   The construction of the LLE weight matrix involves the solution of a
+   :math:`k \times k` linear equation for each of the :math:`N` local
+   neighborhoods
 
-.. figure:: ../auto_examples/manifold/images/plot_lle_digits_4.png
+3. **Partial Eigenvalue Decomposition**. See discussion under Isomap above.
+
+The overall complexity of standard LLE is
+:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[d N^2]`.
+
+* :math:`N` : number of training data points
+* :math:`D` : input dimension
+* :math:`k` : number of nearest neighbors
+* :math:`d` : output dimension
+
+.. topic:: References:
+   
+   * `"Nonlinear dimensionality reduction by locally linear embedding"
+     <http://www.sciencemag.org/content/290/5500/2323.full>`_
+     Roweis, S. & Saul, L.  Science 290:2323 (2000)
+
+
+Modified Locally Linear Embedding
+=================================
+
+One well-known issue with LLE is the regularization problem.  When the number
+of neighbors is greater than the number of input dimensions, the matrix
+defining each local neighborhood is rank-deficient.  To address this, standard
+LLE applies an arbitrary regularization parameter :math:`r`, which is chosen
+relative to the trace of the local weight matrix.  Though it can be shown
+formally that as :math:`r \to 0`, the solution coverges to the desired
+embedding, there is no guarantee that the optimal solution will be found
+for :math:`r > 0`.  This problem manifests itself in embeddings which distort
+the underlying geometry of the manifold.
+
+One method to address the regularization problem is to use multiple weight
+vectors in each neighborhood.  This is the essence of *modified locally
+linear embedding* (MLLE).  MLLE can be  performed with function
+:func:`locally_linear_embedding` or its object-oriented counterpart
+:class:`LocallyLinearEmbedding`, with the keyword ``method = 'modified'``.
+It requires ``n_neighbors > out_dim``.
+
+.. figure:: ../auto_examples/manifold/images/plot_lle_digits_7.png
    :target: ../auto_examples/manifold/plot_lle_digits.html
    :align: center
+   :scale: 50
+   
+Complexity
+----------
 
+The MLLE algorithm comprises three stages:
 
-.. topic:: Examples:
+1. **Nearest Neighbors Search**.  Same as standard LLE
 
-    * See :ref:`example_manifold_plot_lle_digits.py` for an example of
-      dimensionality reduction on handwritten digits.
+2. **Weight Matrix Construction**. Approximately
+   :math:`O[D N k^3] + O[N (k-D) k^2]`.  The first term is exactly equivalent
+   to that of standard LLE.  The second term has to do with constructing the
+   weight matrix from multiple weights.  In practice, the added cost of 
+   constructing the MLLE weight matrix is relatively small compared to the
+   cost of steps 1 and 3.
+
+3. **Partial Eigenvalue Decomposition**. Same as standard LLE
+
+The overall complexity of MLLE is
+:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[N (k-D) k^2] + O[d N^2]`.
+
+* :math:`N` : number of training data points
+* :math:`D` : input dimension
+* :math:`k` : number of nearest neighbors
+* :math:`d` : output dimension
+
+.. topic:: References:
+     
+   * `"MLLE: Modified Locally Linear Embedding Using Multiple Weights"
+     <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382>`_
+     Zhang, Z. & Wang, J.
 
-    * See :ref:`example_manifold_plot_swissroll.py` for an example of
-      locally linear embedding on the swiss roll.
 
+Hessian Eigenmapping
+====================
 
+Hessian Eigenmapping (also known as Hessian-based LLE: HLLE) is another method
+of solving the regularization problem of LLE.  It revolves around a
+hessian-based quadratic form at each neighborhood which is used to recover
+the locally linear structure.  Though other implementations note its poor
+scaling with data size, ``scikits-learn`` implements some algorithmic
+improvements which make its cost comparable to that of other LLE variants
+for small output dimension.  HLLE can be  performed with function
+:func:`locally_linear_embedding` or its object-oriented counterpart
+:class:`LocallyLinearEmbedding`, with the keyword ``method = 'hessian'``.
+It requires ``n_neighbors > out_dim * (out_dim + 3) / 2``.
+
+.. figure:: ../auto_examples/manifold/images/plot_lle_digits_8.png
+   :target: ../auto_examples/manifold/plot_lle_digits.html
+   :align: center
+   :scale: 50
+   
 Complexity
 ----------
 
-The complete algorithm scales using the `dense` eigensolver scales as
-:math:`O(N log(N)) + O(D N K^3) + O(d N^2)`, where N is the number of samples,
-D is the input dimension, d the output dimension and K the number of
-neighbors. If the `lobcpg` solver is used, the last term can be reduced to
-sub-quadratic in N.
+The HLLE algorithm comprises three stages:
+
+1. **Nearest Neighbors Search**.  Same as standard LLE
+
+2. **Weight Matrix Construction**. Approximately
+   :math:`O[D N k^3] + O[N d^6]`.  The first term reflects a similar
+   cost to that of standard LLE.  The second term comes from a QR
+   decomposition of the local hessian estimator.
+
+3. **Partial Eigenvalue Decomposition**. Same as standard LLE
+
+The overall complexity of standard HLLE is
+:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[N d^6] + O[d N^2]`.
+
+* :math:`N` : number of training data points
+* :math:`D` : input dimension
+* :math:`k` : number of nearest neighbors
+* :math:`d` : output dimension
+
+.. topic:: References:
+
+   * `"Hessian Eigenmaps: Locally linear embedding techniques for
+     high-dimensional data" <http://www.pnas.org/content/100/10/5591>`_
+     Donoho, D. & Grimes, C. Proc Natl Acad Sci USA. 100:5591 (2003)
+
+
+Local Tangent Space Alignment
+=============================
+
+Though not technically a variant of LLE, Local tangent space alignment (LTSA)
+is algorithmically similar enough to LLE that it can be put in this category.
+Rather than focusing on preserving neighborhood distances as in LLE, LTSA
+seeks to characterize the local geometry at each neighborhood via its
+tangent space, and performs a global optimization to align these local 
+tangent spaces to learn the embedding.  LTSA can be performed with function
+:func:`locally_linear_embedding` or its object-oriented counterpart
+:class:`LocallyLinearEmbedding`, with the keyword ``method = 'ltsa'``.
+
+.. figure:: ../auto_examples/manifold/images/plot_lle_digits_9.png
+   :target: ../auto_examples/manifold/plot_lle_digits.html
+   :align: center
+   :scale: 50
+   
+Complexity
+----------
+
+The LTSA algorithm comprises three stages:
+
+1. **Nearest Neighbors Search**.  Same as standard LLE
+
+2. **Weight Matrix Construction**. Approximately
+   :math:`O[D N k^3] + O[k^2 d]`.  The first term reflects a similar
+   cost to that of standard LLE.
+
+3. **Partial Eigenvalue Decomposition**. Same as standard LLE
+
+The overall complexity of standard LTSA is
+:math:`O[D \log(k) N \log(N)] + O[D N k^3] + O[k^2 d] + O[d N^2]`.
+
+* :math:`N` : number of training data points
+* :math:`D` : input dimension
+* :math:`k` : number of nearest neighbors
+* :math:`d` : output dimension
+
+.. topic:: References:
+
+   * `"Principal manifolds and nonlinear dimensionality reduction via
+     tangent space alignment"
+     <http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.3693>`_
+     Zhang, Z. & Zha, H. Journal of Shanghai Univ. 8:406 (2004)
 
 
 Tips on practical use
----------------------
+=====================
+
+* Make sure the same scale is used over all features. Because manifold
+  learning methods are based on a nearest-neighbor search, the algorithm
+  may perform poorly otherwise.  See :ref:`Scaler <preprocessing_scaler>`
+  for convenient ways of scaling heterogeneous data.
 
-* Make sure the same scale is used over all features. Being this a
-  nearest-neighbors method it will behave poorly otherwise.
+* The reconstruction error computed by each routine can be used to choose
+  the optimal output dimension.  For a :math:`d`-dimensional manifold embedded
+  in a :math:`D`-dimensional parameter space, the reconstruction error will
+  decrease as ``out_dim`` is increased until ``out_dim == d``.
 
-* On certain problems, the `lobcpg` solver might converge slowly. Supply a
-  generous value for `max_iter` if big oscillations are detected between runs.
+* Note that noisy data can "short-circuit" the manifold, in essence acting
+  as a bridge between parts of the manifold that would otherwise be
+  well-separated.  Manifold learning on noisy and/or incomplete data is
+  an active area of research.
\ No newline at end of file
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 8079bf6f21040..443a276acc817 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -10,6 +10,7 @@ The ``scikits.learn.preprocessing`` package provides several common
 utility functions and transformer classes to change raw feature vectors
 into a representation that is more suitable for the downstream estimators.
 
+.. _preprocessing_scaler:
 
 Standardization or Mean Removal and Variance Scaling
 ====================================================
diff --git a/examples/cluster/plot_segmentation_toy.py b/examples/cluster/plot_segmentation_toy.py
index 8ae07f9c9b312..e810c25b2bfd4 100644
--- a/examples/cluster/plot_segmentation_toy.py
+++ b/examples/cluster/plot_segmentation_toy.py
@@ -68,7 +68,9 @@
 # dependant from the gradient the segmentation is close to a voronoi
 graph.data = np.exp(-graph.data/graph.data.std())
 
-labels = spectral_clustering(graph, k=4)
+# Force the solver to be arpack, since amg is numerically
+# unstable on this example
+labels = spectral_clustering(graph, k=4, mode='arpack')
 label_im = -np.ones(mask.shape)
 label_im[mask] = labels
 
@@ -86,7 +88,7 @@
 graph = image.img_to_graph(img, mask=mask)
 graph.data = np.exp(-graph.data/graph.data.std())
 
-labels = spectral_clustering(graph, k=2)
+labels = spectral_clustering(graph, k=2, mode='arpack')
 label_im = -np.ones(mask.shape)
 label_im[mask] = labels
 
diff --git a/examples/cluster/plot_vq_china.py b/examples/cluster/plot_vq_china.py
new file mode 100644
index 0000000000000..8518f1b1e623b
--- /dev/null
+++ b/examples/cluster/plot_vq_china.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+"""
+============================================
+Vector Quantization of a photo using k-means
+============================================
+
+Performs a pixel-wise Vector Quantization of an image, reducing the number of
+colors required to show the image while preserving the overall appearance.
+"""
+print __doc__
+import numpy as np
+import pylab as pl
+from scikits.learn.cluster import KMeans
+from scikits.learn.datasets import load_sample_image
+from scikits.learn.utils import shuffle
+from time import time
+
+# Load the Summer Palace photo
+china = load_sample_image("china.jpg")
+
+# Convert to floats instead of the default 8 bits integer coding. Dividing by
+# 255 is important so that pl.imshow behaves works well on foat data (need to be
+# in the range [0-1]
+china = np.array(china, dtype=np.float64) / 255
+
+# Load Image and transform to a 2D numpy array.
+w, h, d = original_shape = tuple(china.shape)
+assert d == 3
+image_array = np.reshape(china, (w * h, d))
+
+print "Fitting estimator on a small sub-sample of the data"
+t0 = time()
+image_array_sample = shuffle(image_array, random_state=0)[:1000]
+kmeans = KMeans(k=10, random_state=0).fit(image_array_sample)
+print "done in %0.3fs." % (time() - t0)
+
+# Get labels for all points
+print "Predicting labels on the full image"
+t0 = time()
+labels = kmeans.predict(image_array)
+print "done in %0.3fs." % (time() - t0)
+
+def recreate_image(codebook, labels, w, h):
+    """Recreate the (compressed) image from the code book & labels"""
+    d = codebook.shape[1]
+    image = np.zeros((w, h, d))
+    label_idx = 0
+    for i in range(w):
+        for j in range(h):
+            image[i][j] = codebook[labels[label_idx]]
+            label_idx += 1
+    return image
+
+# Display all results, alongside original image
+pl.figure(1)
+pl.clf()
+ax = pl.axes([0, 0, 1, 1])
+pl.axis('off')
+pl.imshow(china)
+
+pl.figure(2)
+pl.clf()
+ax = pl.axes([0, 0, 1, 1])
+pl.axis('off')
+pl.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))
+pl.show()
diff --git a/examples/cluster/vq_china.py b/examples/cluster/vq_china.py
deleted file mode 100644
index c0b048af1f932..0000000000000
--- a/examples/cluster/vq_china.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-=========================================
-Vector Quantization of Lena using k-means
-=========================================
-
-Performs a Vector Quatization of an image, reducing the 
-number of colors required to show the image.
-"""
-print __doc__
-import os
-import numpy as np
-import pylab as pl
-from scikits.learn.cluster import KMeans
-from scikits.learn.datasets import load_sample_images
-
-# Get all sample images and obtain just china.jpg
-sample_image_name = "china.jpg"
-sample_images = load_sample_images()
-index = None
-for i, filename in enumerate(sample_images.filenames):
-    if filename.endswith(sample_image_name):
-        index = i
-        break
-if index is None:
-    raise AttributeError("Cannot find sample image: %s" % sample_image_name)
-image_data = sample_images.images[index]
-
-# Load Image and transform to a 2D numpy array.
-w, h, d = original_shape = tuple(image_data.shape)
-image_array = np.reshape(image_data, (w * h, 3))
-
-# Take a sample of the data.
-sample_indices = range(len(image_array))
-np.random.shuffle(sample_indices)
-sample_indices = sample_indices[:int(len(image_array) * 0.2)]
-sample_data = image_array[sample_indices]
-
-# Perform Vector Quantisation with 256 clusters.
-k = 256
-kmeans = KMeans(k=k)
-kmeans.fit(sample_data)
-# Get labels for all points
-labels = kmeans.predict(image_array)
-# Save the reduced dataset. Only the centroids and labels need to be saved.
-reduced_image = (kmeans.cluster_centers_, labels)
-
-def recreate_image(centroids, labels, w, h):
-    # Recreates the (compressed) image from centroids, labels and dimensions
-    d = len(centroids[0])
-    image = np.zeros((w, h, d))
-    label_num = 0
-    for i in range(w):
-        for j in range(h):
-            image[i][j] = centroids[labels[label_num]]
-            label_num += 1
-    print np.histogram(labels)
-    print set(labels)
-    return image
-
-# Display all results, alongside original image
-pl.figure()
-ax = pl.axes([0,0,1,1], frameon=False)
-ax.set_axis_off()
-centroids, labels = reduced_image
-im = pl.imshow(recreate_image(centroids, labels, w, h))
-
-pl.show()
-
diff --git a/examples/manifold/plot_compare_methods.py b/examples/manifold/plot_compare_methods.py
index e7944c63bd56c..d6416f51b259d 100644
--- a/examples/manifold/plot_compare_methods.py
+++ b/examples/manifold/plot_compare_methods.py
@@ -1,21 +1,10 @@
-r"""
+"""
 =========================================
  Comparison of Manifold Learning methods
 =========================================
 
 An illustration of dimensionality reduction on the S-curve dataset
-with various manifold learning methods.  The methods are as follows:
-
-* LLE : Standard Locally Linear Embedding.
-  :func:`scikits.learn.manifold.locally_linear`, ``method = 'standard'``
-* LTSA : Local Tangent Space Alignment.
-  :func:`scikits.learn.manifold.locally_linear`, ``method = 'ltsa'``
-* Hessian LLE : Hessian Eigenmapping.
-  :func:`scikits.learn.manifold.locally_linear`, ``method = 'hessian``
-* Modified LLE : Modified Locally Linear Embedding with multiple weights.
-  :func:`scikits.learn.manifold.locally_linear`, ``method = 'modified'``
-* Isomap : Isometric Mapping.
-  :func:`scikits.learn.manifold.Isomap`
+with various manifold learning methods.
 
 For a discussion and comparison of these algorithms, see the
 :ref:`manifold module page <manifold>`
@@ -33,13 +22,11 @@
 
 from scikits.learn import manifold, datasets
 
-X, color = datasets.samples_generator.make_s_curve(1000)
+n_points = 1000
+X, color = datasets.samples_generator.make_s_curve(n_points)
 n_neighbors = 10
 out_dim = 2
 
-methods = ['standard', 'ltsa', 'hessian', 'modified']
-labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']
-
 fig = pl.figure(figsize=(12, 8))
 pl.suptitle("Manifold Learning with %i points, %i neighbors"
                % (1000, n_neighbors), fontsize=14)
@@ -53,7 +40,8 @@
     ax = fig.add_subplot(231, projection='3d')
     pl.scatter(X[:, 0], X[:, 2], c=color, cmap=pl.cm.Spectral)
 
-ax.set_title('Original Data')
+methods = ['standard', 'ltsa', 'hessian', 'modified']
+labels = ['LLE', 'LTSA', 'Hessian LLE', 'Modified LLE']    
 
 for i, method in enumerate(methods):
     t0 = time()
@@ -79,5 +67,6 @@
 pl.title("Isomap (%.2g sec)" % (t1 - t0))
 ax.xaxis.set_major_formatter(NullFormatter())
 ax.yaxis.set_major_formatter(NullFormatter())
+pl.axis('tight')
 
 pl.show()
diff --git a/examples/manifold/plot_lle_digits.py b/examples/manifold/plot_lle_digits.py
index 760bb9b534079..03148070d2d77 100644
--- a/examples/manifold/plot_lle_digits.py
+++ b/examples/manifold/plot_lle_digits.py
@@ -26,6 +26,7 @@
 n_samples, n_features = X.shape
 n_neighbors = 30
 
+
 #----------------------------------------------------------------------
 # Scale and visualize the embedding vectors
 def plot_embedding(X, title=None):
@@ -57,6 +58,21 @@ def plot_embedding(X, title=None):
         pl.title(title)
 
 
+#----------------------------------------------------------------------
+# Plot images of the digits
+N = 20
+img = np.zeros((10 * N, 10 * N))
+for i in range(N):
+    ix = 10*i + 1
+    for j in range(N):
+        iy = 10 * j + 1
+        img[ix:ix + 8, iy:iy + 8] = X[i*N+j].reshape((8,8))
+pl.imshow(img, cmap=pl.cm.binary)
+pl.xticks([])
+pl.yticks([])
+pl.title('A selection from the 64-dimensional digits dataset')
+
+
 #----------------------------------------------------------------------
 # Random 2D projection using a random unitary matrix
 print "Computing random projection"
@@ -89,6 +105,17 @@ def plot_embedding(X, title=None):
     (time() - t0))
 
 
+#----------------------------------------------------------------------
+# Isomap projection of the digits dataset
+print "Computing Isomap embedding"
+t0 = time()
+X_iso = manifold.Isomap(n_neighbors, out_dim=2).fit_transform(X)
+print "Done."
+plot_embedding(X_iso, 
+    "Isomap projection of the digits (time %.2fs)" % 
+    (time() - t0))
+
+
 #----------------------------------------------------------------------
 # Locally linear embedding of the digits dataset
 print "Computing LLE embedding"
@@ -115,19 +142,6 @@ def plot_embedding(X, title=None):
     (time() - t0))
 
 
-#----------------------------------------------------------------------
-# LTSA embedding of the digits dataset
-print "Computing LTSA embedding"
-clf = manifold.LocallyLinearEmbedding(n_neighbors, out_dim=2,
-                                      method='ltsa')
-t0 = time()
-X_ltsa = clf.fit_transform(X)
-print "Done. Reconstruction error: %g" % clf.reconstruction_error_
-plot_embedding(X_ltsa, 
-    "Local Tangent Space Alignment of the digits (time %.2fs)" % 
-    (time() - t0))
-
-
 #----------------------------------------------------------------------
 # HLLE embedding of the digits dataset
 print "Computing Hessian LLE embedding"
@@ -142,15 +156,15 @@ def plot_embedding(X, title=None):
 
 
 #----------------------------------------------------------------------
-# Isomap projection of the digits dataset
-print "Computing Isomap embedding"
+# LTSA embedding of the digits dataset
+print "Computing LTSA embedding"
+clf = manifold.LocallyLinearEmbedding(n_neighbors, out_dim=2,
+                                      method='ltsa')
 t0 = time()
-X_iso = manifold.Isomap(n_neighbors, out_dim=2).fit_transform(X)
-print "Done."
-plot_embedding(X_iso, 
-    "Isomap projection of the digits (time %.2fs)" % 
+X_ltsa = clf.fit_transform(X)
+print "Done. Reconstruction error: %g" % clf.reconstruction_error_
+plot_embedding(X_ltsa,
+    "Local Tangent Space Alignment of the digits (time %.2fs)" % 
     (time() - t0))
 
-
-
 pl.show()
diff --git a/examples/svm/plot_svm_anova.py b/examples/svm/plot_svm_anova.py
index 6187afe801218..100cd0593f67c 100644
--- a/examples/svm/plot_svm_anova.py
+++ b/examples/svm/plot_svm_anova.py
@@ -40,7 +40,7 @@
 percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)
 
 for percentile in percentiles:
-    clf._set_params(anova__percentile=percentile)
+    clf.set_params(anova__percentile=percentile)
     # Compute cross-validation score using all CPUs
     this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1)
     score_means.append(this_scores.mean())
diff --git a/scikits/learn/base.py b/scikits/learn/base.py
index 255b995240c75..944fa82209129 100644
--- a/scikits/learn/base.py
+++ b/scikits/learn/base.py
@@ -6,8 +6,10 @@
 import inspect
 import numpy as np
 from scipy import sparse
+import warnings
 
 from .metrics import r2_score
+from .utils import deprecated
 
 
 ###############################################################################
@@ -172,13 +174,17 @@ def _get_params(self, deep=True):
             out[key] = value
         return out
 
-    def _set_params(self, **params):
+    def set_params(self, **params):
         """ Set the parameters of the estimator.
 
         The method works on simple estimators as well as on nested
         objects (such as pipelines). The former have parameters of the
-        form <component>__<parameter> so that the its possible to
-        update each component of the nested object.
+        form <component>__<parameter> so that it's possible to update
+        each component of a nested object.
+
+        Returns
+        -------
+        self
         """
         if not params:
             # Simple optimisation to gain speed (inspect is slow)
@@ -198,7 +204,7 @@ def _set_params(self, **params):
                     'sub parameter %s' %
                         (sub_name, self.__class__.__name__, sub_name)
                     )
-                sub_object._set_params(**{sub_name: value})
+                sub_object.set_params(**{sub_name: value})
             else:
                 # simple objects case
                 assert key in valid_params, ('Invalid parameter %s '
@@ -207,6 +213,13 @@ def _set_params(self, **params):
                 setattr(self, key, value)
         return self
 
+    def _set_params(self, **params):
+        if params != {}:
+            warnings.warn("Passing estimator parameters to fit is deprecated;"
+                          " use set_params instead",
+                          category=DeprecationWarning)
+        return self.set_params(**params)
+
     def __repr__(self):
         class_name = self.__class__.__name__
         return '%s(%s)' % (
diff --git a/scikits/learn/cluster/affinity_propagation_.py b/scikits/learn/cluster/affinity_propagation_.py
index 6db00c3cc07b1..a96d70d25eee9 100644
--- a/scikits/learn/cluster/affinity_propagation_.py
+++ b/scikits/learn/cluster/affinity_propagation_.py
@@ -210,7 +210,7 @@ def __init__(self, damping=.5, max_iter=200, convit=30, copy=True):
         self.convit = convit
         self.copy = copy
 
-    def fit(self, S, p=None, **params):
+    def fit(self, S, p=None):
         """compute MeanShift
 
         Parameters
@@ -227,7 +227,6 @@ def fit(self, S, p=None, **params):
             algorithm, for memory efficiency
 
         """
-        self._set_params(**params)
         self.cluster_centers_indices_, self.labels_ = affinity_propagation(S,
                                 p, max_iter=self.max_iter, convit=self.convit,
                                 damping=self.damping,
diff --git a/scikits/learn/cluster/dbscan_.py b/scikits/learn/cluster/dbscan_.py
index 7ba2c0cc680a3..00b68b2d63f2b 100644
--- a/scikits/learn/cluster/dbscan_.py
+++ b/scikits/learn/cluster/dbscan_.py
@@ -188,7 +188,7 @@ def fit(self, X, **params):
             Overwrite keywords from __init__.
         """
 
-        self._set_params(**params)
+        self.set_params(**params)
         self.core_sample_indices_, self.labels_ = dbscan(X,
                                                          **self._get_params())
         self.components_ = X[self.core_sample_indices_].copy()
diff --git a/scikits/learn/cluster/hierarchical.py b/scikits/learn/cluster/hierarchical.py
index 5451fd49aff1f..ba98de6d262e8 100644
--- a/scikits/learn/cluster/hierarchical.py
+++ b/scikits/learn/cluster/hierarchical.py
@@ -292,7 +292,7 @@ def __init__(self, n_clusters=2, memory=Memory(cachedir=None, verbose=0),
         self.n_components = n_components
         self.connectivity = connectivity
 
-    def fit(self, X, **params):
+    def fit(self, X):
         """Fit the hierarchical clustering on the data
 
         Parameters
@@ -304,8 +304,6 @@ def fit(self, X, **params):
         -------
         self
         """
-        self._set_params(**params)
-
         memory = self.memory
         if isinstance(memory, basestring):
             memory = Memory(cachedir=memory)
diff --git a/scikits/learn/cluster/k_means_.py b/scikits/learn/cluster/k_means_.py
index f5056dbcfbcee..d709657caba5a 100644
--- a/scikits/learn/cluster/k_means_.py
+++ b/scikits/learn/cluster/k_means_.py
@@ -16,10 +16,11 @@
 
 from ..base import BaseEstimator
 from ..metrics.pairwise import euclidean_distances
-from ..utils import check_random_state
 from ..utils import check_arrays
-from ..utils import shuffle
+from ..utils import check_random_state
 from ..utils import gen_even_slices
+from ..utils import shuffle
+from ..utils import warn_if_not_float
 
 from . import _k_means
 
@@ -229,7 +230,7 @@ def k_means(X, k, init='k-means++', n_init=10, max_iter=300, verbose=0,
         for i in range(max_iter):
             centers_old = centers.copy()
             labels, inertia = _e_step(X, centers,
-                                        x_squared_norms=x_squared_norms)
+                                      x_squared_norms=x_squared_norms)
             centers = _m_step(X, labels, k)
 
             if verbose:
@@ -483,25 +484,22 @@ def __init__(self, k=8, init='k-means++', n_init=10, max_iter=300,
         self.random_state = random_state
         self.copy_x = copy_x
 
-    def _check_data(self, X, **params):
-        """
-        Set parameters and check the sample given is larger than k
-        """
+    def _check_data(self, X):
+        """Verify that the number of samples given is larger than k"""
         if sp.issparse(X):
             raise ValueError("K-Means does not support sparse input matrices.")
         X = np.asanyarray(X)
         if X.shape[0] < self.k:
-            raise ValueError("n_samples=%d should be larger than k=%d" % (
+            raise ValueError("n_samples=%d should be >= k=%d" % (
                 X.shape[0], self.k))
-        self._set_params(**params)
         return X
 
-    def fit(self, X, **params):
-        """Fit k-means"""
-
+    def fit(self, X, y=None):
+        """Compute k-means"""
         self.random_state = check_random_state(self.random_state)
 
-        X = self._check_data(X, **params)
+        X = self._check_data(X)
+        warn_if_not_float(X, self)
 
         self.cluster_centers_, self.labels_, self.inertia_ = k_means(
             X, k=self.k, init=self.init, n_init=self.n_init,
@@ -509,7 +507,7 @@ def fit(self, X, **params):
             tol=self.tol, random_state=self.random_state, copy_x=self.copy_x)
         return self
 
-    def transform(self, X):
+    def transform(self, X, y=None):
         """ Transform the data to a cluster-distance space
 
         In the new space, each dimension is the distance to the cluster centers.
@@ -537,7 +535,7 @@ def transform(self, X):
         return euclidean_distances(X, self.cluster_centers_)
 
     def predict(self, X):
-        """ Predict the closest cluster each sample belongs to
+        """Predict the closest cluster each sample in X belongs to.
 
         In the vector quantization literature, `cluster_centers_` is called
         the code book and each value returned by `predict` is the index of
@@ -550,17 +548,17 @@ def predict(self, X):
 
         Returns
         -------
-        Y : array, shape [n_samples, ]
+        Y : array, shape [n_samples,]
             Index of the closest center each sample belongs to.
         """
         if not hasattr(self, "cluster_centers_"):
             raise AttributeError("Model has not been trained yet. "
                                  "Fit k-means before using predict.")
-        cluster_shape = self.cluster_centers_.shape[1]
-        if not X.shape[1] == cluster_shape:
+        expected_n_features = self.cluster_centers_.shape[1]
+        if not X.shape[1] == expected_n_features:
             raise ValueError("Incorrect number of features. "
-                             "Got %d features, expected %d" % (X.shape[1],
-                                                               cluster_shape))
+                             "Got %d features, expected %d" % (
+                                 X.shape[1], expected_n_features))
         return _e_step(X, self.cluster_centers_)[0]
 
 
@@ -633,8 +631,7 @@ def _mini_batch_step_sparse(X, batch_slice, centers, counts, x_squared_norms):
 
 
 class MiniBatchKMeans(KMeans):
-    """
-    Mini-Batch K-Means clustering
+    """Mini-Batch K-Means clustering
 
     Parameters
     ----------
@@ -702,18 +699,17 @@ def __init__(self, k=8, init='random', max_iter=100,
         self.cluster_centers_ = None
         self.chunk_size = chunk_size
 
-    def fit(self, X, y=None, **params):
-        """
-        Calculates the centroids on a batch X
+    def fit(self, X, y=None):
+        """Compute the centroids on X by chunking it into mini-batches.
 
         Parameters
         ----------
         X: array-like, shape = [n_samples, n_features]
             Coordinates of the data points to cluster
         """
-        self._set_params(**params)
         self.random_state = check_random_state(self.random_state)
         X = check_arrays(X, sparse_format="csr", copy=False)[0]
+        warn_if_not_float(X, self)
         n_samples, n_features = X.shape
         if n_samples < self.k:
             raise ValueError("Number of samples smaller than number "\
@@ -764,7 +760,7 @@ def fit(self, X, y=None, **params):
 
         return self
 
-    def partial_fit(self, X, y=None, **params):
+    def partial_fit(self, X, y=None):
         """Update k means estimate on a single mini-batch X.
 
         Parameters
diff --git a/scikits/learn/cluster/mean_shift_.py b/scikits/learn/cluster/mean_shift_.py
index 9f554487a795c..7e2edbd177d22 100644
--- a/scikits/learn/cluster/mean_shift_.py
+++ b/scikits/learn/cluster/mean_shift_.py
@@ -191,7 +191,7 @@ class MeanShift(BaseEstimator):
     def __init__(self, bandwidth=None):
         self.bandwidth = bandwidth
 
-    def fit(self, X, **params):
+    def fit(self, X):
         """ Compute MeanShift
 
             Parameters
@@ -200,6 +200,5 @@ def fit(self, X, **params):
                 Input points
 
         """
-        self._set_params(**params)
         self.cluster_centers_, self.labels_ = mean_shift(X, self.bandwidth)
         return self
diff --git a/scikits/learn/cluster/spectral.py b/scikits/learn/cluster/spectral.py
index 82a4a01ca992b..20b75e8da6753 100644
--- a/scikits/learn/cluster/spectral.py
+++ b/scikits/learn/cluster/spectral.py
@@ -252,7 +252,7 @@ def __init__(self, k=8, mode=None, random_state=None):
         self.mode = mode
         self.random_state = random_state
 
-    def fit(self, X, **params):
+    def fit(self, X):
         """Compute the spectral clustering from the affinity matrix
 
         Parameters
@@ -281,7 +281,6 @@ def fit(self, X, **params):
         speeds up computation.
         """
         self.random_state = check_random_state(self.random_state)
-        self._set_params(**params)
         self.labels_ = spectral_clustering(X, k=self.k, mode=self.mode,
                                            random_state=self.random_state)
         return self
diff --git a/scikits/learn/cluster/tests/test_dbscan.py b/scikits/learn/cluster/tests/test_dbscan.py
index 8b05d98f3cd9b..7c36e710e7cd2 100644
--- a/scikits/learn/cluster/tests/test_dbscan.py
+++ b/scikits/learn/cluster/tests/test_dbscan.py
@@ -55,9 +55,8 @@ def test_dbscan_feature():
     n_clusters_1 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters_1, n_clusters)
 
-    db = DBSCAN()
-    labels = db.fit(X, metric=metric,
-                    eps=eps, min_samples=min_samples).labels_
+    db = DBSCAN(metric=metric)
+    labels = db.fit(X, eps=eps, min_samples=min_samples).labels_
 
     n_clusters_2 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters_2, n_clusters)
@@ -80,9 +79,8 @@ def test_dbscan_callable():
     n_clusters_1 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters_1, n_clusters)
 
-    db = DBSCAN()
-    labels = db.fit(X, metric=metric,
-                    eps=eps, min_samples=min_samples).labels_
+    db = DBSCAN(metric=metric)
+    labels = db.fit(X, eps=eps, min_samples=min_samples).labels_
 
     n_clusters_2 = len(set(labels)) - int(-1 in labels)
     assert_equal(n_clusters_2, n_clusters)
diff --git a/scikits/learn/cluster/tests/test_k_means.py b/scikits/learn/cluster/tests/test_k_means.py
index e17c739340080..e7ba15410017e 100644
--- a/scikits/learn/cluster/tests/test_k_means.py
+++ b/scikits/learn/cluster/tests/test_k_means.py
@@ -14,12 +14,12 @@
 
 n_clusters = 3
 X = generate_clustered_data(n_clusters=n_clusters, std=.1)
-S = sp.csr_matrix(X)
+X_csr = sp.csr_matrix(X)
 
 
 def test_k_means_pp_init():
     np.random.seed(1)
-    k_means = KMeans(init="k-means++").fit(X, k=n_clusters)
+    k_means = KMeans(init="k-means++", k=n_clusters).fit(X)
 
     centers = k_means.cluster_centers_
     assert_equal(centers.shape, (n_clusters, 2))
@@ -31,7 +31,7 @@ def test_k_means_pp_init():
     assert_equal(np.unique(labels[40:]).size, 1)
 
     # check error on dataset being too small
-    assert_raises(ValueError, k_means.fit, [[0., 1.]], k=n_clusters)
+    assert_raises(ValueError, k_means.fit, [[0., 1.]])
 
 
 def test_mini_batch_k_means_pp_init():
@@ -49,20 +49,20 @@ def test_mini_batch_k_means_pp_init():
 
 def test_sparse_mini_batch_k_means_pp_init():
     np.random.seed(1)
-    sample = S[0:S.shape[0] / 2]
+    sample = X_csr[0:X_csr.shape[0] / 2]
     km = MiniBatchKMeans(init="random").partial_fit(sample)
     # Let's recalculate the inertia on the whole dataset
-    km.partial_fit(S)
+    km.partial_fit(X_csr)
     inertia = km.inertia_
-    km.partial_fit(S[S.shape[0] / 2:])
+    km.partial_fit(X_csr[X_csr.shape[0] / 2:])
     # And again
-    km.partial_fit(S)
+    km.partial_fit(X_csr)
     assert(km.inertia_ < inertia)
 
 
 def test_k_means_pp_random_init():
     np.random.seed(1)
-    k_means = KMeans(init="random").fit(X, k=n_clusters)
+    k_means = KMeans(init="random", k=n_clusters).fit(X)
 
     centers = k_means.cluster_centers_
     assert_equal(centers.shape, (n_clusters, 2))
@@ -74,13 +74,13 @@ def test_k_means_pp_random_init():
     assert_equal(np.unique(labels[40:]).size, 1)
 
     # check error on dataset being too small
-    assert_raises(ValueError, k_means.fit, [[0., 1.]], k=n_clusters)
+    assert_raises(ValueError, k_means.fit, [[0., 1.]])
 
 
 def test_k_means_fixed_array_init():
     np.random.seed(1)
     init_array = np.vstack([X[5], X[25], X[45]])
-    k_means = KMeans(init=init_array, n_init=1).fit(X, k=n_clusters)
+    k_means = KMeans(init=init_array, n_init=1, k=n_clusters).fit(X)
 
     centers = k_means.cluster_centers_
     assert_equal(centers.shape, (n_clusters, 2))
@@ -92,20 +92,20 @@ def test_k_means_fixed_array_init():
     assert_equal(np.unique(labels[40:]).size, 1)
 
     # check error on dataset being too small
-    assert_raises(ValueError, k_means.fit, [[0., 1.]], k=n_clusters)
+    assert_raises(ValueError, k_means.fit, [[0., 1.]])
 
 
 def test_k_means_invalid_init():
     np.random.seed(1)
-    k_means = KMeans(init="invalid", n_init=1)
-    assert_raises(ValueError, k_means.fit, X, k=n_clusters)
+    k_means = KMeans(init="invalid", n_init=1, k=n_clusters)
+    assert_raises(ValueError, k_means.fit, X)
 
 
 def test_k_means_copyx():
     """Check if copy_x=False returns nearly equal X after de-centering."""
     np.random.seed(1)
     my_X = X.copy()
-    k_means = KMeans(copy_x=False).fit(my_X, k=n_clusters)
+    k_means = KMeans(copy_x=False, k=n_clusters).fit(my_X)
     centers = k_means.cluster_centers_
     assert_equal(centers.shape, (n_clusters, 2))
 
@@ -124,7 +124,7 @@ def test_k_means_singleton():
     np.random.seed(1)
     my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 0.9]])
     array_init = np.array([[1.0, 1.0], [5.0, 5.0]])
-    k_means = KMeans(init=array_init).fit(my_X, k=2)
+    k_means = KMeans(init=array_init, k=2).fit(my_X)
 
     # must be singleton clustering
     assert_equal(np.unique(k_means.labels_).size, 1)
@@ -133,7 +133,7 @@ def test_k_means_singleton():
 def test_mbk_means_fixed_array_init():
     np.random.seed(1)
     init_array = np.vstack([X[5], X[25], X[45]])
-    mbk_means = MiniBatchKMeans(init=init_array).fit(X)
+    mbk_means = MiniBatchKMeans(init=init_array, k=n_clusters).fit(X)
 
     centers = mbk_means.cluster_centers_
     assert_equal(centers.shape, (n_clusters, 2))
@@ -141,13 +141,13 @@ def test_mbk_means_fixed_array_init():
     labels = mbk_means.labels_
     assert_equal(np.unique(labels).size, 3)
 
-    assert_raises(ValueError, mbk_means.fit, [[0., 1.]], k=n_clusters)
+    assert_raises(ValueError, mbk_means.fit, [[0., 1.]])
 
 
 def test_sparse_mbk_means_fixed_array_init():
     np.random.seed(1)
     init_array = np.vstack([X[5], X[25], X[45]])
-    mbk_means = MiniBatchKMeans(init=init_array).fit(S)
+    mbk_means = MiniBatchKMeans(init=init_array).fit(X_csr)
 
     centers = mbk_means.cluster_centers_
     assert_equal(centers.shape, (n_clusters, 2))
@@ -155,13 +155,13 @@ def test_sparse_mbk_means_fixed_array_init():
     labels = mbk_means.labels_
     assert_equal(np.unique(labels).size, 3)
 
-    assert_raises(ValueError, mbk_means.fit, [[0., 1.]], k=n_clusters)
+    assert_raises(ValueError, mbk_means.fit, [[0., 1.]])
 
 
 def test_sparse_mbk_means_pp_init():
     np.random.seed(1)
-    mbk_means = MiniBatchKMeans(init="k-means++")
-    assert_raises(ValueError, mbk_means.fit, S, k=n_clusters)
+    mbk_means = MiniBatchKMeans(init="k-means++", k=n_clusters)
+    assert_raises(ValueError, mbk_means.fit, X_csr)
 
 
 def test_sparse_mbk_means_callable_init():
@@ -169,7 +169,7 @@ def test_sparse_mbk_means_callable_init():
 
     def test_init(Xbar, k, random_state):
         return np.vstack([X[5], X[25], X[45]])
-    mbk_means = MiniBatchKMeans(init=test_init).fit(S)
+    mbk_means = MiniBatchKMeans(init=test_init).fit(X_csr)
 
     centers = mbk_means.cluster_centers_
     assert_equal(centers.shape, (n_clusters, 2))
@@ -177,17 +177,18 @@ def test_init(Xbar, k, random_state):
     labels = mbk_means.labels_
     assert_equal(np.unique(labels).size, 3)
 
-    assert_raises(ValueError, mbk_means.fit, [[0., 1.]], k=n_clusters)
+    assert_raises(ValueError, mbk_means.fit, [[0., 1.]])
 
 
 def test_k_means_fixed_array_init_fit():
     np.random.seed(1)
     init_array = np.vstack([X[5], X[25], X[45]])
-    k_means = KMeans(init=init_array, n_init=1).fit(X, k=n_clusters)
+    k_means = KMeans(init=init_array, n_init=1, k=n_clusters).fit(X)
 
     another_init_array = np.vstack([X[1], X[30], X[50]])
-    other_k_means = KMeans(init=init_array, n_init=1)
-    other_k_means.fit(X, init=another_init_array, k=n_clusters)
+    other_k_means = KMeans(init=init_array, n_init=1, k=n_clusters)
+    other_k_means.set_params(init=another_init_array)
+    other_k_means.fit(X)
     assert_true(not np.allclose(k_means.init, other_k_means.init),
                 "init attributes must be different")
 
@@ -195,11 +196,11 @@ def test_k_means_fixed_array_init_fit():
 def test_mbkm_fixed_array_init_fit():
     np.random.seed(1)
     init_array = np.vstack([X[5], X[25], X[45]])
-    k_means = MiniBatchKMeans(init=init_array).fit(X, k=n_clusters)
+    k_means = MiniBatchKMeans(init=init_array, k=n_clusters).fit(X)
 
     another_init_array = np.vstack([X[1], X[30], X[50]])
-    other_k_means = MiniBatchKMeans(init=init_array)
-    other_k_means.fit(X, init=another_init_array, k=n_clusters)
+    other_k_means = MiniBatchKMeans(init=another_init_array, k=n_clusters)
+    other_k_means.fit(X)
     assert_true(not np.allclose(k_means.init, other_k_means.init),
                 "init attributes must be different")
 
@@ -209,12 +210,13 @@ def test_mbk_means():
     true_labels = np.zeros((n_samples,), dtype=np.int32)
     true_labels[20:40] = 1
     true_labels[40:] = 2
-    chunk_size = n_samples * 2
+    chunk_size = n_samples / 10
     # make sure init clusters are in different clusters
     init_array = np.vstack([X[5], X[25], X[45]])
 
     # shuffle original data
-    X_shuffled, S_shuffled, true_labels = shuffle(X, S, true_labels, random_state=1)
+    X_shuffled, X_csr_shuffled, true_labels = shuffle(X, X_csr, true_labels,
+                                                      random_state=1)
 
     mbk_means = MiniBatchKMeans(init=init_array, chunk_size=chunk_size,
                                 k=n_clusters, random_state=1)
@@ -223,16 +225,50 @@ def test_mbk_means():
 
     mbk_means = MiniBatchKMeans(init=init_array, chunk_size=chunk_size,
                                 k=n_clusters, random_state=1)
-    mbk_means.fit(S_shuffled)
+    mbk_means.fit(X_csr_shuffled)
     assert_equal(true_labels, mbk_means.labels_)
 
 
 def test_predict():
-    k_means = KMeans(k=n_clusters)
-    k_means.fit(X)
+    k_means = KMeans(k=n_clusters).fit(X)
+
+    # sanity check: predict centroid labels
     pred = k_means.predict(k_means.cluster_centers_)
     assert_array_equal(pred, np.arange(n_clusters))
 
+    # sanity check: re-predict labeling for training set samples
+    pred = k_means.predict(X)
+    assert_array_equal(k_means.predict(X), k_means.labels_)
+
+
+def test_predict_minibatch():
+    mbk_means = MiniBatchKMeans(k=n_clusters).fit(X)
+
+    # sanity check: predict centroid labels
+    pred = mbk_means.predict(mbk_means.cluster_centers_)
+    assert_array_equal(pred, np.arange(n_clusters))
+
+    # sanity check: re-predict labeling for training set samples
+    pred = mbk_means.predict(X)
+    assert_array_equal(mbk_means.predict(X), mbk_means.labels_)
+
+
+def test_predict_minibatch_sparse_input():
+    mbk_means = MiniBatchKMeans(k=n_clusters).fit(X_csr)
+
+    # sanity check: predict centroid labels
+    pred = mbk_means.predict(mbk_means.cluster_centers_)
+    assert_array_equal(pred, np.arange(n_clusters))
+
+    # sanity check: re-predict labeling for training set samples
+    pred = mbk_means.predict(X_csr)
+    assert_array_equal(mbk_means.predict(X), mbk_means.labels_)
+
+    # check that models trained on sparse input also works for dense input at
+    # predict time
+    pred = mbk_means.predict(X)
+    assert_array_equal(mbk_means.predict(X), mbk_means.labels_)
+
 
 def test_transform():
     k_means = KMeans(k=n_clusters)
diff --git a/scikits/learn/cluster/tests/test_spectral.py b/scikits/learn/cluster/tests/test_spectral.py
index 3846f53ad293b..a2663160c760d 100644
--- a/scikits/learn/cluster/tests/test_spectral.py
+++ b/scikits/learn/cluster/tests/test_spectral.py
@@ -21,7 +21,7 @@ def test_spectral_clustering():
                  ])
 
     for mat in (S, sparse.csr_matrix(S)):
-        model = SpectralClustering(random_state=0).fit(mat, k=2)
+        model = SpectralClustering(random_state=0, k=2).fit(mat)
         labels = model.labels_
         if labels[0] == 0:
             labels = 1 - labels
diff --git a/scikits/learn/covariance/empirical_covariance_.py b/scikits/learn/covariance/empirical_covariance_.py
index 80860ef56150d..ff488a9ce85fb 100644
--- a/scikits/learn/covariance/empirical_covariance_.py
+++ b/scikits/learn/covariance/empirical_covariance_.py
@@ -108,7 +108,7 @@ def _set_estimates(self, covariance):
         else:
             self.precision_ = None
 
-    def fit(self, X, assume_centered=False, **params):
+    def fit(self, X, assume_centered=False):
         """ Fits the Maximum Likelihood Estimator covariance model
         according to the given training data and parameters.
 
@@ -130,7 +130,6 @@ def fit(self, X, assume_centered=False, **params):
             Returns self.
 
         """
-        self._set_params(**params)
         covariance = empirical_covariance(X, assume_centered=assume_centered)
         self._set_estimates(covariance)
 
diff --git a/scikits/learn/covariance/shrunk_covariance_.py b/scikits/learn/covariance/shrunk_covariance_.py
index 60f83ab8e5aa4..bd083663bd5c6 100644
--- a/scikits/learn/covariance/shrunk_covariance_.py
+++ b/scikits/learn/covariance/shrunk_covariance_.py
@@ -99,7 +99,7 @@ def __init__(self, store_precision=True, shrinkage=0.1):
         self.store_precision = store_precision
         self.shrinkage = shrinkage
 
-    def fit(self, X, assume_centered=False, **params):
+    def fit(self, X, assume_centered=False):
         """ Fits the shrunk covariance model
         according to the given training data and parameters.
 
@@ -121,7 +121,6 @@ def fit(self, X, assume_centered=False, **params):
             Returns self.
 
         """
-        self._set_params(**params)
         empirical_cov = empirical_covariance(X,
                                              assume_centered=assume_centered)
         covariance = shrunk_covariance(empirical_cov, self.shrinkage)
diff --git a/scikits/learn/datasets/__init__.py b/scikits/learn/datasets/__init__.py
index 9d06df2a0c9ba..4fec421c4ea9d 100644
--- a/scikits/learn/datasets/__init__.py
+++ b/scikits/learn/datasets/__init__.py
@@ -7,6 +7,7 @@
 from .base import get_data_home
 from .base import clear_data_home
 from .base import load_sample_images
+from .base import load_sample_image
 from .mlcomp import load_mlcomp
 from .lfw import load_lfw_pairs
 from .lfw import load_lfw_people
diff --git a/scikits/learn/datasets/base.py b/scikits/learn/datasets/base.py
index 6077374cc0917..8cc60fd2184da 100644
--- a/scikits/learn/datasets/base.py
+++ b/scikits/learn/datasets/base.py
@@ -10,7 +10,6 @@
 import os
 import csv
 import shutil
-import textwrap
 from os import environ
 from os.path import dirname
 from os.path import join
@@ -348,9 +347,8 @@ def load_boston():
                  feature_names=feature_names,
                  DESCR=fdescr.read())
 
-
 def load_sample_images():
-    """ Load sample images for image manipulation.
+    """Load sample images for image manipulation.
 
     Return
     ------
@@ -366,20 +364,27 @@ def load_sample_images():
     To load the data and visualize the images::
 
     >>> from scikits.learn.datasets import load_sample_images
-    >>> images = load_sample_images()
+    >>> dataset = load_sample_images()
+    >>> len(dataset.images)
+    2
+    >>> first_img_data = dataset.images[0]
+    >>> first_img_data.shape  # height, width, channels
+    (427, 640, 3)
+    >>> first_img_data.dtype
+    dtype('uint8')
 
     >>> # import pylab as pl
     >>> # pl.gray()
-    >>> # pl.matshow(images.images[0]) # Visualize the first image
+    >>> # pl.matshow(dataset.images[0]) # Visualize the first image
     >>> # pl.show()
     """
-    # Try to import Image and imresize from PIL. We do this here to prevent
+    # Try to import imread from scipy. We do this lazily here to prevent
     # this module from depending on PIL.
     try:
         try:
-            from scipy.misc import Image
+            from scipy.misc import imread
         except ImportError:
-            from scipy.misc.pilutil import Image
+            from scipy.misc.pilutil import imread
     except ImportError:
         raise ImportError("The Python Imaging Library (PIL)"
                           "is required to load data from jpeg files")
@@ -389,9 +394,33 @@ def load_sample_images():
                  for filename in os.listdir(module_path)
                  if filename.endswith(".jpg")]
     # Load image data for each image in the source folder.
-    images = [np.asarray(Image.open(filename))
-              for filename in filenames]
+    images = [imread(filename) for filename in filenames]
+
     return Bunch(images=images,
                  filenames=filenames,
                  DESCR=descr)
-    
+
+def load_sample_image(image_name):
+    """Load the numpy array of a single sample image
+
+    >>> china = load_sample_image('china.jpg')
+    >>> china.dtype
+    dtype('uint8')
+    >>> china.shape
+    (427, 640, 3)
+
+    >>> flower = load_sample_image('flower.jpg')
+    >>> flower.dtype
+    dtype('uint8')
+    >>> flower.shape
+    (427, 640, 3)
+    """
+    images = load_sample_images()
+    index = None
+    for i, filename in enumerate(images.filenames):
+        if filename.endswith(image_name):
+            index = i
+            break
+    if index is None:
+        raise AttributeError("Cannot find sample image: %s" % image_name)
+    return images.images[index]
diff --git a/scikits/learn/datasets/samples_generator.py b/scikits/learn/datasets/samples_generator.py
index cdd4ea6693085..8539543ebb009 100644
--- a/scikits/learn/datasets/samples_generator.py
+++ b/scikits/learn/datasets/samples_generator.py
@@ -55,7 +55,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
 
     weights : list of floats or None (default=None)
         The proportions of samples assigned to each class. If None, then
-        classes are balanced.
+        classes are balanced. Note that if `len(weights) == n_classes - 1`,
+        then the last class weight is automatically inferred.
 
     flip_y : float, optional (default=0.01)
         The fraction of samples whose class are randomly exchanged.
@@ -306,7 +307,7 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, bias=0.0,
                                  n_features=n_features,
                                  effective_rank=effective_rank,
                                  tail_strength=tail_strength,
-                                 seed=generator)
+                                 random_state=generator)
 
     # Generate a ground truth model with only n_informative features being non
     # zeros (the other features are not correlated to y and should be ignored
@@ -352,7 +353,7 @@ def make_blobs(n_samples=100, n_features=2, centers=3, cluster_std=1.0,
     n_features : int, optional (default=2)
         The number of features for each sample.
 
-    centers : int or array of shape [n_centers, n_features], optinal (default=3)
+    centers : int or array of shape [n_centers, n_features], optional (default=3)
         The number of centers to generate, or the fixed center locations.
 
     cluster_std: float or sequence of floats, optional (default=1.0)
diff --git a/scikits/learn/datasets/tests/test_samples_generator.py b/scikits/learn/datasets/tests/test_samples_generator.py
index a083a17b2fd7c..50e5202ff0fbd 100644
--- a/scikits/learn/datasets/tests/test_samples_generator.py
+++ b/scikits/learn/datasets/tests/test_samples_generator.py
@@ -1,5 +1,5 @@
 import numpy as np
-from numpy.testing import assert_equal, assert_almost_equal, \
+from numpy.testing import assert_equal, assert_approx_equal, \
                           assert_array_almost_equal, assert_array_less
 
 from .. import make_classification
@@ -18,8 +18,10 @@
 
 def test_make_classification():
     X, y = make_classification(n_samples=100, n_features=20, n_informative=5,
-                               n_classes=3, n_clusters_per_class=1,
-                               weights=[0.1, 0.25, 0.65], random_state=0)
+                               n_redundant=1, n_repeated=1, n_classes=3,
+                               n_clusters_per_class=1, hypercube=False,
+                               shift=None, scale=None, weights=[0.1, 0.25],
+                               random_state=0)
 
     assert_equal(X.shape, (100, 20), "X shape mismatch")
     assert_equal(y.shape, (100,), "y shape mismatch")
@@ -30,20 +32,25 @@ def test_make_classification():
 
 
 def test_make_regression():
-    X, y, c = make_regression(n_samples=50, n_features=10, n_informative=3,
-                              coef=True, bias=0.0, random_state=0)
+    X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
+                              effective_rank=5, coef=True, bias=0.0,
+                              noise=1.0, random_state=0)
 
-    assert_equal(X.shape, (50, 10), "X shape mismatch")
-    assert_equal(y.shape, (50,), "y shape mismatch")
+    assert_equal(X.shape, (100, 10), "X shape mismatch")
+    assert_equal(y.shape, (100,), "y shape mismatch")
     assert_equal(c.shape, (10,), "coef shape mismatch")
     assert_equal(sum(c != 0.0), 3, "Unexpected number of informative features")
-    assert_array_almost_equal(y, np.dot(X, c))
+
+    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
+    assert_approx_equal(np.std(y - np.dot(X, c)), 1.0, significant=2)
 
 
 def test_make_blobs():
-    X, y = make_blobs(n_samples=50, n_features=5, centers=3, random_state=0)
+    X, y = make_blobs(n_samples=50, n_features=2,
+                      centers=[[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]],
+                      random_state=0)
 
-    assert_equal(X.shape, (50, 5), "X shape mismatch")
+    assert_equal(X.shape, (50, 2), "X shape mismatch")
     assert_equal(y.shape, (50,), "y shape mismatch")
     assert_equal(np.unique(y).shape, (3,), "Unexpected number of blobs")
 
diff --git a/scikits/learn/decomposition/fastica_.py b/scikits/learn/decomposition/fastica_.py
index 2c13cd33dbc92..cb8746cca14bb 100644
--- a/scikits/learn/decomposition/fastica_.py
+++ b/scikits/learn/decomposition/fastica_.py
@@ -356,8 +356,7 @@ def __init__(self, n_components=None, algorithm='parallel', whiten=True,
         self.tol = tol
         self.w_init = w_init
 
-    def fit(self, X, **params):
-        self._set_params(**params)
+    def fit(self, X):
         whitening_, unmixing_, sources_ = fastica(X, self.n_components,
                         self.algorithm, self.whiten,
                         self.fun, self.fun_prime, self.fun_args, self.max_iter,
diff --git a/scikits/learn/decomposition/kernel_pca.py b/scikits/learn/decomposition/kernel_pca.py
index 3e4de77191bcc..de5f8ba7f7274 100644
--- a/scikits/learn/decomposition/kernel_pca.py
+++ b/scikits/learn/decomposition/kernel_pca.py
@@ -174,7 +174,7 @@ def _fit_inverse_transform(self, X_transformed, X):
         self.dual_coef_ = linalg.solve(K, X, sym_pos=True, overwrite_a=True)
         self.X_transformed_fit_ = X_transformed
 
-    def fit(self, X, y=None, **params):
+    def fit(self, X, y=None):
         """Fit the model from data in X.
 
         Parameters
@@ -188,7 +188,6 @@ def fit(self, X, y=None, **params):
         self : object
             Returns the instance itself.
         """
-        self._set_params(**params)
         self._fit_transform(X)
 
         if self.fit_inverse_transform:
diff --git a/scikits/learn/decomposition/nmf.py b/scikits/learn/decomposition/nmf.py
index 7ab69774ef69c..3db28b394263b 100644
--- a/scikits/learn/decomposition/nmf.py
+++ b/scikits/learn/decomposition/nmf.py
@@ -344,7 +344,7 @@ def __init__(self, n_components=None, init="nndsvdar", sparseness=None,
         self.max_iter = max_iter
         self.nls_max_iter = nls_max_iter
 
-    def fit_transform(self, X, y=None, **params):
+    def fit_transform(self, X, y=None):
         """Learn a NMF model for the data X and returns the transformed data.
 
         This is more efficient than calling fit followed by transform.
@@ -360,7 +360,6 @@ def fit_transform(self, X, y=None, **params):
         data: array, [n_samples, n_components]
             Transformed data
         """
-        self._set_params(**params)
         X = np.atleast_2d(X)
         if (X < 0).any():
             raise ValueError("Negative data passed to NMF.fit.")
diff --git a/scikits/learn/decomposition/pca.py b/scikits/learn/decomposition/pca.py
index 81029e5e9c0ca..bd112fcde07d1 100644
--- a/scikits/learn/decomposition/pca.py
+++ b/scikits/learn/decomposition/pca.py
@@ -215,8 +215,7 @@ def fit_transform(self, X, y=None, **params):
 
         return U
 
-    def _fit(self, X, **params):
-        self._set_params(**params)
+    def _fit(self, X):
         X = np.atleast_2d(X)
         n_samples, n_features = X.shape
         if self.copy:
@@ -426,7 +425,7 @@ def __init__(self, n_components, copy=True, iterated_power=3,
         self.whiten = whiten
         self.mean_ = None
 
-    def fit(self, X, y=None, **params):
+    def fit(self, X, y=None):
         """Fit the model to the data X.
 
         Parameters
@@ -440,7 +439,6 @@ def fit(self, X, y=None, **params):
         self : object
             Returns the instance itself.
         """
-        self._set_params(**params)
         if not hasattr(X, 'todense'):
             X = np.atleast_2d(X)
 
diff --git a/scikits/learn/decomposition/sparse_pca.py b/scikits/learn/decomposition/sparse_pca.py
index bad147cff0ad6..f6b02dc1d4ac7 100644
--- a/scikits/learn/decomposition/sparse_pca.py
+++ b/scikits/learn/decomposition/sparse_pca.py
@@ -77,13 +77,14 @@ def _update_code(dictionary, Y, alpha, code=None, Gram=None, method='lars',
         finally:
             np.seterr(**err_mgt)
     elif method == 'cd':
-        clf = Lasso(alpha=alpha, fit_intercept=False, precompute=Gram)
+        clf = Lasso(alpha=alpha, fit_intercept=False, precompute=Gram,
+                    max_iter=1000, tol=tol)
         for k in range(n_features):
             # A huge amount of time is spent in this loop. It needs to be
             # tight.
             if code is not None:
                 clf.coef_ = code[:, k]  # Init with previous value of Vk
-            clf.fit(dictionary, Y[:, k], max_iter=1000, tol=tol)
+            clf.fit(dictionary, Y[:, k])
             new_code[:, k] = clf.coef_
     else:
         raise NotImplemented("Lasso method %s is not implemented." % method)
@@ -601,7 +602,7 @@ def __init__(self, n_components, alpha=1, ridge_alpha=0.01, max_iter=1000,
         self.verbose = verbose
         self.random_state = random_state
 
-    def fit(self, X, y=None, **params):
+    def fit(self, X, y=None):
         """Fit the model from data in X.
 
         Parameters
@@ -615,7 +616,6 @@ def fit(self, X, y=None, **params):
         self : object
             Returns the instance itself.
         """
-        self._set_params(**params)
         self.random_state = check_random_state(self.random_state)
         X = np.asanyarray(X)
         code_init = self.V_init.T if self.V_init is not None else None
@@ -726,7 +726,7 @@ def __init__(self, n_components, alpha=1, ridge_alpha=0.01, n_iter=100,
         self.method = method
         self.random_state = random_state
 
-    def fit(self, X, y=None, **params):
+    def fit(self, X, y=None):
         """Fit the model from data in X.
 
         Parameters
@@ -740,7 +740,6 @@ def fit(self, X, y=None, **params):
         self : object
             Returns the instance itself.
         """
-        self._set_params(**params)
         self.random_state = check_random_state(self.random_state)
         X = np.asanyarray(X)
         Vt, _ = dict_learning_online(X.T, self.n_components, alpha=self.alpha,
diff --git a/scikits/learn/feature_extraction/image.py b/scikits/learn/feature_extraction/image.py
index 56685530d9866..0a6525e116241 100644
--- a/scikits/learn/feature_extraction/image.py
+++ b/scikits/learn/feature_extraction/image.py
@@ -88,9 +88,6 @@ def _to_graph(n_x, n_y, n_z, mask=None, img=None,
         else:
             dtype = img.dtype
 
-    if dtype == np.bool:
-        dtype = np.int
-
     if img is not None:
         img = np.atleast_3d(img)
         weights = _compute_gradient_3d(edges, img)
@@ -247,7 +244,7 @@ def extract_patches_2d(image, patch_size, max_patches=None, random_state=None):
         if isinstance(max_patches, int) and max_patches < all_patches:
             n_patches = max_patches
         elif isinstance(max_patches, float) and 0 < max_patches < 1:
-            n_patches = max_patches * n_patches
+            n_patches = int(max_patches * all_patches)
         else:
             raise ValueError("Invalid value for max_patches: %r" % max_patches)
 
diff --git a/scikits/learn/feature_extraction/tests/test_image.py b/scikits/learn/feature_extraction/tests/test_image.py
index 5650b0b3d2f64..d5204ecdd9494 100644
--- a/scikits/learn/feature_extraction/tests/test_image.py
+++ b/scikits/learn/feature_extraction/tests/test_image.py
@@ -7,6 +7,7 @@
 from scipy import ndimage
 
 from nose.tools import assert_equal
+from numpy.testing import assert_raises
 
 from ..image import img_to_graph, grid_to_graph
 from ..image import extract_patches_2d, reconstruct_from_patches_2d, \
@@ -37,12 +38,21 @@ def test_grid_to_graph():
     mask[-roi_size:, -roi_size:] = True
     mask = mask.reshape(size ** 2)
     A = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
+    assert(cs_graph_components(A)[0] == 2)
 
     # Checking that the function works whatever the type of mask is
     mask = np.ones((size, size), dtype=np.int16)
     A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask)
     assert(cs_graph_components(A)[0] == 1)
 
+    # Checking dtype of the graph
+    mask = np.ones((size, size))
+    A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.bool)
+    assert A.dtype == np.bool
+    A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.int)
+    assert A.dtype == np.int
+    A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.float)
+    assert A.dtype == np.float
 
 def test_connect_regions():
     lena = sp.lena()
@@ -133,6 +143,17 @@ def test_extract_patches_max_patches():
     patches = extract_patches_2d(lena, (p_h, p_w), max_patches=100)
     assert_equal(patches.shape, (100, p_h, p_w))
 
+    expected_n_patches = int(0.5 * (i_h - p_h + 1) * (i_w - p_w + 1))
+    patches = extract_patches_2d(lena, (p_h, p_w), max_patches=0.5)
+    assert_equal(patches.shape, (expected_n_patches, p_h, p_w))
+
+    assert_raises(ValueError, extract_patches_2d, lena,
+                                                  (p_h, p_w),
+                                                  max_patches=2.0)
+    assert_raises(ValueError, extract_patches_2d, lena,
+                                                  (p_h, p_w),
+                                                  max_patches=-1.0)
+
 
 def test_reconstruct_patches_perfect():
     lena = downsampled_lena
@@ -152,6 +173,12 @@ def test_reconstruct_patches_perfect_color():
     np.testing.assert_array_equal(lena, lena_reconstructed)
 
 
+def test_patch_extractor_fit():
+    lenas = lena_collection
+    extr = PatchExtractor(patch_size=(8, 8), max_patches=100, random_state=0)
+    assert extr == extr.fit(lenas)
+
+
 def test_patch_extractor_max_patches():
     lenas = lena_collection
     extr = PatchExtractor(patch_size=(8, 8), max_patches=100, random_state=0)
diff --git a/scikits/learn/feature_extraction/tests/test_text.py b/scikits/learn/feature_extraction/tests/test_text.py
index de8e16de08903..429c54b283f0e 100644
--- a/scikits/learn/feature_extraction/tests/test_text.py
+++ b/scikits/learn/feature_extraction/tests/test_text.py
@@ -16,8 +16,10 @@
             assert_false, assert_not_equal
 from numpy.testing import assert_array_almost_equal
 from numpy.testing import assert_array_equal
+from numpy.testing import assert_raises
 
 import pickle
+from StringIO import StringIO
 
 JUNK_FOOD_DOCS = (
     "the pizza pizza beer copyright",
@@ -100,6 +102,11 @@ def test_word_analyzer_unigrams():
                 u'yesterday']
     assert_equal(wa.analyze(text), expected)
 
+    text = StringIO("This is a test with a file-like object!")
+    expected = [u'this', u'is', u'test', u'with', u'file', u'like',
+                u'object']
+    assert_equal(wa.analyze(text), expected)
+
 
 def test_word_analyzer_unigrams_and_bigrams():
     wa = WordNGramAnalyzer(min_n=1, max_n=2, stop_words=None)
@@ -127,6 +134,10 @@ def test_char_ngram_analyzer():
     expected = [u' yeste', u'yester', u'esterd', u'sterda', u'terday']
     assert_equal(cnga.analyze(text)[-5:], expected)
 
+    text = StringIO("This is a test with a file-like object!")
+    expected = [u'thi', u'his', u'is ', u's i', u' is']
+    assert_equal(cnga.analyze(text)[:5], expected)
+
 
 def test_countvectorizer_custom_vocabulary():
     what_we_like = ["pizza", "beer"]
@@ -224,6 +235,10 @@ def test_vectorizer():
     tfidf_test2 = toarray(tv.transform(test_data))
     assert_array_almost_equal(tfidf_test, tfidf_test2)
 
+    # test empty vocabulary
+    v3 = CountVectorizer(vocabulary=None)
+    assert_raises(ValueError, v3.transform, train_data)
+
 
 def test_vectorizer_max_features():
     vec_factories = (
diff --git a/scikits/learn/feature_extraction/text.py b/scikits/learn/feature_extraction/text.py
index 5872e446680f6..1f0d8fd3c0e24 100644
--- a/scikits/learn/feature_extraction/text.py
+++ b/scikits/learn/feature_extraction/text.py
@@ -148,9 +148,7 @@ def analyze(self, text_document):
             original_tokens = tokens
             tokens = []
             n_original_tokens = len(original_tokens)
-            for n in xrange(self.min_n, self.max_n + 1):
-                if n_original_tokens < n:
-                    continue
+            for n in xrange(self.min_n, min(self.max_n + 1, n_original_tokens + 1)):
                 for i in xrange(n_original_tokens - n + 1):
                     tokens.append(u" ".join(original_tokens[i: i + n]))
 
@@ -196,9 +194,7 @@ def analyze(self, text_document):
 
         text_len = len(text_document)
         ngrams = []
-        for n in xrange(self.min_n, self.max_n + 1):
-            if text_len < n:
-                continue
+        for n in xrange(self.min_n, min(self.max_n + 1, text_len + 1)):
             for i in xrange(text_len - n + 1):
                 ngrams.append(text_document[i: i + n])
         return ngrams
diff --git a/scikits/learn/feature_selection/univariate_selection.py b/scikits/learn/feature_selection/univariate_selection.py
index 534e15398a85c..3350795ea2f90 100644
--- a/scikits/learn/feature_selection/univariate_selection.py
+++ b/scikits/learn/feature_selection/univariate_selection.py
@@ -228,11 +228,10 @@ def __init__(self, score_func):
                 "was passed." % (score_func, type(score_func)))
         self.score_func = score_func
 
-    def fit(self, X, y, **params):
+    def fit(self, X, y):
         """
         Evaluate the function
         """
-        self._set_params(**params)
         _scores = self.score_func(X, y)
         self._scores = _scores[0]
         self._pvalues = _scores[1]
@@ -245,18 +244,16 @@ def get_support(self, indices=False):
         mask = self._get_support_mask()
         return mask if not indices else np.where(mask)[0]
 
-    def transform(self, X, **params):
+    def transform(self, X):
         """
         Transform a new matrix using the selected features
         """
-        self._set_params(**params)
         return safe_asanyarray(X)[:, self.get_support(indices=issparse(X))]
 
-    def inverse_transform(self, X, **params):
+    def inverse_transform(self, X):
         """
         Transform a new matrix using the selected features
         """
-        self._set_params(**params)
         support_ = self.get_support()
         if X.ndim == 1:
             X = X[None, :]
@@ -446,9 +443,9 @@ def _get_support_mask(self):
         selector = self._selection_modes[self.mode](lambda x: x)
         selector._pvalues = self._pvalues
         selector._scores = self._scores
-        # Now make some acrobaties to set the right named parameter in
+        # Now perform some acrobatics to set the right named parameter in
         # the selector
         possible_params = selector._get_param_names()
         possible_params.remove('score_func')
-        selector._set_params(**{possible_params[0]: self.param})
+        selector.set_params(**{possible_params[0]: self.param})
         return selector._get_support_mask()
diff --git a/scikits/learn/grid_search.py b/scikits/learn/grid_search.py
index 9056996b762e4..9c45af853b103 100644
--- a/scikits/learn/grid_search.py
+++ b/scikits/learn/grid_search.py
@@ -67,9 +67,11 @@ def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func,
         msg = '%s' % (', '.join('%s=%s' % (k, v)
                                      for k, v in clf_params.iteritems()))
         print "[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')
+
     # update parameters of the classifier after a copy of its base structure
+    # FIXME we should be doing a clone here
     clf = copy.deepcopy(base_clf)
-    clf._set_params(**clf_params)
+    clf.set_params(**clf_params)
 
     if isinstance(X, list) or isinstance(X, tuple):
         X_train = [X[i] for i, cond in enumerate(train) if cond]
diff --git a/scikits/learn/lda.py b/scikits/learn/lda.py
index 8d208164e62f2..54c5e19fa8cc9 100644
--- a/scikits/learn/lda.py
+++ b/scikits/learn/lda.py
@@ -66,7 +66,7 @@ def __init__(self, n_components=None, priors=None):
                 print 'warning: the priors do not sum to 1. Renormalizing'
                 self.priors = self.priors / self.priors.sum()
 
-    def fit(self, X, y, store_covariance=False, tol=1.0e-4, **params):
+    def fit(self, X, y, store_covariance=False, tol=1.0e-4):
         """
         Fit the LDA model according to the given training data and parameters.
 
@@ -81,7 +81,6 @@ def fit(self, X, y, store_covariance=False, tol=1.0e-4, **params):
             If True the covariance matrix (shared by all classes) is computed
             and stored in self.covariance_ attribute.
         """
-        self._set_params(**params)
         X = np.asanyarray(X)
         y = np.asanyarray(y)
         if y.dtype.char.lower() not in ('b', 'h', 'i'):
diff --git a/scikits/learn/linear_model/base.py b/scikits/learn/linear_model/base.py
index d71701b2c122f..ddc5cd0a37e4f 100644
--- a/scikits/learn/linear_model/base.py
+++ b/scikits/learn/linear_model/base.py
@@ -12,11 +12,13 @@
 # License: BSD Style.
 
 import numpy as np
+import scipy.sparse as sp
 
 from ..base import BaseEstimator, RegressorMixin, ClassifierMixin
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from ..utils.extmath import safe_sparse_dot
 from ..utils import safe_asanyarray
+from ..utils import as_float_array
 
 
 ###
@@ -47,30 +49,42 @@ def predict(self, X):
         return safe_sparse_dot(X, self.coef_.T) + self.intercept_
 
     @staticmethod
-    def _center_data(X, y, fit_intercept):
+    def _center_data(X, y, fit_intercept, normalize=False):
         """
         Centers data to have mean zero along axis 0. This is here because
         nearly all linear models will want their data to be centered.
+
+        WARNING : This function modifies X inplace :
+            Use scikits.learn.utils.as_float_array before to convert X to np.float.
+            You can specify an argument overwrite_X (default is False).
         """
-        import scipy.sparse  # importing scipy.sparse just for this is overkill
         if fit_intercept:
-            if scipy.sparse.issparse(X):
-                Xmean = np.zeros(X.shape[1])
+            if sp.issparse(X):
+                X_mean = np.zeros(X.shape[1])
+                X_std = np.ones(X.shape[1])
             else:
-                Xmean = X.mean(axis=0)
-                X = X - Xmean
-            ymean = y.mean()
-            y = y - ymean
+                X_mean = X.mean(axis=0)
+                X -= X_mean
+                if normalize:
+                    X_std = np.sqrt(np.sum(X ** 2, axis=0))
+                    X_std[X_std==0] = 1
+                    X /= X_std
+                else:
+                    X_std = np.ones(X.shape[1])
+            y_mean = y.mean()
+            y = y - y_mean
         else:
-            Xmean = np.zeros(X.shape[1])
-            ymean = 0.
-        return X, y, Xmean, ymean
+            X_mean = np.zeros(X.shape[1])
+            X_std = np.ones(X.shape[1])
+            y_mean = 0.
+        return X, y, X_mean, y_mean, X_std
 
-    def _set_intercept(self, Xmean, ymean):
+    def _set_intercept(self, X_mean, y_mean, X_std):
         """Set the intercept_
         """
         if self.fit_intercept:
-            self.intercept_ = ymean - np.dot(Xmean, self.coef_.T)
+            self.coef_ = self.coef_ / X_std
+            self.intercept_ = y_mean - np.dot(X_mean, self.coef_.T)
         else:
             self.intercept_ = 0
 
@@ -94,10 +108,12 @@ class LinearRegression(LinearModel):
 
     """
 
-    def __init__(self, fit_intercept=True):
+    def __init__(self, fit_intercept=True, normalize=False, overwrite_X=False):
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
+        self.overwrite_X = overwrite_X
 
-    def fit(self, X, y, **params):
+    def fit(self, X, y):
         """
         Fit linear model.
 
@@ -111,21 +127,25 @@ def fit(self, X, y, **params):
             wether to calculate the intercept for this model. If set
             to false, no intercept will be used in calculations
             (e.g. data is expected to be already centered).
+        normalize : boolean, optional
+            If True, the regressors X are normalized
 
         Returns
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
         X = np.asanyarray(X)
         y = np.asanyarray(y)
 
-        X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
+        X = as_float_array(X, self.overwrite_X)
+
+        X, y, X_mean, y_mean, X_std = self._center_data(X, y,
+                self.fit_intercept, self.normalize)
 
         self.coef_, self.residues_, self.rank_, self.singular_ = \
                 np.linalg.lstsq(X, y)
 
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(X_mean, y_mean, X_std)
         return self
 
 ##
@@ -299,7 +319,7 @@ def _set_class_weight(self, class_weight, classes, y):
         self.class_weight = weight
 
     def fit(self, X, y, coef_init=None, intercept_init=None,
-            class_weight=None, sample_weight=None, **params):
+            class_weight=None, sample_weight=None):
         """Fit linear model with Stochastic Gradient Descent.
 
         Parameters
@@ -330,7 +350,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
 
         # check only y because X might be dense or sparse
         y = np.asanyarray(y, dtype=np.float64, order='C')
@@ -443,7 +462,7 @@ def _set_loss_function(self, loss):
             raise ValueError("The loss %s is not supported. " % loss)
 
     def fit(self, X, y, coef_init=None, intercept_init=None,
-            sample_weight=None, **params):
+            sample_weight=None):
         """Fit linear model with Stochastic Gradient Descent.
 
         Parameters
@@ -467,7 +486,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
         y = np.asanyarray(y, dtype=np.float64, order="C")
 
         # make sure X has shape
diff --git a/scikits/learn/linear_model/bayes.py b/scikits/learn/linear_model/bayes.py
index 4a263801c19e3..c9b8111b85d60 100644
--- a/scikits/learn/linear_model/bayes.py
+++ b/scikits/learn/linear_model/bayes.py
@@ -10,6 +10,7 @@
 from scipy import linalg
 
 from .base import LinearModel
+from ..utils import as_float_array
 from ..utils.extmath import fast_logdet
 
 
@@ -33,7 +34,7 @@ class BayesianRidge(LinearModel):
     n_iter : int, optional
         Maximum number of iterations.  Default is 300.
 
-    eps : float, optional
+    tol : float, optional
         Stop the algorithm if w has converged. Default is 1.e-3.
 
     alpha_1 : float, optional
@@ -64,6 +65,18 @@ class BayesianRidge(LinearModel):
         (e.g. data is expected to be already centered).
         Default is True.
 
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+        Default is False
+
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
+    verbose : boolean, optional
+        Verbose mode when fitting the model. Default is False.
+
+
     Attributes
     ----------
     `coef_` : array, shape = (n_features)
@@ -91,9 +104,9 @@ class BayesianRidge(LinearModel):
     >>> from scikits.learn import linear_model
     >>> clf = linear_model.BayesianRidge()
     >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
-    BayesianRidge(n_iter=300, verbose=False, lambda_1=1e-06, lambda_2=1e-06,
-           fit_intercept=True, eps=0.001, alpha_2=1e-06, alpha_1=1e-06,
-           compute_score=False)
+    BayesianRidge(normalize=False, n_iter=300, verbose=False, lambda_1=1e-06,
+           lambda_2=1e-06, fit_intercept=True, alpha_2=1e-06, tol=0.001,
+           alpha_1=1e-06, overwrite_X=False, compute_score=False)
     >>> clf.predict([[1, 1]])
     array([ 1.])
 
@@ -102,20 +115,23 @@ class BayesianRidge(LinearModel):
     See examples/linear_model/plot_bayesian_ridge.py for an example.
     """
 
-    def __init__(self, n_iter=300, eps=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
+    def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                 lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
-                fit_intercept=True, verbose=False):
+                fit_intercept=True, normalize=False,
+                overwrite_X=False, verbose=False):
         self.n_iter = n_iter
-        self.eps = eps
+        self.tol = tol 
         self.alpha_1 = alpha_1
         self.alpha_2 = alpha_2
         self.lambda_1 = lambda_1
         self.lambda_2 = lambda_2
         self.compute_score = compute_score
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
+        self.overwrite_X = overwrite_X
         self.verbose = verbose
 
-    def fit(self, X, y, **params):
+    def fit(self, X, y):
         """Fit the model
 
         Parameters
@@ -129,10 +145,11 @@ def fit(self, X, y, **params):
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
         X = np.asanyarray(X, dtype=np.float)
         y = np.asanyarray(y, dtype=np.float)
-        X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
+        X = as_float_array(X, self.overwrite_X)
+        X, y, X_mean, y_mean, X_std = self._center_data(X, y,
+                self.fit_intercept, self.normalize)
         n_samples, n_features = X.shape
 
         ### Initialization of the values of the parameters
@@ -196,7 +213,7 @@ def fit(self, X, y, **params):
                 self.scores_.append(s)
 
             ### Check for convergence
-            if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.eps:
+            if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
                 if verbose:
                     print "Convergence after ", str(iter_), " iterations"
                 break
@@ -206,7 +223,7 @@ def fit(self, X, y, **params):
         self.lambda_ = lambda_
         self.coef_ = coef_
 
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(X_mean, y_mean, X_std)
         return self
 
 
@@ -234,7 +251,7 @@ class ARDRegression(LinearModel):
     n_iter : int, optional
         Maximum number of iterations. Default is 300
 
-    eps : float, optional
+    tol : float, optional
         Stop the algorithm if w has converged. Default is 1.e-3.
 
     alpha_1 : float, optional
@@ -267,6 +284,13 @@ class ARDRegression(LinearModel):
         (e.g. data is expected to be already centered).
         Default is True.
 
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
     verbose : boolean, optional
         Verbose mode when fitting the model. Default is False.
 
@@ -300,9 +324,10 @@ class ARDRegression(LinearModel):
     >>> from scikits.learn import linear_model
     >>> clf = linear_model.ARDRegression()
     >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
-    ARDRegression(n_iter=300, verbose=False, lambda_1=1e-06, lambda_2=1e-06,
-           fit_intercept=True, eps=0.001, threshold_lambda=10000.0,
-           alpha_2=1e-06, alpha_1=1e-06, compute_score=False)
+    ARDRegression(normalize=False, n_iter=300, verbose=False, lambda_1=1e-06,
+           lambda_2=1e-06, fit_intercept=True, threshold_lambda=10000.0,
+           alpha_2=1e-06, tol=0.001, alpha_1=1e-06, overwrite_X=False,
+           compute_score=False)
     >>> clf.predict([[1, 1]])
     array([ 1.])
 
@@ -311,21 +336,24 @@ class ARDRegression(LinearModel):
     See examples/linear_model/plot_ard.py for an example.
     """
 
-    def __init__(self, n_iter=300, eps=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
+    def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                   lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
-                  threshold_lambda=1.e+4, fit_intercept=True, verbose=False):
+                  threshold_lambda=1.e+4, fit_intercept=True,
+                  normalize=False, overwrite_X=False, verbose=False):
         self.n_iter = n_iter
-        self.eps = eps
+        self.tol = tol
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
         self.alpha_1 = alpha_1
         self.alpha_2 = alpha_2
         self.lambda_1 = lambda_1
         self.lambda_2 = lambda_2
         self.compute_score = compute_score
         self.threshold_lambda = threshold_lambda
+        self.overwrite_X = overwrite_X
         self.verbose = verbose
 
-    def fit(self, X, y, **params):
+    def fit(self, X, y):
         """Fit the ARDRegression model according to the given training data
         and parameters.
 
@@ -343,7 +371,6 @@ def fit(self, X, y, **params):
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
 
         X = np.asanyarray(X, dtype=np.float)
         y = np.asanyarray(y, dtype=np.float)
@@ -351,7 +378,10 @@ def fit(self, X, y, **params):
         n_samples, n_features = X.shape
         coef_ = np.zeros(n_features)
 
-        X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
+        X = as_float_array(X, self.overwrite_X)
+
+        X, y, X_mean, y_mean, X_std = self._center_data(X, y, self.fit_intercept,
+                self.normalize)
 
         ### Launch the convergence loop
         keep_lambda = np.ones(n_features, dtype=bool)
@@ -407,7 +437,7 @@ def fit(self, X, y, **params):
                 self.scores_.append(s)
 
             ### Check for convergence
-            if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.eps:
+            if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
                 if verbose:
                     print "Converged after %s iterations" % iter_
                 break
@@ -417,5 +447,5 @@ def fit(self, X, y, **params):
         self.alpha_ = alpha_
         self.sigma_ = sigma_
 
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(X_mean, y_mean, X_std)
         return self
diff --git a/scikits/learn/linear_model/coordinate_descent.py b/scikits/learn/linear_model/coordinate_descent.py
index 55af2b422acd1..f11498a61b17a 100644
--- a/scikits/learn/linear_model/coordinate_descent.py
+++ b/scikits/learn/linear_model/coordinate_descent.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from .base import LinearModel
+from ..utils import as_float_array
 from ..cross_val import check_cv
 from . import cd_fast
 
@@ -30,13 +31,16 @@ class ElasticNet(LinearModel):
 
     rho : float
         The ElasticNet mixing parameter, with 0 < rho <= 1. For rho = 0
-        the penalty is an L1 penalty. For rho = 1 it is an L2 penalty. 
+        the penalty is an L1 penalty. For rho = 1 it is an L2 penalty.
         For 0 < rho < 1, the penalty is a combination of L1 and L2
 
     fit_intercept: bool
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered.
 
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to 'auto' let us decide. The Gram
@@ -45,6 +49,10 @@ class ElasticNet(LinearModel):
     max_iter: int, optional
         The maximum number of iterations
 
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
     tol: float, optional
         The tolerance for the optimization: if the updates are
         smaller than 'tol', the optimization code checks the
@@ -68,22 +76,24 @@ class ElasticNet(LinearModel):
         a*L1 + b*L2
 
     for::
-        
+
         alpha = a + b and rho = a/(a+b)
 
     """
-
     def __init__(self, alpha=1.0, rho=0.5, fit_intercept=True,
-                 precompute='auto', max_iter=1000, tol=1e-4):
+                 normalize=False, precompute='auto', max_iter=1000,
+                 overwrite_X=False, tol=1e-4):
         self.alpha = alpha
         self.rho = rho
         self.coef_ = None
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
         self.precompute = precompute
         self.max_iter = max_iter
+        self.overwrite_X = overwrite_X
         self.tol = tol
 
-    def fit(self, X, y, Xy=None, coef_init=None, **params):
+    def fit(self, X, y, Xy=None, coef_init=None):
         """Fit Elastic Net model with coordinate descent
 
         Parameters
@@ -108,18 +118,27 @@ def fit(self, X, y, Xy=None, coef_init=None, **params):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        self._set_params(**params)
         X = np.asanyarray(X, dtype=np.float64)
         y = np.asanyarray(y, dtype=np.float64)
+        X = as_float_array(X, self.overwrite_X)
+
+        n_samples, n_features = X.shape
 
-        X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
+        X_init = X
+        X, y, X_mean, y_mean, X_std = self._center_data(X, y,
+                                                        self.fit_intercept,
+                                                        self.normalize)
+        precompute = self.precompute
+        if X_init is not X and hasattr(precompute, '__array__'):
+            precompute = 'auto'  # recompute Gram
+        if X_init is not X and Xy is not None:
+            Xy = None  # recompute Xy
 
         if coef_init is None:
-            self.coef_ = np.zeros(X.shape[1], dtype=np.float64)
+            self.coef_ = np.zeros(n_features, dtype=np.float64)
         else:
             self.coef_ = coef_init
 
-        n_samples = X.shape[0]
         alpha = self.alpha * self.rho * n_samples
         beta = self.alpha * (1.0 - self.rho) * n_samples
 
@@ -127,9 +146,9 @@ def fit(self, X, y, Xy=None, coef_init=None, **params):
 
         # precompute if n_samples > n_features
         if hasattr(self.precompute, '__array__'):
-            Gram = self.precompute
-        elif self.precompute == True or \
-               (self.precompute == 'auto' and X.shape[0] > X.shape[1]):
+            Gram = precompute
+        elif precompute == True or \
+               (precompute == 'auto' and n_samples > n_features):
             Gram = np.dot(X.T, X)
         else:
             Gram = None
@@ -146,7 +165,7 @@ def fit(self, X, y, Xy=None, coef_init=None, **params):
                     cd_fast.enet_coordinate_descent_gram(self.coef_, alpha,
                                 beta, Gram, Xy, y, self.max_iter, self.tol)
 
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(X_mean, y_mean, X_std)
 
         if self.dual_gap_ > self.eps_:
             warnings.warn('Objective did not converge, you might want'
@@ -175,6 +194,13 @@ class Lasso(ElasticNet):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to 'auto' let us decide. The Gram
@@ -203,8 +229,8 @@ class Lasso(ElasticNet):
     >>> from scikits.learn import linear_model
     >>> clf = linear_model.Lasso(alpha=0.1)
     >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
-    Lasso(precompute='auto', alpha=0.1, max_iter=1000, tol=0.0001,
-       fit_intercept=True)
+    Lasso(normalize=False, fit_intercept=True, max_iter=1000, precompute='auto',
+       tol=0.0001, alpha=0.1, overwrite_X=False)
     >>> print clf.coef_
     [ 0.85  0.  ]
     >>> print clf.intercept_
@@ -222,19 +248,22 @@ class Lasso(ElasticNet):
     should be directly passed as a fortran contiguous numpy array.
     """
 
-    def __init__(self, alpha=1.0, fit_intercept=True,
-                 precompute='auto', max_iter=1000, tol=1e-4):
+    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+                 precompute='auto', overwrite_X=False, max_iter=1000,
+                 tol=1e-4):
         super(Lasso, self).__init__(alpha=alpha, rho=1.0,
-                            fit_intercept=fit_intercept,
-                            precompute=precompute, max_iter=max_iter,
-                            tol=tol)
+                            fit_intercept=fit_intercept, normalize=normalize,
+                            precompute=precompute, overwrite_X=overwrite_X,
+                            max_iter=max_iter, tol=tol)
 
 
 ###############################################################################
 # Classes to store linear models along a regularization path
 
-def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
-               verbose=False, **fit_params):
+def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
+               precompute='auto', Xy=None, fit_intercept=True,
+               normalize=False, overwrite_X=False, verbose=False,
+               **params):
     """Compute Lasso path with coordinate descent
 
     Parameters
@@ -257,8 +286,30 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
         List of alphas where to compute the models.
         If None alphas are set automatically
 
-    fit_params : kwargs
-        keyword arguments passed to the Lasso fit method
+    precompute : True | False | 'auto' | array-like
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to 'auto' let us decide. The Gram
+        matrix can also be passed as argument.
+
+    Xy : array-like, optional
+        Xy = np.dot(X.T, y) that can be precomputed. It is useful
+        only when the Gram matrix is precomuted.
+
+    fit_intercept : bool
+        Fit or not an intercept
+
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
+    verbose : bool
+        Verbose computation or not.
+
+    params : kwargs
+        keyword arguments passed to the Lasso objects
 
     Returns
     -------
@@ -272,11 +323,15 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
     should be directly passed as a fortran contiguous numpy array.
     """
     return enet_path(X, y, rho=1., eps=eps, n_alphas=n_alphas, alphas=alphas,
-                  fit_intercept=fit_intercept, verbose=verbose, **fit_params)
+                     precompute='auto', Xy=None,
+                     fit_intercept=fit_intercept, normalize=normalize,
+                     overwrite_X=overwrite_X, verbose=verbose, **params)
 
 
 def enet_path(X, y, rho=0.5, eps=1e-3, n_alphas=100, alphas=None,
-              fit_intercept=True, verbose=False, **fit_params):
+              precompute='auto', Xy=None, fit_intercept=True,
+              normalize=False, overwrite_X=False, verbose=False,
+              **params):
     """Compute Elastic-Net path with coordinate descent
 
     Parameters
@@ -303,8 +358,30 @@ def enet_path(X, y, rho=0.5, eps=1e-3, n_alphas=100, alphas=None,
         List of alphas where to compute the models.
         If None alphas are set automatically
 
-    fit_params : kwargs
-        keyword arguments passed to the Lasso fit method
+    precompute : True | False | 'auto' | array-like
+        Whether to use a precomputed Gram matrix to speed up
+        calculations. If set to 'auto' let us decide. The Gram
+        matrix can also be passed as argument.
+
+    Xy : array-like, optional
+        Xy = np.dot(X.T, y) that can be precomputed. It is useful
+        only when the Gram matrix is precomuted.
+
+    fit_intercept : bool
+        Fit or not an intercept
+
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
+    verbose : bool
+        Verbose computation or not.
+
+    params : kwargs
+        keyword arguments passed to the Lasso objects
 
     Returns
     -------
@@ -314,12 +391,30 @@ def enet_path(X, y, rho=0.5, eps=1e-3, n_alphas=100, alphas=None,
     -----
     See examples/plot_lasso_coordinate_descent_path.py for an example.
     """
-    X, y, Xmean, ymean = LinearModel._center_data(X, y, fit_intercept)
+    X = as_float_array(X, overwrite_X)
+
+    X_init = X
+    X, y, X_mean, y_mean, X_std = LinearModel._center_data(X, y,
+                                                           fit_intercept,
+                                                           normalize)
     X = np.asfortranarray(X)  # make data contiguous in memory
+    n_samples, n_features = X.shape
+
+    if X_init is not X and hasattr(precompute, '__array__'):
+        precompute = 'auto'
+    if X_init is not X and Xy is not None:
+        Xy = None
+
+    if 'precompute' is True or \
+                ((precompute == 'auto') and (n_samples > n_features)):
+        precompute = np.dot(X.T, X)
+
+    if Xy is None:
+        Xy = np.dot(X.T, y)
 
     n_samples = X.shape[0]
     if alphas is None:
-        alpha_max = np.abs(np.dot(X.T, y)).max() / (n_samples * rho)
+        alpha_max = np.abs(Xy).max() / (n_samples * rho)
         alphas = np.logspace(np.log10(alpha_max * eps), np.log10(alpha_max),
                              num=n_alphas)[::-1]
     else:
@@ -327,19 +422,14 @@ def enet_path(X, y, rho=0.5, eps=1e-3, n_alphas=100, alphas=None,
     coef_ = None  # init coef_
     models = []
 
-    if not 'precompute' in fit_params \
-        or fit_params['precompute'] is True \
-        or (fit_intercept and hasattr(fit_params['precompute'], '__array__')):
-        fit_params['precompute'] = np.dot(X.T, X)
-        if not 'Xy' in fit_params or fit_params['Xy'] is None:
-            fit_params['Xy'] = np.dot(X.T, y)
-
     for alpha in alphas:
-        model = ElasticNet(alpha=alpha, rho=rho, fit_intercept=False)
-        model.fit(X, y, coef_init=coef_, **fit_params)
+        model = ElasticNet(alpha=alpha, rho=rho, fit_intercept=False,
+                           precompute=precompute)
+        model.set_params(**params)
+        model.fit(X, y, coef_init=coef_, Xy=Xy)
         if fit_intercept:
             model.fit_intercept = True
-            model._set_intercept(Xmean, ymean)
+            model._set_intercept(X_mean, y_mean, X_std)
         if verbose:
             print model
         coef_ = model.coef_.copy()
@@ -350,19 +440,21 @@ def enet_path(X, y, rho=0.5, eps=1e-3, n_alphas=100, alphas=None,
 class LinearModelCV(LinearModel):
     """Base class for iterative model fitting along a regularization path"""
 
-    def __init__(self, eps=1e-3, n_alphas=100, alphas=None,
-                 fit_intercept=True, precompute='auto', max_iter=1000,
-                 tol=1e-4, cv=None):
+    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
+            normalize=False, precompute='auto', max_iter=1000, tol=1e-4,
+            overwrite_X=False, cv=None):
         self.eps = eps
         self.n_alphas = n_alphas
         self.alphas = alphas
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
         self.precompute = precompute
         self.max_iter = max_iter
         self.tol = tol
+        self.overwrite_X = overwrite_X
         self.cv = cv
 
-    def fit(self, X, y, **fit_params):
+    def fit(self, X, y):
         """Fit linear model with coordinate descent along decreasing alphas
         using cross-validation
 
@@ -380,7 +472,6 @@ def fit(self, X, y, **fit_params):
             keyword arguments passed to the Lasso fit method
 
         """
-        self._set_params(**fit_params)
         X = np.asfortranarray(X, dtype=np.float64)
         y = np.asanyarray(y, dtype=np.float64)
 
@@ -454,7 +545,7 @@ class LassoCV(LinearModelCV):
 
     cv : integer or crossvalidation generator, optional
         If an integer is passed, it is the number of fold (default 3).
-        Specific crossvalidation objects can be passed, see 
+        Specific crossvalidation objects can be passed, see
         scikits.learn.cross_val module for the list of possible objects
 
     Notes
@@ -465,7 +556,6 @@ class LassoCV(LinearModelCV):
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a fortran contiguous numpy array.
     """
-
     path = staticmethod(lasso_path)
     estimator = Lasso
 
@@ -480,7 +570,7 @@ class ElasticNetCV(LinearModelCV):
     rho : float, optional
         float between 0 and 1 passed to ElasticNet (scaling between
         l1 and l2 penalties). For rho = 0
-        the penalty is an L1 penalty. For rho = 1 it is an L2 penalty. 
+        the penalty is an L1 penalty. For rho = 1 it is an L2 penalty.
         For 0 < rho < 1, the penalty is a combination of L1 and L2
 
     eps : float, optional
@@ -510,7 +600,7 @@ class ElasticNetCV(LinearModelCV):
 
     cv : integer or crossvalidation generator, optional
         If an integer is passed, it is the number of fold (default 3).
-        Specific crossvalidation objects can be passed, see 
+        Specific crossvalidation objects can be passed, see
         scikits.learn.cross_val module for the list of possible objects
 
 
@@ -534,23 +624,24 @@ class ElasticNetCV(LinearModelCV):
         a*L1 + b*L2
 
     for::
-        
+
         alpha = a + b and rho = a/(a+b)
 
     """
-
     path = staticmethod(enet_path)
     estimator = ElasticNet
 
     def __init__(self, rho=0.5, eps=1e-3, n_alphas=100, alphas=None,
-                 fit_intercept=True, precompute='auto', max_iter=1000,
-                 tol=1e-4, cv=None):
+                 fit_intercept=True, normalize=False, precompute='auto',
+                 max_iter=1000, tol=1e-4, cv=None, overwrite_X=False):
         self.rho = rho
         self.eps = eps
         self.n_alphas = n_alphas
         self.alphas = alphas
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
         self.precompute = precompute
         self.max_iter = max_iter
         self.tol = tol
         self.cv = cv
+        self.overwrite_X = overwrite_X
diff --git a/scikits/learn/linear_model/least_angle.py b/scikits/learn/linear_model/least_angle.py
index 098203a877ba7..2a27c1e69f29a 100644
--- a/scikits/learn/linear_model/least_angle.py
+++ b/scikits/learn/linear_model/least_angle.py
@@ -15,27 +15,16 @@
 from scipy.linalg.lapack import get_lapack_funcs
 
 from .base import LinearModel
-from ..utils import arrayfuncs
+from ..utils import arrayfuncs, as_float_array
 from ..utils import deprecated
 from ..cross_val import check_cv
 from ..externals.joblib import Parallel, delayed
 
 
-def _safe_normalize(X, overwrite_X):
-    """Normalize X avoiding divisions by zero"""
-    norms = np.sqrt(np.sum(X ** 2, axis=0))
-    nonzeros = np.flatnonzero(norms)
-    if not overwrite_X:
-        X = X.copy()
-        overwrite_X = True
-    X[:, nonzeros] /= norms[nonzeros]
-    return X, norms
-
-
 def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
               alpha_min=0, method='lar', overwrite_X=False,
               eps=np.finfo(np.float).eps,
-              overwrite_Gram=False, verbose=False, ):
+              overwrite_Gram=False, verbose=False):
     """Compute Least Angle Regression and LASSO path
 
     Parameters
@@ -332,6 +321,9 @@ class Lars(LinearModel):
         calculations. If set to 'auto' let us decide. The Gram
         matrix can also be passed as argument.
 
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
 
     eps: float, optional
         The machine-precision regularization in the computation of the
@@ -353,11 +345,11 @@ class Lars(LinearModel):
     --------
     >>> from scikits.learn import linear_model
     >>> clf = linear_model.Lars(n_nonzero_coefs=1)
-    >>> clf.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) # doctest: +ELLIPSIS
+    >>> clf.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111]) # doctest: +ELLIPSIS
     Lars(normalize=True, n_nonzero_coefs=1, verbose=False, fit_intercept=True,
-       eps=..., precompute='auto')
-    >>> print clf.coef_
-    [ 0. -1.]
+       eps=2.2204460492503131e-16, precompute='auto', overwrite_X=False)
+    >>> print clf.coef_ # doctest: +ELLIPSIS
+    [ 0.     -1.1111...]
 
     References
     ----------
@@ -369,7 +361,7 @@ class Lars(LinearModel):
     """
     def __init__(self, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
-                 eps=np.finfo(np.float).eps):
+                 eps=np.finfo(np.float).eps, overwrite_X=False):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -377,6 +369,7 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True,
         self.precompute = precompute
         self.n_nonzero_coefs = n_nonzero_coefs
         self.eps = eps
+        self.overwrite_X = overwrite_X
 
     def _get_gram(self):
         # precompute if n_samples > n_features
@@ -390,7 +383,7 @@ def _get_gram(self):
             Gram = None
         return Gram
 
-    def fit(self, X, y, overwrite_X=False, **params):
+    def fit(self, X, y, overwrite_X=False):
         """Fit the model using X, y as training data.
 
         parameters
@@ -406,12 +399,15 @@ def fit(self, X, y, overwrite_X=False, **params):
         self : object
             returns an instance of self.
         """
-        self._set_params(**params)
 
         X = np.atleast_2d(X)
         y = np.atleast_1d(y)
 
-        X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
+        X = as_float_array(X, self.overwrite_X)
+
+        X, y, X_mean, y_mean, X_std = self._center_data(X, y,
+                                                        self.fit_intercept,
+                                                        self.normalize)
         alpha = getattr(self, 'alpha', 0.)
         if hasattr(self, 'n_nonzero_coefs'):
             alpha = 0.  # n_nonzero_coefs parametrization takes priority
@@ -419,22 +415,17 @@ def fit(self, X, y, overwrite_X=False, **params):
         else:
             max_iter = self.max_iter
 
-        if self.normalize:
-            X, norms = _safe_normalize(X, overwrite_X)
-
         Gram = self._get_gram()
 
         self.alphas_, self.active_, self.coef_path_ = lars_path(X, y,
-                  Gram=Gram, overwrite_X=overwrite_X,
+                  Gram=Gram, overwrite_X=self.overwrite_X,
                   overwrite_Gram=True, alpha_min=alpha,
                   method=self.method, verbose=self.verbose,
                   max_iter=max_iter, eps=self.eps)
 
-        if self.normalize:
-            self.coef_path_ /= norms[:, np.newaxis]
         self.coef_ = self.coef_path_[:, -1]
 
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(X_mean, y_mean, X_std)
 
         return self
 
@@ -458,6 +449,10 @@ class LassoLars(Lars):
     normalize : boolean, optional
         If True, the regressors X are normalized
 
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to 'auto' let us decide. The Gram
@@ -488,9 +483,10 @@ class LassoLars(Lars):
     >>> clf = linear_model.LassoLars(alpha=0.01)
     >>> clf.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) # doctest: +ELLIPSIS
     LassoLars(normalize=True, verbose=False, fit_intercept=True, max_iter=500,
-         eps=..., precompute='auto', alpha=0.01)
-    >>> print clf.coef_
-    [ 0.         -0.96325765]
+         eps=2.2204460492503131e-16, precompute='auto', alpha=0.01,
+         overwrite_X=False)
+    >>> print clf.coef_ # doctest: +ELLIPSIS
+    [ 0.         -0.963257...]
 
     References
     ----------
@@ -503,7 +499,7 @@ class LassoLars(Lars):
 
     def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
-                 eps=np.finfo(np.float).eps):
+                 eps=np.finfo(np.float).eps, overwrite_X=False):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
@@ -511,6 +507,7 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
         self.normalize = normalize
         self.method = 'lasso'
         self.precompute = precompute
+        self.overwrite_X = overwrite_X
         self.eps = eps
 
 
@@ -632,6 +629,10 @@ class LarsCV(LARS):
     normalize : boolean, optional
         If True, the regressors X are normalized
 
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to 'auto' let us decide. The Gram
@@ -674,17 +675,18 @@ class LarsCV(LARS):
 
     def __init__(self, fit_intercept=True, verbose=False, max_iter=500,
                  normalize=True, precompute='auto', cv=None, n_jobs=1,
-                 eps=np.finfo(np.float).eps):
+                 eps=np.finfo(np.float).eps, overwrite_X=False):
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
         self.verbose = verbose
         self.normalize = normalize
         self.precompute = precompute
+        self.overwrite_X = overwrite_X
         self.cv = cv
         self.n_jobs = n_jobs
         self.eps = eps
 
-    def fit(self, X, y, **params):
+    def fit(self, X, y):
         """Fit the model using X, y as training data.
 
         Parameters
@@ -700,7 +702,6 @@ def fit(self, X, y, **params):
         self : object
             returns an instance of self.
         """
-        self._set_params(**params)
         X = np.asanyarray(X)
 
         n_samples, n_features = X.shape
@@ -786,6 +787,10 @@ class LassoLarsCV(LarsCV):
         Cholesky diagonal factors. Increase this for very ill-conditioned
         systems.
 
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
 
     Attributes
     ----------
@@ -854,6 +859,10 @@ class LassoLarsIC(LassoLars):
     normalize : boolean, optional
         If True, the regressors X are normalized
 
+    overwrite_X : boolean, optionnal
+        Default is False.
+        If True, X will be overwritten
+
     precompute : True | False | 'auto' | array-like
         Whether to use a precomputed Gram matrix to speed up
         calculations. If set to 'auto' let us decide. The Gram
@@ -883,11 +892,12 @@ class LassoLarsIC(LassoLars):
     --------
     >>> from scikits.learn import linear_model
     >>> clf = linear_model.LassoLarsIC(criterion='bic')
-    >>> clf.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1]) # doctest: +ELLIPSIS
+    >>> clf.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111]) # doctest: +ELLIPSIS
     LassoLarsIC(normalize=True, verbose=False, fit_intercept=True, max_iter=500,
-          eps=..., precompute='auto', criterion='bic')
-    >>> print clf.coef_
-    [ 0.         -0.81649658]
+          eps=... precompute='auto', criterion='bic',
+          overwrite_X=False)
+    >>> print clf.coef_ # doctest: +ELLIPSIS
+    [ 0.     -1.1111...]
 
     References
     ----------
@@ -906,7 +916,7 @@ class LassoLarsIC(LassoLars):
     """
     def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
-                 eps=np.finfo(np.float).eps):
+                 eps=np.finfo(np.float).eps, overwrite_X=False):
         if criterion not in ['aic', 'bic']:
             raise ValueError('criterion should be either bic or aic')
         self.criterion = criterion
@@ -914,10 +924,11 @@ def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
         self.max_iter = max_iter
         self.verbose = verbose
         self.normalize = normalize
+        self.overwrite_X = overwrite_X
         self.precompute = precompute
         self.eps = eps
 
-    def fit(self, X, y, overwrite_X=False, **params):
+    def fit(self, X, y, overwrite_X=False):
         """Fit the model using X, y as training data.
 
         parameters
@@ -933,17 +944,15 @@ def fit(self, X, y, overwrite_X=False, **params):
         self : object
             returns an instance of self.
         """
-        self._set_params(**params)
-
         X = np.atleast_2d(X)
         y = np.atleast_1d(y)
 
-        X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
+        X = as_float_array(X, self.overwrite_X)
+        X, y, Xmean, ymean, Xstd = LinearModel._center_data(X, y,
+                                                    self.fit_intercept,
+                                                    normalize=self.normalize)
         max_iter = self.max_iter
 
-        if self.normalize:
-            X, norms = _safe_normalize(X, overwrite_X)
-
         Gram = self._get_gram()
 
         alphas_, active_, coef_path_ = lars_path(X, y,
@@ -980,5 +989,5 @@ def fit(self, X, y, overwrite_X=False, **params):
 
         self.alpha_ = alphas_[n_best]
         self.coef_ = coef_path_[:, n_best]
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(Xmean, ymean, Xstd)
         return self
diff --git a/scikits/learn/linear_model/omp.py b/scikits/learn/linear_model/omp.py
index a9afba929ed3e..3c6e7bda6d603 100644
--- a/scikits/learn/linear_model/omp.py
+++ b/scikits/learn/linear_model/omp.py
@@ -13,13 +13,14 @@
 
 from .base import LinearModel
 from ..utils.arrayfuncs import solve_triangular
+from ..utils import as_float_array
 
 premature = """ Orthogonal matching pursuit ended prematurely due to linear
 dependence in the dictionary. The requested precision might not have been met.
 """
 
 
-def _cholesky_omp(X, y, n_nonzero_coefs, eps=None, overwrite_x=False):
+def _cholesky_omp(X, y, n_nonzero_coefs, eps=None, overwrite_X=False):
     """Orthogonal Matching Pursuit step using the Cholesky decomposition.
 
     Parameters:
@@ -36,7 +37,7 @@ def _cholesky_omp(X, y, n_nonzero_coefs, eps=None, overwrite_x=False):
     eps: float
         Targeted squared error, if not None overrides n_nonzero_coefs.
 
-    overwrite_x: bool,
+    overwrite_X: bool,
         Whether the design matrix X can be overwritten by the algorithm. This
         is only helpful if X is already Fortran-ordered, otherwise a copy
         is made anyway.
@@ -51,7 +52,7 @@ def _cholesky_omp(X, y, n_nonzero_coefs, eps=None, overwrite_x=False):
         vector
 
     """
-    if not overwrite_x:
+    if not overwrite_X:
         X = X.copy('F')
     else:  # even if we are allowed to overwrite, still copy it if bad order
         X = np.asfortranarray(X)
@@ -102,7 +103,7 @@ def _cholesky_omp(X, y, n_nonzero_coefs, eps=None, overwrite_x=False):
 
 
 def _gram_omp(Gram, Xy, n_nonzero_coefs, eps_0=None, eps=None,
-              overwrite_gram=False, overwrite_xy=False):
+              overwrite_gram=False, overwrite_Xy=False):
     """Orthogonal Matching Pursuit step on a precomputed Gram matrix.
 
     This function uses the the Cholesky decomposition method.
@@ -129,7 +130,7 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, eps_0=None, eps=None,
         is only helpful if it is already Fortran-ordered, otherwise a copy
         is made anyway.
 
-    overwrite_xy: bool,
+    overwrite_Xy: bool,
         Whether the covariance vector Xy can be overwritten by the algorithm.
 
     Returns:
@@ -147,7 +148,7 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, eps_0=None, eps=None,
     else:
         Gram = np.asfortranarray(Gram)
 
-    if not overwrite_xy:
+    if not overwrite_Xy:
         Xy = Xy.copy()
 
     min_float = np.finfo(Gram.dtype).eps
@@ -202,7 +203,7 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, eps_0=None, eps=None,
 
 
 def orthogonal_mp(X, y, n_nonzero_coefs=None, eps=None, precompute_gram=False,
-                  overwrite_x=False):
+                  overwrite_X=False):
     """Orthogonal Matching Pursuit (OMP)
 
     Solves n_targets Orthogonal Matching Pursuit problems.
@@ -234,7 +235,7 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, eps=None, precompute_gram=False,
         Whether to perform precomputations. Improves performance when n_targets
         or n_samples is very large.
 
-    overwrite_x: bool,
+    overwrite_X: bool,
         Whether the design matrix X can be overwritten by the algorithm. This
         is only helpful if X is already Fortran-ordered, otherwise a copy
         is made anyway.
@@ -266,13 +267,13 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, eps=None, precompute_gram=False,
     X, y = map(np.asanyarray, (X, y))
     if y.ndim == 1:
         y = y[:, np.newaxis]
-    if not overwrite_x:
+    if not overwrite_X:
         X = X.copy('F')
-        overwrite_x = True
+        overwrite_X = True
     else:
         X = np.asfortranarray(X)
     if y.shape[1] > 1:  # subsequent targets will be affected
-        overwrite_x = False
+        overwrite_X = False
     if n_nonzero_coefs == None and eps == None:
         n_nonzero_coefs = int(0.1 * X.shape[1])
     if eps is not None and eps < 0:
@@ -293,20 +294,20 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, eps=None, precompute_gram=False,
         else:
             norms_squared = None
         return orthogonal_mp_gram(G, Xy, n_nonzero_coefs, eps, norms_squared,
-                                  overwrite_gram=overwrite_x,
-                                  overwrite_xy=True)
+                                  overwrite_gram=overwrite_X,
+                                  overwrite_Xy=True)
 
     coef = np.zeros((X.shape[1], y.shape[1]))
     for k in xrange(y.shape[1]):
         x, idx = _cholesky_omp(X, y[:, k], n_nonzero_coefs, eps,
-                               overwrite_x=overwrite_x)
+                               overwrite_X=overwrite_X)
         coef[idx, k] = x
     return np.squeeze(coef)
 
 
 def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, eps=None,
                        norms_squared=None, overwrite_gram=False,
-                       overwrite_xy=False):
+                       overwrite_Xy=False):
     """Gram Orthogonal Matching Pursuit (OMP)
 
     Solves n_targets Orthogonal Matching Pursuit problems using only
@@ -335,7 +336,7 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, eps=None,
         is only helpful if it is already Fortran-ordered, otherwise a copy
         is made anyway.
 
-    overwrite_xy: bool,
+    overwrite_Xy: bool,
         Whether the covariance vector Xy can be overwritten by the algorithm.
 
     Returns
@@ -385,7 +386,7 @@ def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, eps=None,
         x, idx = _gram_omp(Gram, Xy[:, k], n_nonzero_coefs,
                            norms_squared[k] if eps is not None else None, eps,
                            overwrite_gram=overwrite_gram,
-                           overwrite_xy=overwrite_xy)
+                           overwrite_Xy=overwrite_Xy)
         coef[idx, k] = x
     return np.squeeze(coef)
 
@@ -416,6 +417,21 @@ class OrthogonalMatchingPursuit(LinearModel):
         very large. Note that if you already have such matrices, you can pass
         them directly to the fit method.
 
+    overwrite_X: bool,
+        Whether the design matrix X can be overwritten by the algorithm.
+        This is only helpful if X is already Fortran-ordered, otherwise a
+        copy is made anyway.
+
+    overwrite_gram: bool,
+        Whether the gram matrix can be overwritten by the algorithm. This
+        is only helpful if it is already Fortran-ordered, otherwise a copy
+        is made anyway.
+
+    overwrite_Xy: bool,
+        Whether the covariance vector Xy can be overwritten by the
+        algorithm.
+
+
     Attributes
     ----------
     coef_: array, shape = (n_features,) or (n_features, n_targets)
@@ -445,17 +461,20 @@ class OrthogonalMatchingPursuit(LinearModel):
     LassoLars
 
     """
-
-    def __init__(self, n_nonzero_coefs=None, eps=None, fit_intercept=True,
-                 normalize=True, precompute_gram=False):
+    def __init__(self, overwrite_X=False, overwrite_gram=False,
+            overwrite_Xy=False, n_nonzero_coefs=None, eps=None,
+            fit_intercept=True, normalize=True, precompute_gram=False):
         self.n_nonzero_coefs = n_nonzero_coefs
         self.eps = eps
         self.fit_intercept = fit_intercept
         self.normalize = normalize
         self.precompute_gram = precompute_gram
+        self.overwrite_gram = overwrite_gram
+        self.overwrite_Xy = overwrite_Xy
+        self.overwrite_X = overwrite_X
+        self.eps = eps
 
-    def fit(self, X, y, Gram=None, Xy=None, overwrite_x=False,
-            overwrite_gram=False, overwrite_xy=False, **params):
+    def fit(self, X, y, Gram=None, Xy=None):
         """Fit the model using X, y as training data.
 
         Parameters
@@ -473,38 +492,20 @@ def fit(self, X, y, Gram=None, Xy=None, overwrite_x=False,
             (optional)
             Input targets multiplied by X: X.T * y
 
-        overwrite_x: bool,
-            Whether the design matrix X can be overwritten by the algorithm.
-            This is only helpful if X is already Fortran-ordered, otherwise a
-            copy is made anyway.
-
-        overwrite_gram: bool,
-            Whether the gram matrix can be overwritten by the algorithm. This
-            is only helpful if it is already Fortran-ordered, otherwise a copy
-            is made anyway.
-
-        overwrite_xy: bool,
-            Whether the covariance vector Xy can be overwritten by the
-            algorithm.
 
         Returns
         -------
         self: object
             returns an instance of self.
         """
-        self._set_params(**params)
-
         X = np.atleast_2d(X)
         y = np.atleast_1d(y)
+        n_features = X.shape[1]
+        X = as_float_array(X, self.overwrite_X)
 
-        n_samples, n_features = X.shape
-
-        X, y, Xmean, ymean = self._center_data(X, y, self.fit_intercept)
-
-        if self.normalize:
-            norms = np.sqrt(np.sum(X ** 2, axis=0))
-            nonzeros = np.flatnonzero(norms)
-            X[:, nonzeros] /= norms[nonzeros]
+        X, y, X_mean, y_mean, X_std = self._center_data(X, y,
+                                                        self.fit_intercept,
+                                                        self.normalize)
 
         if self.n_nonzero_coefs == None and self.eps is None:
             self.n_nonzero_coefs = int(0.1 * n_features)
@@ -512,35 +513,32 @@ def fit(self, X, y, Gram=None, Xy=None, overwrite_x=False,
         if Gram is not None:
             Gram = np.atleast_2d(Gram)
 
-            if not overwrite_gram:
+            if not self.overwrite_gram:
                 overwrite_gram = True
                 Gram = Gram.copy('F')
             else:
                 Gram = np.asfortranarray(Gram)
 
+            overwrite_gram = self.overwrite_gram
             if y.shape[1] > 1:  # subsequent targets will be affected
                 overwrite_gram = False
 
             if Xy is None:
                 Xy = np.dot(X.T, y)
             else:
-                if not overwrite_xy:
+                if not self.overwrite_Xy:
                     Xy = Xy.copy()
                 if self.normalize:
-                    if Xy.ndim > 1:
-                        Xy /= norms.reshape((n_features, -1))
+                    if len(Xy.shape) == 1:
+                        Xy /= X_std
                     else:
-                        Xy /= norms
+                        Xy /= X_std[:, np.newaxis]
 
             if self.normalize:
-                Gram /= norms
-                Gram /= norms[:, np.newaxis]
-
-            if self.eps is not None:
-                norms_sq = np.sum((y ** 2), axis=0)
-            else:
-                norms_sq = None
+                Gram /= X_std
+                Gram /= X_std[:, np.newaxis]
 
+            norms_sq = np.sum(y ** 2, axis=0) if self.eps is not None else None
             self.coef_ = orthogonal_mp_gram(Gram, Xy, self.n_nonzero_coefs,
                                             self.eps, norms_sq,
                                             overwrite_gram, True).T
@@ -549,10 +547,8 @@ def fit(self, X, y, Gram=None, Xy=None, overwrite_x=False,
             if precompute_gram == 'auto':
                 precompute_gram = X.shape[0] > X.shape[1]
             self.coef_ = orthogonal_mp(X, y, self.n_nonzero_coefs, self.eps,
-                                       precompute_gram=precompute_gram,
-                                       overwrite_x=overwrite_x).T
+                                       precompute_gram=self.precompute_gram,
+                                       overwrite_X=self.overwrite_X).T
 
-        if self.normalize:
-            self.coef_ /= norms
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(X_mean, y_mean, X_std)
         return self
diff --git a/scikits/learn/linear_model/ridge.py b/scikits/learn/linear_model/ridge.py
index d295a412b9c5a..e17b1d0b523f8 100644
--- a/scikits/learn/linear_model/ridge.py
+++ b/scikits/learn/linear_model/ridge.py
@@ -9,7 +9,7 @@
 
 from .base import LinearModel
 from ..utils.extmath import safe_sparse_dot
-from ..utils import safe_asanyarray
+from ..utils import safe_asanyarray, as_float_array
 from ..preprocessing import LabelBinarizer
 from ..grid_search import GridSearchCV
 
@@ -137,6 +137,13 @@ class Ridge(LinearModel):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
+    overwrite_X : boolean, optionnal
+        If True, X will not be copied
+        Default is False
+
     tol: float
         Precision of the solution.
 
@@ -156,15 +163,19 @@ class Ridge(LinearModel):
     >>> X = np.random.randn(n_samples, n_features)
     >>> clf = Ridge(alpha=1.0)
     >>> clf.fit(X, y)
-    Ridge(alpha=1.0, tol=0.001, fit_intercept=True)
+    Ridge(normalize=False, alpha=1.0, overwrite_X=False, tol=0.001,
+       fit_intercept=True)
     """
 
-    def __init__(self, alpha=1.0, fit_intercept=True, tol=1e-3):
+    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+            overwrite_X=False, tol=1e-3):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
+        self.overwrite_X = overwrite_X
         self.tol = tol
 
-    def fit(self, X, y, sample_weight=1.0, solver='auto', **params):
+    def fit(self, X, y, sample_weight=1.0, solver='auto'):
         """
         Fit Ridge regression model
 
@@ -191,17 +202,17 @@ def fit(self, X, y, sample_weight=1.0, solver='auto', **params):
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
-
         X = safe_asanyarray(X, dtype=np.float)
         y = np.asanyarray(y, dtype=np.float)
+        X = as_float_array(X, self.overwrite_X)
 
-        X, y, X_mean, y_mean = \
-           LinearModel._center_data(X, y, self.fit_intercept)
+        X, y, X_mean, y_mean, X_std = \
+           self._center_data(X, y, self.fit_intercept,
+                   self.normalize)
 
         self.coef_ = ridge_regression(X, y, self.alpha, sample_weight,
                                       solver, self.tol)
-        self._set_intercept(X_mean, y_mean)
+        self._set_intercept(X_mean, y_mean, X_std)
         return self
 
 
@@ -221,6 +232,9 @@ class RidgeClassifier(Ridge):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
     Attributes
     ----------
 
@@ -321,12 +335,14 @@ class _RidgeGCV(LinearModel):
     http://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
     """
 
-    def __init__(self, alphas=[0.1, 1.0, 10.0], fit_intercept=True,
-                       score_func=None, loss_func=None):
+    def __init__(self, alphas=[0.1, 1.0, 10.0], fit_intercept=True, normalize=False,
+            score_func=None, loss_func=None, overwrite_X=False):
         self.alphas = np.asanyarray(alphas)
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
         self.score_func = score_func
         self.loss_func = loss_func
+        self.overwrite_X = overwrite_X
 
     def _pre_compute(self, X, y):
         # even if X is very sparse, K is usually very dense
@@ -384,8 +400,10 @@ def fit(self, X, y, sample_weight=1.0):
         y = np.asanyarray(y, dtype=np.float)
 
         n_samples = X.shape[0]
+        X = as_float_array(X, self.overwrite_X)
 
-        X, y, Xmean, ymean = LinearModel._center_data(X, y, self.fit_intercept)
+        X, y, X_mean, y_mean, X_std = LinearModel._center_data(X, y,
+                self.fit_intercept, self.normalize)
 
         K, v, Q = self._pre_compute(X, y)
         n_y = 1 if len(y.shape) == 1 else y.shape[1]
@@ -413,7 +431,7 @@ def fit(self, X, y, sample_weight=1.0):
         self.dual_coef_ = C[best]
         self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
 
-        self._set_intercept(Xmean, ymean)
+        self._set_intercept(X_mean, y_mean, X_std)
 
         return self
 
@@ -440,6 +458,9 @@ class RidgeCV(LinearModel):
         to false, no intercept will be used in calculations
         (e.g. data is expected to be already centered).
 
+    normalize : boolean, optional
+        If True, the regressors X are normalized
+
     loss_func: callable, optional
         function that takes 2 arguments and compares them in
         order to evaluate the performance of prediciton (small is good)
@@ -456,14 +477,15 @@ class RidgeCV(LinearModel):
     """
 
     def __init__(self, alphas=np.array([0.1, 1.0, 10.0]), fit_intercept=True,
-                       score_func=None, loss_func=None, cv=None):
+                   normalize=False, score_func=None, loss_func=None, cv=None):
         self.alphas = alphas
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
         self.score_func = score_func
         self.loss_func = loss_func
         self.cv = cv
 
-    def fit(self, X, y, sample_weight=1.0, **params):
+    def fit(self, X, y, sample_weight=1.0):
         """Fit Ridge regression model
 
         Parameters
@@ -485,8 +507,6 @@ def fit(self, X, y, sample_weight=1.0, **params):
         -------
         self : Returns self.
         """
-        self._set_params(**params)
-
         if self.cv is None:
             estimator = _RidgeGCV(self.alphas, self.fit_intercept,
                                   self.score_func, self.loss_func)
@@ -512,7 +532,7 @@ def fit(self, X, y, sample_weight=1.0, **params):
 
 class RidgeClassifierCV(RidgeCV):
 
-    def fit(self, X, y, sample_weight=1.0, class_weight=None, **params):
+    def fit(self, X, y, sample_weight=1.0, class_weight=None):
         """
         Fit the ridge classifier.
 
@@ -538,15 +558,13 @@ def fit(self, X, y, sample_weight=1.0, class_weight=None, **params):
         self : object
             Returns self.
         """
-        self._set_params(**params)
         if class_weight is None:
             class_weight = {}
         sample_weight2 = np.array([class_weight.get(k, 1.0) for k in y])
         self.label_binarizer = LabelBinarizer()
         Y = self.label_binarizer.fit_transform(y)
         RidgeCV.fit(self, X, Y,
-                    sample_weight=sample_weight * sample_weight2,
-                    cv=self.cv)
+                    sample_weight=sample_weight * sample_weight2)
         return self
 
     def decision_function(self, X):
diff --git a/scikits/learn/linear_model/sparse/coordinate_descent.py b/scikits/learn/linear_model/sparse/coordinate_descent.py
index 38ef4245f6be7..80e150d54827d 100644
--- a/scikits/learn/linear_model/sparse/coordinate_descent.py
+++ b/scikits/learn/linear_model/sparse/coordinate_descent.py
@@ -7,7 +7,7 @@
 
 import warnings
 import numpy as np
-from scipy import sparse
+import scipy.sparse as sp
 
 from ..base import LinearModel
 from . import cd_fast_sparse
@@ -40,14 +40,17 @@ class ElasticNet(LinearModel):
     The parameter rho corresponds to alpha in the glmnet R package
     while alpha corresponds to the lambda parameter in glmnet.
     """
-
-    def __init__(self, alpha=1.0, rho=0.5, fit_intercept=False):
+    def __init__(self, alpha=1.0, rho=0.5, fit_intercept=False,
+                 normalize=False, max_iter=1000, tol=1e-4):
         if fit_intercept:
             raise NotImplementedError("fit_intercept=True is not implemented")
         self.alpha = alpha
         self.rho = rho
         self.fit_intercept = fit_intercept
+        self.normalize = normalize
         self.intercept_ = 0.0
+        self.max_iter = max_iter
+        self.tol = tol
         self._set_coef(None)
 
     def _set_coef(self, coef_):
@@ -56,16 +59,15 @@ def _set_coef(self, coef_):
             self.sparse_coef_ = None
         else:
             # sparse representation of the fitted coef for the predict method
-            self.sparse_coef_ = sparse.csr_matrix(coef_)
+            self.sparse_coef_ = sp.csr_matrix(coef_)
 
-    def fit(self, X, y, max_iter=1000, tol=1e-4, **params):
+    def fit(self, X, y):
         """Fit current model with coordinate descent
 
         X is expected to be a sparse matrix. For maximum efficiency, use a
         sparse matrix in CSC format (scipy.sparse.csc_matrix)
         """
-        self._set_params(**params)
-        X = sparse.csc_matrix(X)
+        X = sp.csc_matrix(X)
         y = np.asanyarray(y, dtype=np.float64)
 
         # NOTE: we are explicitly not centering the data the naive way to
@@ -83,7 +85,7 @@ def fit(self, X, y, max_iter=1000, tol=1e-4, **params):
         coef_, self.dual_gap_, self.eps_ = \
                 cd_fast_sparse.enet_coordinate_descent(
                     self.coef_, alpha, beta, X_data, X.indices, X.indptr, y,
-                    max_iter, tol)
+                    self.max_iter, self.tol)
 
         # update self.coef_ and self.sparse_coef_ consistently
         self._set_coef(coef_)
@@ -109,8 +111,8 @@ def predict(self, X):
         array, shape = [n_samples] with the predicted real values
         """
         # np.dot only works correctly if both arguments are sparse matrices
-        if not sparse.issparse(X):
-            X = sparse.csr_matrix(X)
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
         return np.ravel(np.dot(self.sparse_coef_, X.T).todense()
                         + self.intercept_)
 
@@ -132,7 +134,8 @@ class Lasso(ElasticNet):
         data is assumed to be already centered.
 
     """
-
-    def __init__(self, alpha=1.0, fit_intercept=False):
-        super(Lasso, self).__init__(alpha=alpha, rho=1.0,
-                                    fit_intercept=fit_intercept)
+    def __init__(self, alpha=1.0, fit_intercept=False, normalize=False,
+                 max_iter=1000, tol=1e-4):
+        super(Lasso, self).__init__(
+            alpha=alpha, rho=1.0, fit_intercept=fit_intercept,
+            normalize=normalize, max_iter=max_iter, tol=tol)
diff --git a/scikits/learn/linear_model/sparse/logistic.py b/scikits/learn/linear_model/sparse/logistic.py
index 7b41d2d6cf73e..6f01d7caca9bb 100644
--- a/scikits/learn/linear_model/sparse/logistic.py
+++ b/scikits/learn/linear_model/sparse/logistic.py
@@ -6,6 +6,7 @@
 """
 
 import numpy as np
+import scipy.sparse as sp
 
 from ...base import ClassifierMixin
 from ...svm.sparse.base import SparseBaseLibLinear
@@ -92,8 +93,7 @@ def predict_proba(self, X):
         The returned estimates for all classes are ordered by the
         label of classes.
         """
-        import scipy.sparse
-        X = scipy.sparse.csr_matrix(X)
+        X = sp.csr_matrix(X)
         X.data = np.asanyarray(X.data, dtype=np.float64, order='C')
         probas = csr_predict_prob(X.shape[1], X.data, X.indices,
                                   X.indptr, self.raw_coef_,
diff --git a/scikits/learn/linear_model/sparse/stochastic_gradient.py b/scikits/learn/linear_model/sparse/stochastic_gradient.py
index fd3db80b2953e..10d8dccbeb605 100644
--- a/scikits/learn/linear_model/sparse/stochastic_gradient.py
+++ b/scikits/learn/linear_model/sparse/stochastic_gradient.py
@@ -4,7 +4,7 @@
 """Implementation of Stochastic Gradient Descent (SGD) with sparse data."""
 
 import numpy as np
-from scipy import sparse
+import scipy.sparse as sp
 
 from ...externals.joblib import Parallel, delayed
 from ..base import BaseSGDClassifier, BaseSGDRegressor
@@ -128,13 +128,13 @@ def _set_coef(self, coef_):
             self.sparse_coef_ = None
         else:
             # sparse representation of the fitted coef for the predict method
-            self.sparse_coef_ = sparse.csr_matrix(coef_)
+            self.sparse_coef_ = sp.csr_matrix(coef_)
 
     def _fit_binary(self, X, y):
         """Fit a binary classifier.
         """
         # interprete X as CSR matrix
-        X = sparse.csr_matrix(X)
+        X = sp.csr_matrix(X)
 
         # encode original class labels as 1 (classes[1]) or -1 (classes[0]).
         y_new = np.ones(y.shape, dtype=np.float64, order="C") * -1.0
@@ -174,7 +174,7 @@ def _fit_multiclass(self, X, y):
         all others (OVA: One Versus All).
         """
         # interprete X as CSR matrix
-        X = sparse.csr_matrix(X)
+        X = sp.csr_matrix(X)
 
         # get sparse matrix datastructures
         X_data = np.array(X.data, dtype=np.float64, order="C")
@@ -217,8 +217,8 @@ def decision_function(self, X):
           The signed 'distances' to the hyperplane(s).
         """
         # np.dot only works correctly if both arguments are sparse matrices
-        if not sparse.issparse(X):
-            X = sparse.csr_matrix(X)
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
         scores = np.asarray(np.dot(X, self.sparse_coef_.T).todense()
                             + self.intercept_)
         if self.classes.shape[0] == 2:
@@ -353,11 +353,11 @@ def _set_coef(self, coef_):
             self.sparse_coef_ = None
         else:
             # sparse representation of the fitted coef for the predict method
-            self.sparse_coef_ = sparse.csr_matrix(coef_)
+            self.sparse_coef_ = sp.csr_matrix(coef_)
 
     def _fit_regressor(self, X, y):
         # interprete X as CSR matrix
-        X = sparse.csr_matrix(X)
+        X = sp.csr_matrix(X)
 
         # get sparse matrix datastructures
         X_data = np.array(X.data, dtype=np.float64, order="C")
@@ -400,8 +400,8 @@ def predict(self, X):
            Array containing the predicted class labels.
         """
         # np.dot only works correctly if both arguments are sparse matrices
-        if not sparse.issparse(X):
-            X = sparse.csr_matrix(X)
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
         scores = np.asarray(np.dot(X, self.sparse_coef_.T).todense()
                             + self.intercept_).ravel()
         return scores
diff --git a/scikits/learn/linear_model/sparse/tests/test_coordinate_descent.py b/scikits/learn/linear_model/sparse/tests/test_coordinate_descent.py
index 3abecce590af6..90e5cee442698 100644
--- a/scikits/learn/linear_model/sparse/tests/test_coordinate_descent.py
+++ b/scikits/learn/linear_model/sparse/tests/test_coordinate_descent.py
@@ -1,5 +1,5 @@
 import numpy as np
-from scipy import sparse
+import scipy.sparse as sp
 from numpy.testing import assert_array_almost_equal
 from numpy.testing import assert_almost_equal
 from numpy.testing import assert_equal
@@ -12,7 +12,7 @@
 
 def test_sparse_predict():
     """Check that the predict method works with dense coef_ and sparse X"""
-    X = sparse.lil_matrix((3, 2))
+    X = sp.lil_matrix((3, 2))
     X[0, 0] = 1
     X[0, 1] = 0.5
     X[1, 0] = -1
@@ -25,7 +25,7 @@ def test_sparse_predict():
 
 def test_lasso_zero():
     """Check that the sparse lasso can handle zero data without crashing"""
-    X = sparse.csc_matrix((3, 1))
+    X = sp.csc_matrix((3, 1))
     y = [0, 0, 0]
     T = np.array([[1], [2], [3]])
     clf = SparseLasso().fit(X, y)
@@ -50,8 +50,8 @@ def test_enet_toy_list_input():
     assert_array_almost_equal(pred, [2, 3, 4])
     assert_almost_equal(clf.dual_gap_, 0)
 
-    clf = SparseENet(alpha=0.5, rho=0.3)
-    clf.fit(X, Y, max_iter=1000)
+    clf = SparseENet(alpha=0.5, rho=0.3, max_iter=1000)
+    clf.fit(X, Y)
     pred = clf.predict(T)
     assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
     assert_array_almost_equal(pred, [1.0163,  1.5245,  2.0327], decimal=3)
@@ -69,14 +69,14 @@ def test_enet_toy_explicit_sparse_input():
     """Test ElasticNet for various parameters of alpha and rho with sparse X"""
 
     # training samples
-    X = sparse.lil_matrix((3, 1))
+    X = sp.lil_matrix((3, 1))
     X[0, 0] = -1
     # X[1, 0] = 0
     X[2, 0] = 1
     Y = [-1, 0, 1]       # just a straight line (the identity function)
 
     # test samples
-    T = sparse.lil_matrix((3, 1))
+    T = sp.lil_matrix((3, 1))
     T[0, 0] = 2
     T[1, 0] = 3
     T[2, 0] = 4
@@ -89,8 +89,8 @@ def test_enet_toy_explicit_sparse_input():
     assert_array_almost_equal(pred, [2, 3, 4])
     assert_almost_equal(clf.dual_gap_, 0)
 
-    clf = SparseENet(alpha=0.5, rho=0.3)
-    clf.fit(X, Y, max_iter=1000)
+    clf = SparseENet(alpha=0.5, rho=0.3, max_iter=1000)
+    clf.fit(X, Y)
     pred = clf.predict(T)
     assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
     assert_array_almost_equal(pred, [1.0163,  1.5245,  2.0327], decimal=3)
@@ -132,14 +132,16 @@ def test_sparse_enet_not_as_toy_dataset():
     X_train, X_test = X[n_samples / 2:], X[:n_samples / 2]
     y_train, y_test = y[n_samples / 2:], y[:n_samples / 2]
 
-    s_clf = SparseENet(alpha=0.1, rho=0.8, fit_intercept=False)
-    s_clf.fit(X_train, y_train, max_iter=max_iter, tol=1e-7)
+    s_clf = SparseENet(alpha=0.1, rho=0.8, fit_intercept=False,
+                       max_iter=max_iter, tol=1e-7)
+    s_clf.fit(X_train, y_train)
     assert_almost_equal(s_clf.dual_gap_, 0, 4)
     assert s_clf.score(X_test, y_test) > 0.85
 
     # check the convergence is the same as the dense version
-    d_clf = DenseENet(alpha=0.1, rho=0.8, fit_intercept=False)
-    d_clf.fit(X_train, y_train, max_iter=max_iter, tol=1e-7)
+    d_clf = DenseENet(alpha=0.1, rho=0.8, fit_intercept=False,
+                      max_iter=max_iter, tol=1e-7)
+    d_clf.fit(X_train, y_train)
     assert_almost_equal(d_clf.dual_gap_, 0, 4)
     assert d_clf.score(X_test, y_test) > 0.85
 
@@ -158,14 +160,15 @@ def test_sparse_lasso_not_as_toy_dataset():
     X_train, X_test = X[n_samples / 2:], X[:n_samples / 2]
     y_train, y_test = y[n_samples / 2:], y[:n_samples / 2]
 
-    s_clf = SparseLasso(alpha=0.1, fit_intercept=False)
-    s_clf.fit(X_train, y_train, max_iter=max_iter, tol=1e-7)
+    s_clf = SparseLasso(alpha=0.1, fit_intercept=False,
+                        max_iter=max_iter, tol=1e-7)
+    s_clf.fit(X_train, y_train)
     assert_almost_equal(s_clf.dual_gap_, 0, 4)
     assert s_clf.score(X_test, y_test) > 0.85
 
     # check the convergence is the same as the dense version
-    d_clf = DenseLasso(alpha=0.1, fit_intercept=False)
-    d_clf.fit(X_train, y_train, max_iter=max_iter, tol=1e-7)
+    d_clf = DenseLasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
+    d_clf.fit(X_train, y_train)
     assert_almost_equal(d_clf.dual_gap_, 0, 4)
     assert d_clf.score(X_test, y_test) > 0.85
 
diff --git a/scikits/learn/linear_model/tests/test_coordinate_descent.py b/scikits/learn/linear_model/tests/test_coordinate_descent.py
index 34c92fc4e7d78..01ab83a6c48dc 100644
--- a/scikits/learn/linear_model/tests/test_coordinate_descent.py
+++ b/scikits/learn/linear_model/tests/test_coordinate_descent.py
@@ -83,20 +83,23 @@ def test_enet_toy():
     assert_array_almost_equal(pred, [2, 3, 4])
     assert_almost_equal(clf.dual_gap_, 0)
 
-    clf = ElasticNet(alpha=0.5, rho=0.3)
-    clf.fit(X, Y, max_iter=1000, precompute=False)
+    clf = ElasticNet(alpha=0.5, rho=0.3, max_iter=1000,
+                     precompute=False)
+    clf.fit(X, Y)
     pred = clf.predict(T)
     assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
     assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
     assert_almost_equal(clf.dual_gap_, 0)
 
-    clf.fit(X, Y, max_iter=1000, precompute=True)  # with Gram
+    clf.set_params(max_iter=1000, precompute=True)
+    clf.fit(X, Y)  # with Gram
     pred = clf.predict(T)
     assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
     assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
     assert_almost_equal(clf.dual_gap_, 0)
 
-    clf.fit(X, Y, max_iter=1000, precompute=np.dot(X.T, X))  # with Gram
+    clf.set_params(max_iter=1000, precompute=np.dot(X.T, X))
+    clf.fit(X, Y)  # with Gram
     pred = clf.predict(T)
     assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
     assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
@@ -121,11 +124,11 @@ def test_lasso_path():
     X = random_state.randn(n_samples, n_features)
     y = np.dot(X, w)
 
-    clf = LassoCV(n_alphas=100, eps=1e-3).fit(X, y, max_iter=max_iter)
+    clf = LassoCV(n_alphas=100, eps=1e-3, max_iter=max_iter).fit(X, y)
     assert_almost_equal(clf.alpha, 0.011, 2)
 
-    clf = LassoCV(n_alphas=100, eps=1e-3)
-    clf.fit(X, y, max_iter=max_iter, precompute=True)
+    clf = LassoCV(n_alphas=100, eps=1e-3, max_iter=max_iter, precompute=True)
+    clf.fit(X, y)
     assert_almost_equal(clf.alpha, 0.011, 2)
 
     # test set
@@ -145,12 +148,13 @@ def test_enet_path():
     X = random_state.randn(n_samples, n_features)
     y = np.dot(X, w)
 
-    clf = ElasticNetCV(n_alphas=100, eps=1e-3, rho=0.95, cv=5)
-    clf.fit(X, y, max_iter=max_iter)
+    clf = ElasticNetCV(n_alphas=100, eps=1e-3, rho=0.95, cv=5, max_iter=max_iter)
+    clf.fit(X, y)
     assert_almost_equal(clf.alpha, 0.00779, 2)
 
-    clf = ElasticNetCV(n_alphas=100, eps=1e-3, rho=0.95, cv=5)
-    clf.fit(X, y, max_iter=max_iter, precompute=True)
+    clf = ElasticNetCV(n_alphas=100, eps=1e-3, rho=0.95, cv=5,
+                       max_iter=max_iter, precompute=True)
+    clf.fit(X, y)
     assert_almost_equal(clf.alpha, 0.00779, 2)
 
     # test set
@@ -170,8 +174,9 @@ def test_path_parameters():
     X = random_state.randn(n_samples, n_features)
     y = np.dot(X, w)
 
-    clf = ElasticNetCV(n_alphas=100, eps=1e-3, rho=0.95)
-    clf.fit(X, y, max_iter=max_iter, rho=0.5, n_alphas=50)  # new params
+    clf = ElasticNetCV(n_alphas=50, eps=1e-3, max_iter=max_iter,
+                       rho=0.5)
+    clf.fit(X, y)  # new params
     assert_almost_equal(0.5, clf.rho)
     assert_equal(50, clf.n_alphas)
     assert_equal(50, len(clf.alphas))
diff --git a/scikits/learn/linear_model/tests/test_least_angle.py b/scikits/learn/linear_model/tests/test_least_angle.py
index 7e84ce37ea56f..ceadcb33cf65c 100644
--- a/scikits/learn/linear_model/tests/test_least_angle.py
+++ b/scikits/learn/linear_model/tests/test_least_angle.py
@@ -104,20 +104,32 @@ def test_lasso_lars_vs_lasso_cd(verbose=False):
     X = 3 * diabetes.data
 
     alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso')
-    lasso_cd = linear_model.Lasso(fit_intercept=False)
-    for (c, a) in zip(lasso_path.T, alphas):
+    lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
+    for c, a in zip(lasso_path.T, alphas):
         lasso_cd.alpha = a
-        lasso_cd.fit(X, y, tol=1e-8)
+        lasso_cd.fit(X, y)
         error = np.linalg.norm(c - lasso_cd.coef_)
         assert error < 0.01
 
     # similar test, with the classifiers
     for alpha in np.linspace(1e-2, 1 - 1e-2):
         clf1 = linear_model.LassoLars(alpha=alpha, normalize=False).fit(X, y)
-        clf2 = linear_model.Lasso(alpha=alpha).fit(X, y, tol=1e-8)
+        clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8,
+                                  normalize=False).fit(X, y)
         err = np.linalg.norm(clf1.coef_ - clf2.coef_)
         assert err < 1e-3
 
+    # same test, with normalized data
+    X = diabetes.data
+    alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso')
+    lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True,
+                                  tol=1e-8)
+    for c, a in zip(lasso_path.T, alphas):
+        lasso_cd.alpha = a
+        lasso_cd.fit(X, y)
+        error = np.linalg.norm(c - lasso_cd.coef_)
+        assert error < 0.01
+
 
 def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False):
     """
@@ -129,12 +141,25 @@ def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False):
     for alphas_min in alphas_min:
         alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
                                                     alpha_min=0.9)
-        lasso_cd = linear_model.Lasso(fit_intercept=False)
+        lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
         lasso_cd.alpha = alphas[-1]
-        lasso_cd.fit(X, y, tol=1e-8)
+        lasso_cd.fit(X, y)
         error = np.linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
         assert error < 0.01
 
+    alphas_min = [10, 0.9, 1e-4]
+    # same test, with normalization
+    for alphas_min in alphas_min:
+        alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso',
+                                                    alpha_min=0.9)
+        lasso_cd = linear_model.Lasso(fit_intercept=True, normalize=True,
+                                      tol=1e-8)
+        lasso_cd.alpha = alphas[-1]
+        lasso_cd.fit(X, y)
+        error = np.linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
+        assert error < 0.01
+
+
 
 def test_lars_add_features(verbose=False):
     """
diff --git a/scikits/learn/linear_model/tests/test_ridge.py b/scikits/learn/linear_model/tests/test_ridge.py
index 1c229308cc04d..d5ca7fb31b849 100644
--- a/scikits/learn/linear_model/tests/test_ridge.py
+++ b/scikits/learn/linear_model/tests/test_ridge.py
@@ -101,15 +101,15 @@ def test_ridge_vs_lstsq():
     y = np.random.randn(n_samples)
     X = np.random.randn(n_samples, n_features)
 
-    ridge = Ridge(alpha=0.)
-    ols = LinearRegression()
+    ridge = Ridge(alpha=0., fit_intercept=False)
+    ols = LinearRegression(fit_intercept=False)
 
     ridge.fit(X, y)
     ols.fit (X, y)
     assert_almost_equal(ridge.coef_, ols.coef_)
 
-    ridge.fit(X, y, fit_intercept=False)
-    ols.fit (X, y, fit_intercept=False)
+    ridge.fit(X, y)
+    ols.fit (X, y)
     assert_almost_equal(ridge.coef_, ols.coef_)
 
 def _test_ridge_loo(filter_):
@@ -182,7 +182,8 @@ def _test_ridge_cv(filter_):
     assert_equal(type(ridge_cv.intercept_), np.float64)
 
     cv = KFold(n_samples, 5)
-    ridge_cv.fit(filter_(X_diabetes), y_diabetes, cv=cv)
+    ridge_cv.set_params(cv=cv)
+    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
     ridge_cv.predict(filter_(X_diabetes))
 
     assert_equal(len(ridge_cv.coef_.shape), 1)
@@ -216,10 +217,10 @@ def _test_ridge_classifiers(filter_):
         y_pred = clf.predict(filter_(X_iris))
         assert np.mean(y_iris == y_pred) >= 0.8
 
-    clf = RidgeClassifierCV()
     n_samples = X_iris.shape[0]
     cv = KFold(n_samples, 5)
-    clf.fit(filter_(X_iris), y_iris, cv=cv)
+    clf = RidgeClassifierCV(cv=cv)
+    clf.fit(filter_(X_iris), y_iris)
     y_pred = clf.predict(filter_(X_iris))
     assert np.mean(y_iris == y_pred) >= 0.8
 
diff --git a/scikits/learn/manifold/isomap.py b/scikits/learn/manifold/isomap.py
index df7b298b4eb5b..c78fed1707512 100644
--- a/scikits/learn/manifold/isomap.py
+++ b/scikits/learn/manifold/isomap.py
@@ -122,7 +122,7 @@ def reconstruction_error(self):
         evals = self.kernel_pca_.lambdas_
         return np.sqrt(np.sum(G_center ** 2) - np.sum(evals ** 2)) / G.shape[0]
 
-    def fit(self, X, y=None, **params):
+    def fit(self, X, y=None):
         """Compute the embedding vectors for data X
 
         Parameters
@@ -134,11 +134,10 @@ def fit(self, X, y=None, **params):
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
         self._fit_transform(X)
         return self
 
-    def fit_transform(self, X, y=None, **params):
+    def fit_transform(self, X, y=None):
         """Fit the model from data in X and transform X.
 
         Parameters
@@ -151,7 +150,6 @@ def fit_transform(self, X, y=None, **params):
         -------
         X_new: array-like, shape (n_samples, out_dim)
         """
-        self._set_params(**params)
         self._fit_transform(X)
         return self.embedding_
 
diff --git a/scikits/learn/manifold/locally_linear.py b/scikits/learn/manifold/locally_linear.py
index c12e7ed49c190..8daa86a1786a2 100644
--- a/scikits/learn/manifold/locally_linear.py
+++ b/scikits/learn/manifold/locally_linear.py
@@ -458,7 +458,7 @@ def _fit_transform(self, X):
                 max_iter=self.max_iter, method=self.method,
                 hessian_tol=self.hessian_tol, modified_tol=self.modified_tol)
 
-    def fit(self, X, y=None, **params):
+    def fit(self, X, y=None):
         """Compute the embedding vectors for data X
 
         Parameters
@@ -470,11 +470,10 @@ def fit(self, X, y=None, **params):
         -------
         self : returns an instance of self.
         """
-        self._set_params(**params)
         self._fit_transform(X)
         return self
 
-    def fit_transform(self, X, y=None, **params):
+    def fit_transform(self, X, y=None):
         """Compute the embedding vectors for data X and transform X.
 
         Parameters
@@ -486,11 +485,10 @@ def fit_transform(self, X, y=None, **params):
         -------
         X_new: array-like, shape (n_samples, out_dim)
         """
-        self._set_params(**params)
         self._fit_transform(X)
         return self.embedding_
 
-    def transform(self, X, **params):
+    def transform(self, X):
         """
         Transform new points into embedding space.
 
@@ -507,7 +505,6 @@ def transform(self, X, **params):
         Because of scaling performed by this method, it is discouraged to use
         it together with methods that are not scale-invariant (like SVMs)
         """
-        self._set_params(**params)
         X = np.atleast_2d(X)
         if not hasattr(self, 'ball_tree_'):
             raise ValueError('The model is not fitted')
diff --git a/scikits/learn/manifold/tests/test_locally_linear.py b/scikits/learn/manifold/tests/test_locally_linear.py
index 25bff361dad3e..01007484d72e8 100644
--- a/scikits/learn/manifold/tests/test_locally_linear.py
+++ b/scikits/learn/manifold/tests/test_locally_linear.py
@@ -31,7 +31,8 @@ def test_lle_simple_grid():
     assert_lower(reconstruction_error, tol)
 
     for solver in eigen_solvers:
-        clf.fit(X, eigen_solver=solver)
+        clf.set_params(eigen_solver=solver)
+        clf.fit(X)
         assert clf.embedding_.shape[1] == out_dim
         reconstruction_error = np.linalg.norm(
             np.dot(N, clf.embedding_) - clf.embedding_, 'fro') ** 2
@@ -53,7 +54,7 @@ def test_lle_manifold():
     X = np.c_[X, X[:, 0] ** 2 / 20]
     out_dim = 2
     clf = manifold.LocallyLinearEmbedding(n_neighbors=5, out_dim=out_dim)
-    tol = .5
+    tol = 1.5
 
     N = neighbors.kneighbors_graph(X, clf.n_neighbors,
                                    mode='barycenter').toarray()
@@ -61,7 +62,8 @@ def test_lle_manifold():
     assert_lower(reconstruction_error, tol)
 
     for solver in eigen_solvers:
-        clf.fit(X, eigen_solver=solver)
+        clf.set_params(eigen_solver=solver)        
+        clf.fit(X)
         assert clf.embedding_.shape[1] == out_dim
         reconstruction_error = np.linalg.norm(
             np.dot(N, clf.embedding_) - clf.embedding_, 'fro') ** 2
diff --git a/scikits/learn/neighbors.py b/scikits/learn/neighbors.py
index 04361fbb98e01..616acacf18e8a 100644
--- a/scikits/learn/neighbors.py
+++ b/scikits/learn/neighbors.py
@@ -111,7 +111,7 @@ def __init__(self, n_neighbors=5, algorithm='auto', leaf_size=20):
         self.leaf_size = leaf_size
         self.algorithm = algorithm
 
-    def fit(self, X, y, **params):
+    def fit(self, X, y):
         """Fit the model using X, y as training data
 
         Parameters
@@ -122,14 +122,11 @@ def fit(self, X, y, **params):
         y : {array-like, sparse matrix}, shape = [n_samples]
             Target values, array of integer values.
 
-        params : list of keyword, optional
-            Overwrite keywords from __init__
         """
         X = safe_asanyarray(X)
         if y is None:
             raise ValueError("y must not be None")
         self._y = np.asanyarray(y)
-        self._set_params(**params)
 
         if issparse(X):
             self.ball_tree = None
@@ -142,7 +139,7 @@ def fit(self, X, y, **params):
             self._fit_X = X
         return self
 
-    def kneighbors(self, X, return_distance=True, **params):
+    def kneighbors(self, X, n_neighbors=None, return_distance=True):
         """Finds the K-neighbors of a point.
 
         Returns distance
@@ -193,21 +190,22 @@ class from an array representing our data set and ask who's
                [2]]...)
 
         """
-        self._set_params(**params)
         X = atleast2d_or_csr(X)
+        if n_neighbors is None:
+            n_neighbors = self.n_neighbors
         if self.ball_tree is None:
             dist = euclidean_distances(X, self._fit_X, squared=True)
             # XXX: should be implemented with a partial sort
-            neigh_ind = dist.argsort(axis=1)[:, :self.n_neighbors]
+            neigh_ind = dist.argsort(axis=1)[:, :n_neighbors]
             if not return_distance:
                 return neigh_ind
             else:
                 return dist.T[neigh_ind], neigh_ind
         else:
-            return self.ball_tree.query(X, self.n_neighbors,
+            return self.ball_tree.query(X, n_neighbors,
                                         return_distance=return_distance)
 
-    def predict(self, X, **params):
+    def predict(self, X):
         """Predict the class labels for the provided data
 
         Parameters
@@ -225,7 +223,6 @@ def predict(self, X, **params):
             List of class labels (one for each data sample).
         """
         X = atleast2d_or_csr(X)
-        self._set_params(**params)
 
         # get neighbors
         neigh_ind = self.kneighbors(X, return_distance=False)
@@ -295,7 +292,7 @@ def __init__(self, n_neighbors=5, mode='mean', algorithm='auto',
         self.mode = mode
         self.algorithm = algorithm
 
-    def predict(self, X, **params):
+    def predict(self, X):
         """Predict the target for the provided data
 
         Parameters
@@ -313,7 +310,6 @@ def predict(self, X, **params):
             List of target values (one for each data sample).
         """
         X = atleast2d_or_csr(X)
-        self._set_params(**params)
 
         # compute nearest neighbors
         neigh_ind = self.kneighbors(X, return_distance=False)
diff --git a/scikits/learn/pls.py b/scikits/learn/pls.py
index 126918ed1b8ae..ab7602ed5a062 100644
--- a/scikits/learn/pls.py
+++ b/scikits/learn/pls.py
@@ -201,8 +201,7 @@ def __init__(self, n_components=2, deflation_mode="canonical", mode="A",
         self.tol = tol
         self.copy = copy
 
-    def fit(self, X, Y, **params):
-        self._set_params(**params)
+    def fit(self, X, Y):
         # copy since this will contains the residuals (deflated) matrices
         if self.copy:
             X = np.asanyarray(X, dtype=np.float).copy()
@@ -472,8 +471,8 @@ class PLSRegression(_PLS):
     >>> from scikits.learn.pls import PLSCanonical, PLSRegression, CCA
     >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]
     >>> Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
-    >>> pls2 = PLSRegression()
-    >>> pls2.fit(X, Y, n_components=2)
+    >>> pls2 = PLSRegression(n_components=2)
+    >>> pls2.fit(X, Y)
     PLSRegression(scale=True, algorithm='nipals', max_iter=500, n_components=2,
            tol=1e-06, copy=True)
     >>> Y_pred = pls2.predict(X)
@@ -577,8 +576,8 @@ class PLSCanonical(_PLS):
     >>> from scikits.learn.pls import PLSCanonical, PLSRegression, CCA
     >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]
     >>> Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
-    >>> plsca = PLSCanonical()
-    >>> plsca.fit(X, Y, n_components=2)
+    >>> plsca = PLSCanonical(n_components=2)
+    >>> plsca.fit(X, Y)
     PLSCanonical(scale=True, algorithm='nipals', max_iter=500, n_components=2,
            tol=1e-06, copy=True)
     >>> X_c, Y_c = plsca.transform(X, Y)
@@ -685,8 +684,8 @@ class CCA(_PLS):
     >>> from scikits.learn.pls import PLSCanonical, PLSRegression, CCA
     >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
     >>> Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
-    >>> cca = CCA()
-    >>> cca.fit(X, Y, n_components=1)
+    >>> cca = CCA(n_components=1)
+    >>> cca.fit(X, Y)
     CCA(scale=True, algorithm='nipals', max_iter=500, n_components=1, tol=1e-06,
       copy=True)
     >>> X_c, Y_c = cca.transform(X, Y)
@@ -762,8 +761,7 @@ def __init__(self, n_components=2, scale=True, copy=True):
         self.n_components = n_components
         self.scale = scale
 
-    def fit(self, X, Y, **params):
-        self._set_params(**params)
+    def fit(self, X, Y):
         # copy since this will contains the centered data
         if self.copy:
             X = np.asanyarray(X).copy()
diff --git a/scikits/learn/preprocessing/__init__.py b/scikits/learn/preprocessing/__init__.py
index 63606d8067bbe..7b5191b690416 100644
--- a/scikits/learn/preprocessing/__init__.py
+++ b/scikits/learn/preprocessing/__init__.py
@@ -19,7 +19,7 @@
 def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
     """Compute mean and std dev for centering, scaling
 
-    Zero valued std components are reseted to 1.0 to avoid NaNs when scaling.
+    Zero valued std components are reset to 1.0 to avoid NaNs when scaling.
     """
     X = np.asanyarray(X)
     Xr = np.rollaxis(X, axis)
@@ -178,13 +178,33 @@ def transform(self, X, y=None, copy=None):
         X = np.asanyarray(X)
         if copy:
             X = X.copy()
-        # We are taking a view of the X array and modifying it
         if self.with_mean:
             X -= self.mean_
         if self.with_std:
             X /= self.std_
         return X
 
+    def inverse_transform(self, X, copy=None):
+        """Scale back the data to the original representation
+
+        Parameters
+        ----------
+        X : array-like with shape [n_samples, n_features]
+            The data used to scale along the features axis.
+        """
+        copy = copy if copy is not None else self.copy
+        if sp.issparse(X):
+            raise NotImplementedError(
+                "Scaling is not yet implement for sparse matrices")
+        X = np.asanyarray(X)
+        if copy:
+            X = X.copy()
+        if self.with_std:
+            X *= self.std_
+        if self.with_mean:
+            X += self.mean_
+        return X
+
 
 def normalize(X, norm='l2', axis=1, copy=True):
     """Normalize a dataset along any axis
diff --git a/scikits/learn/preprocessing/tests/test_preprocessing.py b/scikits/learn/preprocessing/tests/test_preprocessing.py
index 6cba3d33cd285..446a1c63adf54 100644
--- a/scikits/learn/preprocessing/tests/test_preprocessing.py
+++ b/scikits/learn/preprocessing/tests/test_preprocessing.py
@@ -35,12 +35,17 @@ def test_scaler():
     """Test scaling of dataset along all axis"""
     # First test with 1D data
     X = np.random.randn(5)
+    X_orig_copy = X.copy()
 
     scaler = Scaler()
     X_scaled = scaler.fit(X).transform(X, copy=False)
     assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
     assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
 
+    # check inverse transform
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert_array_almost_equal(X_scaled_back, X_orig_copy)
+
     # Test with 1D list
     X = [0., 1., 2, 0.4, 1.]
     scaler = Scaler()
@@ -65,6 +70,12 @@ def test_scaler():
     # Check that X has not been copied
     assert X_scaled is not X
 
+    # check inverse transform
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
     X_scaled = scale(X, axis=1, with_std=False)
     assert not np.any(np.isnan(X_scaled))
     assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
@@ -108,6 +119,11 @@ def test_scaler_without_centering():
     # Check that X has not been copied
     assert X_scaled is not X
 
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
     X_scaled = scale(X, with_mean=False)
     assert not np.any(np.isnan(X_scaled))
 
@@ -117,6 +133,11 @@ def test_scaler_without_centering():
     # Check that X has not been copied
     assert X_scaled is not X
 
+    X_scaled_back = scaler.inverse_transform(X_scaled)
+    assert X_scaled_back is not X
+    assert X_scaled_back is not X_scaled
+    assert_array_almost_equal(X_scaled_back, X)
+
 
 def test_normalizer_l1():
     rng = np.random.RandomState(0)
diff --git a/scikits/learn/qda.py b/scikits/learn/qda.py
index e709d3628b8bd..b275410dc50a0 100644
--- a/scikits/learn/qda.py
+++ b/scikits/learn/qda.py
@@ -63,7 +63,7 @@ class QDA(BaseEstimator, ClassifierMixin):
     def __init__(self, priors=None):
         self.priors = np.asarray(priors) if priors is not None else None
 
-    def fit(self, X, y, store_covariances=False, tol=1.0e-4, **params):
+    def fit(self, X, y, store_covariances=False, tol=1.0e-4):
         """
         Fit the QDA model according to the given training data and parameters.
 
@@ -78,7 +78,6 @@ def fit(self, X, y, store_covariances=False, tol=1.0e-4, **params):
             If True the covariance matrices are computed and stored in the
             self.covariances_ attribute.
         """
-        self._set_params(**params)
         X = np.asanyarray(X)
         y = np.asanyarray(y)
         if X.ndim != 2:
diff --git a/scikits/learn/svm/base.py b/scikits/learn/svm/base.py
index 67883ac94000a..ed9a1d259194b 100644
--- a/scikits/learn/svm/base.py
+++ b/scikits/learn/svm/base.py
@@ -71,8 +71,7 @@ def _compute_kernel(self, X):
                                dtype=np.float64, order='C')
         return X
 
-    def fit(self, X, y, class_weight=None, sample_weight=None, cache_size=100.,
-            **params):
+    def fit(self, X, y, class_weight=None, sample_weight=None, cache_size=100.):
         """
         Fit the SVM model according to the given training data and
         parameters.
@@ -111,7 +110,6 @@ class frequencies.
         copied.
 
         """
-        self._set_params(**params)
 
         X = np.asanyarray(X, dtype=np.float64, order='C')
         y = np.asanyarray(y, dtype=np.float64, order='C')
@@ -349,7 +347,7 @@ def _get_solver_type(self):
                              + solver_type)
         return self._solver_type_dict[solver_type]
 
-    def fit(self, X, y, class_weight=None, **params):
+    def fit(self, X, y, class_weight=None):
         """
         Fit the model according to the given training data and
         parameters.
@@ -372,7 +370,6 @@ def fit(self, X, y, class_weight=None, **params):
         self : object
             Returns self.
         """
-        self._set_params(**params)
 
         self.class_weight, self.class_weight_label = \
                      _get_class_weight(class_weight, y)
diff --git a/scikits/learn/svm/classes.py b/scikits/learn/svm/classes.py
index c2db5a0e2ff01..a62de3a6716b8 100644
--- a/scikits/learn/svm/classes.py
+++ b/scikits/learn/svm/classes.py
@@ -275,7 +275,7 @@ class SVR(BaseLibSVM, RegressorMixin):
         penalty parameter C of the error term.
 
     epsilon : float, optional (default=0.1)
-         epsilon in the epsilon-SVR model. It specifies the espilon-tube
+         epsilon in the epsilon-SVR model. It specifies the epsilon-tube
          within which no penalty is associated in the training loss function 
          with points predicted within a distance epsilon from the actual 
          value.
@@ -455,9 +455,9 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3,
                  gamma=0.0, coef0=0.0, shrinking=True,
                  probability=False, tol=1e-3):
 
-        BaseLibSVM.__init__(self, 'epsilon_svr', kernel, degree,
+        BaseLibSVM.__init__(self, 'nu_svr', kernel, degree,
                          gamma, coef0, tol, C, nu,
-                         0.0, shrinking, probability)
+                         None, shrinking, probability)
 
     def fit(self, X, y, sample_weight=None, **params):
         """
diff --git a/scikits/learn/svm/sparse/base.py b/scikits/learn/svm/sparse/base.py
index 6bc983719a2cd..df8a822863d73 100644
--- a/scikits/learn/svm/sparse/base.py
+++ b/scikits/learn/svm/sparse/base.py
@@ -47,8 +47,7 @@ def __init__(self, impl, kernel, degree, gamma, coef0,
         # only used in classification
         self.n_support_ = np.empty(0, dtype=np.int32, order='C')
 
-    def fit(self, X, y, class_weight=None, sample_weight=[], cache_size=100.,
-            **params):
+    def fit(self, X, y, class_weight=None, sample_weight=[], cache_size=100.):
         """
         Fit the SVM model according to the given training data and
         parameters.
@@ -84,7 +83,6 @@ def fit(self, X, y, class_weight=None, sample_weight=[], cache_size=100.,
         For maximum effiency, use a sparse matrix in csr format
         (scipy.sparse.csr_matrix)
         """
-        self._set_params(**params)
 
         import scipy.sparse
         X = scipy.sparse.csr_matrix(X)
@@ -222,7 +220,7 @@ def predict_proba(self, X):
 
 class SparseBaseLibLinear(BaseLibLinear):
 
-    def fit(self, X, y, class_weight=None, **params):
+    def fit(self, X, y, class_weight=None):
         """
         Fit the model using X, y as training data.
 
@@ -239,7 +237,6 @@ def fit(self, X, y, class_weight=None, **params):
         self : object
             Returns an instance of self.
         """
-        self._set_params(**params)
 
         import scipy.sparse
         X = scipy.sparse.csr_matrix(X)
diff --git a/scikits/learn/svm/sparse/classes.py b/scikits/learn/svm/sparse/classes.py
index 004b65430d196..cbd5697316848 100644
--- a/scikits/learn/svm/sparse/classes.py
+++ b/scikits/learn/svm/sparse/classes.py
@@ -142,7 +142,7 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3,
                  gamma=0.0, coef0=0.0, shrinking=True, epsilon=0.1,
                  probability=False, tol=1e-3):
 
-        SparseBaseLibSVM.__init__(self, 'epsilon_svr', kernel,
+        SparseBaseLibSVM.__init__(self, 'nu_svr', kernel,
                          degree, gamma, coef0, tol, C, nu,
                          epsilon, shrinking, probability)
 
diff --git a/scikits/learn/svm/tests/test_svm.py b/scikits/learn/svm/tests/test_svm.py
index ded6936b71f97..6f200757c3c89 100644
--- a/scikits/learn/svm/tests/test_svm.py
+++ b/scikits/learn/svm/tests/test_svm.py
@@ -140,29 +140,13 @@ def test_SVR():
     Test Support Vector Regression
     """
 
-    clf = svm.SVR(kernel='linear')
-    clf.fit(X, Y)
-    pred = clf.predict(T)
-
-    assert_array_almost_equal(clf.dual_coef_, [[-0.1, 0.1]])
-    assert_array_almost_equal(clf.coef_, [[0.2, 0.2]])
-    assert_array_almost_equal(clf.support_vectors_, [[-1, -1], [1, 1]])
-    assert_array_equal(clf.support_, [1, 3])
-    assert_array_almost_equal(clf.intercept_, [1.5])
-    assert_array_almost_equal(pred, [1.1, 2.3, 2.5])
-
-    # the same with kernel='rbf'
-    clf = svm.SVR(kernel='rbf')
-    clf.fit(X, Y)
-    pred = clf.predict(T)
-
-    assert_array_almost_equal(clf.dual_coef_,
-                              [[-0.014, -0.515, -0.013, 0.515, 0.013, 0.013]],
-                              decimal=3)
-    assert_raises(NotImplementedError, lambda: clf.coef_)
-    assert_array_almost_equal(clf.support_vectors_, X)
-    assert_array_almost_equal(clf.intercept_, [1.49997261])
-    assert_array_almost_equal(pred, [1.10001274, 1.86682485, 1.73300377])
+    diabetes = datasets.load_diabetes()
+    for clf in (svm.NuSVR(kernel='linear', nu=.4),
+                svm.SVR(kernel='linear', C=10.),
+                svm.sparse.NuSVR(kernel='linear', nu=.4),
+                svm.sparse.SVR(kernel='linear', C=10.)):
+        clf.fit(diabetes.data, diabetes.target)
+        assert clf.score(diabetes.data, diabetes.target) > 0.02
 
 
 def test_oneclass():
diff --git a/scikits/learn/tests/test_base.py b/scikits/learn/tests/test_base.py
index 53a422f5cbc0c..4d1fb4ab572b3 100644
--- a/scikits/learn/tests/test_base.py
+++ b/scikits/learn/tests/test_base.py
@@ -93,9 +93,9 @@ def test_get_params():
     assert_true('a__d' in test._get_params(deep=True))
     assert_true('a__d' not in test._get_params(deep=False))
 
-    test._set_params(a__d=2)
+    test.set_params(a__d=2)
     assert test.a.d == 2
-    assert_raises(AssertionError, test._set_params, a__a=2)
+    assert_raises(AssertionError, test.set_params, a__a=2)
 
 
 def test_is_classifier():
diff --git a/scikits/learn/tests/test_cross_val.py b/scikits/learn/tests/test_cross_val.py
index c849ae1328748..c130d4178825d 100644
--- a/scikits/learn/tests/test_cross_val.py
+++ b/scikits/learn/tests/test_cross_val.py
@@ -23,8 +23,7 @@ class MockClassifier(BaseEstimator):
     def __init__(self, a=0):
         self.a = a
 
-    def fit(self, X, Y, **params):
-        self._set_params(**params)
+    def fit(self, X, Y):
         return self
 
     def predict(self, T):
diff --git a/scikits/learn/tests/test_grid_search.py b/scikits/learn/tests/test_grid_search.py
index e52e06ba2a082..83e18ab46766d 100644
--- a/scikits/learn/tests/test_grid_search.py
+++ b/scikits/learn/tests/test_grid_search.py
@@ -21,8 +21,7 @@ class MockClassifier(BaseEstimator):
     def __init__(self, foo_param=0):
         self.foo_param = foo_param
 
-    def fit(self, X, Y, **params):
-        self._set_params(**params)
+    def fit(self, X, Y):
         return self
 
     def predict(self, T):
@@ -88,7 +87,7 @@ def test_grid_search_sparse_score_func():
     clf = LinearSVC()
     cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
     # XXX: set refit to False due to a random bug when True (default)
-    cv.fit(X_[:180], y_[:180], refit=False)
+    cv.set_params(refit=False).fit(X_[:180], y_[:180])
     y_pred = cv.predict(X_[180:])
     C = cv.best_estimator.C
 
@@ -96,7 +95,7 @@ def test_grid_search_sparse_score_func():
     clf = SparseLinearSVC()
     cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
     # XXX: set refit to False due to a random bug when True (default)
-    cv.fit(X_[:180], y_[:180], refit=False)
+    cv.set_params(refit=False).fit(X_[:180], y_[:180])
     y_pred2 = cv.predict(X_[180:])
     C2 = cv.best_estimator.C
 
diff --git a/scikits/learn/tests/test_naive_bayes.py b/scikits/learn/tests/test_naive_bayes.py
index 81321931d1122..730b7e5866ab2 100644
--- a/scikits/learn/tests/test_naive_bayes.py
+++ b/scikits/learn/tests/test_naive_bayes.py
@@ -123,7 +123,8 @@ def test_discretenb_uniform_prior():
        when fit_prior=False and class_prior=None"""
 
     for cls in [BernoulliNB, MultinomialNB]:
-        clf = cls(fit_prior=False)
+        clf = cls()
+        clf.set_params(fit_prior=False)
         clf.fit([[0], [0], [1]], [0, 0, 1])
         prior = np.exp(clf.class_log_prior_)
         assert prior[0] == prior[1]
diff --git a/scikits/learn/tests/test_neighbors.py b/scikits/learn/tests/test_neighbors.py
index 14567e4dac5ad..66a459ab95757 100644
--- a/scikits/learn/tests/test_neighbors.py
+++ b/scikits/learn/tests/test_neighbors.py
@@ -101,16 +101,17 @@ def test_neighbors_iris():
     """
 
     for s in ('auto', 'ball_tree', 'brute', 'inplace'):
-        clf = neighbors.NeighborsClassifier()
-        clf.fit(iris.data, iris.target, n_neighbors=1, algorithm=s)
+        clf = neighbors.NeighborsClassifier(n_neighbors=1, algorithm=s)
+        clf.fit(iris.data, iris.target)
         assert_array_equal(clf.predict(iris.data), iris.target)
 
-        clf.fit(iris.data, iris.target, n_neighbors=9, algorithm=s)
+        clf.set_params(n_neighbors=9, algorithm=s)
+        clf.fit(iris.data, iris.target)
         assert np.mean(clf.predict(iris.data) == iris.target) > 0.95
 
         for m in ('barycenter', 'mean'):
-            rgs = neighbors.NeighborsRegressor()
-            rgs.fit(iris.data, iris.target, mode=m, algorithm=s)
+            rgs = neighbors.NeighborsRegressor(mode=m)
+            rgs.fit(iris.data, iris.target)
             assert np.mean(
                 rgs.predict(iris.data).round() == iris.target) > 0.95
 
@@ -125,8 +126,8 @@ def _test_neighbors_sparse_regression():
 
     for sparse1 in SPARSE_TYPES:
         for m in ('barycenter', 'mean'):
-            rgs = neighbors.NeighborsRegressor()
-            rgs.fit(sparse1(iris.data), iris.target, mode=m)
+            rgs = neighbors.NeighborsRegressor(mode=m)
+            rgs.fit(sparse1(iris.data), iris.target)
             for sparse2 in SPARSE_OR_DENSE:
                 data2 = sparse2(iris.data)
                 assert np.mean(rgs.predict(data2).round() == iris.target) > 0.95
diff --git a/scikits/learn/tests/test_pipeline.py b/scikits/learn/tests/test_pipeline.py
index 98911a44c0a6b..42e1f7f78fa1e 100644
--- a/scikits/learn/tests/test_pipeline.py
+++ b/scikits/learn/tests/test_pipeline.py
@@ -44,7 +44,7 @@ def test_pipeline_init():
                  dict(svc__a=None, svc__b=None, svc=clf))
 
     # Check that params are set
-    pipe._set_params(svc__a=0.1)
+    pipe.set_params(svc__a=0.1)
     assert_equal(clf.a, 0.1)
     # Smoke test the repr:
     repr(pipe)
@@ -55,13 +55,13 @@ def test_pipeline_init():
     pipe = Pipeline([('anova', filter1), ('svc', clf)])
 
     # Check that params are set
-    pipe._set_params(svc__C=0.1)
+    pipe.set_params(svc__C=0.1)
     assert_equal(clf.C, 0.1)
     # Smoke test the repr:
     repr(pipe)
 
     # Check that params are not set when naming them wrong
-    assert_raises(AssertionError, pipe._set_params, anova__C=0.1)
+    assert_raises(AssertionError, pipe.set_params, anova__C=0.1)
 
     # Test clone
     pipe2 = clone(pipe)
diff --git a/scikits/learn/tests/test_pls.py b/scikits/learn/tests/test_pls.py
index 75f9c4e506877..e66ea38dac226 100644
--- a/scikits/learn/tests/test_pls.py
+++ b/scikits/learn/tests/test_pls.py
@@ -14,10 +14,10 @@ def test_pls():
     # ===========================================================
     # Compare 2 algo.: nipals vs. svd
     # ------------------------------
-    pls_bynipals = pls.PLSCanonical()
-    pls_bynipals.fit(X, Y, n_components=n_components)
-    pls_bysvd = pls.PLSCanonical(algorithm="svd")
-    pls_bysvd.fit(X, Y, n_components=n_components)
+    pls_bynipals = pls.PLSCanonical(n_components=n_components)
+    pls_bynipals.fit(X, Y)
+    pls_bysvd = pls.PLSCanonical(algorithm="svd", n_components=n_components)
+    pls_bysvd.fit(X, Y)
     # check that the loading vectors are highly correlated
     assert_array_almost_equal(
         [np.abs(np.corrcoef(pls_bynipals.x_loadings_[:, k],
@@ -35,8 +35,8 @@ def test_pls():
 
     # Check PLS properties (with n_components=X.shape[1])
     # ---------------------------------------------------
-    plsca = pls.PLSCanonical()
-    plsca.fit(X, Y, n_components=X.shape[1])
+    plsca = pls.PLSCanonical(n_components=X.shape[1])
+    plsca.fit(X, Y)
     T = plsca.x_scores_
     P = plsca.x_loadings_
     Wx = plsca.x_weights_
@@ -81,8 +81,8 @@ def check_ortho(M, err_msg):
 
     # "Non regression test" on canonical PLS
     # --------------------------------------
-    pls_bynipals = pls.PLSCanonical()
-    pls_bynipals.fit(X, Y, n_components=n_components)
+    pls_bynipals = pls.PLSCanonical(n_components=n_components)
+    pls_bynipals.fit(X, Y)
 
     pls_ca = pls_bynipals
     x_loadings = np.array(
@@ -111,8 +111,8 @@ def check_ortho(M, err_msg):
 
     # 2) Regression PLS (PLS2): "Non regression test"
     # ===============================================
-    pls2 = pls.PLSRegression()
-    pls2.fit(X, Y, n_components=n_components)
+    pls2 = pls.PLSRegression(n_components=n_components)
+    pls2.fit(X, Y)
 
     x_loadings = np.array(
           [[-0.66591531, -0.01976472],
diff --git a/scikits/learn/utils/__init__.py b/scikits/learn/utils/__init__.py
index ec31dc4e61da8..4921a34e80175 100644
--- a/scikits/learn/utils/__init__.py
+++ b/scikits/learn/utils/__init__.py
@@ -23,6 +23,41 @@ def safe_asanyarray(X, dtype=None, order=None):
     return X
 
 
+def as_float_array(X, overwrite_X=False):
+    """
+    Converts a numpy array to type np.float
+
+    The new dtype will be float32 or np.float64, depending on the original type.
+    The function can create a copy or modify the argument depending
+    of the argument overwrite_X
+
+    WARNING : If X is not of type float, then a copy of X with the right type
+              will be returned
+
+    Parameters
+    ----------
+    X : array
+
+    overwrite_X : bool, optional
+        if False, a copy of X will be created
+
+    Returns
+    -------
+    X : array
+        An array of type np.float
+    """
+    if X.dtype in [np.float32, np.float64]:
+        if overwrite_X:
+            return X
+        else:
+            return X.copy()
+    if X.dtype == np.int32:
+        X = X.astype(np.float32)
+    else:
+        X = X.astype(np.float64)
+    return X
+
+
 def atleast2d_or_csr(X):
     """Like numpy.atleast_2d, but converts sparse matrices to CSR format"""
     X = X.tocsr() if sp.issparse(X) else np.atleast_2d(X)
@@ -112,6 +147,15 @@ def check_arrays(*arrays, **options):
     return checked_arrays
 
 
+def warn_if_not_float(X, estimator='This algorithm'):
+    """Warning utility function to check that data type is floating point"""
+    if not isinstance(estimator, basestring):
+        estimator = estimator.__class__.__name__
+    if X.dtype.kind != 'f':
+        warnings.warn("%s assumes floating point values as input, "
+                      "got %s" % (estimator, X.dtype))
+
+
 class deprecated(object):
     """Decorator to mark a function or class as deprecated.
 
diff --git a/scikits/learn/utils/tests/test___init__.py b/scikits/learn/utils/tests/test___init__.py
new file mode 100644
index 0000000000000..fc0368ece2374
--- /dev/null
+++ b/scikits/learn/utils/tests/test___init__.py
@@ -0,0 +1,23 @@
+import numpy as np
+from .. import as_float_array
+
+
+def test_as_float_array():
+    """
+    Test function for as_float_array
+    """
+    X = np.ones((3, 10), dtype=np.int32)
+    X = X + np.arange(10, dtype=np.int32)
+    # Checks that the return type is ok
+    X2 = as_float_array(X, overwrite_X=True)
+    np.testing.assert_equal(X2.dtype, np.float32)
+    # Another test
+    X = X.astype(np.int64)
+    X2 = as_float_array(X, overwrite_X=False)
+    # Checking that the array wasn't overwritten
+    assert as_float_array(X, False) is not X
+    # Checking that the new type is ok
+    np.testing.assert_equal(X2.dtype, np.float64)
+    # Here, X is of the right type, it shouldn't be modified
+    X = np.ones((3, 2), dtype=np.float32)
+    assert as_float_array(X, overwrite_X=True) is X