diff --git a/.gitignore b/.gitignore
index 9b158da07a2ec..b8ee8d20322c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,7 @@ doc/samples
 *.prof
 .tox/
 .coverage
+pip-wheel-metadata
 
 lfw_preprocessed/
 nips2010_pdf/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000..aa8df3c3cbc87
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,22 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.7.8
+    hooks:
+    -   id: flake8
+        types: [file, python]
+        # only check for unused imports for now, as long as
+        # the code is not fully PEP8 compatible
+        args: [--select=F401]
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.730
+    hooks:
+     -  id: mypy
+        args:
+          - --ignore-missing-imports
+        files: sklearn/
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index d1849a940d96c..f30db7f0ae08a 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -98,9 +98,6 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     python -m pip install -U pip
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
 
-    # TODO: Remove pin when https://github.com/python-pillow/Pillow/issues/4518 gets fixed
-    python -m pip install "pillow>=4.3.0,!=7.1.0,!=7.1.1"
-
     python -m pip install pandas matplotlib pyamg scikit-image
     # do not install dependencies for lightgbm since it requires scikit-learn
     python -m pip install lightgbm --no-deps
diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py
index 81e99856c6890..eaad1df75475c 100644
--- a/build_tools/generate_authors_table.py
+++ b/build_tools/generate_authors_table.py
@@ -11,6 +11,7 @@
 import getpass
 import time
 from pathlib import Path
+from os import path
 
 print("user:", file=sys.stderr)
 user = input()
@@ -18,7 +19,7 @@
 auth = (user, passwd)
 
 LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4'
-REPO_FOLDER = Path(__file__).parent.parent
+REPO_FOLDER = Path(path.abspath(__file__)).parent.parent
 
 
 def get(url):
@@ -100,7 +101,6 @@ def get_profile(login):
         'Duchesnay': 'Edouard Duchesnay',
         'Lars': 'Lars Buitinck',
         'MechCoder': 'Manoj Kumar',
-        'jeremiedbb': 'Jérémie Du Boisberranger',
     }
     if profile["name"] in missing_names:
         profile["name"] = missing_names[profile["name"]]
diff --git a/conftest.py b/conftest.py
index 2b9e87bf9f292..874931341e195 100644
--- a/conftest.py
+++ b/conftest.py
@@ -99,16 +99,6 @@ def pytest_unconfigure(config):
     del sys._is_pytest_session
 
 
-def pytest_runtest_setup(item):
-    if isinstance(item, DoctestItem):
-        set_config(print_changed_only=True)
-
-
-def pytest_runtest_teardown(item, nextitem):
-    if isinstance(item, DoctestItem):
-        set_config(print_changed_only=False)
-
-
 # TODO: Remove when modules are deprecated in 0.24
 # Configures pytest to ignore deprecated modules.
 collect_ignore_glob = [
diff --git a/doc/about.rst b/doc/about.rst
index a6cdd54eb9201..814a4724d9579 100644
--- a/doc/about.rst
+++ b/doc/about.rst
@@ -271,14 +271,18 @@ July 2017.
    </div>
    </div>
 
-............
+Past Sponsors
+.............
 
 .. raw:: html
 
    <div class="sk-sponsor-div">
    <div class="sk-sponsor-div-box">
 
-`Anaconda, Inc <https://www.anaconda.com/>`_ funds Adrin Jalali since 2019.
+`INRIA <https://www.inria.fr>`_ actively supports this project. It has
+provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler
+(2012-2013) and Olivier Grisel (2013-2017) to work on this project
+full-time. It also hosts coding sprints and other events.
 
 .. raw:: html
 
@@ -286,67 +290,63 @@ July 2017.
 
    <div class="sk-sponsor-div-box">
 
-.. image:: images/anaconda.png
+.. image:: images/inria-logo.jpg
    :width: 100pt
    :align: center
-   :target: https://sydney.edu.au/
+   :target: https://www.inria.fr
 
 .. raw:: html
 
    </div>
    </div>
 
-Past Sponsors
-.............
+.....................
 
 .. raw:: html
 
    <div class="sk-sponsor-div">
    <div class="sk-sponsor-div-box">
 
-`INRIA <https://www.inria.fr>`_ actively supports this project. It has
-provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler
-(2012-2013) and Olivier Grisel (2013-2017) to work on this project
-full-time. It also hosts coding sprints and other events.
+`Paris-Saclay Center for Data Science
+<https://www.datascience-paris-saclay.fr/>`_
+funded one year for a developer to work on the project full-time
+(2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the
+time of Joris van den Bossche (2017-2018).
 
 .. raw:: html
 
    </div>
-
    <div class="sk-sponsor-div-box">
 
-.. image:: images/inria-logo.jpg
+.. image:: images/cds-logo.png
    :width: 100pt
    :align: center
-   :target: https://www.inria.fr
+   :target: https://www.datascience-paris-saclay.fr/
 
 .. raw:: html
 
    </div>
    </div>
 
-.....................
+............
 
 .. raw:: html
 
    <div class="sk-sponsor-div">
    <div class="sk-sponsor-div-box">
 
-`Paris-Saclay Center for Data Science
-<https://www.datascience-paris-saclay.fr/>`_
-funded one year for a developer to work on the project full-time
-(2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the
-time of Joris van den Bossche (2017-2018).
+`Anaconda, Inc <https://www.anaconda.com/>`_ funded Adrin Jalali in 2019.
 
 .. raw:: html
 
    </div>
+
    <div class="sk-sponsor-div-box">
 
-.. image:: images/cds-logo.png
+.. image:: images/anaconda.png
    :width: 100pt
    :align: center
-   :target: https://www.datascience-paris-saclay.fr/
+   :target: https://www.anaconda.com/
 
 .. raw:: html
 
diff --git a/doc/authors.rst b/doc/authors.rst
index 6a03871d67e90..7b5426fe3128d 100644
--- a/doc/authors.rst
+++ b/doc/authors.rst
@@ -7,7 +7,7 @@
     </style>
     <div>
     <a href='https://github.com/jeremiedbb'><img src='https://avatars2.githubusercontent.com/u/34657725?v=4' class='avatar' /></a> <br />
-    <p>Jérémie Du Boisberranger</p>
+    <p>Jérémie du Boisberranger</p>
     </div>
     <div>
     <a href='https://github.com/jorisvandenbossche'><img src='https://avatars2.githubusercontent.com/u/1020496?v=4' class='avatar' /></a> <br />
diff --git a/doc/conf.py b/doc/conf.py
index c3ab17d3e73af..74e37d01307be 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -17,6 +17,7 @@
 import warnings
 import re
 from packaging.version import parse
+from pathlib import Path
 
 # If extensions (or modules to document with autodoc) are in another
 # directory, add these directories to sys.path here. If the directory
@@ -208,6 +209,23 @@
 # If true, the reST sources are included in the HTML build as _sources/name.
 html_copy_source = True
 
+# Adds variables into templates
+html_context = {}
+# finds latest release highlights and places it into HTML context for
+# index.html
+release_highlights_dir = Path("..") / "examples" / "release_highlights"
+# Finds the highlight with the latest version number
+latest_highlights = sorted(release_highlights_dir.glob(
+                           "plot_release_highlights_*.py"))[-1]
+latest_highlights = latest_highlights.with_suffix('').name
+html_context["release_highlights"] = \
+    f"auto_examples/release_highlights/{latest_highlights}"
+
+# get version from higlight name assuming highlights have the form
+# plot_release_highlights_0_22_0
+highlight_version = ".".join(latest_highlights.split("_")[-3:-1])
+html_context["release_highlights_version"] = highlight_version
+
 # -- Options for LaTeX output ------------------------------------------------
 latex_elements = {
     # The paper size ('letterpaper' or 'a4paper').
@@ -281,6 +299,11 @@ def __repr__(self):
 
     def __call__(self, directory):
         src_path = os.path.normpath(os.path.join(self.src_dir, directory))
+
+        # Forces Release Highlights to the top
+        if os.path.basename(src_path) == "release_highlights":
+            return "0"
+
         readme = os.path.join(src_path, "README.txt")
 
         try:
@@ -314,6 +337,7 @@ def __call__(self, directory):
     },
     # avoid generating too many cross links
     'inspect_global_variables': False,
+    'remove_config_comments': True,
 }
 
 
@@ -386,6 +410,3 @@ def setup(app):
 warnings.filterwarnings("ignore", category=UserWarning,
                         message='Matplotlib is currently using agg, which is a'
                                 ' non-GUI backend, so cannot show the figure.')
-
-# Reduces the output of estimators
-sklearn.set_config(print_changed_only=True)
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 99c59ec3392c6..e13b6850d50eb 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -248,19 +248,28 @@ modifying code and submitting a PR:
    and start making changes. Always use a feature branch. It's good
    practice to never work on the ``master`` branch!
 
-9. Develop the feature on your feature branch on your computer, using Git to
-   do the version control. When you're done editing, add changed files using
-   ``git add`` and then ``git commit``::
+9. (**Optional**) Install `pre-commit <https://pre-commit.com/#install>`_ to
+   run code style checks before each commit::
 
-       $ git add modified_files
-       $ git commit
+        $ pip install pre-commit
+        $ pre-commit install
 
-   to record your changes in Git, then push the changes to your GitHub
-   account with::
+   pre-commit checks can be disabled for a particular commit with
+   `git commit -n`.
+
+10. Develop the feature on your feature branch on your computer, using Git to
+    do the version control. When you're done editing, add changed files using
+    ``git add`` and then ``git commit``::
+ 
+        $ git add modified_files
+        $ git commit
+
+    to record your changes in Git, then push the changes to your GitHub
+    account with::
 
        $ git push -u origin my_feature
 
-10. Follow `these
+11. Follow `these
     <https://help.github.com/articles/creating-a-pull-request-from-a-fork>`_
     instructions to create a pull request from your fork. This will send an
     email to the committers. You may want to consider sending an email to the
@@ -422,9 +431,12 @@ You can check for common programming errors with the following tools:
 
     mypy --ignore-missing-import sklearn
 
-  must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular,
-   - when importing C or Cython modules
-   - on properties with decorators
+  must not produce new errors in your pull request. Using `# type: ignore`
+  annotation can be a workaround for a few cases that are not supported by
+  mypy, in particular,
+
+  - when importing C or Cython modules
+  - on properties with decorators
 
 Bonus points for contributions that include a performance analysis with
 a benchmark script and profiling output (please report on the mailing
diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 96aa942fb9238..13d2010ca7319 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -246,40 +246,19 @@ whether it is just for you or for contributing it to scikit-learn, there are
 several internals of scikit-learn that you should be aware of in addition to
 the scikit-learn API outlined above. You can check whether your estimator
 adheres to the scikit-learn interface and standards by running
-:func:`utils.estimator_checks.check_estimator` on the class::
+:func:`~sklearn.utils.estimator_checks.check_estimator` on an instance. The
+:func:`~sklearn.utils.parametrize_with_checks` pytest decorator can also be
+used (see its docstring for details and possible interactions with `pytest`)::
 
   >>> from sklearn.utils.estimator_checks import check_estimator
   >>> from sklearn.svm import LinearSVC
-  >>> check_estimator(LinearSVC)  # passes
+  >>> check_estimator(LinearSVC())  # passes
 
 The main motivation to make a class compatible to the scikit-learn estimator
 interface might be that you want to use it together with model evaluation and
 selection tools such as :class:`model_selection.GridSearchCV` and
 :class:`pipeline.Pipeline`.
 
-Setting `generate_only=True` returns a generator that yields (estimator, check)
-tuples where the check can be called independently from each other, i.e.
-`check(estimator)`. This allows all checks to be run independently and report
-the checks that are failing. scikit-learn provides a pytest specific decorator, 
-:func:`~sklearn.utils.parametrize_with_checks`, making it easier to test
-multiple estimators::
-
-  from sklearn.utils.estimator_checks import parametrize_with_checks
-  from sklearn.linear_model import LogisticRegression
-  from sklearn.tree import DecisionTreeRegressor
-
-  @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor])
-  def test_sklearn_compatible_estimator(estimator, check):
-      check(estimator)
-
-This decorator sets the `id` keyword in `pytest.mark.parameterize` exposing
-the name of the underlying estimator and check in the test name. This allows
-`pytest -k` to be used to specify which tests to run.
-
-.. code-block: bash
-   
-   pytest test_check_estimators.py -k check_estimators_fit_returns_self
-
 Before detailing the required interface below, we describe two ways to achieve
 the correct interface more easily.
 
@@ -531,6 +510,11 @@ requires_fit (default=True)
 requires_positive_X (default=False)
     whether the estimator requires positive X.
 
+requires_y (default=False)
+    whether the estimator requires y to be passed to `fit`, `fit_predict` or
+    `fit_transform` methods. The tag is True for estimators inheriting from
+    `~sklearn.base.RegressorMixin` and `~sklearn.base.ClassifierMixin`.
+
 requires_positive_y (default=False)
     whether the estimator requires a positive y (only applicable for regression).
 
@@ -538,10 +522,17 @@ _skip_test (default=False)
     whether to skip common tests entirely. Don't use this unless you have a
     *very good* reason.
 
-_xfail_test (default=False)
-    dictionary ``{check_name : reason}`` of common checks to mark as a
-    known failure, with the associated reason. Don't use this unless you have a
-    *very good* reason.
+_xfail_checks (default=False)
+    dictionary ``{check_name: reason}`` of common checks that will be marked
+    as `XFAIL` for pytest, when using
+    :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`. This tag
+    currently has no effect on
+    :func:`~sklearn.utils.estimator_checks.check_estimator`.
+    Don't use this unless there is a *very good* reason for your estimator
+    not to pass the check.
+    Also note that the usage of this tag is highly subject to change because
+    we are trying to make it more flexible: be prepared for breaking changes
+    in the future.
 
 stateless (default=False)
     whether the estimator needs access to data for fitting. Even though an
diff --git a/doc/developers/plotting.rst b/doc/developers/plotting.rst
index 98af195b56453..7a2f6ebf69415 100644
--- a/doc/developers/plotting.rst
+++ b/doc/developers/plotting.rst
@@ -50,7 +50,7 @@ attributes::
                                 estimator.__class__.__name__)
        return viz.plot(ax=ax, name=name, **kwargs)
 
-Read more in :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py`
+Read more in :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
 and the :ref:`User Guide <visualizations>`.
 
 Plotting with Multiple Axes
diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst
index b26d68ecfbe02..4c11c24684352 100644
--- a/doc/developers/tips.rst
+++ b/doc/developers/tips.rst
@@ -248,8 +248,8 @@ code. Follow these steps:
        $> valgrind -v --suppressions=valgrind-python.supp python my_test_script.py
 
 .. _valgrind: http://valgrind.org
-.. _`README.valgrind`: https://svn.python.org/projects/python/trunk/Misc/README.valgrind
-.. _`valgrind-python.supp`: https://svn.python.org/projects/python/trunk/Misc/valgrind-python.supp
+.. _`README.valgrind`: https://github.com/python/cpython/blob/master/Misc/README.valgrind
+.. _`valgrind-python.supp`: https://github.com/python/cpython/blob/master/Misc/valgrind-python.supp
 
 
 The result will be a list of all the memory-related errors, which reference
diff --git a/doc/images/anaconda-small.png b/doc/images/anaconda-small.png
deleted file mode 100644
index ccb8bb8b707de..0000000000000
Binary files a/doc/images/anaconda-small.png and /dev/null differ
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 3d9924638b69b..2489eaf55bac7 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1569,6 +1569,7 @@ Plotting
    utils.deprecated
    utils.estimator_checks.check_estimator
    utils.estimator_checks.parametrize_with_checks
+   utils.estimator_html_repr
    utils.extmath.safe_sparse_dot
    utils.extmath.randomized_range_finder
    utils.extmath.randomized_svd
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index 51a933dcbee47..e7dac0dadc630 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -144,7 +144,7 @@ or by name::
  * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py`
  * :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py`
  * :ref:`sphx_glr_auto_examples_compose_plot_digits_pipe.py`
- * :ref:`sphx_glr_auto_examples_plot_kernel_approximation.py`
+ * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py`
  * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py`
  * :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py`
 
@@ -528,6 +528,31 @@ above example would be::
                                   ('countvectorizer', CountVectorizer(),
                                    'title')])
 
+.. _visualizing_composite_estimators:
+
+Visualizing Composite Estimators
+================================
+
+Estimators can be displayed with a HTML representation when shown in a
+jupyter notebook. This can be useful to diagnose or visualize a Pipeline with
+many estimators. This visualization is activated by setting the
+`display` option in :func:`sklearn.set_config`::
+
+  >>> from sklearn import set_config
+  >>> set_config(display='diagram')   # doctest: +SKIP
+  >>> # diplays HTML representation in a jupyter context
+  >>> column_trans  # doctest: +SKIP
+
+An example of the HTML output can be seen in the 
+**HTML representation of Pipeline** section of 
+:ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
+As an alternative, the HTML can be written to a file using
+:func:`~sklearn.utils.estimator_html_repr`::
+
+   >>> from sklearn.utils import estimator_html_repr
+   >>> with open('my_estimator.html', 'w') as f:  # doctest: +SKIP
+   ...     f.write(estimator_html_repr(clf))
+
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py`
diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index e2de690658a25..ed014cea6f2ff 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -323,11 +323,11 @@ The following cross-validators can be used in such cases.
 
 While i.i.d. data is a common assumption in machine learning theory, it rarely
 holds in practice. If one knows that the samples have been generated using a
-time-dependent process, it's safer to
-use a :ref:`time-series aware cross-validation scheme <timeseries_cv>`
-Similarly if we know that the generative process has a group structure
-(samples from collected from different subjects, experiments, measurement
-devices) it safer to use :ref:`group-wise cross-validation <group_cv>`.
+time-dependent process, it is safer to
+use a :ref:`time-series aware cross-validation scheme <timeseries_cv>`.
+Similarly, if we know that the generative process has a group structure
+(samples collected from different subjects, experiments, measurement
+devices), it is safer to use :ref:`group-wise cross-validation <group_cv>`.
 
 
 K-fold
@@ -535,14 +535,14 @@ folds: each set contains approximately the same percentage of samples of each
 target class as the complete set.
 
 Here is an example of stratified 3-fold cross-validation on a dataset with 50 samples from
-two unbalanced classes.  We show the number of samples in each class and compare with 
+two unbalanced classes.  We show the number of samples in each class and compare with
 :class:`KFold`.
 
   >>> from sklearn.model_selection import StratifiedKFold, KFold
   >>> import numpy as np
   >>> X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
-  >>> skf = StratifiedKFold(n_splits=3) 
-  >>> for train, test in skf.split(X, y):  
+  >>> skf = StratifiedKFold(n_splits=3)
+  >>> for train, test in skf.split(X, y):
   ...     print('train -  {}   |   test -  {}'.format(
   ...         np.bincount(y[train]), np.bincount(y[test])))
   train -  [30  3]   |   test -  [15  2]
@@ -556,7 +556,7 @@ two unbalanced classes.  We show the number of samples in each class and compare
   train -  [28  5]   |   test -  [17]
   train -  [34]   |   test -  [11  5]
 
-We can see that :class:`StratifiedKFold` preserves the class ratios 
+We can see that :class:`StratifiedKFold` preserves the class ratios
 (approximately 1 / 10) in both train and test dataset.
 
 Here is a visualization of the cross-validation behavior.
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 1416b9d3a6045..434cf146c2d4e 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -246,7 +246,7 @@ amount of time (e.g., on large datasets).
 
  * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_iris.py`
  * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances_faces.py`
- * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`
+ * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`
 
 .. topic:: References
 
@@ -952,8 +952,9 @@ controls the number of iterations of the boosting process::
   >>> clf.score(X_test, y_test)
   0.8965
 
-Available losses for regression are 'least_squares' and
-'least_absolute_deviation', which is less sensitive to outliers. For
+Available losses for regression are 'least_squares',
+'least_absolute_deviation', which is less sensitive to outliers, and
+'poisson', which is well suited to model counts and frequencies. For
 classification, 'binary_crossentropy' is used for binary classification and
 'categorical_crossentropy' is used for multiclass classification. By default
 the loss is 'auto' and will select the appropriate loss depending on
@@ -1017,6 +1018,8 @@ If no missing values were encountered for a given feature during training,
 then samples with missing values are mapped to whichever child has the most
 samples.
 
+.. _sw_hgbdt:
+
 Sample weight support
 ---------------------
 
@@ -1436,7 +1439,7 @@ any other regressor or classifier, exposing a `predict`, `predict_proba`, and
    >>> print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred)))
    R2 score: 0.81
 
-Note that it is also possible to get the output of the stacked outputs of the
+Note that it is also possible to get the output of the stacked
 `estimators` using the `transform` method::
 
   >>> reg.transform(X_test[:5])
diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index 084e110f5c702..cedc43c23c16c 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -1019,7 +1019,7 @@ The :class:`PatchExtractor` class works in the same way as
 implemented as an estimator, so it can be used in pipelines. See::
 
     >>> five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)
-    >>> patches = image.PatchExtractor((2, 2)).transform(five_images)
+    >>> patches = image.PatchExtractor(patch_size=(2, 2)).transform(five_images)
     >>> patches.shape
     (45, 2, 2, 3)
 
diff --git a/doc/modules/isotonic.rst b/doc/modules/isotonic.rst
index 1f54dcfa50bad..8967ef18afcb3 100644
--- a/doc/modules/isotonic.rst
+++ b/doc/modules/isotonic.rst
@@ -28,6 +28,6 @@ correlation coefficient
 for predicting to unseen data. The predictions of :class:`IsotonicRegression`
 thus form a function that is piecewise linear:
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_isotonic_regression_001.png
-   :target: ../auto_examples/plot_isotonic_regression.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_isotonic_regression_001.png
+   :target: ../auto_examples/miscellaneous/plot_isotonic_regression.html
    :align: center
diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst
index 77354d5afaf1d..fb3843c6bc045 100644
--- a/doc/modules/kernel_approximation.rst
+++ b/doc/modules/kernel_approximation.rst
@@ -84,8 +84,8 @@ For a given value of ``n_components`` :class:`RBFSampler` is often less accurate
 as :class:`Nystroem`. :class:`RBFSampler` is cheaper to compute, though, making
 use of larger feature spaces more efficient.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_approximation_002.png
-    :target: ../auto_examples/plot_kernel_approximation.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_approximation_002.png
+    :target: ../auto_examples/miscellaneous/plot_kernel_approximation.html
     :scale: 50%
     :align: center
 
@@ -93,7 +93,7 @@ use of larger feature spaces more efficient.
 
 .. topic:: Examples:
 
-    * :ref:`sphx_glr_auto_examples_plot_kernel_approximation.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py`
 
 .. _additive_chi_kernel_approx:
 
diff --git a/doc/modules/kernel_ridge.rst b/doc/modules/kernel_ridge.rst
index a67733b1ca5a5..286e9d4ac5322 100644
--- a/doc/modules/kernel_ridge.rst
+++ b/doc/modules/kernel_ridge.rst
@@ -35,8 +35,8 @@ However, prediction of 100000 target values is more than three times faster
 with :class:`~sklearn.svm.SVR` since it has learned a sparse model using only
 approximately 1/3 of the 100 training datapoints as support vectors.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_001.png
-   :target: ../auto_examples/plot_kernel_ridge_regression.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_ridge_regression_001.png
+   :target: ../auto_examples/miscellaneous/plot_kernel_ridge_regression.html
    :align: center
 
 The next figure compares the time for fitting and prediction of
@@ -51,8 +51,8 @@ prediction time depends on the parameters :math:`\epsilon` and :math:`C` of
 the :class:`~sklearn.svm.SVR`; :math:`\epsilon = 0` would correspond to a
 dense model.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_002.png
-   :target: ../auto_examples/plot_kernel_ridge_regression.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_ridge_regression_002.png
+   :target: ../auto_examples/miscellaneous/plot_kernel_ridge_regression.html
    :align: center
 
 
diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst
index e1dfb0c03ea4b..c3ac94dedefa9 100644
--- a/doc/modules/lda_qda.rst
+++ b/doc/modules/lda_qda.rst
@@ -7,9 +7,9 @@ Linear and Quadratic Discriminant Analysis
 .. currentmodule:: sklearn
 
 Linear Discriminant Analysis
-(:class:`discriminant_analysis.LinearDiscriminantAnalysis`) and Quadratic
+(:class:`~discriminant_analysis.LinearDiscriminantAnalysis`) and Quadratic
 Discriminant Analysis
-(:class:`discriminant_analysis.QuadraticDiscriminantAnalysis`) are two classic
+(:class:`~discriminant_analysis.QuadraticDiscriminantAnalysis`) are two classic
 classifiers, with, as their names suggest, a linear and a quadratic decision
 surface, respectively.
 
@@ -37,68 +37,59 @@ flexible.
 Dimensionality reduction using Linear Discriminant Analysis
 ===========================================================
 
-:class:`discriminant_analysis.LinearDiscriminantAnalysis` can be used to
+:class:`~discriminant_analysis.LinearDiscriminantAnalysis` can be used to
 perform supervised dimensionality reduction, by projecting the input data to a
 linear subspace consisting of the directions which maximize the separation
 between classes (in a precise sense discussed in the mathematics section
 below). The dimension of the output is necessarily less than the number of
-classes, so this is, in general, a rather strong dimensionality reduction, and
+classes, so this is in general a rather strong dimensionality reduction, and
 only makes sense in a multiclass setting.
 
-This is implemented in
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.transform`. The desired
-dimensionality can be set using the ``n_components`` constructor parameter.
-This parameter has no influence on
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.fit` or
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.predict`.
+This is implemented in the `transform` method. The desired dimensionality can
+be set using the ``n_components`` parameter. This parameter has no influence
+on the `fit` and `predict` methods.
 
 .. topic:: Examples:
 
     :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py`: Comparison of LDA and PCA
     for dimensionality reduction of the Iris dataset
 
+.. _lda_qda_math:
+
 Mathematical formulation of the LDA and QDA classifiers
 =======================================================
 
 Both LDA and QDA can be derived from simple probabilistic models which model
 the class conditional distribution of the data :math:`P(X|y=k)` for each class
-:math:`k`. Predictions can then be obtained by using Bayes' rule:
+:math:`k`. Predictions can then be obtained by using Bayes' rule, for each
+training sample :math:`x \in \mathcal{R}^d`:
 
 .. math::
-    P(y=k | X) = \frac{P(X | y=k) P(y=k)}{P(X)} = \frac{P(X | y=k) P(y = k)}{ \sum_{l} P(X | y=l) \cdot P(y=l)}
+    P(y=k | x) = \frac{P(x | y=k) P(y=k)}{P(x)} = \frac{P(x | y=k) P(y = k)}{ \sum_{l} P(x | y=l) \cdot P(y=l)}
 
-and we select the class :math:`k` which maximizes this conditional probability.
+and we select the class :math:`k` which maximizes this posterior probability.
 
 More specifically, for linear and quadratic discriminant analysis,
-:math:`P(X|y)` is modeled as a multivariate Gaussian distribution with
+:math:`P(x|y)` is modeled as a multivariate Gaussian distribution with
 density:
 
-.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right)
+.. math:: P(x | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (x-\mu_k)^t \Sigma_k^{-1} (x-\mu_k)\right)
 
 where :math:`d` is the number of features.
 
-To use this model as a classifier, we just need to estimate from the training
-data the class priors :math:`P(y=k)` (by the proportion of instances of class
-:math:`k`), the class means :math:`\mu_k` (by the empirical sample class means)
-and the covariance matrices (either by the empirical sample class covariance
-matrices, or by a regularized estimator: see the section on shrinkage below).
+QDA
+---
 
-In the case of LDA, the Gaussians for each class are assumed to share the same
-covariance matrix: :math:`\Sigma_k = \Sigma` for all :math:`k`. This leads to
-linear decision surfaces, which can be seen by comparing the
-log-probability ratios :math:`\log[P(y=k | X) / P(y=l | X)]`:
+According to the model above, the log of the posterior is:
 
 .. math::
-    \log\left(\frac{P(y=k|X)}{P(y=l|X)}\right)=
-    \log\left(\frac{P(X|y=k)P(y=k)}{P(X|y=l)P(y=l)}\right)=0 \Leftrightarrow
 
-    (\mu_k-\mu_l)^t\Sigma^{-1} X =
-    \frac{1}{2} (\mu_k^t \Sigma^{-1} \mu_k - \mu_l^t \Sigma^{-1} \mu_l)
-    - \log\frac{P(y=k)}{P(y=l)}
+    \log P(y=k | x) &= \log P(x | y=k) + \log P(y = k) + Cst \\
+    &= -\frac{1}{2} \log |\Sigma_k| -\frac{1}{2} (x-\mu_k)^t \Sigma_k^{-1} (x-\mu_k) + \log P(y = k) + Cst,
 
-In the case of QDA, there are no assumptions on the covariance matrices
-:math:`\Sigma_k` of the Gaussians, leading to quadratic decision surfaces. See
-[#1]_ for more details.
+where the constant term :math:`Cst` corresponds to the denominator
+:math:`P(x)`, in addition to other constant terms from the Gaussian. The
+predicted class is the one that maximises this log-posterior.
 
 .. note:: **Relation with Gaussian Naive Bayes**
 
@@ -107,22 +98,60 @@ In the case of QDA, there are no assumptions on the covariance matrices
 	  and the resulting classifier is equivalent to the Gaussian Naive Bayes
 	  classifier :class:`naive_bayes.GaussianNB`.
 
-Mathematical formulation of LDA dimensionality reduction
-========================================================
+LDA
+---
+
+LDA is a special case of QDA, where the Gaussians for each class are assumed
+to share the same covariance matrix: :math:`\Sigma_k = \Sigma` for all
+:math:`k`. This reduces the log posterior to:
 
-To understand the use of LDA in dimensionality reduction, it is useful to start
-with a geometric reformulation of the LDA classification rule explained above.
-We write :math:`K` for the total number of target classes. Since in LDA we
-assume that all classes have the same estimated covariance :math:`\Sigma`, we
-can rescale the data so that this covariance is the identity:
+.. math:: \log P(y=k | x) = -\frac{1}{2} (x-\mu_k)^t \Sigma^{-1} (x-\mu_k) + \log P(y = k) + Cst.
+
+The term :math:`(x-\mu_k)^t \Sigma^{-1} (x-\mu_k)` corresponds to the
+`Mahalanobis Distance <https://en.wikipedia.org/wiki/Mahalanobis_distance>`_
+between the sample :math:`x` and the mean :math:`\mu_k`. The Mahalanobis
+distance tells how close :math:`x` is from :math:`\mu_k`, while also
+accounting for the variance of each feature. We can thus interpret LDA as
+assigning :math:`x` to the class whose mean is the closest in terms of
+Mahalanobis distance, while also accounting for the class prior
+probabilities.
+
+The log-posterior of LDA can also be written [3]_ as:
+
+.. math::
 
-.. math:: X^* = D^{-1/2}U^t X\text{ with }\Sigma = UDU^t
+    \log P(y=k | x) = \omega_k^t x + \omega_{k0} + Cst.
 
-Then one can show that to classify a data point after scaling is equivalent to
-finding the estimated class mean :math:`\mu^*_k` which is closest to the data
-point in the Euclidean distance. But this can be done just as well after
-projecting on the :math:`K-1` affine subspace :math:`H_K` generated by all the
-:math:`\mu^*_k` for all classes. This shows that, implicit in the LDA
+where :math:`\omega_k = \Sigma^{-1} \mu_k` and :math:`\omega_{k0} =
+-\frac{1}{2} \mu_k^t\Sigma^{-1}\mu_k + \log P (y = k)`. These quantities
+correspond to the `coef_` and `intercept_` attributes, respectively.
+
+From the above formula, it is clear that LDA has a linear decision surface.
+In the case of QDA, there are no assumptions on the covariance matrices
+:math:`\Sigma_k` of the Gaussians, leading to quadratic decision surfaces.
+See [1]_ for more details.
+
+Mathematical formulation of LDA dimensionality reduction
+========================================================
+
+First note that the K means :math:`\mu_k` are vectors in
+:math:`\mathcal{R}^d`, and they lie in an affine subspace :math:`H` of
+dimension at least :math:`K - 1` (2 points lie on a line, 3 points lie on a
+plane, etc).
+
+As mentioned above, we can interpret LDA as assigning :math:`x` to the class
+whose mean :math:`\mu_k` is the closest in terms of Mahalanobis distance,
+while also accounting for the class prior probabilities. Alternatively, LDA
+is equivalent to first *sphering* the data so that the covariance matrix is
+the identity, and then assigning :math:`x` to the closest mean in terms of
+Euclidean distance (still accounting for the class priors).
+
+Computing Euclidean distances in this d-dimensional space is equivalent to
+first projecting the data points into :math:`H`, and computing the distances
+there (since the other dimensions will contribute equally to each class in
+terms of distance). In other words, if :math:`x` is closest to :math:`\mu_k`
+in the original space, it will also be the case in :math:`H`.
+This shows that, implicit in the LDA
 classifier, there is a dimensionality reduction by linear projection onto a
 :math:`K-1` dimensional space.
 
@@ -131,19 +160,22 @@ onto the linear subspace :math:`H_L` which maximizes the variance of the
 :math:`\mu^*_k` after projection (in effect, we are doing a form of PCA for the
 transformed class means :math:`\mu^*_k`). This :math:`L` corresponds to the
 ``n_components`` parameter used in the
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.transform` method. See
-[#1]_ for more details.
+:func:`~discriminant_analysis.LinearDiscriminantAnalysis.transform` method. See
+[1]_ for more details.
 
 Shrinkage
 =========
 
-Shrinkage is a tool to improve estimation of covariance matrices in situations
-where the number of training samples is small compared to the number of
-features. In this scenario, the empirical sample covariance is a poor
-estimator. Shrinkage LDA can be used by setting the ``shrinkage`` parameter of
-the :class:`discriminant_analysis.LinearDiscriminantAnalysis` class to 'auto'.
+Shrinkage is a form of regularization used to improve the estimation of
+covariance matrices in situations where the number of training samples is
+small compared to the number of features.
+In this scenario, the empirical sample covariance is a poor
+estimator, and shrinkage helps improving the generalization performance of
+the classifier.
+Shrinkage LDA can be used by setting the ``shrinkage`` parameter of
+the :class:`~discriminant_analysis.LinearDiscriminantAnalysis` class to 'auto'.
 This automatically determines the optimal shrinkage parameter in an analytic
-way following the lemma introduced by Ledoit and Wolf [#2]_. Note that
+way following the lemma introduced by Ledoit and Wolf [2]_. Note that
 currently shrinkage only works when setting the ``solver`` parameter to 'lsqr'
 or 'eigen'.
 
@@ -165,13 +197,33 @@ matrix.
 Estimation algorithms
 =====================
 
-The default solver is 'svd'. It can perform both classification and transform,
-and it does not rely on the calculation of the covariance matrix. This can be
-an advantage in situations where the number of features is large. However, the
-'svd' solver cannot be used with shrinkage.
-
-The 'lsqr' solver is an efficient algorithm that only works for classification.
-It supports shrinkage.
+Using LDA and QDA requires computing the log-posterior which depends on the
+class priors :math:`P(y=k)`, the class means :math:`\mu_k`, and the
+covariance matrices.
+
+The 'svd' solver is the default solver used for
+:class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`, and it is
+the only available solver for
+:class:`~sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis`.
+It can perform both classification and transform (for LDA).
+As it does not rely on the calculation of the covariance matrix, the 'svd'
+solver may be preferable in situations where the number of features is large.
+The 'svd' solver cannot be used with shrinkage.
+For QDA, the use of the SVD solver relies on the fact that the covariance
+matrix :math:`\Sigma_k` is, by definition, equal to :math:`\frac{1}{n - 1}
+X_k^tX_k = V S^2 V^t` where :math:`V` comes from the SVD of the (centered)
+matrix: :math:`X_k = U S V^t`. It turns out that we can compute the
+log-posterior above without having to explictly compute :math:`\Sigma`:
+computing :math:`S` and :math:`V` via the SVD of :math:`X` is enough. For
+LDA, two SVDs are computed: the SVD of the centered input matrix :math:`X`
+and the SVD of the class-wise mean vectors.
+
+The 'lsqr' solver is an efficient algorithm that only works for
+classification. It needs to explicitly compute the covariance matrix
+:math:`\Sigma`, and supports shrinkage. This solver computes the coefficients
+:math:`\omega_k = \Sigma^{-1}\mu_k` by solving for :math:`\Sigma \omega =
+\mu_k`, thus avoiding the explicit computation of the inverse
+:math:`\Sigma^{-1}`.
 
 The 'eigen' solver is based on the optimization of the between class scatter to
 within class scatter ratio. It can be used for both classification and
@@ -186,8 +238,11 @@ a high number of features.
 
 .. topic:: References:
 
-   .. [#1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
+   .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
       Friedman J., Section 4.3, p.106-119, 2008.
 
-   .. [#2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix.
+   .. [2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix.
       The Journal of Portfolio Management 30(4), 110-119, 2004.
+
+   .. [3] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification
+      (Second Edition), section 2.6.2.
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index 606b4246a0b88..1f6556bfa54f3 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -311,15 +311,15 @@ To use this feature, feed the classifier an indicator matrix, in which cell
 [i, j] indicates the presence of label j in sample i.
 
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_multilabel_001.png
-    :target: ../auto_examples/plot_multilabel.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multilabel_001.png
+    :target: ../auto_examples/miscellaneous/plot_multilabel.html
     :align: center
     :scale: 75%
 
 
 .. topic:: Examples:
 
-    * :ref:`sphx_glr_auto_examples_plot_multilabel.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multilabel.py`
 
 .. _ovo_classification:
 
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 9aa27a53501b8..397fdd1dd9e90 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -230,12 +230,12 @@ which will be used to compute the weights.
    :scale: 75
 
 The use of multi-output nearest neighbors for regression is demonstrated in
-:ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`. In this example, the inputs
+:ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs
 X are the pixels of the upper half of faces and the outputs Y are the pixels of
 the lower half of those faces.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png
-   :target: ../auto_examples/plot_multioutput_face_completion.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png
+   :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html
    :scale: 75
    :align: center
 
@@ -245,7 +245,7 @@ the lower half of those faces.
   * :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`: an example of regression
     using nearest neighbors.
 
-  * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`: an example of
+  * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`: an example of
     multi-output regression using nearest neighbors.
 
 
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index c061feb0b1d7c..76bd85f3bb1c8 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -98,8 +98,8 @@ Outlier Factor (LOF) does not show a decision boundary in black as it
 has no predict method to be applied on new data when it is used for outlier
 detection.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_anomaly_comparison_001.png
-   :target: ../auto_examples/plot_anomaly_comparison.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_anomaly_comparison_001.png
+   :target: ../auto_examples/miscellaneous/plot_anomaly_comparison.html
    :align: center
    :scale: 50
 
@@ -109,12 +109,12 @@ The :class:`svm.OneClassSVM` is known to be sensitive to outliers and thus
 does not perform very well for outlier detection. Finally,
 :class:`covariance.EllipticEnvelope` assumes the data is Gaussian and learns
 an ellipse. For more details on the different estimators refer to the example
-:ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` and the sections
-hereunder.
+:ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` and the
+sections hereunder.
 
 .. topic:: Examples:
 
-  * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py`
+  * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py`
     for a comparison of the :class:`svm.OneClassSVM`, the
     :class:`ensemble.IsolationForest`, the
     :class:`neighbors.LocalOutlierFactor` and
@@ -270,8 +270,8 @@ allows you to add more trees to an already fitted model::
    * See :ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py` for
      an illustration of the use of IsolationForest.
 
-   * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` for a
-     comparison of :class:`ensemble.IsolationForest` with
+   * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py`
+     for a comparison of :class:`ensemble.IsolationForest` with
      :class:`neighbors.LocalOutlierFactor`,
      :class:`svm.OneClassSVM` (tuned to perform like an outlier detection
      method) and a covariance-based outlier detection with
@@ -339,8 +339,8 @@ This strategy is illustrated below.
    * See :ref:`sphx_glr_auto_examples_neighbors_plot_lof_outlier_detection.py`
      for an illustration of the use of :class:`neighbors.LocalOutlierFactor`.
 
-   * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` for a
-     comparison with other anomaly detection methods.
+   * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py`
+     for a comparison with other anomaly detection methods.
 
 .. topic:: References:
 
diff --git a/doc/modules/random_projection.rst b/doc/modules/random_projection.rst
index eb8d6de984985..cd3c129cfad45 100644
--- a/doc/modules/random_projection.rst
+++ b/doc/modules/random_projection.rst
@@ -64,19 +64,19 @@ bounded distortion introduced by the random projection::
   >>> johnson_lindenstrauss_min_dim(n_samples=[1e4, 1e5, 1e6], eps=0.1)
   array([ 7894,  9868, 11841])
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_001.png
-   :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_johnson_lindenstrauss_bound_001.png
+   :target: ../auto_examples/miscellaneous/plot_johnson_lindenstrauss_bound.html
    :scale: 75
    :align: center
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_002.png
-   :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_johnson_lindenstrauss_bound_002.png
+   :target: ../auto_examples/miscellaneous/plot_johnson_lindenstrauss_bound.html
    :scale: 75
    :align: center
 
 .. topic:: Example:
 
-  * See :ref:`sphx_glr_auto_examples_plot_johnson_lindenstrauss_bound.py`
+  * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py`
     for a theoretical explication on the Johnson-Lindenstrauss lemma and an
     empirical validation using sparse random matrices.
 
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 23dc7fbf67b65..8acebc79e412e 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -90,7 +90,7 @@ After being fitted, the model can then be used to predict new values::
 SVMs decision function (detailed in the :ref:`svm_mathematical_formulation`)
 depends on some subset of the training data, called the support vectors. Some
 properties of these support vectors can be found in attributes
-``support_vectors_``, ``support_`` and ``n_support``::
+``support_vectors_``, ``support_`` and ``n_support_``::
 
     >>> # get support vectors
     >>> clf.support_vectors_
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index ecd037d0631ac..e12b63adb48c4 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -56,9 +56,9 @@ The disadvantages of decision trees include:
 
     - Decision-tree learners can create over-complex trees that do not
       generalise the data well. This is called overfitting. Mechanisms
-      such as pruning (not currently supported), setting the minimum
-      number of samples required at a leaf node or setting the maximum
-      depth of the tree are necessary to avoid this problem.
+      such as pruning, setting the minimum number of samples required
+      at a leaf node or setting the maximum depth of the tree are
+      necessary to avoid this problem.
 
     - Decision trees can be unstable because small variations in the
       data might result in a completely different tree being generated.
@@ -124,10 +124,10 @@ Using the Iris dataset, we can construct a tree as follows::
     >>> clf = tree.DecisionTreeClassifier()
     >>> clf = clf.fit(X, y)
 
-Once trained, you can plot the tree with the plot_tree function::
+Once trained, you can plot the tree with the :func:`plot_tree` function::
 
 
-    >>> tree.plot_tree(clf.fit(iris.data, iris.target)) # doctest: +SKIP
+    >>> tree.plot_tree(clf) # doctest: +SKIP
 
 .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png
    :target: ../auto_examples/tree/plot_iris_dtc.html
@@ -137,10 +137,7 @@ Once trained, you can plot the tree with the plot_tree function::
 We can also export the tree in `Graphviz
 <https://www.graphviz.org/>`_ format using the :func:`export_graphviz`
 exporter. If you use the `conda <https://conda.io>`_ package manager, the graphviz binaries
-
-and the python package can be installed with
-
-    conda install python-graphviz
+and the python package can be installed with `conda install python-graphviz`.
 
 Alternatively binaries for graphviz can be downloaded from the graphviz project homepage,
 and the Python wrapper installed from pypi with `pip install graphviz`.
@@ -188,7 +185,7 @@ of external libraries and is more compact:
 
     >>> from sklearn.datasets import load_iris
     >>> from sklearn.tree import DecisionTreeClassifier
-    >>> from sklearn.tree.export import export_text
+    >>> from sklearn.tree import export_text
     >>> iris = load_iris()
     >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
     >>> decision_tree = decision_tree.fit(iris.data, iris.target)
@@ -283,19 +280,19 @@ X is a single real value and the outputs Y are the sine and cosine of X.
    :align: center
 
 The use of multi-output trees for classification is demonstrated in
-:ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`. In this example, the inputs
+:ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs
 X are the pixels of the upper half of faces and the outputs Y are the pixels of
 the lower half of those faces.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png
-   :target: ../auto_examples/plot_multioutput_face_completion.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png
+   :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html
    :scale: 75
    :align: center
 
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py`
- * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`
+ * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`
 
 .. topic:: References:
 
diff --git a/doc/modules/unsupervised_reduction.rst b/doc/modules/unsupervised_reduction.rst
index 3a85b8e53b553..6e16886064cfc 100644
--- a/doc/modules/unsupervised_reduction.rst
+++ b/doc/modules/unsupervised_reduction.rst
@@ -37,7 +37,7 @@ documentation: :ref:`random_projection`.
 
 .. topic:: **Examples**
 
-   * :ref:`sphx_glr_auto_examples_plot_johnson_lindenstrauss_bound.py`
+   * :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py`
 
 Feature agglomeration
 ------------------------
diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 89079971ca29a..6c8fb57c34aa7 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -21,19 +21,17 @@ enhance the functionality of scikit-learn's estimators.
 
 **Data formats**
 
+- `Fast svmlight / libsvm file loader <https://github.com/mblondel/svmlight-loader>`_
+  Fast and memory-efficient svmlight / libsvm file loader for Python.
+
 - `sklearn_pandas <https://github.com/paulgb/sklearn-pandas/>`_ bridge for
   scikit-learn pipelines and pandas data frame with dedicated transformers.
-  
+
 - `sklearn_xarray <https://github.com/phausamann/sklearn-xarray/>`_ provides
   compatibility of scikit-learn estimators with xarray data structures.
 
 **Auto-ML**
 
-- `auto_ml <https://github.com/ClimbsRocks/auto_ml/>`_
-  Automated machine learning for production and analytics, built on scikit-learn
-  and related projects. Trains a pipeline wth all the standard machine learning 
-  steps. Tuned for prediction speed and ease of transfer to production environments. 
-
 - `auto-sklearn <https://github.com/automl/auto-sklearn/>`_
   An automated machine learning toolkit and a drop-in replacement for a
   scikit-learn estimator
@@ -44,35 +42,21 @@ enhance the functionality of scikit-learn's estimators.
   preprocessors as well as the estimators. Works as a drop-in replacement for a
   scikit-learn estimator.
 
-- `scikit-optimize <https://scikit-optimize.github.io/>`_
-  A library to minimize (very) expensive and noisy black-box functions. It
-  implements several methods for sequential model-based optimization, and
-  includes a replacement for ``GridSearchCV`` or ``RandomizedSearchCV`` to do
-  cross-validated parameter search using any of these strategies.
-
 **Experimentation frameworks**
 
 - `REP <https://github.com/yandex/REP>`_ Environment for conducting data-driven
   research in a consistent and reproducible way
 
-- `ML Frontend <https://github.com/jeff1evesque/machine-learning>`_ provides
-  dataset management and SVM fitting/prediction through
-  `web-based <https://github.com/jeff1evesque/machine-learning#web-interface>`_
-  and `programmatic <https://github.com/jeff1evesque/machine-learning#programmatic-interface>`_
-  interfaces.
-
 - `Scikit-Learn Laboratory
   <https://skll.readthedocs.io/en/latest/index.html>`_  A command-line
   wrapper around scikit-learn that makes it easy to run machine learning
   experiments with multiple learners and large feature sets.
 
-- `Xcessiv <https://github.com/reiinakano/xcessiv>`_ is a notebook-like
-  application for quick, scalable, and automated hyperparameter tuning
-  and stacked ensembling. Provides a framework for keeping track of 
-  model-hyperparameter combinations.
-
 **Model inspection and visualisation**
 
+- `dtreeviz <https://github.com/parrt/dtreeviz/>`_ A python library for
+  decision tree visualization and model interpretation.
+
 - `eli5 <https://github.com/TeamHG-Memex/eli5/>`_ A library for
   debugging/inspecting machine learning models and explaining their
   predictions.
@@ -80,13 +64,20 @@ enhance the functionality of scikit-learn's estimators.
 - `mlxtend <https://github.com/rasbt/mlxtend>`_ Includes model visualization
   utilities.
 
-- `scikit-plot <https://github.com/reiinakano/scikit-plot>`_ A visualization library
-  for quick and easy generation of common plots in data analysis and machine learning.
-
 - `yellowbrick <https://github.com/DistrictDataLabs/yellowbrick>`_ A suite of
   custom matplotlib visualizers for scikit-learn estimators to support visual feature
   analysis, model selection, evaluation, and diagnostics.
 
+**Model selection**
+
+- `scikit-optimize <https://scikit-optimize.github.io/>`_
+  A library to minimize (very) expensive and noisy black-box functions. It
+  implements several methods for sequential model-based optimization, and
+  includes a replacement for ``GridSearchCV`` or ``RandomizedSearchCV`` to do
+  cross-validated parameter search using any of these strategies.
+
+- `sklearn-deap <https://github.com/rsteca/sklearn-deap>`_ Use evolutionary
+   algorithms instead of gridsearch in scikit-learn.
 
 **Model export for production**
 
@@ -102,11 +93,6 @@ enhance the functionality of scikit-learn's estimators.
 - `sklearn-porter <https://github.com/nok/sklearn-porter>`_
   Transpile trained scikit-learn models to C, Java, Javascript and others.
 
-- `sklearn-compiledtrees <https://github.com/ajtulloch/sklearn-compiledtrees/>`_
-  Generate a C++ implementation of the predict function for decision trees (and
-  ensembles) trained by sklearn. Useful for latency-sensitive production
-  environments.
-
 
 Other estimators and tasks
 --------------------------
@@ -118,10 +104,10 @@ and tasks.
 
 **Structured learning**
 
-- `sktime <https://github.com/alan-turing-institute/sktime>`_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting.
+- `tslearn <https://github.com/tslearn-team/tslearn>`_ A machine learning library for time series 
+  that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression.
 
-- `Seqlearn <https://github.com/larsmans/seqlearn>`_  Sequence classification
-  using HMMs or structured perceptron.
+- `sktime <https://github.com/alan-turing-institute/sktime>`_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting.
 
 - `HMMLearn <https://github.com/hmmlearn/hmmlearn>`_ Implementation of hidden
   markov models that was previously part of scikit-learn.
@@ -139,12 +125,6 @@ and tasks.
 
 **Deep neural networks etc.**
 
-- `pylearn2 <http://deeplearning.net/software/pylearn2/>`_ A deep learning and
-  neural network library build on theano with scikit-learn like interface.
-
-- `sklearn_theano <https://sklearn-theano.github.io/>`_ scikit-learn compatible
-  estimators, transformers, and datasets which use Theano internally
-
 - `nolearn <https://github.com/dnouri/nolearn>`_ A number of wrappers and
   abstractions around existing neural network libraries
 
@@ -153,8 +133,8 @@ and tasks.
 
 - `lasagne <https://github.com/Lasagne/Lasagne>`_ A lightweight library to
   build and train neural networks in Theano.
-  
-- `skorch <https://github.com/dnouri/skorch>`_ A scikit-learn compatible 
+
+- `skorch <https://github.com/dnouri/skorch>`_ A scikit-learn compatible
   neural network library that wraps PyTorch.
 
 **Broad scope**
@@ -162,9 +142,6 @@ and tasks.
 - `mlxtend <https://github.com/rasbt/mlxtend>`_ Includes a number of additional
   estimators as well as model visualization utilities.
 
-- `sparkit-learn <https://github.com/lensacom/sparkit-learn>`_ Scikit-learn
-  API and functionality for PySpark's distributed modelling.
-
 **Other regression and classification**
 
 - `xgboost <https://github.com/dmlc/xgboost>`_ Optimised gradient boosted decision
@@ -187,18 +164,20 @@ and tasks.
 - `gplearn <https://github.com/trevorstephens/gplearn>`_ Genetic Programming
   for symbolic regression tasks.
 
-- `multiisotonic <https://github.com/alexfields/multiisotonic>`_ Isotonic
-  regression on multidimensional features.
-
-- `scikit-multilearn <https://scikit.ml>`_ Multi-label classification with 
-  focus on label space manipulation.
+- `scikit-multilearn <https://github.com/scikit-multilearn/scikit-multilearn>`_
+  Multi-label classification with focus on label space manipulation.
 
-- `seglearn <https://github.com/dmbee/seglearn>`_ Time series and sequence 
+- `seglearn <https://github.com/dmbee/seglearn>`_ Time series and sequence
   learning using sliding window segmentation.
 
+- `libOPF <https://github.com/jppbsi/LibOPF>`_ Optimal path forest classifier
+
+- `fastFM <https://github.com/ibayer/fastFM>`_ Fast factorization machine
+  implementation compatible with scikit-learn
+
 **Decomposition and clustering**
 
-- `lda <https://github.com/ariddell/lda/>`_: Fast implementation of latent
+- `lda <https://github.com/lda-project/lda/>`_: Fast implementation of latent
   Dirichlet allocation in Cython which uses `Gibbs sampling
   <https://en.wikipedia.org/wiki/Gibbs_sampling>`_ to sample from the true
   posterior distribution. (scikit-learn's
@@ -207,9 +186,6 @@ and tasks.
   <https://en.wikipedia.org/wiki/Variational_Bayesian_methods>`_ to sample from
   a tractable approximation of a topic model's posterior distribution.)
 
-- `Sparse Filtering <https://github.com/jmetzen/sparse-filtering>`_
-  Unsupervised feature learning based on sparse-filtering
-
 - `kmodes <https://github.com/nicodv/kmodes>`_ k-modes clustering algorithm for
   categorical data, and several of its variations.
 
@@ -237,9 +213,6 @@ Other packages useful for data analysis and machine learning.
 - `Pandas <https://pandas.pydata.org/>`_ Tools for working with heterogeneous and
   columnar data, relational queries, time series and basic statistics.
 
-- `theano <http://deeplearning.net/software/theano/>`_ A CPU/GPU array
-  processing framework geared towards deep learning research.
-
 - `statsmodels <https://www.statsmodels.org>`_ Estimating and analysing
   statistical models. More focused on statistical tests and less on prediction
   than scikit-learn.
@@ -253,17 +226,9 @@ Other packages useful for data analysis and machine learning.
 - `Seaborn <https://stanford.edu/~mwaskom/software/seaborn/>`_ Visualization library based on
   matplotlib. It provides a high-level interface for drawing attractive statistical graphics.
 
-- `Deep Learning <http://deeplearning.net/software_links/>`_ A curated list of deep learning
-  software libraries.
-
 Recommendation Engine packages
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
- - `GraphLab
-   <https://turi.com/products/create/docs/graphlab.toolkits.recommender.html>`_
-   Implementation of classical recommendation techniques (in C++, with
-   Python bindings).
-
 - `implicit <https://github.com/benfred/implicit>`_, Library for implicit
   feedback datasets.
 
@@ -297,11 +262,3 @@ Domain specific packages
 
 - `MSMBuilder <http://msmbuilder.org/>`_  Machine learning for protein
   conformational dynamics time series.
-
-- `scikit-surprise <https://surpriselib.com/>`_ A scikit for building and
-  evaluating recommender systems.
-
-Snippets and tidbits
----------------------
-
-The `wiki <https://github.com/scikit-learn/scikit-learn/wiki/Third-party-projects-and-code-snippets>`_ has more!
diff --git a/doc/templates/index.html b/doc/templates/index.html
index e17111fb48eef..f49fbc2f4c540 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -8,7 +8,7 @@
         <h1 class="sk-landing-header text-white text-monospace">scikit-learn</h1>
         <h4 class="sk-landing-subheader text-white font-italic mb-3">Machine Learning in Python</h4>
         <a class="btn sk-landing-btn mb-1" href="{{ pathto('getting_started') }}" role="button">Getting Started</a>
-        <a class="btn sk-landing-btn mb-1" href="whats_new/v{{ version }}.html" role="button">What's New in {{ release }}</a>
+        <a class="btn sk-landing-btn mb-1" href="{{ pathto(release_highlights) }}" role="button">Release Highlights for {{ release_highlights_version }}</a>
         <a class="btn sk-landing-btn mb-1" href="https://github.com/scikit-learn/scikit-learn" role="button">GitHub</a>
       </div>
       <div class="col-md-6 d-flex">
@@ -155,12 +155,13 @@ <h4 class="sk-landing-call-header">News</h4>
         <ul class="sk-landing-call-list list-unstyled">
         <li><strong>On-going development:</strong>
         <a href="https://scikit-learn.org/dev/whats_new.html"><strong>What's new</strong> (Changelog)</a>
+        <li><strong>May 2020.</strong> scikit-learn 0.23.0 is available for download (<a href="whats_new/v0.23.html#version-0-23-0">Changelog</a>).
         </li>
         <li><strong>Scikit-learn from 0.23 requires Python 3.6 or greater.</strong>
         </li>
         <li><strong>March 2020.</strong> scikit-learn 0.22.2 is available for download (<a href="whats_new/v0.22.html#version-0-22-2">Changelog</a>).
         <li><strong>January 2020.</strong> scikit-learn 0.22.1 is available for download (<a href="whats_new/v0.22.html#version-0-22-1">Changelog</a>).
-        <li><strong>December 2019.</strong> scikit-learn 0.22 is available for download (<a href="whats_new/v0.22.html#version-0-22-0">Changelog</a>).
+        <li><strong>December 2019.</strong> scikit-learn 0.22 is available for download (<a href="whats_new/v0.22.html#version-0-22-0">Changelog</a> and <a href="{{ pathto('auto_examples/release_highlights/plot_release_highlights_0_22_0') }}">Release Highlights</a>).
         </li>
         <li><strong>Scikit-learn from 0.21 requires Python 3.5 or greater.</strong>
         </li>
@@ -168,14 +169,6 @@ <h4 class="sk-landing-call-header">News</h4>
         </li>
         <li><strong>May 2019.</strong> scikit-learn 0.21.0 to 0.21.2 are available for download (<a href="whats_new/v0.21.html#version-0-21-2">Changelog</a>).
         </li>
-        <li><strong>March 2019.</strong> scikit-learn 0.20.3 is available for download (<a href="whats_new/v0.20.html#version-0-20-3">Changelog</a>).
-        </li>
-        <li><strong>September 2018.</strong> scikit-learn 0.20.0 is available for download (<a href="whats_new/v0.20.html#version-0-20-0">Changelog</a>).
-        </li>
-        <li><strong>July 2018.</strong> scikit-learn 0.19.2 is available for download (<a href="whats_new/v0.19.html#version-0-19-2">Changelog</a>).
-        </li>
-        <li><strong>July 2017.</strong> scikit-learn 0.19.0 is available for download (<a href="whats_new/v0.19.html#version-0-19">Changelog</a>).
-        </li>
         </ul>
       </div>
       <div class="col-md-4">
@@ -252,7 +245,6 @@ <h4 class="sk-landing-call-header">Who uses scikit-learn?</h4>
                 <img class="sk-footer-funding-logo" src="_static/intel-small.png" title="Intel" >
                 <img class="sk-footer-funding-logo" src="_static/nvidia-small.png" title="Nvidia" >
                 <img class="sk-footer-funding-logo" src="_static/dataiku-small.png" title="Dataiku" >
-                <img class="sk-footer-funding-logo" src="_static/anaconda-small.png" title="Anaconda" >
         </div>
         </a>
   </div>
diff --git a/doc/themes/scikit-learn-modern/nav.html b/doc/themes/scikit-learn-modern/nav.html
index 57c631f6cbee7..4fbd22f48a4dd 100644
--- a/doc/themes/scikit-learn-modern/nav.html
+++ b/doc/themes/scikit-learn-modern/nav.html
@@ -9,6 +9,7 @@
 {%- set drop_down_navigation = [
   ('Getting Started', pathto('getting_started')),
   ('Tutorial', pathto('tutorial/index')),
+  ("What's new", 'whats_new/v' + version + '.html'),
   ('Glossary', pathto('glossary')),
   ('Development', pathto('developers/index')),
   ('FAQ', pathto('faq')),
diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index 2b80d6fe2b762..ceda27c6de093 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -90,7 +90,7 @@ div.highlight {
 
 div.highlight pre {
   margin-bottom: 0;
-  line-height: 1rem;
+  line-height: 1.2rem;
 }
 
 div.highlight a {
diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
index 082c5ffa3aa79..28e965bd925a5 100644
--- a/doc/tutorial/basic/tutorial.rst
+++ b/doc/tutorial/basic/tutorial.rst
@@ -77,8 +77,8 @@ Loading an example dataset
 `scikit-learn` comes with a few standard datasets, for instance the
 `iris <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ and `digits
 <https://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits>`_
-datasets for classification and the `boston house prices dataset
-<https://archive.ics.uci.edu/ml/machine-learning-databases/housing/>`_ for regression.
+datasets for classification and the `diabetes dataset
+<https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html>`_ for regression.
 
 In the following, we start a Python interpreter from our shell and then
 load the ``iris`` and ``digits`` datasets.  Our notational convention is that
diff --git a/doc/visualizations.rst b/doc/visualizations.rst
index 47d826602b62f..ebb98700d9e08 100644
--- a/doc/visualizations.rst
+++ b/doc/visualizations.rst
@@ -24,8 +24,8 @@ ROC curve for a fitted support vector machine:
 
     svc_disp = plot_roc_curve(svc, X_test, y_test)
 
-.. figure:: auto_examples/images/sphx_glr_plot_roc_curve_visualization_api_001.png
-    :target: auto_examples/plot_roc_curve_visualization_api.html
+.. figure:: auto_examples/miscellaneous/images/sphx_glr_plot_roc_curve_visualization_api_001.png
+    :target: auto_examples/miscellaneous/plot_roc_curve_visualization_api.html
     :align: center
     :scale: 75%
 
@@ -48,8 +48,8 @@ method of the `Display` object.
     rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=ax, alpha=0.8)
     svc_disp.plot(ax=ax, alpha=0.8)
 
-.. figure:: auto_examples/images/sphx_glr_plot_roc_curve_visualization_api_002.png
-    :target: auto_examples/plot_roc_curve_visualization_api.html
+.. figure:: auto_examples/miscellaneous/images/sphx_glr_plot_roc_curve_visualization_api_002.png
+    :target: auto_examples/miscellaneous/plot_roc_curve_visualization_api.html
     :align: center
     :scale: 75%
 
@@ -58,8 +58,9 @@ values of the curves.
 
 .. topic:: Examples:
 
-    * :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py`
-    * :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_display_object_visualization.py`
 
 Available Plotting Utilities
 ============================
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 7b84374bd5146..66f2a3818cec8 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -12,6 +12,7 @@ on libraries.io to be notified when new versions are released.
 .. toctree::
     :maxdepth: 1
 
+    Version 0.24 <whats_new/v0.24.rst>
     Version 0.23 <whats_new/v0.23.rst>
     Version 0.22 <whats_new/v0.22.rst>
     Version 0.21 <whats_new/v0.21.rst>
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 4c489c1887815..4ce884d2e8f87 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -7,13 +7,31 @@
 Version 0.23.0
 ==============
 
-**In Development**
+**May 12 2020**
+
+For a short description of the main highlights of the release, please
+refer to
+:ref:`sphx_glr_auto_examples_release_highlights_plot_release_highlights_0_23_0.py`.
 
 
 .. include:: changelog_legend.inc
 
 Put the changes in their relevant module.
 
+Enforcing keyword-only arguments
+--------------------------------
+
+In an effort to promote clear and non-ambiguous use of the library, most
+constructor and function parameters are now expected to be passed as keyword
+arguments (i.e. using the `param=value` syntax) instead of positional. To
+ease the transition, a `FutureWarning` is raised if a keyword-only parameter
+is used as positional. In version 0.25, these parameters will be strictly
+keyword-only, and a `TypeError` will be raised.
+:issue:`15005` by `Joel Nothman`_, `Adrin Jalali`_, `Thomas Fan`_, and
+`Nicolas Hug`_. See `SLEP009
+<https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep009/proposal.html>`_
+for more details.
+
 Changed models
 --------------
 
@@ -22,14 +40,44 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
-- :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`,
-  and :class:`ensemble.IsolationForest`. |Fix|
-
-- Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
+- |Fix| :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`,
+  and :class:`ensemble.IsolationForest`.
+- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` and
+  ``algorithm="full"``.
+- |Fix| :class:`cluster.Birch`
+- |Fix| :func:`compose.ColumnTransformer.get_feature_names`
+- |Fix| :func:`compose.ColumnTransformer.fit`
+- |Fix| :func:`datasets.make_multilabel_classification`
+- |Fix| :class:`decomposition.PCA` with `n_components='mle'`
+- |Enhancement| :class:`decomposition.NMF` and
+  :func:`decomposition.non_negative_factorization` with float32 dtype input.
+- |Fix| :func:`decomposition.KernelPCA.inverse_transform`
+- |API| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegrerssor`
+- |Fix| ``estimator_samples_`` in :class:`ensemble.BaggingClassifier`,
+  :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest`
+- |Fix| :class:`ensemble.StackingClassifier` and
+  :class:`ensemble.StackingRegressor` with `sample_weight`
+- |Fix| :class:`gaussian_process.GaussianProcessRegressor`
+- |Fix| :class:`linear_model.RANSACRegressor` with ``sample_weight``.
+- |Fix| :class:`linear_model.RidgeClassifierCV`
+- |Fix| :func:`metrics.mean_squared_error` with `squared` and
+  `multioutput='raw_values'`.
+- |Fix| :func:`metrics.mutual_info_score` with negative scores.
+- |Fix| :func:`metrics.confusion_matrix` with zero length `y_true` and `y_pred`
+- |Fix| :class:`neural_network.MLPClassifier`
+- |Fix| :class:`preprocessing.StandardScaler` with `partial_fit` and sparse
+  input.
+- |Fix| :class:`preprocessing.Normalizer` with norm='max'
+- |Fix| Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
   including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
   :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
   :class:`svm.SVC`, :class:`svm.SVR`, :class:`linear_model.LogisticRegression`.
-  |Efficiency| |Fix|
+- |Fix| :class:`tree.DecisionTreeClassifier`, :class:`tree.ExtraTreeClassifier` and
+  :class:`ensemble.GradientBoostingClassifier` as well as ``predict`` method of
+  :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeRegressor`, and
+  :class:`ensemble.GradientBoostingRegressor` and read-only float32 input in
+  ``predict``, ``decision_path`` and ``predict_proba``.
 
 Details are listed in the changelog below.
 
@@ -53,23 +101,37 @@ Changelog
 :mod:`sklearn.cluster`
 ......................
 
-- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more
-  more memory efficient implementation of single linkage clustering.
-  :pr:`11514` by :user:`Leland McInnes <lmcinnes>`.
-- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now converges with
-  ``tol=0`` as with the default ``algorithm="full"``. :pr:`16075` by
-  :user:`Erich Schubert <kno10>`.
-
 - |Efficiency| :class:`cluster.Birch` implementation of the predict method
   avoids high memory footprint by calculating the distances matrix using
   a chunked scheme.
   :pr:`16149` by :user:`Jeremie du Boisberranger <jeremiedbb>` and
   :user:`Alex Shacked <alexshacked>`.
 
+- |Efficiency| |MajorFeature| The critical parts of :class:`cluster.KMeans`
+  have a more optimized implementation. Parallelism is now over the data
+  instead of over initializations allowing better scalability. :pr:`11950` by
+  :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
+- |Enhancement| :class:`cluster.KMeans` now supports sparse data when
+  `solver = "elkan"`. :pr:`11950` by
+  :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
+- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more
+  memory efficient implementation of single linkage clustering.
+  :pr:`11514` by :user:`Leland McInnes <lmcinnes>`.
+
+- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now converges with
+  ``tol=0`` as with the default ``algorithm="full"``. :pr:`16075` by
+  :user:`Erich Schubert <kno10>`.
+
 - |Fix| Fixed a bug in :class:`cluster.Birch` where the `n_clusters` parameter
   could not have a `np.int64` type. :pr:`16484`
   by :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
+- |Fix| :class:`cluster.AgglomerativeCluClustering` add specific error when
+  distance matrix is not square and `affinity=precomputed`.
+  :pr:`16257` by :user:`Simona Maggio <simonamaggio>`.
+
 - |API| The ``n_jobs`` parameter of :class:`cluster.KMeans`,
   :class:`cluster.SpectralCoclustering` and
   :class:`cluster.SpectralBiclustering` is deprecated. They now use OpenMP
@@ -81,26 +143,26 @@ Changelog
   deprecated. It has no effect. :pr:`11950` by
   :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
-- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more
-  optimized implementation. Parallelism is now over the data instead of over
-  initializations allowing better scalability. :pr:`11950` by
-  :user:`Jeremie du Boisberranger <jeremiedbb>`.
-
-- |Enhancement| :class:`cluster.KMeans` now supports sparse data when
-  `solver = "elkan"`. :pr:`11950` by
-  :user:`Jeremie du Boisberranger <jeremiedbb>`.
+- |API| The ``random_state`` parameter has been added to 
+  :class:`cluster.AffinityPropagation`. :pr:`16801` by :user:`rcwoolston`
+  and :user:`Chiara Marmo <cmarmo>`.
 
 :mod:`sklearn.compose`
 ......................
 
-- |Fix| :class:`compose.ColumnTransformer` method ``get_feature_names`` now
-  returns correct results when one of the transformer steps applies on an
-  empty list of columns :pr:`15963` by `Roman Yurchak`_.
-
 - |Efficiency| :class:`compose.ColumnTransformer` is now faster when working
   with dataframes and strings are used to specific subsets of data for
   transformers. :pr:`16431` by `Thomas Fan`_.
 
+- |Enhancement| :class:`compose.ColumnTransformer` method ``get_feature_names``
+  now supports `'passthrough'` columns, with the feature name being either
+  the column name for a dataframe, or `'xi'` for column index `i`.
+  :pr:`14048` by :user:`Lewis Ball <lrjball>`.
+
+- |Fix| :class:`compose.ColumnTransformer` method ``get_feature_names`` now
+  returns correct results when one of the transformer steps applies on an
+  empty list of columns :pr:`15963` by `Roman Yurchak`_.
+
 - |Fix| :func:`compose.ColumnTransformer.fit` will error when selecting
   a column name that is not unique in the dataframe. :pr:`16431` by
   `Thomas Fan`_.
@@ -108,15 +170,9 @@ Changelog
 :mod:`sklearn.datasets`
 .......................
 
-- |Enhancement| Added ``return_centers`` parameter  in
-  :func:`datasets.make_blobs`, which can be used to return
-  centers for each cluster.
-  :pr:`15709` by :user:`<shivamgargsya>` and
-  :user:`Venkatachalam N <venkyyuvy>`.
-
-- |Enhancement| Functions :func:`datasets.make_circles` and
-  :func:`datasets.make_moons` now accept two-element tuple.
-  :pr:`15707` by :user:`Maciej J Mikulski <mjmikulski>`.
+- |Efficiency| :func:`datasets.fetch_openml` has reduced memory usage because
+  it no longer stores the full dataset text stream in memory. :pr:`16084` by
+  `Joel Nothman`_.
 
 - |Feature| :func:`datasets.fetch_california_housing` now supports
   heterogeneous data using pandas by setting `as_frame=True`. :pr:`15950`
@@ -129,27 +185,46 @@ Changelog
   ``DataFrame`` by setting `as_frame=True`. :pr:`15980` by :user:`wconnell` and
   :user:`Reshama Shaikh <reshamas>`.
 
+- |Enhancement| Added ``return_centers`` parameter  in
+  :func:`datasets.make_blobs`, which can be used to return
+  centers for each cluster.
+  :pr:`15709` by :user:`shivamgargsya` and
+  :user:`Venkatachalam N <venkyyuvy>`.
+
+- |Enhancement| Functions :func:`datasets.make_circles` and
+  :func:`datasets.make_moons` now accept two-element tuple.
+  :pr:`15707` by :user:`Maciej J Mikulski <mjmikulski>`.
+
 - |Fix| :func:`datasets.make_multilabel_classification` now generates
   `ValueError` for arguments `n_classes < 1` OR `length < 1`.
   :pr:`16006` by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |API| The `StreamHandler` was removed from `sklearn.logger` to avoid
+  double logging of messages in common cases where a hander is attached
+  to the root logger, and to follow the Python logging documentation
+  recommendation for libraries to leave the log message handling to
+  users and application code. :pr:`16451` by :user:`Christoph Deil <cdeil>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
+- |Enhancement| :class:`decomposition.NMF` and
+  :func:`decomposition.non_negative_factorization` now preserves float32 dtype.
+  :pr:`16280` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
+- |Enhancement| :func:`TruncatedSVD.transform` is now faster on given sparse
+  ``csc`` matrices. :pr:`16837` by :user:`wornbb`.
+
 - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
-   exclusively choose the components that explain the variance greater than
-   `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
+  exclusively choose the components that explain the variance greater than
+  `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
 
 - |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly
   handles small eigenvalues, and does not infer 0 as the correct number of
-  components. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
+  components. :pr:`16224` by :user:`Lisa Schwetlick <lschwetlick>`, and
   :user:`Gelavizh Ahmadi <gelavizh1>` and :user:`Marija Vlajic Wheeler
   <marijavlajic>` and :pr:`16841` by `Nicolas Hug`_.
 
-- |Enhancement| :class:`decomposition.NMF` and
-  :func:`decomposition.non_negative_factorization` now preserves float32 dtype.
-  :pr:`16280` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
-
 - |Fix| :class:`decomposition.KernelPCA` method ``inverse_transform`` now
   applies the correct inverse transform to the transformed data. :pr:`16655`
   by :user:`Lewis Ball <lrjball>`.
@@ -158,6 +233,10 @@ Changelog
   raise `invalid value encountered in multiply` during `fit`.
   :pr:`16718` by :user:`Gui Miotto <gui-miotto>`.
 
+- |Feature| Added `n_components_` attribute to :class:`decomposition.SparsePCA`
+  and :class:`decomposition.MiniBatchSparsePCA`. :pr:`16981` by
+  :user:`Mateusz Górski <Reksbril>`.
+
 :mod:`sklearn.ensemble`
 .......................
 
@@ -165,9 +244,22 @@ Changelog
   :class:`ensemble.HistGradientBoostingRegressor` now support
   :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_.
 
+- |Feature| Early stopping in
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` is now determined with a
+  new `early_stopping` parameter instead of `n_iter_no_change`. Default value
+  is 'auto', which enables early stopping if there are at least 10,000
+  samples in the training set. :pr:`14516` by :user:`Johann Faouzi
+  <johannfaouzi>`.
+
+- |MajorFeature| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
+  constraints, useful when features are supposed to have a positive/negative
+  effect on the target. :pr:`15582` by `Nicolas Hug`_.
+
 - |API| Added boolean `verbose` flag to classes:
   :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`.
-  :pr:`15991` by :user:`Sam Bail <spbail>`,
+  :pr:`16069` by :user:`Sam Bail <spbail>`,
   :user:`Hanna Bruce MacDonald <hannahbrucemacdonald>`,
   :user:`Reshama Shaikh <reshamas>`, and
   :user:`Chiara Marmo <cmarmo>`.
@@ -182,20 +274,7 @@ Changelog
   :class:`ensemble.HistGradientBoostingRegressor`. The depth now corresponds to
   the number of edges to go from the root to the deepest leaf.
   Stumps (trees with one split) are now allowed.
-  :pr: `16182` by :user:`Santhosh B <santhoshbala18>`
-
-- |Feature| Early stopping in
-  :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` is now determined with a
-  new `early_stopping` parameter instead of `n_iter_no_change`. Default value
-  is 'auto', which enables early stopping if there are at least 10,000
-  samples in the training set. :pr:`14516` by :user:`Johann Faouzi
-  <johannfaouzi>`.
-
-- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
-  constraints, useful when features are supposed to have a positive/negative
-  effect on the target. :pr:`15582` by `Nicolas Hug`_.
+  :pr:`16182` by :user:`Santhosh B <santhoshbala18>`
 
 - |Fix| Fixed a bug in :class:`ensemble.BaggingClassifier`,
   :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest`
@@ -210,6 +289,16 @@ Changelog
   to obtain the input to the meta estimator.
   :pr:`16539` by :user:`Bill DeRose <wderose>`.
 
+- |Feature| Added additional option `loss="poisson"` to
+  :class:`ensemble.HistGradientBoostingRegressor`, which adds Poisson deviance
+  with log-link useful for modeling count data.
+  :pr:`16692` by :user:`Christian Lorentzen <lorentzenchr>`
+
+- |Fix| Fixed a bug where :class:`ensemble.HistGradientBoostingRegressor` and
+  :class:`ensemble.HistGradientBoostingClassifier` would fail with multiple
+  calls to fit when `warm_start=True`, `early_stopping=True`, and there is no
+  validation set. :pr:`16663` by `Thomas Fan`_.
+
 :mod:`sklearn.feature_extraction`
 .................................
 
@@ -218,16 +307,21 @@ Changelog
   for datasets with large vocabularies combined with ``min_df`` or ``max_df``.
   :pr:`15834` by :user:`Santiago M. Mola <smola>`.
 
+:mod:`sklearn.feature_selection`
+................................
 
 - |Enhancement| Added support for multioutput data in
   :class:`feature_selection.RFE` and :class:`feature_selection.RFECV`.
   :pr:`16103` by :user:`Divyaprabha M <divyaprabha123>`.
 
+- |API| Adds :class:`feature_selection.SelectorMixin` back to public API.
+  :pr:`16132` by :user:`trimeta`.
+
 :mod:`sklearn.gaussian_process`
 ...............................
 
 - |Enhancement| :func:`gaussian_process.kernels.Matern` returns the RBF kernel when ``nu=np.inf``.
-  :pr:`15503` by :user:`Sam Dixon` <sam-dixon>.
+  :pr:`15503` by :user:`Sam Dixon <sam-dixon>`.
 
 - |Fix| Fixed bug in :class:`gaussian_process.GaussianProcessRegressor` that
   caused predicted standard deviations to only be between 0 and 1 when
@@ -241,6 +335,10 @@ Changelog
   ``max_value`` and ``min_value``. Array-like inputs allow a different max and min to be specified
   for each feature. :pr:`16403` by :user:`Narendra Mukherjee <narendramukherjee>`.
 
+- |Enhancement| :class:`impute.SimpleImputer`, :class:`impute.KNNImputer`, and
+  :class:`impute.SimpleImputer` accepts pandas' nullable integer dtype with
+  missing values. :pr:`16508` by `Thomas Fan`_.
+
 :mod:`sklearn.inspection`
 .........................
 
@@ -260,14 +358,10 @@ Changelog
   :pr:`14300` by :user:`Christian Lorentzen <lorentzenchr>`, `Roman Yurchak`_,
   and `Olivier Grisel`_.
 
-- |Feature| Support of `sample_weight` in :class:`linear_model.ElasticNet` and
-  :class:`linear_model:Lasso` for dense feature matrix `X`.
-  :pr:`15436` by :user:`Christian Lorentzen <lorentzenchr>`.
-
-- |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit
-  method of :class:`linear_model.RANSACRegressor`, it would not be passed to
-  the wrapped `base_estimator` during the fitting of the final model.
-  :pr:`15573` by :user:`Jeremy Alexandre <J-A16>`.
+- |MajorFeature| Support of `sample_weight` in
+  :class:`linear_model.ElasticNet` and :class:`linear_model.Lasso` for dense
+  feature matrix `X`. :pr:`15436` by :user:`Christian Lorentzen
+  <lorentzenchr>`.
 
 - |Efficiency| :class:`linear_model.RidgeCV` and
   :class:`linear_model.RidgeClassifierCV` now does not allocate a
@@ -276,6 +370,16 @@ Changelog
   `store_cv_values` is `True`.
   :pr:`15652` by :user:`Jérôme Dockès <jeromedockes>`.
 
+- |Enhancement| :class:`linear_model.LassoLars` and
+  :class:`linear_model.Lars` now support a `jitter` parameter that adds
+  random noise to the target. This might help with stability in some edge
+  cases. :pr:`15179` by :user:`angelaambroz`.
+
+- |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit
+  method of :class:`linear_model.RANSACRegressor`, it would not be passed to
+  the wrapped `base_estimator` during the fitting of the final model.
+  :pr:`15773` by :user:`Jeremy Alexandre <J-A16>`.
+
 - |Fix| add `best_score_` attribute to :class:`linear_model.RidgeCV` and
   :class:`linear_model.RidgeClassifierCV`.
   :pr:`15653` by :user:`Jérôme Dockès <jeromedockes>`.
@@ -285,6 +389,11 @@ Changelog
   instead of predictions.
   :pr:`14848` by :user:`Venkatachalam N <venkyyuvy>`.
 
+- |Fix| :class:`linear_model.LogisticRegression` will now avoid an unnecessary
+  iteration when `solver='newton-cg'` by checking for inferior or equal instead
+  of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
+  :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
+
 - |API| Deprecated public attributes `standard_coef_`, `standard_intercept_`,
   `average_coef_`, and `average_intercept_` in
   :class:`linear_model.SGDClassifier`,
@@ -293,20 +402,28 @@ Changelog
   :class:`linear_model.PassiveAggressiveRegressor`.
   :pr:`16261` by :user:`Carlos Brandt <chbrandt>`.
 
-- |Fix| :class:`linear_model.LogisticRegression` will now avoid an unnecessary
-  iteration when `solver='newton-cg'` by checking for inferior or equal instead
-  of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
-  :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
+- |Fix| |Efficiency| :class:`linear_model.ARDRegression` is more stable and
+  much faster when `n_samples > n_features`. It can now scale to hundreds of
+  thousands of samples. The stability fix might imply changes in the number
+  of non-zero coefficients and in the predicted output. :pr:`16849` by
+  `Nicolas Hug`_.
+
+- |Fix| Fixed a bug in :class:`linear_model.ElasticNetCV`,
+  :class:`linear_model.MultitaskElasticNetCV`, :class:`linear_model.LassoCV`
+  and :class:`linear_model.MultitaskLassoCV` where fitting would fail when
+  using joblib loky backend. :pr:`14264` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Efficiency| Speed up :class:`linear_model.MultiTaskLasso`,
+  :class:`linear_model.MultiTaskLassoCV`, :class:`linear_model.MultiTaskElasticNet`,
+  :class:`linear_model.MultiTaskElasticNetCV` by avoiding slower
+  BLAS Level 2 calls on small arrays
+  :pr:`17021` by :user:`Alex Gramfort <agramfort>` and
+  :user:`Mathurin Massias <mathurinm>`.
 
 :mod:`sklearn.metrics`
 ......................
 
-- |API| Changed the formatting of values in
-  :meth:`metrics.ConfusionMatrixDisplay.plot` and
-  :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g'
-  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and
-  `Thomas Fan`_.
-
 - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows
   its ``reduce_func`` to not have a return value, enabling in-place operations.
   :pr:`16397` by `Joel Nothman`_.
@@ -324,6 +441,18 @@ Changelog
   the `labels` parameter.
   :pr:`16442` by `Kyle Parsons <parsons-kyle-89>`.
 
+- |API| Changed the formatting of values in
+  :meth:`metrics.ConfusionMatrixDisplay.plot` and
+  :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g'
+  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and
+  `Thomas Fan`_.
+
+- |API| From version 0.25, :func:`metrics.pairwise.pairwise_distances` will no
+  longer automatically compute the ``VI`` parameter for Mahalanobis distance
+  and the ``V`` parameter for seuclidean distance if ``Y`` is passed. The user
+  will be expected to compute this parameter on the training data of their
+  choice and pass it to `pairwise_distances`. :pr:`16993` by `Joel Nothman`_.
+
 :mod:`sklearn.model_selection`
 ..............................
 
@@ -333,9 +462,9 @@ Changelog
   type and details.
   :pr:`15622` by :user:`Gregory Morse <GregoryMorse>`.
 
-- |Fix| :func: `cross_val_predict` supports `method="predict_proba"`
-  when `y=None`.
-  :pr:`15918` by :user:`Luca Kubin <lkubin>`.
+- |Fix| :func:`model_selection.cross_val_predict` supports
+  `method="predict_proba"` when `y=None`.:pr:`15918` by
+  :user:`Luca Kubin <lkubin>`.
 
 - |Fix| :func:`model_selection.fit_grid_point` is deprecated in 0.23 and will
   be removed in 0.25. :pr:`16401` by
@@ -359,21 +488,44 @@ Changelog
 :mod:`sklearn.neural_network`
 .............................
 
+- |Efficiency| :class:`neural_network.MLPClassifier` and
+  :class:`neural_network.MLPRegressor` has reduced memory footprint when using
+  stochastic solvers, `'sgd'` or `'adam'`, and `shuffle=True`. :pr:`14075` by
+  :user:`meyer89`.
+
 - |Fix| Increases the numerical stability of the logistic loss function in
   :class:`neural_network.MLPClassifier` by clipping the probabilities.
   :pr:`16117` by `Thomas Fan`_.
 
+:mod:`sklearn.inspection`
+.........................
+
+- |Enhancement| :class:`inspection.PartialDependenceDisplay` now exposes the
+  deciles lines as attributes so they can be hidden or customized. :pr:`15785`
+  by `Nicolas Hug`_
+
 :mod:`sklearn.preprocessing`
 ............................
 
-- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
-  transforming. :pr:`15762` by `Thomas Fan`_.
-
 - |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder`
   will now accept value 'if_binary' and will drop the first category of
   each feature with two categories. :pr:`16245`
   by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Enhancement| :class:`preprocessing.OneHotEncoder`'s `drop_idx_` ndarray
+  can now contain `None`, where `drop_idx_[i] = None` means that no category
+  is dropped for index `i`. :pr:`16585` by :user:`Chiara Marmo <cmarmo>`.
+
+- |Enhancement| :class:`preprocessing.MaxAbsScaler`,
+  :class:`preprocessing.MinMaxScaler`, :class:`preprocessing.StandardScaler`,
+  :class:`preprocessing.PowerTransformer`,
+  :class:`preprocessing.QuantileTransformer`,
+  :class:`preprocessing.RobustScaler` now supports pandas' nullable integer
+  dtype with missing values. :pr:`16508` by `Thomas Fan`_.
+
+- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
+  transforming. :pr:`15762` by `Thomas Fan`_.
+
 - |Fix| Fix a bug in :class:`preprocessing.StandardScaler` which was incorrectly
   computing statistics when calling `partial_fit` on sparse inputs.
   :pr:`16466` by :user:`Guillaume Lemaitre <glemaitre>`.
@@ -383,6 +535,13 @@ Changelog
   normalizing the vectors. :pr:`16632` by
   :user:`Maura Pintor <Maupin1991>` and :user:`Battista Biggio <bbiggio>`.
 
+:mod:`sklearn.semi_supervised`
+..............................
+
+- |Fix| :class:`semi_supervised.LabelSpreading` and
+  :class:`semi_supervised.LabelPropagation` avoids divide by zero warnings
+  when normalizing `label_distributions_`. :pr:`15946` by :user:`ngshya`.
+
 :mod:`sklearn.svm`
 ..................
 
@@ -390,14 +549,14 @@ Changelog
   generators used to randomly select coordinates in the coordinate descent
   algorithms. Platform-dependent C ``rand()`` was used, which is only able to
   generate numbers up to ``32767`` on windows platform (see this `blog
-  post <https://codeforces.com/blog/entry/61587>`) and also has poor
+  post <https://codeforces.com/blog/entry/61587>`_) and also has poor
   randomization power as suggested by `this presentation
-  <https://channel9.msdn.com/Events/GoingNative/2013/rand-Considered-Harmful>`.
+  <https://channel9.msdn.com/Events/GoingNative/2013/rand-Considered-Harmful>`_.
   It was replaced with C++11 ``mt19937``, a Mersenne Twister that correctly
   generates 31bits/63bits random numbers on all platforms. In addition, the
   crude "modulo" postprocessor used to get a random number in a bounded
   interval was replaced by the tweaked Lemire method as suggested by `this blog
-  post <http://www.pcg-random.org/posts/bounded-rands.html>`.
+  post <http://www.pcg-random.org/posts/bounded-rands.html>`_.
   Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
   including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
   :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
@@ -406,16 +565,16 @@ Changelog
   number of samples (LibSVM) or the number of features (LibLinear) is large.
   :pr:`13511` by :user:`Sylvain Marié <smarie>`.
 
-- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and
-  `probB_`, are now deprecated as they were not useful. :pr:`15558` by
-  `Thomas Fan`_.
-
 - |Fix| Fix use of custom kernel not taking float entries such as string
   kernels in :class:`svm.SVC` and :class:`svm.SVR`. Note that custom kennels
   are now expected to validate their input where they previously received
   valid numeric arrays.
   :pr:`11296` by `Alexandre Gramfort`_ and  :user:`Georgi Peev <georgipeev>`.
 
+- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and
+  `probB_`, are now deprecated as they were not useful. :pr:`15558` by
+  `Thomas Fan`_.
+
 :mod:`sklearn.tree`
 ...................
 
@@ -434,29 +593,113 @@ Changelog
 :mod:`sklearn.utils`
 ....................
 
+- |MajorFeature| Estimators can now be displayed with a rich html
+  representation. This can be enabled in Jupyter notebooks by setting
+  `display='diagram'` in :func:`~sklearn.set_config`. The raw html can be
+  returned by using :func:`utils.estimator_html_repr`.
+  :pr:`14180` by `Thomas Fan`_.
+
 - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`.
   :pr:`15926` by :user:`Loïc Estève <lesteve>`.
 
-- |Enhancement| add warning in :func:`utils.validation.check_array` for
+- |Enhancement| add warning in :func:`utils.check_array` for
   pandas sparse DataFrame.
   :pr:`16021` by :user:`Rushabh Vasani <rushabh-v>`.
 
-- |Enhancement| :func:`utils.validation.check_array` now constructs a sparse
-  matrix from a pandas DataFrame that contains only `SparseArray`s.
+- |Enhancement| :func:`utils.check_array` now constructs a sparse
+  matrix from a pandas DataFrame that contains only `SparseArray` columns.
   :pr:`16728` by `Thomas Fan`_.
 
-:mod:`sklearn.cluster`
-......................
+- |Enhancement| :func:`utils.validation.check_array` supports pandas'
+  nullable integer dtype with missing values when `force_all_finite` is set to
+  `False` or `'allow-nan'` in which case the data is converted to floating
+  point values where `pd.NA` values are replaced by `np.nan`. As a consequence,
+  all :mod:`sklearn.preprocessing` transformers that accept numeric inputs with
+  missing values represented as `np.nan` now also accepts being directly fed
+  pandas dataframes with `pd.Int* or `pd.Uint*` typed columns that use `pd.NA`
+  as a missing value marker. :pr:`16508` by `Thomas Fan`_.
 
-- |Fix| :class:`cluster.AgglomerativeClustering` add specific error when
-  distance matrix is not square and `affinity=precomputed`.
-  :pr:`16257` by :user:`Simona Maggio <simonamaggio>`.
+- |API| Passing classes to :func:`utils.estimator_checks.check_estimator` and
+  :func:`utils.estimator_checks.parametrize_with_checks` is now deprecated,
+  and support for classes will be removed in 0.24. Pass instances instead.
+  :pr:`17032` by `Nicolas Hug`_.
+
+- |FIX| :func:`utils.all_estimators` now only returns public estimators.
+  :pr:`15380` by `Thomas Fan`_.
 
 Miscellaneous
 .............
 
+- |MajorFeature| Adds a HTML representation of estimators to be shown in
+  a jupyter notebook or lab. This visualization is acitivated by setting the
+  `display` option in :func:`sklearn.set_config`. :pr:`14180` by
+  `Thomas Fan`_.
+
+- |Enhancement| ``scikit-learn`` now works with ``mypy`` without errors.
+  :pr:`16726` by `Roman Yurchak`_.
+
 - |API| Most estimators now expose a `n_features_in_` attribute. This
   attribute is equal to the number of features passed to the `fit` method.
   See `SLEP010
   <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
   for details. :pr:`16112` by `Nicolas Hug`_.
+
+- |API| Estimators now have a `requires_y` tags which is False by default
+  except for estimators that inherit from `~sklearn.base.RegressorMixin` or
+  `~sklearn.base.ClassifierMixin`. This tag is used to ensure that a proper
+  error message is raised when y was expected but None was passed.
+  :pr:`16622` by `Nicolas Hug`_.
+
+- |API| The default setting `print_changed_only` has been changed from False
+  to True. This means that the `repr` of estimators is now more concise and
+  only shows the parameters whose default value has been changed when
+  printing an estimator. You can restore the previous behaviour by using
+  `sklearn.set_config(print_changed_only=False)`. Also, note that it is
+  always possible to quickly inspect the parameters of any estimator using
+  `est.get_params(deep=False)`. :pr:`17061` by `Nicolas Hug`_.
+
+Code and Documentation Contributors
+-----------------------------------
+
+Thanks to everyone who has contributed to the maintenance and improvement of the
+project since version 0.20, including:
+
+Abbie Popa, Adrin Jalali, Aleksandra Kocot, Alexandre Batisse, Alexandre
+Gramfort, Alex Henrie, Alex Itkes, Alex Liang, alexshacked, Alonso Silva
+Allende, Ana Casado, Andreas Mueller, Angela Ambroz, Ankit810, Arie Pratama
+Sutiono, Arunav Konwar, Baptiste Maingret, Benjamin Beier Liu, bernie gray,
+Bharathi Srinivasan, Bharat Raghunathan, Bibhash Chandra Mitra, Brian Wignall,
+brigi, Brigitta Sipőcz, Carlos H Brandt, CastaChick, castor, cgsavard, Chiara
+Marmo, Chris Gregory, Christian Kastner, Christian Lorentzen, Corrie
+Bartelheimer, Daniël van Gelder, Daphne, David Breuer, david-cortes, dbauer9,
+Divyaprabha M, Edward Qian, Ekaterina Borovikova, ELNS, Emily Taylor, Erich
+Schubert, Eric Leung, Evgeni Chasnovski, Fabiana, Facundo Ferrín, Fan,
+Franziska Boenisch, Gael Varoquaux, Gaurav Sharma, Geoffrey Bolmier, Georgi
+Peev, gholdman1, Gonthier Nicolas, Gregory Morse, Gregory R. Lee, Guillaume
+Lemaitre, Gui Miotto, Hailey Nguyen, Hanmin Qin, Hao Chun Chang, HaoYin, Hélion
+du Mas des Bourboux, Himanshu Garg, Hirofumi Suzuki, huangk10, Hugo van
+Kemenade, Hye Sung Jung, indecisiveuser, inderjeet, J-A16, Jérémie du
+Boisberranger, Jin-Hwan CHO, JJmistry, Joel Nothman, Johann Faouzi, Jon Haitz
+Legarreta Gorroño, Juan Carlos Alfaro Jiménez, judithabk6, jumon, Kathryn
+Poole, Katrina Ni, Kesshi Jordan, Kevin Loftis, Kevin Markham,
+krishnachaitanya9, Lam Gia Thuan, Leland McInnes, Lisa Schwetlick, lkubin, Loic
+Esteve, lopusz, lrjball, lucgiffon, lucyleeow, Lucy Liu, Lukas Kemkes, Maciej J
+Mikulski, Madhura Jayaratne, Magda Zielinska, maikia, Mandy Gu, Manimaran,
+Manish Aradwad, Maren Westermann, Maria, Mariana Meireles, Marie Douriez,
+Marielle, Mateusz Górski, mathurinm, Matt Hall, Maura Pintor, mc4229, meyer89,
+m.fab, Michael Shoemaker, Michał Słapek, Mina Naghshhnejad, mo, Mohamed
+Maskani, Mojca Bertoncelj, narendramukherjee, ngshya, Nicholas Won, Nicolas
+Hug, nicolasservel, Niklas, @nkish, Noa Tamir, Oleksandr Pavlyk, olicairns,
+Oliver Urs Lenz, Olivier Grisel, parsons-kyle-89, Paula, Pete Green, Pierre
+Delanoue, pspachtholz, Pulkit Mehta, Qizhi  Jiang, Quang Nguyen, rachelcjordan,
+raduspaimoc, Reshama Shaikh, Riccardo Folloni, Rick Mackenbach, Ritchie Ng,
+Roman Feldbauer, Roman Yurchak, Rory Hartong-Redden, Rüdiger Busche, Rushabh
+Vasani, Sambhav Kothari, Samesh Lakhotia, Samuel Duan, SanthoshBala18, Santiago
+M. Mola, Sarat Addepalli, scibol, Sebastian Kießling, SergioDSR, Sergul Aydore,
+Shiki-H, shivamgargsya, SHUBH CHATTERJEE, Siddharth Gupta, simonamaggio,
+smarie, Snowhite, stareh, Stephen Blystone, Stephen Marsh, Sunmi Yoon,
+SylvainLan, talgatomarov, tamirlan1, th0rwas, theoptips, Thomas J Fan, Thomas
+Li, Thomas Schmitt, Tim Nonner, Tim Vink, Tiphaine Viard, Tirth Patel, Titus
+Christian, Tom Dupré la Tour, trimeta, Vachan D A, Vandana Iyer, Venkatachalam
+N, waelbenamara, wconnell, wderose, wenliwyan, Windber, wornbb, Yu-Hang "Maxin"
+Tang
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
new file mode 100644
index 0000000000000..dd4ab30a7f2ff
--- /dev/null
+++ b/doc/whats_new/v0.24.rst
@@ -0,0 +1,55 @@
+.. include:: _contributors.rst
+
+.. currentmodule:: sklearn
+
+.. _changes_0_24:
+
+Version 0.24.0
+==============
+
+**In Development**
+
+
+.. include:: changelog_legend.inc
+
+Put the changes in their relevant module.
+
+Changed models
+--------------
+
+The following estimators and functions, when fit with the same data and
+parameters, may produce different models from the previous version. This often
+occurs due to changes in the modelling logic (bug fixes or enhancements), or in
+random sampling procedures.
+
+- items
+- items
+
+Details are listed in the changelog below.
+
+(While we are trying to better inform users by providing this information, we
+cannot assure that this list is complete.)
+
+Changelog
+---------
+
+..
+    Entries should be grouped by module (in alphabetic order) and prefixed with
+    one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|,
+    |Fix| or |API| (see whats_new.rst for descriptions).
+    Entries should be ordered by those labels (e.g. |Fix| after |Efficiency|).
+    Changes not specific to a module should be listed under *Multiple Modules*
+    or *Miscellaneous*.
+    Entries should end with:
+    :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
+    where 123456 is the *pull request* number, not the issue number.
+
+:mod:`sklearn.module`
+.....................
+
+
+Code and Documentation Contributors
+-----------------------------------
+
+Thanks to everyone who has contributed to the maintenance and improvement of the
+project since version 0.20, including:
diff --git a/examples/README.txt b/examples/README.txt
index 4ee6efc46d1dd..958de667a5c69 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -2,8 +2,3 @@
 
 Examples
 ========
-
-Miscellaneous examples
-----------------------
-
-Miscellaneous and introductory examples for scikit-learn.
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index 1c79c4bb1d607..24fc4d69e35d0 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -87,6 +87,15 @@
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
 
+##############################################################################
+# HTML representation of ``Pipeline``
+###############################################################################
+# When the ``Pipeline`` is printed out in a jupyter notebook an HTML
+# representation of the estimator is displayed as follows:
+from sklearn import set_config
+set_config(display='diagram')
+clf
+
 ###############################################################################
 # Use ``ColumnTransformer`` by selecting column by data types
 ###############################################################################
diff --git a/examples/datasets/plot_random_dataset.py b/examples/datasets/plot_random_dataset.py
index 8de51a124f950..2f8d4be8ac383 100644
--- a/examples/datasets/plot_random_dataset.py
+++ b/examples/datasets/plot_random_dataset.py
@@ -3,14 +3,15 @@
 Plot randomly generated classification dataset
 ==============================================
 
-Plot several randomly generated 2D classification datasets.
-This example illustrates the :func:`datasets.make_classification`
-:func:`datasets.make_blobs` and :func:`datasets.make_gaussian_quantiles`
-functions.
+This example plots several randomly generated classification datasets.
+For easy visualization, all datasets have 2 features, plotted on the x and y
+axis. The color of each point represents its class label.
 
-For ``make_classification``, three binary and two multi-class classification
-datasets are generated, with different numbers of informative features and
-clusters per class.  """
+The first 4 plots use the :func:`~sklearn.datasets.make_classification` with
+different numbers of informative features, clusters per class and classes.
+The final 2 plots use :func:`~sklearn.datasets.make_blobs` and
+:func:`~sklearn.datasets.make_gaussian_quantiles`.
+"""
 
 print(__doc__)
 
diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py
index 58494f7ef816d..f932d698adc8b 100644
--- a/examples/decomposition/plot_pca_3d.py
+++ b/examples/decomposition/plot_pca_3d.py
@@ -66,14 +66,13 @@ def plot_figs(fig_num, elev, azim):
     Y = np.c_[a, b, c]
 
     # Using SciPy's SVD, this would be:
-    # _, pca_score, V = scipy.linalg.svd(Y, full_matrices=False)
+    # _, pca_score, Vt = scipy.linalg.svd(Y, full_matrices=False)
 
     pca = PCA(n_components=3)
     pca.fit(Y)
-    pca_score = pca.explained_variance_ratio_
-    V = pca.components_
+    V = pca.components_.T
 
-    x_pca_axis, y_pca_axis, z_pca_axis = 3 * V.T
+    x_pca_axis, y_pca_axis, z_pca_axis = 3 * V
     x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]]
     y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]]
     z_pca_plane = np.r_[z_pca_axis[:2], - z_pca_axis[1::-1]]
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 3dbe7dbaac296..860bb14687534 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -11,7 +11,7 @@
 and 500 regression trees of depth 4.
 
 Note: For larger datasets (n_samples >= 10000), please refer to
-:class:`sklearn.ensemble.HistGradientBoostingRegressor`
+:class:`sklearn.ensemble.HistGradientBoostingRegressor`.
 """
 print(__doc__)
 
@@ -32,8 +32,7 @@
 # Load the data
 # -------------------------------------
 #
-# First we need to load the data. We set random state to be consistent with the
-# result.
+# First we need to load the data.
 
 diabetes = datasets.load_diabetes()
 X, y = diabetes.data, diabetes.target
@@ -43,13 +42,11 @@
 # -------------------------------------
 #
 # Next, we will split our dataset to use 90% for training and leave the rest
-# for testing. We will also prepare the parameters we want to use to fit our
-# regression model. You can play with those parameters to see how the
-# results change:
+# for testing. We will also set the regression model parameters. You can play
+# with these parameters to see how the results change.
 #
-# n_estimators : the number of boosting stages which will be performed.
-# Later, we will plot and see how the deviance changes with those boosting
-# operations.
+# n_estimators : the number of boosting stages that will be performed.
+# Later, we will plot deviance against boosting iterations.
 #
 # max_depth : limits the number of nodes in the tree.
 # The best value depends on the interaction of the input variables.
@@ -57,12 +54,11 @@
 # min_samples_split : the minimum number of samples required to split an
 # internal node.
 #
-# learning_rate : how much the contribution of each tree will shrink
+# learning_rate : how much the contribution of each tree will shrink.
 #
-# loss : here, we decided to use least squeares as a loss function.
-# However there are many other options (check
-# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are
-# other possibilities)
+# loss : loss function to optimize. The least squares function is  used in this
+# case however, there are many other options (see
+# :class:`~sklearn.ensemble.GradientBoostingRegressor` ).
 
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.1, random_state=13)
@@ -80,10 +76,10 @@
 # Now we will initiate the gradient boosting regressors and fit it with our
 # training data. Let's also look and the mean squared error on the test data.
 
-clf = ensemble.GradientBoostingRegressor(**params)
-clf.fit(X_train, y_train)
+reg = ensemble.GradientBoostingRegressor(**params)
+reg.fit(X_train, y_train)
 
-mse = mean_squared_error(y_test, clf.predict(X_test))
+mse = mean_squared_error(y_test, reg.predict(X_test))
 print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
 
 ##############################################################################
@@ -91,16 +87,16 @@
 # -------------------------------------
 #
 # Finally, we will visualize the results. To do that we will first compute the
-# test set deviance and then plot it.
+# test set deviance and then plot it against boosting iterations.
 
 test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
-for i, y_pred in enumerate(clf.staged_predict(X_test)):
-    test_score[i] = clf.loss_(y_test, y_pred)
+for i, y_pred in enumerate(reg.staged_predict(X_test)):
+    test_score[i] = reg.loss_(y_test, y_pred)
 
 fig = plt.figure(figsize=(6, 6))
 plt.subplot(1, 1, 1)
 plt.title('Deviance')
-plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
+plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
          label='Training Set Deviance')
 plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
          label='Test Set Deviance')
@@ -116,16 +112,16 @@
 #
 # Careful, impurity-based feature importances can be misleading for
 # high cardinality features (many unique values). As an alternative,
-# the permutation importances of ``clf`` are computed on a
+# the permutation importances of ``reg`` can be computed on a
 # held out test set. See :ref:`permutation_importance` for more details.
 #
-# In this case, the two methods agree to identify the same top 2 features
-# as strongly predictive features but not in the same order. The third most
+# For this example, the impurity-based and permutation methods identify the
+# same 2 strongly predictive features but not in the same order. The third most
 # predictive feature, "bp", is also the same for the 2 methods. The remaining
 # features are less predictive and the error bars of the permutation plot
 # show that they overlap with 0.
 
-feature_importance = clf.feature_importances_
+feature_importance = reg.feature_importances_
 sorted_idx = np.argsort(feature_importance)
 pos = np.arange(sorted_idx.shape[0]) + .5
 fig = plt.figure(figsize=(12, 6))
@@ -134,7 +130,7 @@
 plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
 plt.title('Feature Importance (MDI)')
 
-result = permutation_importance(clf, X_test, y_test, n_repeats=10,
+result = permutation_importance(reg, X_test, y_test, n_repeats=10,
                                 random_state=42, n_jobs=2)
 sorted_idx = result.importances_mean.argsort()
 plt.subplot(1, 2, 2)
diff --git a/examples/ensemble/plot_voting_regressor.py b/examples/ensemble/plot_voting_regressor.py
index 6fd629bb9c083..2587dee4352e9 100644
--- a/examples/ensemble/plot_voting_regressor.py
+++ b/examples/ensemble/plot_voting_regressor.py
@@ -5,28 +5,28 @@
 
 .. currentmodule:: sklearn
 
-A voting regressor is an ensemble meta-estimator that fits base regressors each
-on the whole dataset. It, then, averages the individual predictions to form a
-final prediction.
+A voting regressor is an ensemble meta-estimator that fits several base
+regressors, each on the whole dataset. Then it averages the individual
+predictions to form a final prediction.
 We will use three different regressors to predict the data:
 :class:`~ensemble.GradientBoostingRegressor`,
 :class:`~ensemble.RandomForestRegressor`, and
 :class:`~linear_model.LinearRegression`).
-Then, using them we will make voting regressor
+Then the above 3 regressors will be used for the
 :class:`~ensemble.VotingRegressor`.
 
-Finally, we will plot all of them for comparison.
+Finally, we will plot the predictions made by all models for comparison.
 
-We will work with the diabetes dataset which consists of the 10 features
-collected from a cohort of diabetes patients. The target is the disease
-progression after one year from the baseline.
+We will work with the diabetes dataset which consists of 10 features
+collected from a cohort of diabetes patients. The target is a quantitative
+measure of disease progression one year after baseline.
 
 """
 print(__doc__)
 
 import matplotlib.pyplot as plt
 
-from sklearn import datasets
+from sklearn.datasets import load_diabetes
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.linear_model import LinearRegression
@@ -36,11 +36,11 @@
 # Training classifiers
 # --------------------------------
 #
-# First, we are going to load diabetes dataset and initiate gradient boosting
-# regressor, random forest regressor and linear regression. Next, we are going
-# to use each of them to build the voting regressor:
+# First, we will load the diabetes dataset and initiate a gradient boosting
+# regressor, a random forest regressor and a linear regression. Next, we will
+# use the 3 regressors to build the voting regressor:
 
-X, y = datasets.load_diabetes(return_X_y=True)
+X, y = load_diabetes(return_X_y=True)
 
 # Train classifiers
 reg1 = GradientBoostingRegressor(random_state=1)
@@ -58,8 +58,7 @@
 # Making predictions
 # --------------------------------
 #
-# Now we will use each of the regressors to make 20 first predictions about the
-# diabetes dataset.
+# Now we will use each of the regressors to make the 20 first predictions.
 
 xt = X[:20]
 
@@ -73,7 +72,7 @@
 # --------------------------------
 #
 # Finally, we will visualize the 20 predictions. The red stars show the average
-# prediction
+# prediction made by :class:`~ensemble.VotingRegressor`.
 
 plt.figure()
 plt.plot(pred1, 'gd', label='GradientBoostingRegressor')
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 5186cf0ba3bac..2ba7dc05d16b6 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -5,122 +5,255 @@
 
 Missing values can be replaced by the mean, the median or the most frequent
 value using the basic :class:`sklearn.impute.SimpleImputer`.
-The median is a more robust estimator for data with high magnitude variables
-which could dominate results (otherwise known as a 'long tail').
 
-With ``KNNImputer``, missing values can be imputed using the weighted
-or unweighted mean of the desired number of nearest neighbors.
+In this example we will investigate different imputation techniques:
 
-Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
-round-robin linear regression, treating every variable as an output in
-turn. The version implemented assumes Gaussian (output) variables. If your
-features are obviously non-Normal, consider transforming them to look more
-Normal so as to potentially improve performance.
+- imputation by the constant value 0
+- imputation by the mean value of each feature combined with a missing-ness
+  indicator auxiliary variable
+- k nearest neighbor imputation
+- iterative imputation
+
+We will use two datasets: Diabetes dataset which consists of 10 feature
+variables collected from diabetes patients with an aim to predict disease
+progression and California Housing dataset for which the target is the median
+house value for California districts.
+
+As neither of these datasets have missing values, we will remove some
+values to create new versions with artificially missing data. The performance
+of
+:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset
+is then compared the performance on the altered datasets with the artificially
+missing values imputed using different techniques.
 
-In addition of using an imputing method, we can also keep an indication of the
-missing information using :func:`sklearn.impute.MissingIndicator` which might
-carry some information.
 """
 print(__doc__)
 
+# Authors: Maria Telenczuk  <https://github.com/maikia>
+# License: BSD 3 clause
+
+###############################################################################
+# Download the data and make missing values sets
+################################################
+#
+# First we download the two datasets. Diabetes dataset is shipped with
+# scikit-learn. It has 442 entries, each with 10 features. California Housing
+# dataset is much larger with 20640 entries and 8 features. It needs to be
+# downloaded. We will only use the first 400 entries for the sake of speeding
+# up the calculations but feel free to use the whole dataset.
+#
+
 import numpy as np
-import matplotlib.pyplot as plt
 
-# To use the experimental IterativeImputer, we need to explicitly ask for it:
-from sklearn.experimental import enable_iterative_imputer  # noqa
+from sklearn.datasets import fetch_california_housing
 from sklearn.datasets import load_diabetes
-from sklearn.datasets import load_boston
+
+
+rng = np.random.RandomState(42)
+
+X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
+X_california, y_california = fetch_california_housing(return_X_y=True)
+X_california = X_california[:400]
+y_california = y_california[:400]
+
+
+def add_missing_values(X_full, y_full):
+    n_samples, n_features = X_full.shape
+
+    # Add missing values in 75% of the lines
+    missing_rate = 0.75
+    n_missing_samples = int(n_samples * missing_rate)
+
+    missing_samples = np.zeros(n_samples, dtype=np.bool)
+    missing_samples[: n_missing_samples] = True
+
+    rng.shuffle(missing_samples)
+    missing_features = rng.randint(0, n_features, n_missing_samples)
+    X_missing = X_full.copy()
+    X_missing[missing_samples, missing_features] = np.nan
+    y_missing = y_full.copy()
+
+    return X_missing, y_missing
+
+
+X_miss_california, y_miss_california = add_missing_values(
+    X_california, y_california)
+
+X_miss_diabetes, y_miss_diabetes = add_missing_values(
+    X_diabetes, y_diabetes)
+
+
+###############################################################################
+# Impute the missing data and score
+# #################################
+# Now we will write a function which will score the results on the differently
+# imputed data. Let's look at each imputer separately:
+#
+
+rng = np.random.RandomState(0)
+
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import (
-    SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator)
+
+# To use the experimental IterativeImputer, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer  # noqa
+from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
 from sklearn.model_selection import cross_val_score
+from sklearn.pipeline import make_pipeline
 
-rng = np.random.RandomState(0)
 
 N_SPLITS = 5
-REGRESSOR = RandomForestRegressor(random_state=0)
+regressor = RandomForestRegressor(random_state=0)
+
+###############################################################################
+# Missing information
+# -------------------
+# In addition to imputing the missing values, the imputers have an
+# `add_indicator` parameter that marks the values that were missing, which
+# might carry some information.
+#
 
 
 def get_scores_for_imputer(imputer, X_missing, y_missing):
-    estimator = make_pipeline(
-        make_union(imputer, MissingIndicator(missing_values=0)),
-        REGRESSOR)
+    estimator = make_pipeline(imputer, regressor)
     impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                     scoring='neg_mean_squared_error',
                                     cv=N_SPLITS)
     return impute_scores
 
 
-def get_results(dataset):
-    X_full, y_full = dataset.data, dataset.target
-    n_samples = X_full.shape[0]
-    n_features = X_full.shape[1]
+x_labels = ['Full data',
+            'Zero imputation',
+            'Mean Imputation',
+            'KNN Imputation',
+            'Iterative Imputation']
+
+mses_california = np.zeros(5)
+stds_california = np.zeros(5)
+mses_diabetes = np.zeros(5)
+stds_diabetes = np.zeros(5)
+
+###############################################################################
+# Estimate the score
+# ------------------
+# First, we want to estimate the score on the original data:
+#
 
-    # Estimate the score on the entire dataset, with no missing values
-    full_scores = cross_val_score(REGRESSOR, X_full, y_full,
+
+def get_full_score(X_full, y_full):
+    full_scores = cross_val_score(regressor, X_full, y_full,
                                   scoring='neg_mean_squared_error',
                                   cv=N_SPLITS)
+    return full_scores.mean(), full_scores.std()
 
-    # Add missing values in 75% of the lines
-    missing_rate = 0.75
-    n_missing_samples = int(np.floor(n_samples * missing_rate))
-    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
-                                          dtype=np.bool),
-                                 np.ones(n_missing_samples,
-                                         dtype=np.bool)))
-    rng.shuffle(missing_samples)
-    missing_features = rng.randint(0, n_features, n_missing_samples)
-    X_missing = X_full.copy()
-    X_missing[np.where(missing_samples)[0], missing_features] = 0
-    y_missing = y_full.copy()
 
-    # Estimate the score after replacing missing values by 0
-    imputer = SimpleImputer(missing_values=0,
-                            strategy='constant',
-                            fill_value=0)
+mses_california[0], stds_california[0] = get_full_score(X_california,
+                                                        y_california)
+mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes)
+
+
+###############################################################################
+# Replace missing values by 0
+# ---------------------------
+#
+# Now we will estimate the score on the data where the missing values are
+# replaced by 0:
+#
+
+
+def get_impute_zero_score(X_missing, y_missing):
+
+    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True,
+                            strategy='constant', fill_value=0)
     zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return zero_impute_scores.mean(), zero_impute_scores.std()
 
-    # Estimate the score after imputation (mean strategy) of the missing values
-    imputer = SimpleImputer(missing_values=0, strategy="mean")
-    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
-    # Estimate the score after kNN-imputation of the missing values
-    imputer = KNNImputer(missing_values=0)
+mses_california[1], stds_california[1] = get_impute_zero_score(
+    X_miss_california, y_miss_california)
+mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes,
+                                                           y_miss_diabetes)
+
+
+###############################################################################
+# kNN-imputation of the missing values
+# ------------------------------------
+#
+# :class:`sklearn.impute.KNNImputer` imputes missing values using the weighted
+# or unweighted mean of the desired number of nearest neighbors.
+
+def get_impute_knn_score(X_missing, y_missing):
+    imputer = KNNImputer(missing_values=np.nan, add_indicator=True)
     knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return knn_impute_scores.mean(), knn_impute_scores.std()
 
-    # Estimate the score after iterative imputation of the missing values
-    imputer = IterativeImputer(missing_values=0,
-                               random_state=0,
-                               n_nearest_features=5,
+
+mses_california[2], stds_california[2] = get_impute_knn_score(
+    X_miss_california, y_miss_california)
+mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes,
+                                                          y_miss_diabetes)
+
+
+###############################################################################
+# Impute missing values with mean
+# -------------------------------
+#
+
+def get_impute_mean(X_missing, y_missing):
+    imputer = SimpleImputer(missing_values=np.nan, strategy="mean",
+                            add_indicator=True)
+    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return mean_impute_scores.mean(), mean_impute_scores.std()
+
+
+mses_california[3], stds_california[3] = get_impute_mean(X_miss_california,
+                                                         y_miss_california)
+mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes,
+                                                     y_miss_diabetes)
+
+
+###############################################################################
+# Iterative imputation of the missing values
+# ------------------------------------------
+#
+# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
+# round-robin linear regression, modeling each feature with missing values as a
+# function of other features, in turn.
+# The version implemented assumes Gaussian (output) variables. If your features
+# are obviously non-normal, consider transforming them to look more normal
+# to potentially improve performance.
+#
+
+def get_impute_iterative(X_missing, y_missing):
+    imputer = IterativeImputer(missing_values=np.nan, add_indicator=True,
+                               random_state=0, n_nearest_features=5,
                                sample_posterior=True)
     iterative_impute_scores = get_scores_for_imputer(imputer,
                                                      X_missing,
                                                      y_missing)
+    return iterative_impute_scores.mean(), iterative_impute_scores.std()
 
-    return ((full_scores.mean(), full_scores.std()),
-            (zero_impute_scores.mean(), zero_impute_scores.std()),
-            (mean_impute_scores.mean(), mean_impute_scores.std()),
-            (knn_impute_scores.mean(), knn_impute_scores.std()),
-            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
+mses_california[4], stds_california[4] = get_impute_iterative(
+    X_miss_california, y_miss_california)
+mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes,
+                                                          y_miss_diabetes)
 
-results_diabetes = np.array(get_results(load_diabetes()))
-mses_diabetes = results_diabetes[:, 0] * -1
-stds_diabetes = results_diabetes[:, 1]
+mses_diabetes = mses_diabetes * -1
+mses_california = mses_california * -1
+
+###############################################################################
+# Plot the results
+# ################
+#
+# Finally we are going to visualize the score:
+#
+
+import matplotlib.pyplot as plt
 
-results_boston = np.array(get_results(load_boston()))
-mses_boston = results_boston[:, 0] * -1
-stds_boston = results_boston[:, 1]
 
 n_bars = len(mses_diabetes)
 xval = np.arange(n_bars)
 
-x_labels = ['Full data',
-            'Zero imputation',
-            'Mean Imputation',
-            'KNN Imputation',
-            'Iterative Imputation']
 colors = ['r', 'g', 'b', 'orange', 'black']
 
 # plot diabetes results
@@ -138,16 +271,20 @@ def get_results(dataset):
 ax1.invert_yaxis()
 ax1.set_yticklabels(x_labels)
 
-# plot boston results
+# plot california dataset results
 ax2 = plt.subplot(122)
 for j in xval:
-    ax2.barh(j, mses_boston[j], xerr=stds_boston[j],
+    ax2.barh(j, mses_california[j], xerr=stds_california[j],
              color=colors[j], alpha=0.6, align='center')
 
-ax2.set_title('Imputation Techniques with Boston Data')
+ax2.set_title('Imputation Techniques with California Data')
 ax2.set_yticks(xval)
 ax2.set_xlabel('MSE')
 ax2.invert_yaxis()
 ax2.set_yticklabels([''] * n_bars)
 
 plt.show()
+
+# You can also try different techniques. For instance, the median is a more
+# robust estimator for data with high magnitude variables which could dominate
+# results (otherwise known as a 'long tail').
diff --git a/examples/linear_model/plot_bayesian_ridge_curvefit.py b/examples/linear_model/plot_bayesian_ridge_curvefit.py
old mode 100755
new mode 100644
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 88e83d434a3c6..73fc94fb94600 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -80,14 +80,12 @@
 
 
 def plot_ic_criterion(model, name, color):
-    alpha_ = model.alpha_ + EPSILON
-    alphas_ = model.alphas_ + EPSILON
     criterion_ = model.criterion_
-    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
-             linewidth=3, label='%s criterion' % name)
-    plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
+    plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color,
+                 linewidth=3, label='%s criterion' % name)
+    plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3,
                 label='alpha: %s estimate' % name)
-    plt.xlabel('-log(alpha)')
+    plt.xlabel(r'$\alpha$')
     plt.ylabel('criterion')
 
 
@@ -108,19 +106,17 @@ def plot_ic_criterion(model, name, color):
 t_lasso_cv = time.time() - t1
 
 # Display results
-m_log_alphas = -np.log10(model.alphas_ + EPSILON)
-
 plt.figure()
 ymin, ymax = 2300, 3800
-plt.plot(m_log_alphas, model.mse_path_, ':')
-plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
+plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':')
+plt.plot(model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
          label='Average across the folds', linewidth=2)
-plt.axvline(-np.log10(model.alpha_ + EPSILON), linestyle='--', color='k',
+plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k',
             label='alpha: CV estimate')
 
 plt.legend()
 
-plt.xlabel('-log(alpha)')
+plt.xlabel(r'$\alpha$')
 plt.ylabel('Mean square error')
 plt.title('Mean square error on each fold: coordinate descent '
           '(train time: %.2fs)' % t_lasso_cv)
@@ -137,17 +133,15 @@ def plot_ic_criterion(model, name, color):
 t_lasso_lars_cv = time.time() - t1
 
 # Display results
-m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON)
-
 plt.figure()
-plt.plot(m_log_alphas, model.mse_path_, ':')
-plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
-         label='Average across the folds', linewidth=2)
-plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
+plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':')
+plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
+             label='Average across the folds', linewidth=2)
+plt.axvline(model.alpha_, linestyle='--', color='k',
             label='alpha CV')
 plt.legend()
 
-plt.xlabel('-log(alpha)')
+plt.xlabel(r'$\alpha$')
 plt.ylabel('Mean square error')
 plt.title('Mean square error on each fold: Lars (train time: %.2fs)'
           % t_lasso_lars_cv)
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 4b0386edfcdf6..59c07580b81ba 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -3,87 +3,87 @@
 Poisson regression and non-normal loss
 ======================================
 
-This example illustrates the use of log-linear Poisson regression
-on the `French Motor Third-Party Liability Claims dataset
-<https://www.openml.org/d/41214>`_ from [1]_ and compares
-it with models learned with least squared error. In this dataset, each sample
-corresponds to an insurance policy, i.e. a contract within an insurance
-company and an individual (policiholder). Available features include driver
-age, vehicle age, vehicle power, etc.
-
-A few definitions: a *claim* is the request made by a policyholder to the
-insurer to compensate for a loss covered by the insurance. The *exposure* is
-the duration of the insurance coverage of a given policy, in years.
-
-Our goal is to predict the expected number of insurance claims (or frequency)
-following car accidents for a policyholder given the historical data over a
-population of policyholders.
+This example illustrates the use of log-linear Poisson regression on the
+`French Motor Third-Party Liability Claims dataset
+<https://www.openml.org/d/41214>`_ from [1]_ and compares it with a linear
+model fitted with the usual least squared error and a non-linear GBRT model
+fitted with the Poisson loss (and a log-link).
+
+A few definitions:
+
+- A **policy** is a contract between an insurance company and an individual:
+  the **policyholder**, that is, the vehicle driver in this case.
+
+- A **claim** is the request made by a policyholder to the insurer to
+  compensate for a loss covered by the insurance.
+
+- The **exposure** is the duration of the insurance coverage of a given policy,
+  in years.
+
+- The claim **frequency** is the number of claims divided by the exposure,
+  typically measured in number of claims per year.
+
+In this dataset, each sample corresponds to an insurance policy. Available
+features include driver age, vehicle age, vehicle power, etc.
+
+Our goal is to predict the expected frequency of claims following car accidents
+for a new policyholder given the historical data over a population of
+policyholders.
 
 .. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
-    Third-Party Liability Claims (November 8, 2018).
-    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+    Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764
+    <http://dx.doi.org/10.2139/ssrn.3164764>`_
 
 """
 print(__doc__)
-
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #          Roman Yurchak <rth.yurchak@gmail.com>
+#          Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
-import warnings
-
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 
-from sklearn.datasets import fetch_openml
-from sklearn.dummy import DummyRegressor
-from sklearn.compose import ColumnTransformer
-from sklearn.linear_model import Ridge, PoissonRegressor
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
-from sklearn.preprocessing import OrdinalEncoder
-from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.utils import gen_even_slices
-from sklearn.metrics import auc
-
-from sklearn.metrics import mean_squared_error, mean_absolute_error
-from sklearn.metrics import mean_poisson_deviance
-
 
-def load_mtpl2(n_samples=100000):
-    """Fetch the French Motor Third-Party Liability Claims dataset.
-
-    Parameters
-    ----------
-    n_samples: int or None, default=100000
-      Number of samples to select (for faster run time). If None, the full
-      dataset with 678013 samples is returned.
-    """
+##############################################################################
+# The French Motor Third-Party Liability Claims dataset
+# -----------------------------------------------------
+#
+# Let's load the motor claim dataset from OpenML:
+# https://www.openml.org/d/41214
 
-    # freMTPL2freq dataset from https://www.openml.org/d/41214
-    df = fetch_openml(data_id=41214, as_frame=True)['data']
+from sklearn.datasets import fetch_openml
 
-    # unquote string fields
-    for column_name in df.columns[df.dtypes.values == np.object]:
-        df[column_name] = df[column_name].str.strip("'")
-    if n_samples is not None:
-        return df.iloc[:n_samples]
-    return df
 
+df = fetch_openml(data_id=41214, as_frame=True).frame
+df
 
 ##############################################################################
-# Let's load the motor claim dataset. We ignore the severity data for this
-# study for the sake of simplicitly.
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occurring with a constant rate in a given time interval (``Exposure``,
+# in units of years).
 #
-# We also subsample the data for the sake of computational cost and running
-# time. Using the full dataset would lead to similar conclusions.
+# Here we want to model the frequency ``y = ClaimNb / Exposure`` conditionally
+# on ``X`` via a (scaled) Poisson distribution, and use ``Exposure`` as
+# ``sample_weight``.
+
+df["Frequency"] = df["ClaimNb"] / df["Exposure"]
 
-df = load_mtpl2(n_samples=300000)
+print("Average Frequency = {}"
+      .format(np.average(df["Frequency"], weights=df["Exposure"])))
 
-# Correct for unreasonable observations (that might be data error)
-df["Exposure"] = df["Exposure"].clip(upper=1)
+print("Fraction of exposure with zero claims = {0:.1%}"
+      .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
+              df["Exposure"].sum()))
+
+fig, (ax0, ax1, ax2) = plt.subplots(ncols=3, figsize=(16, 4))
+ax0.set_title("Number of claims")
+_ = df["ClaimNb"].hist(bins=30, log=True, ax=ax0)
+ax1.set_title("Exposure in years")
+_ = df["Exposure"].hist(bins=30, log=True, ax=ax1)
+ax2.set_title("Frequency (number of claims per year)")
+_ = df["Frequency"].hist(bins=30, log=True, ax=ax2)
 
 ##############################################################################
 # The remaining columns can be used to predict the frequency of claim events.
@@ -93,6 +93,12 @@ def load_mtpl2(n_samples=100000):
 # In order to fit linear models with those predictors it is therefore
 # necessary to perform standard feature transformations as follows:
 
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+from sklearn.compose import ColumnTransformer
+
+
 log_scale_transformer = make_pipeline(
     FunctionTransformer(np.log, validate=False),
     StandardScaler()
@@ -112,123 +118,153 @@ def load_mtpl2(n_samples=100000):
     remainder="drop",
 )
 
-##############################################################################
-# The number of claims (``ClaimNb``) is a positive integer that can be modeled
-# as a Poisson distribution. It is then assumed to be the number of discrete
-# events occurring with a constant rate in a given time interval
-# (``Exposure``, in units of years). Here we model the frequency
-# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution,
-# and use ``Exposure`` as ``sample_weight``.
-
-df["Frequency"] = df["ClaimNb"] / df["Exposure"]
-
-print(
-   pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
-)
-
-print("Average Frequency = {}"
-      .format(np.average(df["Frequency"], weights=df["Exposure"])))
-
-print("Percentage of zero claims = {0:%}"
-      .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
-              df["Exposure"].sum()))
 
 ##############################################################################
-# It is worth noting that 92 % of policyholders have zero claims, and if we
-# were to convert this problem into a binary classification task, it would be
-# significantly imbalanced.
+# A constant prediction baseline
+# ------------------------------
+#
+# It is worth noting that more than 93% of policyholders have zero claims. If
+# we were to convert this problem into a binary classification task, it would
+# be significantly imbalanced, and even a simplistic model that would only
+# predict mean can achieve an accuracy of 93%.
 #
 # To evaluate the pertinence of the used metrics, we will consider as a
 # baseline a "dummy" estimator that constantly predicts the mean frequency of
 # the training sample.
 
-df_train, df_test = train_test_split(df, random_state=0)
+from sklearn.dummy import DummyRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split
 
-dummy = make_pipeline(
-    linear_model_preprocessor,
-    DummyRegressor(strategy='mean')
-)
-dummy.fit(df_train, df_train["Frequency"],
-          dummyregressor__sample_weight=df_train["Exposure"])
+df_train, df_test = train_test_split(df, test_size=0.33, random_state=0)
+
+dummy = Pipeline([
+    ("preprocessor", linear_model_preprocessor),
+    ("regressor", DummyRegressor(strategy='mean')),
+]).fit(df_train, df_train["Frequency"],
+       regressor__sample_weight=df_train["Exposure"])
+
+
+##############################################################################
+# Let's compute the performance of this constant prediction baseline with 3
+# different regression metrics:
+
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import mean_poisson_deviance
 
 
 def score_estimator(estimator, df_test):
     """Score an estimator on the test set."""
-
     y_pred = estimator.predict(df_test)
 
     print("MSE: %.3f" %
           mean_squared_error(df_test["Frequency"], y_pred,
-                             df_test["Exposure"]))
+                             sample_weight=df_test["Exposure"]))
     print("MAE: %.3f" %
           mean_absolute_error(df_test["Frequency"], y_pred,
-                              df_test["Exposure"]))
+                              sample_weight=df_test["Exposure"]))
 
-    # ignore non-positive predictions, as they are invalid for
-    # the Poisson deviance
+    # Ignore non-positive predictions, as they are invalid for
+    # the Poisson deviance.
     mask = y_pred > 0
     if (~mask).any():
-        warnings.warn("Estimator yields non-positive predictions for {} "
-                      "samples out of {}. These will be ignored while "
-                      "computing the Poisson deviance"
-                      .format((~mask).sum(), mask.shape[0]))
+        n_masked, n_samples = (~mask).sum(), mask.shape[0]
+        print(f"WARNING: Estimator yields invalid, non-positive predictions "
+              f" for {n_masked} samples out of {n_samples}. These predictions "
+              f"are ignored when computing the Poisson deviance.")
 
     print("mean Poisson deviance: %.3f" %
           mean_poisson_deviance(df_test["Frequency"][mask],
                                 y_pred[mask],
-                                df_test["Exposure"][mask]))
+                                sample_weight=df_test["Exposure"][mask]))
 
 
 print("Constant mean frequency evaluation:")
 score_estimator(dummy, df_test)
 
 ##############################################################################
-# We start by modeling the target variable with the least squares linear
-# regression model,
+# (Generalized) linear models
+# ---------------------------
+#
+# We start by modeling the target variable with the (l2 penalized) least
+# squares linear regression model, more comonly known as Ridge regression. We
+# use a low penalization `alpha`, as we expect such a linear model to under-fit
+# on such a large dataset.
+
+from sklearn.linear_model import Ridge
 
-ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0))
-ridge.fit(df_train, df_train["Frequency"],
-          ridge__sample_weight=df_train["Exposure"])
+
+ridge_glm = Pipeline([
+    ("preprocessor", linear_model_preprocessor),
+    ("regressor", Ridge(alpha=1e-6)),
+]).fit(df_train, df_train["Frequency"],
+       regressor__sample_weight=df_train["Exposure"])
 
 ##############################################################################
 # The Poisson deviance cannot be computed on non-positive values predicted by
-# the model. For models that do return a few non-positive predictions
-# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples,
+# the model. For models that do return a few non-positive predictions (e.g.
+# :class:`~sklearn.linear_model.Ridge`) we ignore the corresponding samples,
 # meaning that the obtained Poisson deviance is approximate. An alternative
-# approach could be to use :class:`compose.TransformedTargetRegressor`
+# approach could be to use :class:`~sklearn.compose.TransformedTargetRegressor`
 # meta-estimator to map ``y_pred`` to a strictly positive domain.
 
 print("Ridge evaluation:")
-score_estimator(ridge, df_test)
+score_estimator(ridge_glm, df_test)
 
 ##############################################################################
 # Next we fit the Poisson regressor on the target variable. We set the
-# regularization strength ``alpha`` to 1 over number of samples in oder to
-# mimic the Ridge regressor whose L2 penalty term scales differently with the
-# number of samples.
+# regularization strength ``alpha`` to approximately 1e-6 over number of
+# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty
+# term scales differently with the number of samples.
+#
+# Since the Poisson regressor internally models the log of the expected target
+# value instead of the expected value directly (log vs identity link function),
+# the relationship between X and y is not exactly linear anymore. Therefore the
+# Poisson regressor is called a Generalized Linear Model (GLM) rather than a
+# vanilla linear model as is the case for Ridge regression.
 
-poisson = make_pipeline(
-    linear_model_preprocessor,
-    PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000)
-)
-poisson.fit(df_train, df_train["Frequency"],
-            poissonregressor__sample_weight=df_train["Exposure"])
+from sklearn.linear_model import PoissonRegressor
+
+n_samples = df_train.shape[0]
+
+poisson_glm = Pipeline([
+    ("preprocessor", linear_model_preprocessor),
+    ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300))
+])
+poisson_glm.fit(df_train, df_train["Frequency"],
+                regressor__sample_weight=df_train["Exposure"])
 
 print("PoissonRegressor evaluation:")
-score_estimator(poisson, df_test)
+score_estimator(poisson_glm, df_test)
 
 ##############################################################################
-# Finally, we will consider a non-linear model, namely a random forest. Random
-# forests do not require the categorical data to be one-hot encoded: instead,
-# we can encode each category label with an arbitrary integer using
-# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will
-# treat the categorical features as ordered features, which might not be always
-# a desired behavior. However this effect is limited for deep enough trees
-# which are able to recover the categorical nature of the features. The main
-# advantage of the :class:`preprocessing.OrdinalEncoder` over the
-# :class:`preprocessing.OneHotEncoder` is that it will make training faster.
-
-rf_preprocessor = ColumnTransformer(
+# Gradient Boosting Regression Trees for Poisson regression
+# ---------------------------------------------------------
+#
+# Finally, we will consider a non-linear model, namely Gradient Boosting
+# Regression Trees. Tree-based models do not require the categorical data to be
+# one-hot encoded: instead, we can encode each category label with an arbitrary
+# integer using :class:`~sklearn.preprocessing.OrdinalEncoder`. With this
+# encoding, the trees will treat the categorical features as ordered features,
+# which might not be always a desired behavior. However this effect is limited
+# for deep enough trees which are able to recover the categorical nature of the
+# features. The main advantage of the
+# :class:`~sklearn.preprocessing.OrdinalEncoder` over the
+# :class:`~sklearn.preprocessing.OneHotEncoder` is that it will make training
+# faster.
+#
+# Gradient Boosting also gives the possibility to fit the trees with a Poisson
+# loss (with an implicit log-link function) instead of the default
+# least-squares loss. Here we only fit trees with the Poisson loss to keep this
+# example concise.
+
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.preprocessing import OrdinalEncoder
+
+
+tree_preprocessor = ColumnTransformer(
     [
         ("categorical", OrdinalEncoder(),
             ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
@@ -237,23 +273,22 @@ def score_estimator(estimator, df_test):
     ],
     remainder="drop",
 )
-rf = make_pipeline(
-    rf_preprocessor,
-    RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2)
-)
-rf.fit(df_train, df_train["Frequency"].values,
-       randomforestregressor__sample_weight=df_train["Exposure"].values)
+poisson_gbrt = Pipeline([
+    ("preprocessor", tree_preprocessor),
+    ("regressor", HistGradientBoostingRegressor(loss="poisson",
+                                                max_leaf_nodes=128)),
+])
+poisson_gbrt.fit(df_train, df_train["Frequency"],
+                 regressor__sample_weight=df_train["Exposure"])
 
-
-print("RandomForestRegressor evaluation:")
-score_estimator(rf, df_test)
+print("Poisson Gradient Boosted Trees evaluation:")
+score_estimator(poisson_gbrt, df_test)
 
 
 ##############################################################################
-# Like the Ridge regression above, the random forest model minimizes the
-# conditional squared error, too. However, because of a higher predictive
-# power, it also results in a smaller Poisson deviance than the Poisson
-# regression model.
+# Like the Poisson GLM above, the gradient boosted trees model minimizes
+# the Poisson deviance. However, because of a higher predictive power,
+# it reaches lower values of Poisson deviance.
 #
 # Evaluating models with a single train / test split is prone to random
 # fluctuations. If computing resources allow, it should be verified that
@@ -263,7 +298,7 @@ def score_estimator(estimator, df_test):
 # comparing the histogram of observed target values with that of predicted
 # values:
 
-fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True)
+fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True)
 fig.subplots_adjust(bottom=0.2)
 n_bins = 20
 for row_idx, label, df in zip(range(2),
@@ -278,7 +313,7 @@ def score_estimator(estimator, df_test):
     axes[row_idx, 0].set_ylim([1e1, 5e5])
     axes[row_idx, 0].set_ylabel(label + " samples")
 
-    for idx, model in enumerate([ridge, poisson, rf]):
+    for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]):
         y_pred = model.predict(df)
 
         pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
@@ -292,21 +327,42 @@ def score_estimator(estimator, df_test):
 
 ##############################################################################
 # The experimental data presents a long tail distribution for ``y``. In all
-# models we predict a mean expected value, so we will have necessarily fewer
-# extreme values. Additionally, the normal distribution used in ``Ridge`` and
-# ``RandomForestRegressor`` has a constant variance, while for the Poisson
-# distribution used in ``PoissonRegressor``, the variance is proportional to
-# the mean predicted value.
+# models, we predict the expected frequency of a random variable, so we will
+# have necessarily fewer extreme values than for the observed realizations of
+# that random variable. This explains that the mode of the histograms of model
+# predictions doesn't necessarily correspond to the smallest value.
+# Additionally, the normal distribution used in ``Ridge`` has a constant
+# variance, while for the Poisson distribution used in ``PoissonRegressor`` and
+# ``HistGradientBoostingRegressor``, the variance is proportional to the
+# predicted expected value.
+#
+# Thus, among the considered estimators, ``PoissonRegressor`` and
+# ``HistGradientBoostingRegressor`` are a-priori better suited for modeling the
+# long tail distribution of the non-negative data as compared to the ``Ridge``
+# model which makes a wrong assumption on the distribution of the target
+# variable.
 #
-# Thus, among the considered estimators, ``PoissonRegressor`` is better suited
-# for modeling the long tail distribution of the data as compared to the
-# ``Ridge`` and ``RandomForestRegressor`` estimators.
+# The ``HistGradientBoostingRegressor`` estimator has the most flexibility and
+# is able to predict higher expected values.
+#
+# Note that we could have used the least squares loss for the
+# ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal
+# distributed response variable as does the `Ridge` model, and possibly
+# also lead to slightly negative predictions. However the gradient boosted
+# trees would still perform relatively well and in particular better than
+# ``PoissonRegressor`` thanks to the flexibility of the trees combined with the
+# large number of training samples.
+#
+# Evaluation of the calibration of predictions
+# --------------------------------------------
 #
 # To ensure that estimators yield reasonable predictions for different
 # policyholder types, we can bin test samples according to ``y_pred`` returned
 # by each model. Then for each bin, we compare the mean predicted ``y_pred``,
 # with the mean observed target:
 
+from sklearn.utils import gen_even_slices
+
 
 def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
                                   n_bins=100):
@@ -352,104 +408,169 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
     return bin_centers, y_true_bin, y_pred_bin
 
 
-fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5))
+print(f"Actual number of claims: {df_test['ClaimNb'].sum()}")
+fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
 plt.subplots_adjust(wspace=0.3)
 
-for axi, model in zip(ax, [ridge, poisson, rf]):
+for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt,
+                                   dummy]):
     y_pred = model.predict(df_test)
-
+    y_true = df_test["Frequency"].values
+    exposure = df_test["Exposure"].values
     q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
-        df_test["Frequency"].values,
-        y_pred,
-        sample_weight=df_test["Exposure"].values,
-        n_bins=10)
+        y_true, y_pred, sample_weight=exposure, n_bins=10)
 
-    axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions")
-    axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations")
+    # Name of the model after the estimator used in the last step of the
+    # pipeline.
+    print(f"Predicted number of claims by {model[-1]}: "
+          f"{np.sum(y_pred * exposure):.1f}")
+
+    axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions")
+    axi.plot(q, y_true_seg, marker='o', linestyle="--", label="observations")
     axi.set_xlim(0, 1.0)
-    axi.set_ylim(0, 0.6)
+    axi.set_ylim(0, 0.5)
     axi.set(
-        title=model[-1].__class__.__name__,
+        title=model[-1],
         xlabel='Fraction of samples sorted by y_pred',
         ylabel='Mean Frequency (y_pred)'
     )
     axi.legend()
 plt.tight_layout()
 
-##############################################################################
-# The ``Ridge`` regression model can predict very low expected frequencies
-# that do not match the data. It can therefore severly under-estimate the risk
-# for some policyholders.
-#
-# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency
-# between predicted and observed targets, especially for low predicted target
-# values.
-#
-# However, for some business applications, we are not necessarily interested
-# in the ability of the model to predict the expected frequency value, but
-# instead to predict which policyholder groups are the riskiest and which are
-# the safest. In this case, the model evaluation would cast the problem as a
-# ranking problem rather than a regression problem.
-#
-# To compare the 3 models within this perspective, one can plot the fraction of
-# the number of claims vs the fraction of exposure for test samples ordered by
-# the model predictions, from safest to riskiest  according to each model:
-
-
-def _cumulated_claims(y_true, y_pred, exposure):
-    idx_sort = np.argsort(y_pred)  # from safest to riskiest
-    sorted_exposure = exposure[idx_sort]
-    sorted_frequencies = y_true[idx_sort]
-    cumulated_exposure = np.cumsum(sorted_exposure)
-    cumulated_exposure /= cumulated_exposure[-1]
-    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)
+###############################################################################
+# The dummy regression model predicts a constant frequency. This model does not
+# attribute the same tied rank to all samples but is none-the-less globally
+# well calibrated (to estimate the mean frequency of the entire population).
+#
+# The ``Ridge`` regression model can predict very low expected frequencies that
+# do not match the data. It can therefore severly under-estimate the risk for
+# some policyholders.
+#
+# ``PoissonRegressor`` and ``HistGradientBoostingRegressor`` show better
+# consistency between predicted and observed targets, especially for low
+# predicted target values.
+#
+# The sum of all predictions also confirms the calibration issue of the
+# ``Ridge`` model: it under-estimates by more than 3% the total number of
+# claims in the test set while the other three models can approximately recover
+# the total number of claims of the test portfolio.
+#
+# Evaluation of the ranking power
+# -------------------------------
+#
+# For some business applications, we are interested in the ability of the model
+# to rank the riskiest from the safest policyholders, irrespective of the
+# absolute value of the prediction. In this case, the model evaluation would
+# cast the problem as a ranking problem rather than a regression problem.
+#
+# To compare the 3 models from this perspective, one can plot the cumulative
+# proportion of claims vs the cumulative proportion of exposure for the test
+# samples order by the model predictions, from safest to riskiest according to
+# each model.
+#
+# This plot is called a Lorenz curve and can be summarized by the Gini index:
+
+from sklearn.metrics import auc
+
+
+def lorenz_curve(y_true, y_pred, exposure):
+    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
+    exposure = np.asarray(exposure)
+
+    # order samples by increasing predicted risk:
+    ranking = np.argsort(y_pred)
+    ranked_exposure = exposure[ranking]
+    ranked_frequencies = y_true[ranking]
+    ranked_exposure = exposure[ranking]
+    cumulated_claims = np.cumsum(ranked_frequencies * ranked_exposure)
     cumulated_claims /= cumulated_claims[-1]
+    cumulated_exposure = np.cumsum(ranked_exposure)
+    cumulated_exposure /= cumulated_exposure[-1]
     return cumulated_exposure, cumulated_claims
 
 
 fig, ax = plt.subplots(figsize=(8, 8))
 
-for model in [ridge, poisson, rf]:
+for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]:
     y_pred = model.predict(df_test)
-    cum_exposure, cum_claims = _cumulated_claims(
-        df_test["Frequency"].values,
-        y_pred,
-        df_test["Exposure"].values)
-    area = auc(cum_exposure, cum_claims)
-    label = "{} (area under curve: {:.3f})".format(
-        model[-1].__class__.__name__, area)
+    cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred,
+                                            df_test["Exposure"])
+    gini = 1 - 2 * auc(cum_exposure, cum_claims)
+    label = "{} (Gini: {:.2f})".format(model[-1], gini)
     ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
-cum_exposure, cum_claims = _cumulated_claims(
-    df_test["Frequency"].values,
-    df_test["Frequency"].values,
-    df_test["Exposure"].values)
-area = auc(cum_exposure, cum_claims)
-label = "Oracle (area under curve: {:.3f})".format(area)
+cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"],
+                                        df_test["Frequency"],
+                                        df_test["Exposure"])
+gini = 1 - 2 * auc(cum_exposure, cum_claims)
+label = "Oracle (Gini: {:.2f})".format(gini)
 ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
 
 # Random Baseline
 ax.plot([0, 1], [0, 1], linestyle="--", color="black",
         label="Random baseline")
 ax.set(
-    title="Cumulated number of claims by model",
-    xlabel='Fraction of exposure (from safest to riskiest)',
-    ylabel='Fraction of number of claims'
+    title="Lorenz curves by model",
+    xlabel='Cumulative proportion of exposure (from safest to riskiest)',
+    ylabel='Cumulative proportion of claims'
 )
 ax.legend(loc="upper left")
 
 ##############################################################################
-# This plot reveals that the random forest model is slightly better at ranking
-# policyholders by risk profiles even if the absolute value of the predicted
-# expected frequencies are less well calibrated than for the linear Poisson
-# model.
+# As expected, the dummy regressor is unable to correctly rank the samples and
+# therefore performs the worst on this plot.
+#
+# The tree-based model is significantly better at ranking policyholders by risk
+# while the two linear models perform similarly.
 #
 # All three models are significantly better than chance but also very far from
 # making perfect predictions.
 #
 # This last point is expected due to the nature of the problem: the occurrence
 # of accidents is mostly dominated by circumstantial causes that are not
-# captured in the columns of the dataset or that are indeed random.
+# captured in the columns of the dataset and can indeed be considered as purely
+# random.
+#
+# The linear models assume no interactions between the input variables which
+# likely causes under-fitting. Inserting a polynomial feature extractor
+# (:func:`~sklearn.preprocessing.PolynomialFeatures`) indeed increases their
+# discrimative power by 2 points of Gini index. In particular it improves the
+# ability of the models to identify the top 5% riskiest profiles.
+#
+# Main takeaways
+# --------------
+#
+# - The performance of the models can be evaluated by their ability to yield
+#   well-calibrated predictions and a good ranking.
+#
+# - The calibration of the model can be assessed by plotting the mean observed
+#   value vs the mean predicted value on groups of test samples binned by
+#   predicted risk.
+#
+# - The least squares loss (along with the implicit use of the identity link
+#   function) of the Ridge regression model seems to cause this model to be
+#   badly calibrated. In particular, it tends to underestimate the risk and can
+#   even predict invalid negative frequencies.
+#
+# - Using the Poisson loss with a log-link can correct these problems and lead
+#   to a well-calibrated linear model.
+#
+# - The Gini index reflects the ability of a model to rank predictions
+#   irrespective of their absolute values, and therefore only assess their
+#   ranking power.
+#
+# - Despite the improvement in calibration, the ranking power of both linear
+#   models are comparable and well below the ranking power of the Gradient
+#   Boosting Regression Trees.
+#
+# - The Poisson deviance computed as an evaluation metric reflects both the
+#   calibration and the ranking power of the model. It also makes a linear
+#   assumption on the ideal relationship between the expected value and the
+#   variance of the response variable. For the sake of conciseness we did not
+#   check whether this assumption holds.
+#
+# - Traditional regression metrics such as Mean Squared Error and Mean Absolute
+#   Error are hard to meaningfully interpret on count values with many zeros.
 
 plt.show()
diff --git a/examples/miscellaneous/README.txt b/examples/miscellaneous/README.txt
new file mode 100644
index 0000000000000..4e44ceee95809
--- /dev/null
+++ b/examples/miscellaneous/README.txt
@@ -0,0 +1,7 @@
+.. _miscellaneous_examples:
+
+Miscellaneous
+-------------
+
+Miscellaneous and introductory examples for scikit-learn.
+
diff --git a/examples/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py
similarity index 100%
rename from examples/plot_anomaly_comparison.py
rename to examples/miscellaneous/plot_anomaly_comparison.py
diff --git a/examples/plot_changed_only_pprint_parameter.py b/examples/miscellaneous/plot_changed_only_pprint_parameter.py
similarity index 100%
rename from examples/plot_changed_only_pprint_parameter.py
rename to examples/miscellaneous/plot_changed_only_pprint_parameter.py
diff --git a/examples/miscellaneous/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py
new file mode 100644
index 0000000000000..32ea3ef2d1120
--- /dev/null
+++ b/examples/miscellaneous/plot_display_object_visualization.py
@@ -0,0 +1,92 @@
+"""
+===================================
+Visualizations with Display Objects
+===================================
+
+.. currentmodule:: sklearn.metrics
+
+In this example, we will construct display objects,
+:class:`ConfusionMatrixDisplay`, :class:`RocCurveDisplay`, and
+:class:`PrecisionRecallDisplay` directly from their respective metrics. This
+is an alternative to using their corresponding plot functions when
+a model's predictions are already computed or expensive to compute. Note that
+this is advanced usage, and in general we recommend using their respective
+plot functions.
+"""
+print(__doc__)
+
+##############################################################################
+# Load Data and train model
+# -------------------------
+# For this example, we load a blood transfusion service center data set from
+# `OpenML <https://www.openml.org/d/1464>`. This is a binary classification
+# problem where the target is whether an individual donated blood. Then the
+# data is split into a train and test dataset and a logistic regression is
+# fitted wtih the train dataset.
+from sklearn.datasets import fetch_openml
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+
+X, y = fetch_openml(data_id=1464, return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
+
+clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))
+clf.fit(X_train, y_train)
+
+##############################################################################
+# Create :class:`ConfusionMatrixDisplay`
+##############################################################################
+# With the fitted model, we compute the predictions of the model on the test
+# dataset. These predictions are used to compute the confustion matrix which
+# is plotted with the :class:`ConfusionMatrixDisplay`
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import ConfusionMatrixDisplay
+
+y_pred = clf.predict(X_test)
+cm = confusion_matrix(y_test, y_pred)
+
+cm_display = ConfusionMatrixDisplay(cm).plot()
+
+
+##############################################################################
+# Create :class:`RocCurveDisplay`
+##############################################################################
+# The roc curve requires either the probabilities or the non-thresholded
+# decision values from the estimator. Since the logistic regression provides
+# a decision function, we will use it to plot the roc curve:
+from sklearn.metrics import roc_curve
+from sklearn.metrics import RocCurveDisplay
+y_score = clf.decision_function(X_test)
+
+fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1])
+roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
+
+##############################################################################
+# Create :class:`PrecisionRecallDisplay`
+##############################################################################
+# Similarly, the precision recall curve can be plotted using `y_score` from
+# the prevision sections.
+from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import PrecisionRecallDisplay
+
+prec, recall, _ = precision_recall_curve(y_test, y_score,
+                                         pos_label=clf.classes_[1])
+pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot()
+
+##############################################################################
+# Combining the display objects into a single plot
+##############################################################################
+# The display objects store the computed values that were passed as arguments.
+# This allows for the visualizations to be easliy combined using matplotlib's
+# API. In the following example, we place the displays next to each other in a
+# row.
+
+# sphinx_gallery_thumbnail_number = 4
+import matplotlib.pyplot as plt
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
+
+roc_display.plot(ax=ax1)
+pr_display.plot(ax=ax2)
+plt.show()
diff --git a/examples/plot_isotonic_regression.py b/examples/miscellaneous/plot_isotonic_regression.py
similarity index 100%
rename from examples/plot_isotonic_regression.py
rename to examples/miscellaneous/plot_isotonic_regression.py
diff --git a/examples/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
similarity index 99%
rename from examples/plot_johnson_lindenstrauss_bound.py
rename to examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
index b981c14fbf132..9d369c6c6d46d 100644
--- a/examples/plot_johnson_lindenstrauss_bound.py
+++ b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
@@ -8,7 +8,8 @@
 dataset can be randomly projected into a lower dimensional Euclidean
 space while controlling the distortion in the pairwise distances.
 
-.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma
+.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/\
+    Johnson%E2%80%93Lindenstrauss_lemma
 """
 
 print(__doc__)
diff --git a/examples/plot_kernel_approximation.py b/examples/miscellaneous/plot_kernel_approximation.py
similarity index 100%
rename from examples/plot_kernel_approximation.py
rename to examples/miscellaneous/plot_kernel_approximation.py
diff --git a/examples/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py
similarity index 100%
rename from examples/plot_kernel_ridge_regression.py
rename to examples/miscellaneous/plot_kernel_ridge_regression.py
diff --git a/examples/plot_multilabel.py b/examples/miscellaneous/plot_multilabel.py
similarity index 100%
rename from examples/plot_multilabel.py
rename to examples/miscellaneous/plot_multilabel.py
diff --git a/examples/plot_multioutput_face_completion.py b/examples/miscellaneous/plot_multioutput_face_completion.py
similarity index 100%
rename from examples/plot_multioutput_face_completion.py
rename to examples/miscellaneous/plot_multioutput_face_completion.py
diff --git a/examples/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
similarity index 97%
rename from examples/plot_partial_dependence_visualization_api.py
rename to examples/miscellaneous/plot_partial_dependence_visualization_api.py
index 8ccb225afc2d0..cbfa2c5e8ab64 100644
--- a/examples/plot_partial_dependence_visualization_api.py
+++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
@@ -10,9 +10,9 @@
 
 .. note::
 
-    See also :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py`
+    See also :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
 
-"""
+"""  # noqa
 print(__doc__)
 
 import pandas as pd
@@ -98,7 +98,6 @@
 # which will plot the partial dependence curves of each model on the same axes.
 # The length of the axes list must be equal to the number of plots drawn.
 
-# Sets this image as the thumbnail for sphinx gallery
 # sphinx_gallery_thumbnail_number = 4
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))
 tree_disp.plot(ax=[ax1, ax2], line_kw={"label": "Decision Tree"})
diff --git a/examples/plot_roc_curve_visualization_api.py b/examples/miscellaneous/plot_roc_curve_visualization_api.py
similarity index 100%
rename from examples/plot_roc_curve_visualization_api.py
rename to examples/miscellaneous/plot_roc_curve_visualization_api.py
diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py
index c3ee182ad3061..7be082de5e005 100644
--- a/examples/model_selection/plot_learning_curve.py
+++ b/examples/model_selection/plot_learning_curve.py
@@ -57,6 +57,7 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
+
           - None, to use the default 5-fold cross-validation,
           - integer, to specify the number of folds.
           - :term:`CV splitter`,
diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index e468357da9dec..24633f10b93b3 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -32,7 +32,6 @@
 print(__doc__)
 
 import numpy as np
-from scipy import interp
 import matplotlib.pyplot as plt
 
 from sklearn import svm, datasets
@@ -72,7 +71,7 @@
     viz = plot_roc_curve(classifier, X[test], y[test],
                          name='ROC fold {}'.format(i),
                          alpha=0.3, lw=1, ax=ax)
-    interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr)
+    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
     interp_tpr[0] = 0.0
     tprs.append(interp_tpr)
     aucs.append(viz.roc_auc)
diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py
index 57314a218f6ee..33f421a226c33 100644
--- a/examples/neural_networks/plot_mnist_filters.py
+++ b/examples/neural_networks/plot_mnist_filters.py
@@ -18,10 +18,16 @@
 
 To make the example run faster, we use very few hidden units, and train only
 for a very short time. Training longer would result in weights with a much
-smoother spatial appearance.
+smoother spatial appearance. The example will throw a warning because it
+doesn't converge, in this case this is what we want because of CI's time
+constraints.
 """
+
+import warnings
+
 import matplotlib.pyplot as plt
 from sklearn.datasets import fetch_openml
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.neural_network import MLPClassifier
 
 print(__doc__)
@@ -38,7 +44,13 @@
                     solver='sgd', verbose=10, random_state=1,
                     learning_rate_init=.1)
 
-mlp.fit(X_train, y_train)
+# this example won't converge because of CI's time constraints, so we catch the
+# warning and are ignore it here
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=ConvergenceWarning,
+                            module="sklearn")
+    mlp.fit(X_train, y_train)
+
 print("Training set score: %f" % mlp.score(X_train, y_train))
 print("Test set score: %f" % mlp.score(X_test, y_test))
 
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
new file mode 100644
index 0000000000000..5ad3fb35bd708
--- /dev/null
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -0,0 +1,168 @@
+# flake8: noqa
+"""
+========================================
+Release Highlights for scikit-learn 0.23
+========================================
+
+.. currentmodule:: sklearn
+
+We are pleased to announce the release of scikit-learn 0.23! Many bug fixes
+and improvements were added, as well as some new key features. We detail
+below a few of the major features of this release. **For an exhaustive list of
+all the changes**, please refer to the :ref:`release notes <changes_0_23>`.
+
+To install the latest version (with pip)::
+
+    pip install --upgrade scikit-learn
+
+or with conda::
+
+    conda install scikit-learn
+"""
+
+##############################################################################
+# Generalized Linear Models, and Poisson loss for gradient boosting
+# -----------------------------------------------------------------
+# Long-awaited Generalized Linear Models with non-normal loss functions are now
+# available. In particular, three new regressors were implemented:
+# :class:`~sklearn.linear_model.PoissonRegressor`,
+# :class:`~sklearn.linear_model.GammaRegressor`, and
+# :class:`~sklearn.linear_model.TweedieRegressor`. The Poisson regressor can be
+# used to model positive integer counts, or relative frequencies. Read more in
+# the :ref:`User Guide <Generalized_linear_regression>`. Additionally,
+# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` supports a new
+# 'poisson' loss as well.
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import PoissonRegressor
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+n_samples, n_features = 1000, 20
+rng = np.random.RandomState(0)
+X = rng.randn(n_samples, n_features)
+# positive integer target correlated with X[:, 5] with many zeros:
+y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+glm = PoissonRegressor()
+gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
+glm.fit(X_train, y_train)
+gbdt.fit(X_train, y_train)
+print(glm.score(X_test, y_test))
+print(gbdt.score(X_test, y_test))
+
+##############################################################################
+# Rich visual representation of estimators
+# -----------------------------------------
+# Estimators can now be visualized in notebooks by enabling the
+# `display='diagram'` option. This is particularly useful to summarise the
+# structure of pipelines and other composite estimators, with interactivity to
+# provide detail.  Click on the example image below to expand Pipeline
+# elements.  See :ref:`visualizing_composite_estimators` for how you can use
+# this feature.
+
+from sklearn import set_config
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn.compose import make_column_transformer
+from sklearn.linear_model import LogisticRegression
+set_config(display='diagram')
+
+num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
+
+cat_proc = make_pipeline(
+    SimpleImputer(strategy='constant', fill_value='missing'),
+    OneHotEncoder(handle_unknown='ignore'))
+
+preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')),
+                                       (cat_proc, ('feat0', 'feat2')))
+
+clf = make_pipeline(preprocessor, LogisticRegression())
+clf
+
+##############################################################################
+# Scalability and stability improvements to KMeans
+# ------------------------------------------------
+# The :class:`~sklearn.cluster.KMeans` estimator was entirely re-worked, and it
+# is now significantly faster and more stable. In addition, the Elkan algorithm
+# is now compatible with sparse matrices. The estimator uses OpenMP based
+# parallelism instead of relying on joblib, so the `n_jobs` parameter has no
+# effect anymore. For more details on how to control the number of threads,
+# please refer to our :ref:`parallelism` notes.
+import scipy
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+from sklearn.metrics import completeness_score
+
+rng = np.random.RandomState(0)
+X, y = make_blobs(random_state=rng)
+X = scipy.sparse.csr_matrix(X)
+X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng)
+kmeans = KMeans(algorithm='elkan').fit(X_train)
+print(completeness_score(kmeans.predict(X_test), y_test))
+
+##############################################################################
+# Improvements to the histogram-based Gradient Boosting estimators
+# ----------------------------------------------------------------
+# Various improvements were made to
+# :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+# :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. On top of the
+# Poisson loss mentionned above, these estimators now support :ref:`sample
+# weights <sw_hgbdt>`. Also, an automatic early-stopping criterion was added:
+# early-stopping is enabled by default when the number of samples exceeds 10k.
+# Finally, users can now define :ref:`monotonic constraints
+# <monotonic_cst_gbdt>` to constrain the predictions based on the variations of
+# specific features. In the following example, we construct a target that is
+# generally positively correlated with the first feature, with some noise.
+# Applying monotoinc constraints allows the prediction to capture the global
+# effect of the first feature, instead of fitting the noise.
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.inspection import plot_partial_dependence
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+n_samples = 500
+rng = np.random.RandomState(0)
+X = rng.randn(n_samples, 2)
+noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+y = (5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise)
+
+gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
+gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y)
+
+disp = plot_partial_dependence(
+    gbdt_no_cst, X, features=[0], feature_names=['feature 0'],
+    line_kw={'linewidth': 4, 'label': 'unconstrained'})
+plot_partial_dependence(gbdt_cst, X, features=[0],
+    line_kw={'linewidth': 4, 'label': 'constrained'}, ax=disp.axes_)
+disp.axes_[0, 0].plot(X[:, 0], y, 'o', alpha=.5, zorder=-1, label='samples')
+disp.axes_[0, 0].set_ylim(-3, 3); disp.axes_[0, 0].set_xlim(-1, 1)
+plt.legend()
+plt.show()
+
+##############################################################################
+# Sample-weight support for Lasso and ElasticNet
+# ----------------------------------------------
+# The two linear regressors :class:`~sklearn.linear_model.Lasso` and
+# :class:`~sklearn.linear_model.ElasticNet` now support sample weights.
+
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import make_regression
+from sklearn.linear_model import Lasso
+import numpy as np
+
+n_samples, n_features = 1000, 20
+rng = np.random.RandomState(0)
+X, y = make_regression(n_samples, n_features, random_state=rng)
+sample_weight = rng.rand(n_samples)
+X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
+    X, y, sample_weight, random_state=rng)
+reg = Lasso()
+reg.fit(X_train, y_train, sample_weight=sw_train)
+print(reg.score(X_test, y_test, sw_test))
diff --git a/maint_tools/whats_missing.sh b/maint_tools/whats_missing.sh
index 54ce06f8bbcf5..5b2d6b8fd8a01 100755
--- a/maint_tools/whats_missing.sh
+++ b/maint_tools/whats_missing.sh
@@ -19,7 +19,7 @@ logged_prs() {
 
 mentioned_issues() {
 	cat doc/whats_new/v$to_file.rst |
-			grep -o 'issue:`[0-9]\+`' |
+			grep -o 'issue:`[0-9]\+`\|pr:`[0-9]\+`' |
 			grep -o '[0-9]\+'
 }
 
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 7f203a079f22b..870d0d9a93f0d 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -19,8 +19,6 @@
 from ._config import get_config, set_config, config_context
 
 logger = logging.getLogger(__name__)
-logger.addHandler(logging.StreamHandler())
-logger.setLevel(logging.INFO)
 
 
 # PEP0440 compatible formatted version, see:
@@ -39,7 +37,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.23.dev0'
+__version__ = '0.24.dev0'
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
diff --git a/sklearn/_config.py b/sklearn/_config.py
index c7f3934ee1cb3..f183203e13228 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -6,7 +6,8 @@
 _global_config = {
     'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)),
     'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
-    'print_changed_only': False,
+    'print_changed_only': True,
+    'display': 'text',
 }
 
 
@@ -27,7 +28,7 @@ def get_config():
 
 
 def set_config(assume_finite=None, working_memory=None,
-               print_changed_only=None):
+               print_changed_only=None, display=None):
     """Set global scikit-learn configuration
 
     .. versionadded:: 0.19
@@ -59,6 +60,13 @@ def set_config(assume_finite=None, working_memory=None,
 
         .. versionadded:: 0.21
 
+    display : {'text', 'diagram'}, optional
+        If 'diagram', estimators will be displayed as text in a jupyter lab
+        of notebook context. If 'text', estimators will be displayed as
+        text. Default is 'text'.
+
+        .. versionadded:: 0.23
+
     See Also
     --------
     config_context: Context manager for global scikit-learn configuration
@@ -70,6 +78,8 @@ def set_config(assume_finite=None, working_memory=None,
         _global_config['working_memory'] = working_memory
     if print_changed_only is not None:
         _global_config['print_changed_only'] = print_changed_only
+    if display is not None:
+        _global_config['display'] = display
 
 
 @contextmanager
@@ -93,9 +103,19 @@ def config_context(**new_config):
     print_changed_only : bool, optional
         If True, only the parameters that were set to non-default
         values will be printed when printing an estimator. For example,
-        ``print(SVC())`` while True will only print 'SVC()' while the default
-        behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
-        all the non-changed parameters.
+        ``print(SVC())`` while True will only print 'SVC()', but would print
+        'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
+        when False. Default is True.
+
+        .. versionchanged:: 0.23
+           Default changed from False to True.
+
+    display : {'text', 'diagram'}, optional
+        If 'diagram', estimators will be displayed as text in a jupyter lab
+        of notebook context. If 'text', estimators will be displayed as
+        text. Default is 'text'.
+
+        .. versionadded:: 0.23
 
     Notes
     -----
diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx
index c7486097df854..75c4bbef11379 100644
--- a/sklearn/_isotonic.pyx
+++ b/sklearn/_isotonic.pyx
@@ -11,6 +11,8 @@ cimport numpy as np
 cimport cython
 from cython cimport floating
 
+np.import_array()
+
 
 def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
     cdef:
diff --git a/sklearn/base.py b/sklearn/base.py
index 70dec8c030418..77c3223ed75e1 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -17,9 +17,12 @@
 import numpy as np
 
 from . import __version__
+from ._config import get_config
 from .utils import _IS_32BIT
 from .utils.validation import check_X_y
 from .utils.validation import check_array
+from .utils._estimator_html_repr import estimator_html_repr
+from .utils.validation import _deprecate_positional_args
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -33,13 +36,16 @@
     'stateless': False,
     'multilabel': False,
     '_skip_test': False,
-    '_xfail_test': False,
+    '_xfail_checks': False,
     'multioutput_only': False,
     'binary_only': False,
-    'requires_fit': True}
+    'requires_fit': True,
+    'requires_y': False,
+    }
 
 
-def clone(estimator, safe=True):
+@_deprecate_positional_args
+def clone(estimator, *, safe=True):
     """Constructs a new estimator with the same parameters.
 
     Clone does a deep copy of the model in an estimator
@@ -374,7 +380,8 @@ def _check_n_features(self, X, reset):
                                        self.n_features_in_)
                 )
 
-    def _validate_data(self, X, y=None, reset=True, **check_params):
+    def _validate_data(self, X, y=None, reset=True,
+                       validate_separately=False, **check_params):
         """Validate input data and set or check the `n_features_in_` attribute.
 
         Parameters
@@ -389,9 +396,14 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
             Whether to reset the `n_features_in_` attribute.
             If False, the input will be checked for consistency with data
             provided when reset was last True.
+        validate_separately : False or tuple of dicts, default=False
+            Only used if y is not None.
+            If False, call validate_X_y(). Else, it must be a tuple of kwargs
+            to be used for calling check_array() on X and y respectively.
         **check_params : kwargs
             Parameters passed to :func:`sklearn.utils.check_array` or
-            :func:`sklearn.utils.check_X_y`.
+            :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
+            is not False.
 
         Returns
         -------
@@ -400,10 +412,24 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
         """
 
         if y is None:
+            if self._get_tags()['requires_y']:
+                raise ValueError(
+                    f"This {self.__class__.__name__} estimator "
+                    f"requires y to be passed, but the target y is None."
+                )
             X = check_array(X, **check_params)
             out = X
         else:
-            X, y = check_X_y(X, y, **check_params)
+            if validate_separately:
+                # We need this because some estimators validate X and y
+                # separately, and in general, separately calling check_array()
+                # on X and y isn't equivalent to just calling check_X_y()
+                # :(
+                check_X_params, check_y_params = validate_separately
+                X = check_array(X, **check_X_params)
+                y = check_array(y, **check_y_params)
+            else:
+                X, y = check_X_y(X, y, **check_params)
             out = X, y
 
         if check_params.get('ensure_2d', True):
@@ -411,6 +437,34 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
 
         return out
 
+    @property
+    def _repr_html_(self):
+        """HTML representation of estimator.
+
+        This is redundant with the logic of `_repr_mimebundle_`. The latter
+        should be favorted in the long term, `_repr_html_` is only
+        implemented for consumers who do not interpret `_repr_mimbundle_`.
+        """
+        if get_config()["display"] != 'diagram':
+            raise AttributeError("_repr_html_ is only defined when the "
+                                 "'display' configuration option is set to "
+                                 "'diagram'")
+        return self._repr_html_inner
+
+    def _repr_html_inner(self):
+        """This function is returned by the @property `_repr_html_` to make
+        `hasattr(estimator, "_repr_html_") return `True` or `False` depending
+        on `get_config()["display"]`.
+        """
+        return estimator_html_repr(self)
+
+    def _repr_mimebundle_(self, **kwargs):
+        """Mime bundle used by jupyter kernels to display estimator"""
+        output = {"text/plain": repr(self)}
+        if get_config()["display"] == 'diagram':
+            output["text/html"] = estimator_html_repr(self)
+        return output
+
 
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""
@@ -444,6 +498,9 @@ def score(self, X, y, sample_weight=None):
         from .metrics import accuracy_score
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class RegressorMixin:
     """Mixin class for all regression estimators in scikit-learn."""
@@ -494,6 +551,9 @@ def score(self, X, y, sample_weight=None):
         y_pred = self.predict(X)
         return r2_score(y, y_pred, sample_weight=sample_weight)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class ClusterMixin:
     """Mixin class for all cluster estimators in scikit-learn."""
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 8a719d49bd6de..31df362ddb009 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -329,7 +329,7 @@ def fit(self, X, y, sample_weight=None):
             self.label_encoder_.fit(self.classes)
 
         self.classes_ = self.label_encoder_.classes_
-        Y = label_binarize(y, self.classes_)
+        Y = label_binarize(y, classes=self.classes_)
 
         df, idx_pos_class = self._preproc(X)
         self.calibrators_ = []
@@ -574,7 +574,7 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5,
     if len(labels) > 2:
         raise ValueError("Only binary classification is supported. "
                          "Provided labels %s." % labels)
-    y_true = label_binarize(y_true, labels)[:, 0]
+    y_true = label_binarize(y_true, classes=labels)[:, 0]
 
     if strategy == 'quantile':  # Determine bin edges by distribution of data
         quantiles = np.linspace(0, 1, n_bins + 1)
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 9516c8e4bdd05..f670c621a94ee 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -10,7 +10,7 @@
 
 from ..exceptions import ConvergenceWarning
 from ..base import BaseEstimator, ClusterMixin
-from ..utils import as_float_array, check_array
+from ..utils import as_float_array, check_array, check_random_state
 from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..metrics import euclidean_distances
 from ..metrics import pairwise_distances_argmin
@@ -32,7 +32,7 @@ def all_equal_similarities():
 
 def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
                          damping=0.5, copy=True, verbose=False,
-                         return_n_iter=False):
+                         return_n_iter=False, random_state='warn'):
     """Perform Affinity Propagation Clustering of data
 
     Read more in the :ref:`User Guide <affinity_propagation>`.
@@ -72,6 +72,14 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
     return_n_iter : bool, default False
         Whether or not to return the number of iterations.
 
+    random_state : int or np.random.RandomStateInstance, default: 0
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
+
     Returns
     -------
 
@@ -133,7 +141,16 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
                     if return_n_iter
                     else (np.array([0]), np.array([0] * n_samples)))
 
-    random_state = np.random.RandomState(0)
+    if random_state == 'warn':
+        warnings.warn(("'random_state' has been introduced in 0.23. "
+                       "It will be set to None starting from 0.25 which "
+                       "means that results will differ at every function "
+                       "call. Set 'random_state' to None to silence this "
+                       "warning, or to 0 to keep the behavior of versions "
+                       "<0.23."),
+                      FutureWarning)
+        random_state = 0
+    random_state = check_random_state(random_state)
 
     # Place preference on the diagonal of S
     S.flat[::(n_samples + 1)] = preference
@@ -274,6 +291,13 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     verbose : bool, default=False
         Whether to be verbose.
 
+    random_state : int or np.random.RandomStateInstance, default: 0
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
 
     Attributes
     ----------
@@ -292,23 +316,6 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     n_iter_ : int
         Number of iterations taken to converge.
 
-    Examples
-    --------
-    >>> from sklearn.cluster import AffinityPropagation
-    >>> import numpy as np
-    >>> X = np.array([[1, 2], [1, 4], [1, 0],
-    ...               [4, 2], [4, 4], [4, 0]])
-    >>> clustering = AffinityPropagation().fit(X)
-    >>> clustering
-    AffinityPropagation()
-    >>> clustering.labels_
-    array([0, 0, 0, 1, 1, 1])
-    >>> clustering.predict([[0, 0], [4, 4]])
-    array([0, 1])
-    >>> clustering.cluster_centers_
-    array([[1, 2],
-           [4, 2]])
-
     Notes
     -----
     For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
@@ -333,11 +340,28 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
 
     Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
     Between Data Points", Science Feb. 2007
+
+    Examples
+    --------
+    >>> from sklearn.cluster import AffinityPropagation
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> clustering = AffinityPropagation(random_state=5).fit(X)
+    >>> clustering
+    AffinityPropagation(random_state=5)
+    >>> clustering.labels_
+    array([0, 0, 0, 1, 1, 1])
+    >>> clustering.predict([[0, 0], [4, 4]])
+    array([0, 1])
+    >>> clustering.cluster_centers_
+    array([[1, 2],
+           [4, 2]])
     """
     @_deprecate_positional_args
     def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
                  copy=True, preference=None, affinity='euclidean',
-                 verbose=False):
+                 verbose=False, random_state='warn'):
 
         self.damping = damping
         self.max_iter = max_iter
@@ -346,6 +370,7 @@ def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
         self.verbose = verbose
         self.preference = preference
         self.affinity = affinity
+        self.random_state = random_state
 
     @property
     def _pairwise(self):
@@ -388,7 +413,8 @@ def fit(self, X, y=None):
             affinity_propagation(
                 self.affinity_matrix_, self.preference, max_iter=self.max_iter,
                 convergence_iter=self.convergence_iter, damping=self.damping,
-                copy=self.copy, verbose=self.verbose, return_n_iter=True)
+                copy=self.copy, verbose=self.verbose, return_n_iter=True,
+                random_state=self.random_state)
 
         if self.affinity != "precomputed":
             self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 182ae4b481116..92246141d6fe8 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -736,6 +736,9 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         - single uses the minimum of the distances between all observations
           of the two sets.
 
+        .. versionadded:: 0.20
+            Added the 'single' option
+
     distance_threshold : float, default=None
         The linkage distance threshold above which, clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
diff --git a/sklearn/cluster/_dbscan_inner.pyx b/sklearn/cluster/_dbscan_inner.pyx
index a348bf59d6717..b9a80686a76f8 100644
--- a/sklearn/cluster/_dbscan_inner.pyx
+++ b/sklearn/cluster/_dbscan_inner.pyx
@@ -9,6 +9,8 @@ from libcpp.vector cimport vector
 cimport numpy as np
 import numpy as np
 
+np.import_array()
+
 
 # Work around Cython bug: C++ exceptions are not caught unless thrown within
 # a cdef function with an "except +" declaration.
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index 65c8871fbb456..70c4abb0c4ac7 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -322,6 +322,9 @@ def _elkan_iter_chunked_dense(
                     for k in range(n_features):
                         centers_new[j, k] += centers_new_chunk[j * n_features + k]
 
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
     if update_centers:
         _relocate_empty_clusters_dense(X, sample_weight, centers_old,
                                        centers_new, weight_in_clusters, labels)
@@ -553,6 +556,9 @@ def _elkan_iter_chunked_sparse(
                     for k in range(n_features):
                         centers_new[j, k] += centers_new_chunk[j * n_features + k]
 
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
     if update_centers:
         _relocate_empty_clusters_sparse(
             X_data, X_indices, X_indptr, sample_weight,
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index b185983c4b0f9..8d24ed497aef3 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -834,6 +834,9 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         For now "auto" (kept for backward compatibiliy) chooses "elkan" but it
         might change in the future for a better heuristic.
 
+        .. versionchanged:: 0.18
+            Added Elkan algorithm
+
     Attributes
     ----------
     cluster_centers_ : ndarray of shape (n_clusters, n_features)
@@ -946,6 +949,8 @@ def fit(self, X, y=None, sample_weight=None):
             The weights for each observation in X. If None, all observations
             are assigned equal weight.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         self
@@ -1587,6 +1592,8 @@ def fit(self, X, y=None, sample_weight=None):
             The weights for each observation in X. If None, all observations
             are assigned equal weight (default: None).
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         self
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index c8ca3ec569a88..7f54a318d3d49 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -529,7 +529,7 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_,
             # in the dict params
             _params['p'] = p
         dists = pairwise_distances(P, np.take(X, unproc, axis=0),
-                                   metric, n_jobs=None,
+                                   metric=metric, n_jobs=None,
                                    **_params).ravel()
 
     rdists = np.maximum(dists, core_distances_[point_index])
diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py
index 6484d36d443d1..826878061291b 100644
--- a/sklearn/cluster/tests/test_affinity_propagation.py
+++ b/sklearn/cluster/tests/test_affinity_propagation.py
@@ -33,16 +33,18 @@ def test_affinity_propagation():
     preference = np.median(S) * 10
     # Compute Affinity Propagation
     cluster_centers_indices, labels = affinity_propagation(
-        S, preference=preference)
+        S, preference=preference, random_state=39)
 
     n_clusters_ = len(cluster_centers_indices)
 
     assert n_clusters == n_clusters_
 
-    af = AffinityPropagation(preference=preference, affinity="precomputed")
+    af = AffinityPropagation(preference=preference, affinity="precomputed",
+                             random_state=28)
     labels_precomputed = af.fit(S).labels_
 
-    af = AffinityPropagation(preference=preference, verbose=True)
+    af = AffinityPropagation(preference=preference, verbose=True,
+                             random_state=37)
     labels = af.fit(X).labels_
 
     assert_array_equal(labels, labels_precomputed)
@@ -55,7 +57,7 @@ def test_affinity_propagation():
 
     # Test also with no copy
     _, labels_no_copy = affinity_propagation(S, preference=preference,
-                                             copy=False)
+                                             copy=False, random_state=74)
     assert_array_equal(labels, labels_no_copy)
 
     # Test input validation
@@ -63,16 +65,16 @@ def test_affinity_propagation():
         affinity_propagation(S[:, :-1])
     with pytest.raises(ValueError):
         affinity_propagation(S, damping=0)
-    af = AffinityPropagation(affinity="unknown")
+    af = AffinityPropagation(affinity="unknown", random_state=78)
     with pytest.raises(ValueError):
         af.fit(X)
-    af_2 = AffinityPropagation(affinity='precomputed')
+    af_2 = AffinityPropagation(affinity='precomputed', random_state=21)
     with pytest.raises(TypeError):
         af_2.fit(csr_matrix((3, 3)))
 
 def test_affinity_propagation_predict():
     # Test AffinityPropagation.predict
-    af = AffinityPropagation(affinity="euclidean")
+    af = AffinityPropagation(affinity="euclidean", random_state=63)
     labels = af.fit_predict(X)
     labels2 = af.predict(X)
     assert_array_equal(labels, labels2)
@@ -87,7 +89,7 @@ def test_affinity_propagation_predict_error():
 
     # Predict not supported when affinity="precomputed".
     S = np.dot(X, X.T)
-    af = AffinityPropagation(affinity="precomputed")
+    af = AffinityPropagation(affinity="precomputed", random_state=57)
     af.fit(S)
     with pytest.raises(ValueError):
         af.predict(X)
@@ -100,7 +102,7 @@ def test_affinity_propagation_fit_non_convergence():
     X = np.array([[0, 0], [1, 1], [-2, -2]])
 
     # Force non-convergence by allowing only a single iteration
-    af = AffinityPropagation(preference=-10, max_iter=1)
+    af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
 
     assert_warns(ConvergenceWarning, af.fit, X)
     assert_array_equal(np.empty((0, 2)), af.cluster_centers_)
@@ -129,7 +131,7 @@ def test_affinity_propagation_equal_mutual_similarities():
 
     # setting different preferences
     cluster_center_indices, labels = assert_no_warnings(
-        affinity_propagation, S, preference=[-20, -10])
+        affinity_propagation, S, preference=[-20, -10], random_state=37)
 
     # expect one cluster, with highest-preference sample as exemplar
     assert_array_equal([1], cluster_center_indices)
@@ -143,7 +145,8 @@ def test_affinity_propagation_predict_non_convergence():
 
     # Force non-convergence by allowing only a single iteration
     af = assert_warns(ConvergenceWarning,
-                      AffinityPropagation(preference=-10, max_iter=1).fit, X)
+                      AffinityPropagation(preference=-10,
+                                          max_iter=1, random_state=75).fit, X)
 
     # At prediction time, consider new samples as noise since there are no
     # clusters
@@ -156,7 +159,8 @@ def test_affinity_propagation_non_convergence_regressiontest():
     X = np.array([[1, 0, 0, 0, 0, 0],
                   [0, 1, 1, 1, 0, 0],
                   [0, 0, 1, 0, 0, 1]])
-    af = AffinityPropagation(affinity='euclidean', max_iter=2).fit(X)
+    af = AffinityPropagation(affinity='euclidean',
+                             max_iter=2, random_state=34).fit(X)
     assert_array_equal(np.array([-1, -1, -1]), af.labels_)
 
 
@@ -181,6 +185,38 @@ def test_equal_similarities_and_preferences():
     assert _equal_similarities_and_preferences(S, np.array(0))
 
 
+def test_affinity_propagation_random_state():
+    # Significance of random_state parameter
+    # Generate sample data
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(n_samples=300, centers=centers,
+                                cluster_std=0.5, random_state=0)
+    # random_state = 0
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
+    ap.fit(X)
+    centers0 = ap.cluster_centers_
+
+    # random_state = 76
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
+    ap.fit(X)
+    centers76 = ap.cluster_centers_
+
+    assert np.mean((centers0 - centers76) ** 2) > 1
+
+
+# FIXME: to be removed in 0.25
+def test_affinity_propagation_random_state_warning():
+    # test that a warning is raised when random_state is not defined.
+    X = np.array([[0, 0], [1, 1], [-2, -2]])
+    match = ("'random_state' has been introduced in 0.23. "
+             "It will be set to None starting from 0.25 which "
+             "means that results will differ at every function "
+             "call. Set 'random_state' to None to silence this "
+             "warning, or to 0 to keep the behavior of versions "
+             "<0.23.")
+    with pytest.warns(FutureWarning, match=match):
+        AffinityPropagation().fit(X)
+
 @pytest.mark.parametrize('centers', [csr_matrix(np.zeros((1, 10))),
                                      np.zeros((1, 10))])
 def test_affinity_propagation_convergence_warning_dense_sparse(centers):
@@ -188,7 +224,7 @@ def test_affinity_propagation_convergence_warning_dense_sparse(centers):
     rng = np.random.RandomState(42)
     X = rng.rand(40, 10)
     y = (4 * rng.rand(40)).astype(np.int)
-    ap = AffinityPropagation()
+    ap = AffinityPropagation(random_state=46)
     ap.fit(X, y)
     ap.cluster_centers_ = centers
     with pytest.warns(None) as record:
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 903c63a00fd22..f148633021a97 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -15,6 +15,7 @@
 from joblib import Parallel, delayed
 
 from ..base import clone, TransformerMixin
+from ..utils._estimator_html_repr import _VisualBlock
 from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
 from ..preprocessing import FunctionTransformer
 from ..utils import Bunch
@@ -315,19 +316,18 @@ def _validate_remainder(self, X):
                 self.remainder)
 
         # Make it possible to check for reordered named columns on transform
-        if (hasattr(X, 'columns') and
-                any(_determine_key_type(cols) == 'str'
-                    for cols in self._columns)):
+        self._has_str_cols = any(_determine_key_type(cols) == 'str'
+                                 for cols in self._columns)
+        if hasattr(X, 'columns'):
             self._df_columns = X.columns
 
         self._n_features = X.shape[1]
         cols = []
         for columns in self._columns:
             cols.extend(_get_column_indices(X, columns))
-        remaining_idx = list(set(range(self._n_features)) - set(cols))
-        remaining_idx = sorted(remaining_idx) or None
 
-        self._remainder = ('remainder', self.remainder, remaining_idx)
+        remaining_idx = sorted(set(range(self._n_features)) - set(cols))
+        self._remainder = ('remainder', self.remainder, remaining_idx or None)
 
     @property
     def named_transformers_(self):
@@ -356,11 +356,18 @@ def get_feature_names(self):
             if trans == 'drop' or (
                     hasattr(column, '__len__') and not len(column)):
                 continue
-            elif trans == 'passthrough':
-                raise NotImplementedError(
-                    "get_feature_names is not yet supported when using "
-                    "a 'passthrough' transformer.")
-            elif not hasattr(trans, 'get_feature_names'):
+            if trans == 'passthrough':
+                if hasattr(self, '_df_columns'):
+                    if ((not isinstance(column, slice))
+                            and all(isinstance(col, str) for col in column)):
+                        feature_names.extend(column)
+                    else:
+                        feature_names.extend(self._df_columns[column])
+                else:
+                    indices = np.arange(self._n_features)
+                    feature_names.extend(['x%d' % i for i in indices[column]])
+                continue
+            if not hasattr(trans, 'get_feature_names'):
                 raise AttributeError("Transformer %s (type %s) does not "
                                      "provide get_feature_names."
                                      % (str(name), type(trans).__name__))
@@ -582,6 +589,7 @@ def transform(self, X):
         # name order and count. See #14237 for details.
         if (self._remainder[2] is not None and
                 hasattr(self, '_df_columns') and
+                self._has_str_cols and
                 hasattr(X, 'columns')):
             n_cols_fit = len(self._df_columns)
             n_cols_transform = len(X.columns)
@@ -630,6 +638,11 @@ def _hstack(self, Xs):
             Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
             return np.hstack(Xs)
 
+    def _sk_visual_block_(self):
+        names, transformers, name_details = zip(*self.transformers)
+        return _VisualBlock('parallel', transformers,
+                            names=names, name_details=name_details)
+
 
 def _check_X(X):
     """Use check_array only on lists and other non-array-likes / sparse"""
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index d8c062ed423a2..1d6695a808d81 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -42,6 +42,8 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator):
 
     Read more in the :ref:`User Guide <transformed_target_regressor>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     regressor : object, default=None
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index ca1c185c91e06..a9f1764eb97e4 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -668,25 +668,88 @@ def test_column_transformer_get_feature_names():
     ct.fit(X)
     assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1__c']
 
-    # passthrough transformers not supported
+    # drop transformer
+    ct = ColumnTransformer(
+        [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
+    ct.fit(X)
+    assert ct.get_feature_names() == ['col0__a', 'col0__b']
+
+    # passthrough transformer
     ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
     ct.fit(X)
-    assert_raise_message(
-        NotImplementedError, 'get_feature_names is not yet supported',
-        ct.get_feature_names)
+    assert ct.get_feature_names() == ['x0', 'x1']
 
     ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
                            remainder='passthrough')
     ct.fit(X)
-    assert_raise_message(
-        NotImplementedError, 'get_feature_names is not yet supported',
-        ct.get_feature_names)
+    assert ct.get_feature_names() == ['trans__a', 'trans__b', 'x1']
 
-    # drop transformer
-    ct = ColumnTransformer(
-        [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
+    ct = ColumnTransformer([('trans', 'passthrough', [1])],
+                           remainder='passthrough')
     ct.fit(X)
-    assert ct.get_feature_names() == ['col0__a', 'col0__b']
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', lambda x: [1])],
+                           remainder='passthrough')
+    ct.fit(X)
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))],
+                           remainder='passthrough')
+    ct.fit(X)
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))],
+                           remainder='passthrough')
+    ct.fit(X)
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+
+def test_column_transformer_get_feature_names_dataframe():
+    # passthough transformer with a dataframe
+    pd = pytest.importorskip('pandas')
+    X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],
+                  [{'c': 5}, {'c': 6}]], dtype=object).T
+    X_df = pd.DataFrame(X, columns=['col0', 'col1'])
+
+    ct = ColumnTransformer([('trans', 'passthrough', ['col0', 'col1'])])
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col0', 'col1']
+
+    ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col0', 'col1']
+
+    ct = ColumnTransformer([('col0', DictVectorizer(), 0)],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1']
+
+    ct = ColumnTransformer([('trans', 'passthrough', ['col1'])],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough',
+                             lambda x: x[['col1']].columns)],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', [1])],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
 
 
 def test_column_transformer_special_strings():
diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index 801611943f350..354c0f8998968 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -67,6 +67,8 @@ class EllipticEnvelope(OutlierMixin, MinCovDet):
         such a way we obtain the expected number of outliers (samples with
         decision function < 0) in training.
 
+        .. versionadded:: 0.20
+
     raw_location_ : ndarray of shape (n_features,)
         The raw robust estimated location before correction and re-weighting.
 
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index c83dbc89697e1..684a57fdeb296 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -79,6 +79,7 @@ def empirical_covariance(X, assume_centered=False):
            [0.25, 0.25, 0.25]])
     """
     X = np.asarray(X)
+
     if X.ndim == 1:
         X = np.reshape(X, (1, -1))
 
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 1d0d93db75101..35a398741bc15 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -84,6 +84,9 @@ def graphical_lasso(emp_cov, alpha, cov_init=None, mode='cd', tol=1e-4,
 
     Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
 
+    .. versionchanged:: v0.20
+        graph_lasso has been renamed to graphical_lasso
+
     Parameters
     ----------
     emp_cov : ndarray of shape (n_features, n_features)
@@ -283,6 +286,9 @@ class GraphicalLasso(EmpiricalCovariance):
 
     Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
 
+    .. versionchanged:: v0.20
+        GraphLasso has been renamed to GraphicalLasso
+
     Parameters
     ----------
     alpha : float, default=0.01
@@ -509,6 +515,9 @@ class GraphicalLassoCV(GraphicalLasso):
 
     Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
 
+    .. versionchanged:: v0.20
+        GraphLassoCV has been renamed to GraphicalLassoCV
+
     Parameters
     ----------
     alphas : int or array-like of shape (n_alphas,), dtype=float, default=4
@@ -563,6 +572,9 @@ class GraphicalLassoCV(GraphicalLasso):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     verbose : bool, default=False
         If verbose is True, the objective function and duality gap are
         printed at each iteration.
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index fcc13a84e803e..ec6748aaf721a 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -82,12 +82,12 @@ class ShrunkCovariance(EmpiricalCovariance):
 
     Attributes
     ----------
-    location_ : ndarray of shape (n_features,)
-        Estimated location, i.e. the estimated mean.
-
     covariance_ : ndarray of shape (n_features, n_features)
         Estimated covariance matrix
 
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     precision_ : ndarray of shape (n_features, n_features)
         Estimated pseudo inverse matrix.
         (stored only if store_precision is True)
@@ -344,12 +344,12 @@ class LedoitWolf(EmpiricalCovariance):
 
     Attributes
     ----------
-    location_ : ndarray of shape (n_features,)
-        Estimated location, i.e. the estimated mean.
-
     covariance_ : ndarray of shape (n_features, n_features)
         Estimated covariance matrix.
 
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     precision_ : ndarray of shape (n_features, n_features)
         Estimated pseudo inverse matrix.
         (stored only if store_precision is True)
@@ -526,6 +526,9 @@ class OAS(EmpiricalCovariance):
     covariance_ : ndarray of shape (n_features, n_features)
         Estimated covariance matrix.
 
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     precision_ : ndarray of shape (n_features, n_features)
         Estimated pseudo inverse matrix.
         (stored only if store_precision is True)
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 508448c3ede39..b6912f81105a8 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -519,7 +519,8 @@ def fit_transform(self, X, y=None):
         return self.fit(X, y).transform(X, y)
 
     def _more_tags(self):
-        return {'poor_score': True}
+        return {'poor_score': True,
+                'requires_y': False}
 
 
 class PLSRegression(_PLS):
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index d481288133991..2402fc3a069dc 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -17,6 +17,7 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_pandas_support
+from ..utils.validation import _deprecate_positional_args
 
 import numpy as np
 
@@ -80,7 +81,8 @@ def _convert_data_dataframe(caller_name, data, target,
     return combined_df, X, y
 
 
-def load_files(container_path, description=None, categories=None,
+@_deprecate_positional_args
+def load_files(container_path, *, description=None, categories=None,
                load_content=True, shuffle=True, encoding=None,
                decode_error='strict', random_state=0):
     """Load text files with categories as subfolder names.
@@ -267,7 +269,8 @@ def load_data(module_path, data_file_name):
     return data, target, target_names
 
 
-def load_wine(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_wine(*, return_X_y=False, as_frame=False):
     """Load and return the wine dataset (classification).
 
     .. versionadded:: 0.18
@@ -381,7 +384,8 @@ def load_wine(return_X_y=False, as_frame=False):
                  feature_names=feature_names)
 
 
-def load_iris(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_iris(*, return_X_y=False, as_frame=False):
     """Load and return the iris dataset (classification).
 
     The iris dataset is a classic and very easy multi-class classification
@@ -439,6 +443,8 @@ def load_iris(return_X_y=False, as_frame=False):
         filename: str
             The path to the location of the data.
 
+            .. versionadded:: 0.20
+
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.18
@@ -493,7 +499,8 @@ def load_iris(return_X_y=False, as_frame=False):
                  filename=iris_csv_filename)
 
 
-def load_breast_cancer(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_breast_cancer(*, return_X_y=False, as_frame=False):
     """Load and return the breast cancer wisconsin dataset (classification).
 
     The breast cancer dataset is a classic and very easy binary classification
@@ -551,6 +558,8 @@ def load_breast_cancer(return_X_y=False, as_frame=False):
         filename: str
             The path to the location of the data.
 
+            .. versionadded:: 0.20
+
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.18
@@ -615,7 +624,8 @@ def load_breast_cancer(return_X_y=False, as_frame=False):
                  filename=csv_filename)
 
 
-def load_digits(n_class=10, return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
     """Load and return the digits dataset (classification).
 
     Each datapoint is a 8x8 image of a digit.
@@ -665,6 +675,9 @@ def load_digits(n_class=10, return_X_y=False, as_frame=False):
             The names of the dataset columns.
         target_names: list
             The names of target classes.
+
+            .. versionadded:: 0.20
+
         frame: DataFrame of shape (1797, 65)
             Only present when `as_frame=True`. DataFrame with `data` and
             `target`.
@@ -735,7 +748,8 @@ def load_digits(n_class=10, return_X_y=False, as_frame=False):
                  DESCR=descr)
 
 
-def load_diabetes(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_diabetes(*, return_X_y=False, as_frame=False):
     """Load and return the diabetes dataset (regression).
 
     ==============   ==================
@@ -827,7 +841,8 @@ def load_diabetes(return_X_y=False, as_frame=False):
                  target_filename=target_filename)
 
 
-def load_linnerud(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_linnerud(*, return_X_y=False, as_frame=False):
     """Load and return the physical excercise linnerud dataset.
 
     This dataset is suitable for multi-ouput regression tasks.
@@ -885,6 +900,8 @@ def load_linnerud(return_X_y=False, as_frame=False):
         target_filename: str
             The path to the location of the target.
 
+            .. versionadded:: 0.20
+
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.18
@@ -928,7 +945,8 @@ def load_linnerud(return_X_y=False, as_frame=False):
                  target_filename=target_filename)
 
 
-def load_boston(return_X_y=False):
+@_deprecate_positional_args
+def load_boston(*, return_X_y=False):
     """Load and return the boston house-prices dataset (regression).
 
     ==============   ==============
@@ -961,6 +979,7 @@ def load_boston(return_X_y=False):
             The physical location of boston csv dataset.
 
             .. versionadded:: 0.20
+
         DESCR : str
             The full description of the dataset.
         feature_names : ndarray
diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py
index e3df2124aab2b..107458e2d515d 100644
--- a/sklearn/datasets/_california_housing.py
+++ b/sklearn/datasets/_california_housing.py
@@ -36,6 +36,8 @@
 from ._base import _pkl_filepath
 from ._base import RemoteFileMetadata
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
+
 
 # The original data can be found at:
 # https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz
@@ -48,7 +50,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_california_housing(data_home=None, download_if_missing=True,
+@_deprecate_positional_args
+def fetch_california_housing(*, data_home=None, download_if_missing=True,
                              return_X_y=False, as_frame=False):
     """Load the California housing dataset (regression).
 
diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py
index 6b23f913e05a7..de93b22ac4f56 100644
--- a/sklearn/datasets/_covtype.py
+++ b/sklearn/datasets/_covtype.py
@@ -28,6 +28,8 @@
 from ..utils import Bunch
 from ._base import _pkl_filepath
 from ..utils import check_random_state
+from ..utils.validation import _deprecate_positional_args
+
 
 # The original data can be found in:
 # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
@@ -40,7 +42,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_covtype(data_home=None, download_if_missing=True,
+@_deprecate_positional_args
+def fetch_covtype(*, data_home=None, download_if_missing=True,
                   random_state=None, shuffle=False, return_X_y=False):
     """Load the covertype dataset (classification).
 
diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py
index c0ba00fa46f04..4e2f6856d89b1 100644
--- a/sklearn/datasets/_kddcup99.py
+++ b/sklearn/datasets/_kddcup99.py
@@ -23,6 +23,8 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import shuffle as shuffle_method
+from ..utils.validation import _deprecate_positional_args
+
 
 # The original data can be found at:
 # https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz
@@ -43,7 +45,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
+@_deprecate_positional_args
+def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False,
                    random_state=None,
                    percent10=True, download_if_missing=True, return_X_y=False):
     """Load the kddcup99 dataset (classification).
diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py
index 3dc3833db3417..b8db75010e8f2 100644
--- a/sklearn/datasets/_lfw.py
+++ b/sklearn/datasets/_lfw.py
@@ -20,6 +20,7 @@
 
 from ._base import get_data_home, _fetch_remote, RemoteFileMetadata
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
 
 logger = logging.getLogger(__name__)
 
@@ -215,7 +216,8 @@ def _fetch_lfw_people(data_folder_path, slice_=None, color=False, resize=None,
     return faces, target, target_names
 
 
-def fetch_lfw_people(data_home=None, funneled=True, resize=0.5,
+@_deprecate_positional_args
+def fetch_lfw_people(*, data_home=None, funneled=True, resize=0.5,
                      min_faces_per_person=0, color=False,
                      slice_=(slice(70, 195), slice(78, 172)),
                      download_if_missing=True, return_X_y=False):
@@ -385,7 +387,9 @@ def _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None,
     return pairs, target, np.array(['Different persons', 'Same person'])
 
 
-def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5,
+@_deprecate_positional_args
+def fetch_lfw_pairs(*, subset='train', data_home=None, funneled=True,
+                    resize=0.5,
                     color=False, slice_=(slice(70, 195), slice(78, 172)),
                     download_if_missing=True):
     """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).
diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py
index d5f163d468214..dfa459880a5c4 100644
--- a/sklearn/datasets/_olivetti_faces.py
+++ b/sklearn/datasets/_olivetti_faces.py
@@ -25,6 +25,7 @@
 from ._base import RemoteFileMetadata
 from ._base import _pkl_filepath
 from ..utils import check_random_state, Bunch
+from ..utils.validation import _deprecate_positional_args
 
 # The original data can be found at:
 # https://cs.nyu.edu/~roweis/data/olivettifaces.mat
@@ -35,7 +36,8 @@
               'd5fca46a4b8906c18e454d41af987794'))
 
 
-def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
+@_deprecate_positional_args
+def fetch_olivetti_faces(*, data_home=None, shuffle=False, random_state=0,
                          download_if_missing=True, return_X_y=False):
     """Load the Olivetti faces data-set from AT&T (classification).
 
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index cef0e6cb1f411..112cd9c0e525e 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -9,6 +9,7 @@
 import itertools
 from collections.abc import Generator
 from collections import OrderedDict
+from functools import partial
 
 from urllib.request import urlopen, Request
 
@@ -22,6 +23,7 @@
 from ..utils import get_chunk_n_rows
 from ..utils import _chunk_generator
 from ..utils import check_pandas_support  # noqa
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['fetch_openml']
 
@@ -44,11 +46,11 @@ def _retry_with_clean_cache(openml_path, data_home):
     """
     def decorator(f):
         @wraps(f)
-        def wrapper():
+        def wrapper(*args, **kw):
             if data_home is None:
-                return f()
+                return f(*args, **kw)
             try:
-                return f()
+                return f(*args, **kw)
             except HTTPError:
                 raise
             except Exception:
@@ -56,7 +58,7 @@ def wrapper():
                 local_path = _get_local_path(openml_path, data_home)
                 if os.path.exists(local_path):
                     os.unlink(local_path)
-                return f()
+                return f(*args, **kw)
         return wrapper
     return decorator
 
@@ -217,7 +219,7 @@ def _sparse_data_to_array(arff_data, include_columns):
     return y
 
 
-def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
+def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
     """
     converts the arff object into the appropriate matrix type (np.array or
     scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -225,8 +227,8 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
 
     Parameters
     ----------
-    arff_data : list or dict
-        as obtained from liac-arff object
+    arff : dict
+        As obtained from liac-arff object.
 
     col_slice_x : list
         The column indices that are sliced from the original array to return
@@ -241,6 +243,7 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
     X : np.array or scipy.sparse.csr_matrix
     y : np.array
     """
+    arff_data = arff['data']
     if isinstance(arff_data, Generator):
         if shape[0] == -1:
             count = -1
@@ -300,7 +303,8 @@ def _convert_arff_data_dataframe(arff, columns, features_dict):
 
     Returns
     -------
-    dataframe : pandas DataFrame
+    result : tuple
+        tuple with the resulting dataframe
     """
     pd = check_pandas_support('fetch_openml with as_frame=True')
 
@@ -327,7 +331,7 @@ def _convert_arff_data_dataframe(arff, columns, features_dict):
         if dtype == 'category':
             dtype = pd.api.types.CategoricalDtype(attributes[column])
         df[column] = df[column].astype(dtype, copy=False)
-    return df
+    return (df, )
 
 
 def _get_data_info_by_name(name, version, data_home):
@@ -448,27 +452,119 @@ def _get_num_samples(data_qualities):
     return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
 
-def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
-    # Accesses an ARFF file on the OpenML server. Documentation:
-    # https://www.openml.org/api_data_docs#!/data/get_download_id
-    # encode_nominal argument is to ensure unit testing, do not alter in
-    # production!
-    url = _DATA_FILE.format(file_id)
+def _load_arff_response(url, data_home, return_type, encode_nominal,
+                        parse_arff):
+    """Load arff data with url and parses arff response with parse_arff"""
+    response = _open_openml_url(url, data_home)
 
-    @_retry_with_clean_cache(url, data_home)
-    def _arff_load():
-        with closing(_open_openml_url(url, data_home)) as response:
-            if sparse is True:
-                return_type = _arff.COO
-            else:
-                return_type = _arff.DENSE_GEN
+    with closing(response):
+        # Note that if the data is dense, no reading is done until the data
+        # generator is iterated.
+        arff = _arff.load((line.decode('utf-8') for line in response),
+                          return_type=return_type,
+                          encode_nominal=encode_nominal)
+        return parse_arff(arff)
+
+
+def _download_data_to_bunch(url, sparse, data_home, *,
+                            as_frame, features_list, data_columns,
+                            target_columns, shape):
+    """Download OpenML ARFF and convert to Bunch of data
+    """
+    # NB: this function is long in order to handle retry for any failure
+    #     during the streaming parse of the ARFF.
+
+    # Prepare which columns and data types should be returned for the X and y
+    features_dict = {feature['name']: feature for feature in features_list}
+
+    # XXX: col_slice_y should be all nominal or all numeric
+    _verify_target_data_type(features_dict, target_columns)
+
+    col_slice_y = [int(features_dict[col_name]['index'])
+                   for col_name in target_columns]
+
+    col_slice_x = [int(features_dict[col_name]['index'])
+                   for col_name in data_columns]
+    for col_idx in col_slice_y:
+        feat = features_list[col_idx]
+        nr_missing = int(feat['number_of_missing_values'])
+        if nr_missing > 0:
+            raise ValueError('Target column {} has {} missing values. '
+                             'Missing values are not supported for target '
+                             'columns. '.format(feat['name'], nr_missing))
+
+    # Access an ARFF file on the OpenML server. Documentation:
+    # https://www.openml.org/api_data_docs#!/data/get_download_id
 
-            arff_file = _arff.loads(response.read().decode('utf-8'),
-                                    encode_nominal=encode_nominal,
-                                    return_type=return_type)
-        return arff_file
+    if sparse is True:
+        return_type = _arff.COO
+    else:
+        return_type = _arff.DENSE_GEN
 
-    return _arff_load()
+    frame = nominal_attributes = None
+    if as_frame:
+        columns = data_columns + target_columns
+        parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
+                             features_dict=features_dict)
+
+        def postprocess(frame):  # type:ignore
+            X = frame[data_columns]
+            if len(target_columns) >= 2:
+                y = frame[target_columns]
+            elif len(target_columns) == 1:
+                y = frame[target_columns[0]]
+            else:
+                y = None
+            return X, y, frame, nominal_attributes
+    else:
+        def parse_arff(arff):
+            X, y = _convert_arff_data(arff, col_slice_x, col_slice_y, shape)
+            # nominal attributes is a dict mapping from the attribute name to
+            # the possible values. Includes also the target column (which will
+            # be popped off below, before it will be packed in the Bunch
+            # object)
+            nominal_attributes = {k: v for k, v in arff['attributes']
+                                  if isinstance(v, list) and
+                                  k in data_columns + target_columns}
+            return X, y, nominal_attributes
+
+        def postprocess(X, y, nominal_attributes):  # type:ignore
+            is_classification = {col_name in nominal_attributes
+                                 for col_name in target_columns}
+            if not is_classification:
+                # No target
+                pass
+            elif all(is_classification):
+                y = np.hstack([
+                    np.take(
+                        np.asarray(nominal_attributes.pop(col_name),
+                                   dtype='O'),
+                        y[:, i:i + 1].astype(int, copy=False))
+                    for i, col_name in enumerate(target_columns)
+                ])
+            elif any(is_classification):
+                raise ValueError('Mix of nominal and non-nominal targets is '
+                                 'not currently supported')
+
+            # reshape y back to 1-D array, if there is only 1 target column;
+            # back to None if there are not target columns
+            if y.shape[1] == 1:
+                y = y.reshape((-1,))
+            elif y.shape[1] == 0:
+                y = None
+            return X, y, frame, nominal_attributes
+
+    out = _retry_with_clean_cache(url, data_home)(
+        _load_arff_response)(url, data_home,
+                             return_type=return_type,
+                             encode_nominal=not as_frame,
+                             parse_arff=parse_arff)
+    X, y, frame, nominal_attributes = postprocess(*out)
+
+    return Bunch(data=X, target=y, frame=frame,
+                 categories=nominal_attributes,
+                 feature_names=data_columns,
+                 target_names=target_columns)
 
 
 def _verify_target_data_type(features_dict, target_columns):
@@ -513,7 +609,8 @@ def _valid_data_column_names(features_list, target_columns):
     return valid_data_column_names
 
 
-def fetch_openml(name=None, version='active', data_id=None, data_home=None,
+@_deprecate_positional_args
+def fetch_openml(name=None, *, version='active', data_id=None, data_home=None,
                  target_column='default-target', cache=True, return_X_y=False,
                  as_frame=False):
     """Fetch dataset from openml by name or dataset id.
@@ -526,6 +623,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
     Read more in the :ref:`User Guide <openml>`.
 
+    .. versionadded:: 0.20
+
     .. note:: EXPERIMENTAL
 
         The API is experimental (particularly the return value structure),
@@ -703,25 +802,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     data_columns = _valid_data_column_names(features_list,
                                             target_columns)
 
-    # prepare which columns and data types should be returned for the X and y
-    features_dict = {feature['name']: feature for feature in features_list}
-
-    # XXX: col_slice_y should be all nominal or all numeric
-    _verify_target_data_type(features_dict, target_columns)
-
-    col_slice_y = [int(features_dict[col_name]['index'])
-                   for col_name in target_columns]
-
-    col_slice_x = [int(features_dict[col_name]['index'])
-                   for col_name in data_columns]
-    for col_idx in col_slice_y:
-        feat = features_list[col_idx]
-        nr_missing = int(feat['number_of_missing_values'])
-        if nr_missing > 0:
-            raise ValueError('Target column {} has {} missing values. '
-                             'Missing values are not supported for target '
-                             'columns. '.format(feat['name'], nr_missing))
-
     # determine arff encoding to return
     if not return_sparse:
         # The shape must include the ignored features to keep the right indexes
@@ -732,66 +812,21 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         shape = None
 
     # obtain the data
-    arff = _download_data_arff(data_description['file_id'], return_sparse,
-                               data_home, encode_nominal=not as_frame)
+    url = _DATA_FILE.format(data_description['file_id'])
+    bunch = _download_data_to_bunch(url, return_sparse, data_home,
+                                    as_frame=as_frame,
+                                    features_list=features_list, shape=shape,
+                                    target_columns=target_columns,
+                                    data_columns=data_columns)
+
+    if return_X_y:
+        return bunch.data, bunch.target
 
     description = "{}\n\nDownloaded from openml.org.".format(
         data_description.pop('description'))
 
-    nominal_attributes = None
-    frame = None
-    if as_frame:
-        columns = data_columns + target_columns
-        frame = _convert_arff_data_dataframe(arff, columns, features_dict)
-        X = frame[data_columns]
-        if len(target_columns) >= 2:
-            y = frame[target_columns]
-        elif len(target_columns) == 1:
-            y = frame[target_columns[0]]
-        else:
-            y = None
-    else:
-        # nominal attributes is a dict mapping from the attribute name to the
-        # possible values. Includes also the target column (which will be
-        # popped off below, before it will be packed in the Bunch object)
-        nominal_attributes = {k: v for k, v in arff['attributes']
-                              if isinstance(v, list) and
-                              k in data_columns + target_columns}
-
-        X, y = _convert_arff_data(arff['data'], col_slice_x,
-                                  col_slice_y, shape)
-
-        is_classification = {col_name in nominal_attributes
-                             for col_name in target_columns}
-        if not is_classification:
-            # No target
-            pass
-        elif all(is_classification):
-            y = np.hstack([
-                np.take(
-                    np.asarray(nominal_attributes.pop(col_name), dtype='O'),
-                    y[:, i:i + 1].astype(int, copy=False))
-                for i, col_name in enumerate(target_columns)
-            ])
-        elif any(is_classification):
-            raise ValueError('Mix of nominal and non-nominal targets is not '
-                             'currently supported')
-
-        # reshape y back to 1-D array, if there is only 1 target column; back
-        # to None if there are not target columns
-        if y.shape[1] == 1:
-            y = y.reshape((-1,))
-        elif y.shape[1] == 0:
-            y = None
-
-    if return_X_y:
-        return X, y
-
-    bunch = Bunch(
-        data=X, target=y, frame=frame, feature_names=data_columns,
-        target_names=target_columns,
+    bunch.update(
         DESCR=description, details=data_description,
-        categories=nominal_attributes,
         url="https://www.openml.org/d/{}".format(data_id))
 
     return bunch
diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py
index 4f1c5cc4af199..abb9881700614 100644
--- a/sklearn/datasets/_rcv1.py
+++ b/sklearn/datasets/_rcv1.py
@@ -25,6 +25,7 @@
 from ._svmlight_format_io import load_svmlight_files
 from ..utils import shuffle as shuffle_
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
 
 
 # The original vectorized data can be found at:
@@ -75,7 +76,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
+@_deprecate_positional_args
+def fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True,
                random_state=None, shuffle=False, return_X_y=False):
     """Load the RCV1 multilabel dataset (classification).
 
diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py
index 62ef492f42f5e..9ccdf8700fcf0 100644
--- a/sklearn/datasets/_samples_generator.py
+++ b/sklearn/datasets/_samples_generator.py
@@ -18,6 +18,7 @@
 from ..utils import check_array, check_random_state
 from ..utils import shuffle as util_shuffle
 from ..utils.random import sample_without_replacement
+from ..utils.validation import _deprecate_positional_args
 
 
 def _generate_hypercube(samples, dimensions, rng):
@@ -33,7 +34,8 @@ def _generate_hypercube(samples, dimensions, rng):
     return out
 
 
-def make_classification(n_samples=100, n_features=20, n_informative=2,
+@_deprecate_positional_args
+def make_classification(n_samples=100, n_features=20, *, n_informative=2,
                         n_redundant=2, n_repeated=0, n_classes=2,
                         n_clusters_per_class=2, weights=None, flip_y=0.01,
                         class_sep=1.0, hypercube=True, shift=0.0, scale=1.0,
@@ -102,7 +104,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     flip_y : float, optional (default=0.01)
         The fraction of samples whose class is assigned randomly. Larger
         values introduce noise in the labels and make the classification
-        task harder.
+        task harder. Note that the default setting flip_y > 0 might lead
+        to less than n_classes in y in some cases.
 
     class_sep : float, optional (default=1.0)
         The factor multiplying the hypercube size.  Larger values spread
@@ -260,7 +263,9 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     return X, y
 
 
-def make_multilabel_classification(n_samples=100, n_features=20, n_classes=5,
+@_deprecate_positional_args
+def make_multilabel_classification(n_samples=100, n_features=20, *,
+                                   n_classes=5,
                                    n_labels=2, length=50, allow_unlabeled=True,
                                    sparse=False, return_indicator='dense',
                                    return_distributions=False,
@@ -421,7 +426,8 @@ def sample_example():
     return X, Y
 
 
-def make_hastie_10_2(n_samples=12000, random_state=None):
+@_deprecate_positional_args
+def make_hastie_10_2(n_samples=12000, *, random_state=None):
     """Generates data for binary classification used in
     Hastie et al. 2009, Example 10.2.
 
@@ -469,7 +475,8 @@ def make_hastie_10_2(n_samples=12000, random_state=None):
     return X, y
 
 
-def make_regression(n_samples=100, n_features=100, n_informative=10,
+@_deprecate_positional_args
+def make_regression(n_samples=100, n_features=100, *, n_informative=10,
                     n_targets=1, bias=0.0, effective_rank=None,
                     tail_strength=0.5, noise=0.0, shuffle=True, coef=False,
                     random_state=None):
@@ -591,7 +598,8 @@ def make_regression(n_samples=100, n_features=100, n_informative=10,
         return X, y
 
 
-def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None,
+@_deprecate_positional_args
+def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None,
                  factor=.8):
     """Make a large circle containing a smaller circle in 2d.
 
@@ -667,7 +675,8 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None,
     return X, y
 
 
-def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None):
+@_deprecate_positional_args
+def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
     """Make two interleaving half circles
 
     A simple toy dataset to visualize clustering and classification
@@ -730,7 +739,8 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None):
     return X, y
 
 
-def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
+@_deprecate_positional_args
+def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0,
                center_box=(-10.0, 10.0), shuffle=True, random_state=None,
                return_centers=False):
     """Generate isotropic Gaussian blobs for clustering.
@@ -745,6 +755,9 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
         If array-like, each element of the sequence indicates
         the number of samples per cluster.
 
+        .. versionchanged:: v0.20
+            one can now pass an array-like to the ``n_samples`` parameter
+
     n_features : int, optional (default=2)
         The number of features for each sample.
 
@@ -885,7 +898,9 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
         return X, y
 
 
-def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_friedman1(n_samples=100, n_features=10, *, noise=0.0,
+                   random_state=None):
     """Generate the "Friedman #1" regression problem
 
     This dataset is described in Friedman [1] and Breiman [2].
@@ -947,7 +962,8 @@ def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None):
     return X, y
 
 
-def make_friedman2(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
     """Generate the "Friedman #2" regression problem
 
     This dataset is described in Friedman [1] and Breiman [2].
@@ -1012,7 +1028,8 @@ def make_friedman2(n_samples=100, noise=0.0, random_state=None):
     return X, y
 
 
-def make_friedman3(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
     """Generate the "Friedman #3" regression problem
 
     This dataset is described in Friedman [1] and Breiman [2].
@@ -1076,7 +1093,8 @@ def make_friedman3(n_samples=100, noise=0.0, random_state=None):
     return X, y
 
 
-def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10,
+@_deprecate_positional_args
+def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10,
                          tail_strength=0.5, random_state=None):
     """Generate a mostly low rank matrix with bell-shaped singular values
 
@@ -1145,7 +1163,8 @@ def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10,
     return np.dot(np.dot(u, s), v.T)
 
 
-def make_sparse_coded_signal(n_samples, n_components, n_features,
+@_deprecate_positional_args
+def make_sparse_coded_signal(n_samples, *, n_components, n_features,
                              n_nonzero_coefs, random_state=None):
     """Generate a signal as a sparse combination of dictionary elements.
 
@@ -1207,7 +1226,9 @@ def make_sparse_coded_signal(n_samples, n_components, n_features,
     return map(np.squeeze, (Y, D, X))
 
 
-def make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None):
+@_deprecate_positional_args
+def make_sparse_uncorrelated(n_samples=100, n_features=10, *,
+                             random_state=None):
     """Generate a random regression problem with sparse uncorrelated design
 
     This dataset is described in Celeux et al [1]. as::
@@ -1258,7 +1279,8 @@ def make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None):
     return X, y
 
 
-def make_spd_matrix(n_dim, random_state=None):
+@_deprecate_positional_args
+def make_spd_matrix(n_dim, *, random_state=None):
     """Generate a random symmetric, positive-definite matrix.
 
     Read more in the :ref:`User Guide <sample_generators>`.
@@ -1285,13 +1307,14 @@ def make_spd_matrix(n_dim, random_state=None):
     generator = check_random_state(random_state)
 
     A = generator.rand(n_dim, n_dim)
-    U, s, V = linalg.svd(np.dot(A.T, A))
-    X = np.dot(np.dot(U, 1.0 + np.diag(generator.rand(n_dim))), V)
+    U, _, Vt = linalg.svd(np.dot(A.T, A))
+    X = np.dot(np.dot(U, 1.0 + np.diag(generator.rand(n_dim))), Vt)
 
     return X
 
 
-def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False,
+@_deprecate_positional_args
+def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False,
                            smallest_coef=.1, largest_coef=.9,
                            random_state=None):
     """Generate a sparse symmetric definite positive matrix.
@@ -1365,7 +1388,8 @@ def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False,
     return prec
 
 
-def make_swiss_roll(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
     """Generate a swiss roll dataset.
 
     Read more in the :ref:`User Guide <sample_generators>`.
@@ -1417,7 +1441,8 @@ def make_swiss_roll(n_samples=100, noise=0.0, random_state=None):
     return X, t
 
 
-def make_s_curve(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
     """Generate an S curve dataset.
 
     Read more in the :ref:`User Guide <sample_generators>`.
@@ -1459,7 +1484,8 @@ def make_s_curve(n_samples=100, noise=0.0, random_state=None):
     return X, t
 
 
-def make_gaussian_quantiles(mean=None, cov=1., n_samples=100,
+@_deprecate_positional_args
+def make_gaussian_quantiles(*, mean=None, cov=1., n_samples=100,
                             n_features=2, n_classes=3,
                             shuffle=True, random_state=None):
     r"""Generate isotropic Gaussian and label samples by quantile
@@ -1554,7 +1580,8 @@ def _shuffle(data, random_state=None):
     return result, row_idx, col_idx
 
 
-def make_biclusters(shape, n_clusters, noise=0.0, minval=10,
+@_deprecate_positional_args
+def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10,
                     maxval=100, shuffle=True, random_state=None):
     """Generate an array with constant block diagonal structure for
     biclustering.
@@ -1645,7 +1672,8 @@ def make_biclusters(shape, n_clusters, noise=0.0, minval=10,
     return result, rows, cols
 
 
-def make_checkerboard(shape, n_clusters, noise=0.0, minval=10,
+@_deprecate_positional_args
+def make_checkerboard(shape, n_clusters, *, noise=0.0, minval=10,
                       maxval=100, shuffle=True, random_state=None):
     """Generate an array with block checkerboard structure for
     biclustering.
diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py
index 7f621d1de74eb..e17ab419c5d7e 100644
--- a/sklearn/datasets/_species_distributions.py
+++ b/sklearn/datasets/_species_distributions.py
@@ -50,6 +50,7 @@
 from ._base import _fetch_remote
 from ._base import RemoteFileMetadata
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
 from ._base import _pkl_filepath
 
 # The original data can be found at:
@@ -137,7 +138,8 @@ def construct_grids(batch):
     return (xgrid, ygrid)
 
 
-def fetch_species_distributions(data_home=None,
+@_deprecate_positional_args
+def fetch_species_distributions(*, data_home=None,
                                 download_if_missing=True):
     """Loader for species distribution dataset from Phillips et. al. (2006)
 
diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py
index 91bb35ff2ec75..8360ee4402b40 100644
--- a/sklearn/datasets/_svmlight_format_io.py
+++ b/sklearn/datasets/_svmlight_format_io.py
@@ -25,6 +25,7 @@
 from .. import __version__
 
 from ..utils import check_array, IS_PYPY
+from ..utils.validation import _deprecate_positional_args
 
 if not IS_PYPY:
     from ._svmlight_format_fast import _load_svmlight_file
@@ -37,7 +38,8 @@ def _load_svmlight_file(*args, **kwargs):
                 'for the status updates).')
 
 
-def load_svmlight_file(f, n_features=None, dtype=np.float64,
+@_deprecate_positional_args
+def load_svmlight_file(f, *, n_features=None, dtype=np.float64,
                        multilabel=False, zero_based="auto", query_id=False,
                        offset=0, length=-1):
     """Load datasets in the svmlight / libsvm format into sparse CSR matrix
@@ -151,8 +153,13 @@ def get_data():
 
         X, y = get_data()
     """
-    return tuple(load_svmlight_files([f], n_features, dtype, multilabel,
-                                     zero_based, query_id, offset, length))
+    return tuple(load_svmlight_files([f], n_features=n_features,
+                                     dtype=dtype,
+                                     multilabel=multilabel,
+                                     zero_based=zero_based,
+                                     query_id=query_id,
+                                     offset=offset,
+                                     length=length))
 
 
 def _gen_open(f):
@@ -196,7 +203,8 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id,
     return data, indices, indptr, labels, query
 
 
-def load_svmlight_files(files, n_features=None, dtype=np.float64,
+@_deprecate_positional_args
+def load_svmlight_files(files, *, n_features=None, dtype=np.float64,
                         multilabel=False, zero_based="auto", query_id=False,
                         offset=0, length=-1):
     """Load dataset from multiple files in SVMlight format
@@ -380,7 +388,9 @@ def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
         f.write((line_pattern % feat).encode('ascii'))
 
 
-def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
+@_deprecate_positional_args
+def dump_svmlight_file(X, y, f, *, zero_based=True, comment=None,
+                       query_id=None,
                        multilabel=False):
     """Dump the dataset in svmlight / libsvm file format.
 
diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py
index ebbd191069c49..c5d322c88ef0c 100644
--- a/sklearn/datasets/_twenty_newsgroups.py
+++ b/sklearn/datasets/_twenty_newsgroups.py
@@ -45,6 +45,7 @@
 from ..feature_extraction.text import CountVectorizer
 from .. import preprocessing
 from ..utils import check_random_state, Bunch
+from ..utils.validation import _deprecate_positional_args
 
 logger = logging.getLogger(__name__)
 
@@ -146,7 +147,8 @@ def strip_newsgroup_footer(text):
         return text
 
 
-def fetch_20newsgroups(data_home=None, subset='train', categories=None,
+@_deprecate_positional_args
+def fetch_20newsgroups(*, data_home=None, subset='train', categories=None,
                        shuffle=True, random_state=42,
                        remove=(),
                        download_if_missing=True, return_X_y=False):
@@ -322,7 +324,8 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
     return data
 
 
-def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None,
+@_deprecate_positional_args
+def fetch_20newsgroups_vectorized(*, subset="train", remove=(), data_home=None,
                                   download_if_missing=True, return_X_y=False,
                                   normalize=True):
     """Load the 20 newsgroups dataset and vectorize it into token counts \
diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index 224538b181696..3ec60074a4015 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -152,7 +152,7 @@ def test_load_digits():
 
 
 def test_load_digits_n_class_lt_10():
-    digits = load_digits(9)
+    digits = load_digits(n_class=9)
     assert digits.data.shape == (1617, 64)
     assert numpy.unique(digits.target).size == 9
 
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index f9969c75d5c8e..44fe392e42e74 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -12,8 +12,9 @@
 from sklearn import config_context
 from sklearn.datasets import fetch_openml
 from sklearn.datasets._openml import (_open_openml_url,
+                                      _arff,
+                                      _DATA_FILE,
                                       _get_data_description_by_id,
-                                      _download_data_arff,
                                       _get_local_path,
                                       _retry_with_clean_cache,
                                       _feature_to_dtype)
@@ -56,8 +57,13 @@ def decode_column(data_bunch, col_idx):
     if sparse is True:
         raise ValueError('This test is not intended for sparse data, to keep '
                          'code relatively simple')
-    data_arff = _download_data_arff(data_description['file_id'],
-                                    sparse, None, False)
+    url = _DATA_FILE.format(data_description['file_id'])
+    with _open_openml_url(url, data_home=None) as f:
+        data_arff = _arff.load((line.decode('utf-8') for line in f),
+                               return_type=(_arff.COO if sparse
+                                            else _arff.DENSE_GEN),
+                               encode_nominal=False)
+
     data_downloaded = np.array(list(data_arff['data']), dtype='O')
 
     for i in range(len(data_bunch.feature_names)):
@@ -176,6 +182,15 @@ def info(self):
                 return {'Content-Encoding': 'gzip'}
             return {}
 
+        def __iter__(self):
+            return iter(self.data)
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            return False
+
     def _file_name(url, suffix):
         return (re.sub(r'\W', '-', url[len("https://openml.org/"):])
                 + suffix + path_suffix)
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index a09b89bda6d6e..cc0178b70e447 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -201,17 +201,17 @@ def fit(self, X, y=None):
         # to allow for unified computation of loglikelihood
         if self.svd_method == 'lapack':
             def my_svd(X):
-                _, s, V = linalg.svd(X, full_matrices=False)
-                return (s[:n_components], V[:n_components],
+                _, s, Vt = linalg.svd(X, full_matrices=False)
+                return (s[:n_components], Vt[:n_components],
                         squared_norm(s[n_components:]))
         elif self.svd_method == 'randomized':
             random_state = check_random_state(self.random_state)
 
             def my_svd(X):
-                _, s, V = randomized_svd(X, n_components,
-                                         random_state=random_state,
-                                         n_iter=self.iterated_power)
-                return s, V, squared_norm(X) - squared_norm(s)
+                _, s, Vt = randomized_svd(X, n_components,
+                                          random_state=random_state,
+                                          n_iter=self.iterated_power)
+                return s, Vt, squared_norm(X) - squared_norm(s)
         else:
             raise ValueError('SVD method %s is not supported. Please consider'
                              ' the documentation' % self.svd_method)
@@ -219,11 +219,11 @@ def my_svd(X):
         for i in range(self.max_iter):
             # SMALL helps numerics
             sqrt_psi = np.sqrt(psi) + SMALL
-            s, V, unexp_var = my_svd(X / (sqrt_psi * nsqrt))
+            s, Vt, unexp_var = my_svd(X / (sqrt_psi * nsqrt))
             s **= 2
             # Use 'maximum' here to avoid sqrt problems.
-            W = np.sqrt(np.maximum(s - 1., 0.))[:, np.newaxis] * V
-            del V
+            W = np.sqrt(np.maximum(s - 1., 0.))[:, np.newaxis] * Vt
+            del Vt
             W *= sqrt_psi
 
             # loglikelihood
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index bc34c17326f19..77d9b13da0dfa 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -295,13 +295,13 @@ def partial_fit(self, X, y=None, check_input=True):
             X = np.vstack((self.singular_values_.reshape((-1, 1)) *
                            self.components_, X, mean_correction))
 
-        U, S, V = linalg.svd(X, full_matrices=False)
-        U, V = svd_flip(U, V, u_based_decision=False)
+        U, S, Vt = linalg.svd(X, full_matrices=False)
+        U, Vt = svd_flip(U, Vt, u_based_decision=False)
         explained_variance = S ** 2 / (n_total_samples - 1)
         explained_variance_ratio = S ** 2 / np.sum(col_var * n_total_samples)
 
         self.n_samples_seen_ = n_total_samples
-        self.components_ = V[:self.n_components_]
+        self.components_ = Vt[:self.n_components_]
         self.singular_values_ = S[:self.n_components_]
         self.mean_ = col_mean
         self.var_ = col_var
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 527f78d34bbb5..617bf8541d830 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -236,7 +236,7 @@ def _fit_transform(self, K):
         # if v is an eigenvector of K
         #     then Phi(X)v  is an eigenvector of Phi(X)Phi(X)'
         # if u is an eigenvector of Phi(X)Phi(X)'
-        #     then Phi(X)'u is an eigenvector of Phi(X)Phi(X)'
+        #     then Phi(X)'u is an eigenvector of Phi(X)'Phi(X)
         #
         # At this stage our self.alphas_ (the v) have norm 1, we need to scale
         # them so that eigenvectors in kernel feature space (the u) have norm=1
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 641e68cd7fc8b..a6e253aab1e6e 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -143,6 +143,9 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
     n_components : int, optional (default=10)
         Number of topics.
 
+        .. versionchanged:: 0.19
+            ``n_topics `` was renamed to ``n_components``
+
     doc_topic_prior : float, optional (default=None)
         Prior of document topic distribution `theta`. If the value is None,
         defaults to `1 / n_components`.
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 87092d7ccd17e..b30637ac50a14 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -320,7 +320,7 @@ class PCA(_BasePCA):
     [6.30061...]
     """
     @_deprecate_positional_args
-    def __init__(self, n_components=None, copy=True, whiten=False,
+    def __init__(self, n_components=None, *, copy=True, whiten=False,
                  svd_solver='auto', tol=0.0, iterated_power='auto',
                  random_state=None):
         self.n_components = n_components
@@ -373,14 +373,14 @@ def fit_transform(self, X, y=None):
         This method returns a Fortran-ordered array. To convert it to a
         C-ordered array, use 'np.ascontiguousarray'.
         """
-        U, S, V = self._fit(X)
+        U, S, Vt = self._fit(X)
         U = U[:, :self.n_components_]
 
         if self.whiten:
             # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
             U *= sqrt(X.shape[0] - 1)
         else:
-            # X_new = X * V = U * S * V^T * V = U * S
+            # X_new = X * V = U * S * Vt * V = U * S
             U *= S[:self.n_components_]
 
         return U
@@ -451,11 +451,11 @@ def _fit_full(self, X, n_components):
         self.mean_ = np.mean(X, axis=0)
         X -= self.mean_
 
-        U, S, V = linalg.svd(X, full_matrices=False)
+        U, S, Vt = linalg.svd(X, full_matrices=False)
         # flip eigenvectors' sign to enforce deterministic output
-        U, V = svd_flip(U, V)
+        U, Vt = svd_flip(U, Vt)
 
-        components_ = V
+        components_ = Vt
 
         # Get variance explained by singular values
         explained_variance_ = (S ** 2) / (n_samples - 1)
@@ -491,7 +491,7 @@ def _fit_full(self, X, n_components):
             explained_variance_ratio_[:n_components]
         self.singular_values_ = singular_values_[:n_components]
 
-        return U, S, V
+        return U, S, Vt
 
     def _fit_truncated(self, X, n_components, svd_solver):
         """Fit the model by computing truncated SVD (by ARPACK or randomized)
@@ -530,22 +530,22 @@ def _fit_truncated(self, X, n_components, svd_solver):
         if svd_solver == 'arpack':
             # random init solution, as ARPACK does it internally
             v0 = random_state.uniform(-1, 1, size=min(X.shape))
-            U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
+            U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0)
             # svds doesn't abide by scipy.linalg.svd/randomized_svd
             # conventions, so reverse its outputs.
             S = S[::-1]
             # flip eigenvectors' sign to enforce deterministic output
-            U, V = svd_flip(U[:, ::-1], V[::-1])
+            U, Vt = svd_flip(U[:, ::-1], Vt[::-1])
 
         elif svd_solver == 'randomized':
             # sign flipping is done inside
-            U, S, V = randomized_svd(X, n_components=n_components,
-                                     n_iter=self.iterated_power,
-                                     flip_sign=True,
-                                     random_state=random_state)
+            U, S, Vt = randomized_svd(X, n_components=n_components,
+                                      n_iter=self.iterated_power,
+                                      flip_sign=True,
+                                      random_state=random_state)
 
         self.n_samples_, self.n_features_ = n_samples, n_features
-        self.components_ = V
+        self.components_ = Vt
         self.n_components_ = n_components
 
         # Get variance explained by singular values
@@ -562,7 +562,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
         else:
             self.noise_variance_ = 0.
 
-        return U, S, V
+        return U, S, Vt
 
     def score_samples(self, X):
         """Return the log-likelihood of each sample.
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 888d5d79e1e4b..53f3ed3bf23ca 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -103,6 +103,11 @@ class SparsePCA(TransformerMixin, BaseEstimator):
     error_ : array
         Vector of errors at each iteration.
 
+    n_components_ : int
+        Estimated number of components.
+
+        .. versionadded:: 0.23
+
     n_iter_ : int
         Number of iterations run.
 
@@ -197,6 +202,7 @@ def fit(self, X, y=None):
             self.components_, axis=1)[:, np.newaxis]
         components_norm[components_norm == 0] = 1
         self.components_ /= components_norm
+        self.n_components_ = len(self.components_)
 
         self.error_ = E
         return self
@@ -234,7 +240,7 @@ def transform(self, X):
 
     def _more_tags(self):
         return {
-            '_xfail_test': {
+            '_xfail_checks': {
                 "check_methods_subset_invariance":
                 "fails for the transform method"
             }
@@ -312,6 +318,11 @@ class MiniBatchSparsePCA(SparsePCA):
     components_ : array, [n_components, n_features]
         Sparse components extracted from the data.
 
+    n_components_ : int
+        Estimated number of components.
+
+        .. versionadded:: 0.23
+
     n_iter_ : int
         Number of iterations run.
 
@@ -403,5 +414,6 @@ def fit(self, X, y=None):
             self.components_, axis=1)[:, np.newaxis]
         components_norm[components_norm == 0] = 1
         self.components_ /= components_norm
+        self.n_components_ = len(self.components_)
 
         return self
diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py
index f3d14e31f3e1b..9ee0339a192b4 100644
--- a/sklearn/decomposition/tests/test_sparse_pca.py
+++ b/sklearn/decomposition/tests/test_sparse_pca.py
@@ -207,3 +207,18 @@ def test_spca_error_unormalized_components(spca):
     err_msg = "normalize_components=False is not supported starting "
     with pytest.raises(NotImplementedError, match=err_msg):
         spca(normalize_components=False).fit(Y)
+
+
+@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
+@pytest.mark.parametrize("n_components", [None, 3])
+def test_spca_n_components_(SPCA, n_components):
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 12, 10
+    X = rng.randn(n_samples, n_features)
+
+    model = SPCA(n_components=n_components).fit(X)
+
+    if n_components is not None:
+        assert model.n_components_ == n_components
+    else:
+        assert model.n_components_ == n_features
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 45e623904b9ea..96a17fc5a34a5 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -94,7 +94,9 @@ def _class_means(X, y):
 
 
 def _class_cov(X, y, priors, shrinkage=None):
-    """Compute class covariance matrix.
+    """Compute weighted within-class covariance matrix.
+
+    The per-class covariance are weighted by the class priors.
 
     Parameters
     ----------
@@ -116,7 +118,7 @@ def _class_cov(X, y, priors, shrinkage=None):
     Returns
     -------
     cov : array-like of shape (n_features, n_features)
-        Class covariance matrix.
+        Weighted within-class covariance matrix
     """
     classes = np.unique(y)
     cov = np.zeros(shape=(X.shape[1], X.shape[1]))
@@ -137,7 +139,8 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
     share the same covariance matrix.
 
     The fitted model can also be used to reduce the dimensionality of the input
-    by projecting it to the most discriminative directions.
+    by projecting it to the most discriminative directions, using the
+    `transform` method.
 
     .. versionadded:: 0.17
        *LinearDiscriminantAnalysis*.
@@ -163,21 +166,27 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
         Note that shrinkage works only with 'lsqr' and 'eigen' solvers.
 
     priors : array-like of shape (n_classes,), default=None
-        Class priors.
+        The class prior probabilities. By default, the class proportions are
+        inferred from the training data.
 
     n_components : int, default=None
         Number of components (<= min(n_classes - 1, n_features)) for
         dimensionality reduction. If None, will be set to
-        min(n_classes - 1, n_features).
+        min(n_classes - 1, n_features). This parameter only affects the
+        `transform` method.
 
     store_covariance : bool, default=False
-        Additionally compute class covariance matrix (default False), used
-        only in 'svd' solver.
+        If True, explicitely compute the weighted within-class covariance
+        matrix when solver is 'svd'. The matrix is always computed
+        and stored for the other solvers.
 
         .. versionadded:: 0.17
 
     tol : float, default=1.0e-4
-        Threshold used for rank estimation in SVD solver.
+        Absolute threshold for a singular value of X to be considered
+        significant, used to estimate the rank of X. Dimensions whose
+        singular values are non-significant are discarded. Only used if
+        solver is 'svd'.
 
         .. versionadded:: 0.17
 
@@ -190,8 +199,11 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
         Intercept term.
 
     covariance_ : array-like of shape (n_features, n_features)
-        Covariance matrix (shared by all classes). Only available
-        `store_covariance` is True.
+        Weighted within-class covariance matrix. It corresponds to
+        `sum_k prior_k * C_k` where `C_k` is the covariance matrix of the
+        samples in class `k`. The `C_k` are estimated using the (potentially
+        shrunk) biased estimator of covariance. If solver is 'svd', only
+        exists when `store_covariance` is True.
 
     explained_variance_ratio_ : ndarray of shape (n_components,)
         Percentage of variance explained by each of the selected components.
@@ -200,16 +212,17 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
         or svd solver is used.
 
     means_ : array-like of shape (n_classes, n_features)
-        Class means.
+        Class-wise means.
 
     priors_ : array-like of shape (n_classes,)
         Class priors (sum to 1).
 
     scalings_ : array-like of shape (rank, n_classes - 1)
         Scaling of the features in the space spanned by the class centroids.
+        Only available for 'svd' and 'eigen' solvers.
 
     xbar_ : array-like of shape (n_features,)
-        Overall mean.
+        Overall mean. Only present if solver is 'svd'.
 
     classes_ : array-like of shape (n_classes,)
         Unique class labels.
@@ -219,22 +232,6 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
     sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis: Quadratic
         Discriminant Analysis
 
-    Notes
-    -----
-    The default solver is 'svd'. It can perform both classification and
-    transform, and it does not rely on the calculation of the covariance
-    matrix. This can be an advantage in situations where the number of features
-    is large. However, the 'svd' solver cannot be used with shrinkage.
-
-    The 'lsqr' solver is an efficient algorithm that only works for
-    classification. It supports shrinkage.
-
-    The 'eigen' solver is based on the optimization of the between class
-    scatter to within class scatter ratio. It can be used for both
-    classification and transform, and it supports shrinkage. However, the
-    'eigen' solver needs to compute the covariance matrix, so it might not be
-    suitable for situations with a high number of features.
-
     Examples
     --------
     >>> import numpy as np
@@ -381,11 +378,11 @@ def _solve_svd(self, X, y):
         # 2) Within variance scaling
         X = np.sqrt(fac) * (Xc / std)
         # SVD of centered (within)scaled data
-        U, S, V = linalg.svd(X, full_matrices=False)
+        U, S, Vt = linalg.svd(X, full_matrices=False)
 
         rank = np.sum(S > self.tol)
         # Scaling of within covariance is: V' 1/S
-        scalings = (V[:rank] / std).T / S[:rank]
+        scalings = (Vt[:rank] / std).T / S[:rank]
 
         # 3) Between variance scaling
         # Scale weighted centers
@@ -394,12 +391,12 @@ def _solve_svd(self, X, y):
         # Centers are living in a space with n_classes-1 dim (maximum)
         # Use SVD to find projection in the space spanned by the
         # (n_classes) centers
-        _, S, V = linalg.svd(X, full_matrices=0)
+        _, S, Vt = linalg.svd(X, full_matrices=0)
 
         self.explained_variance_ratio_ = (S**2 / np.sum(
             S**2))[:self._max_components]
         rank = np.sum(S > self.tol * S[0])
-        self.scalings_ = np.dot(scalings, V.T[:, :rank])
+        self.scalings_ = np.dot(scalings, Vt.T[:, :rank])
         coef = np.dot(self.means_ - self.xbar_, self.scalings_)
         self.intercept_ = (-0.5 * np.sum(coef ** 2, axis=1) +
                            np.log(self.priors_))
@@ -542,6 +539,29 @@ def predict_log_proba(self, X):
         """
         return np.log(self.predict_proba(X))
 
+    def decision_function(self, X):
+        """Apply decision function to an array of samples.
+
+        The decision function is equal (up to a constant factor) to the
+        log-posterior of the model, i.e. `log p(y = k | x)`. In a binary
+        classification setting this instead corresponds to the difference
+        `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Array of samples (test vectors).
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Decision function values related to each class, per sample.
+            In the two-class case, the shape is (n_samples,), giving the
+            log likelihood ratio of the positive class.
+        """
+        # Only override for the doc
+        return super().decision_function(X)
+
 
 class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     """Quadratic Discriminant Analysis
@@ -560,47 +580,60 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     Parameters
     ----------
     priors : ndarray of shape (n_classes,), default=None
-        Priors on classes
+        Class priors. By default, the class proportions are inferred from the
+        training data.
 
     reg_param : float, default=0.0
-        Regularizes the covariance estimate as
-        ``(1-reg_param)*Sigma + reg_param*np.eye(n_features)``
+        Regularizes the per-class covariance estimates by transforming S2 as
+        ``S2 = (1 - reg_param) * S2 + reg_param * np.eye(n_features)``,
+        where S2 corresponds to the `scaling_` attribute of a given class.
 
     store_covariance : bool, default=False
-        If True the covariance matrices are computed and stored in the
-        `self.covariance_` attribute.
+        If True, the class covariance matrices are explicitely computed and
+        stored in the `self.covariance_` attribute.
 
         .. versionadded:: 0.17
 
     tol : float, default=1.0e-4
-        Threshold used for rank estimation.
+        Absolute threshold for a singular value to be considered significant,
+        used to estimate the rank of `Xk` where `Xk` is the centered matrix
+        of samples in class k. This parameter does not affect the
+        predictions. It only controls a warning that is raised when features
+        are considered to be colinear.
 
         .. versionadded:: 0.17
 
     Attributes
     ----------
-    covariance_ : list of array-like of shape (n_features, n_features)
-        Covariance matrices of each class. Only available
+    covariance_ : list of len n_classes of ndarray \
+            of shape (n_features, n_features)
+        For each class, gives the covariance matrix estimated using the
+        samples of that class. The estimations are unbiased. Only present if
         `store_covariance` is True.
 
     means_ : array-like of shape (n_classes, n_features)
-        Class means.
+        Class-wise means.
 
     priors_ : array-like of shape (n_classes,)
         Class priors (sum to 1).
 
-    rotations_ : list of ndarrays
-        For each class k an array of shape (n_features, n_k), with
+    rotations_ : list of len n_classes of ndarray of shape (n_features, n_k)
+        For each class k an array of shape (n_features, n_k), where
         ``n_k = min(n_features, number of elements in class k)``
         It is the rotation of the Gaussian distribution, i.e. its
-        principal axis.
-
-    scalings_ : list of ndarrays
-        For each class k an array of shape (n_k,). It contains the scaling
-        of the Gaussian distributions along its principal axes, i.e. the
-        variance in the rotated coordinate system.
-
-    classes_ : array-like of shape (n_classes,)
+        principal axis. It corresponds to `V`, the matrix of eigenvectors
+        coming from the SVD of `Xk = U S Vt` where `Xk` is the centered
+        matrix of samples from class k.
+
+    scalings_ : list of len n_classes of ndarray of shape (n_k,)
+        For each class, contains the scaling of
+        the Gaussian distributions along its principal axes, i.e. the
+        variance in the rotated coordinate system. It corresponds to `S^2 /
+        (n_samples - 1)`, where `S` is the diagonal matrix of singular values
+        from the SVD of `Xk`, where `Xk` is the centered matrix of samples
+        from class k.
+
+    classes_ : ndarray of shape (n_classes,)
         Unique class labels.
 
     Examples
@@ -676,7 +709,7 @@ def fit(self, X, y):
                                  'is ill defined.' % str(self.classes_[ind]))
             Xgc = Xg - meang
             # Xgc = U * S * V.T
-            U, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
+            _, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
             rank = np.sum(S > self.tol)
             if rank < n_features:
                 warnings.warn("Variables are collinear")
@@ -695,6 +728,7 @@ def fit(self, X, y):
         return self
 
     def _decision_function(self, X):
+        # return log posterior, see eq (4.12) p. 110 of the ESL.
         check_is_fitted(self)
 
         X = check_array(X)
@@ -704,7 +738,7 @@ def _decision_function(self, X):
             S = self.scalings_[i]
             Xm = X - self.means_[i]
             X2 = np.dot(Xm, R * (S ** (-0.5)))
-            norm2.append(np.sum(X2 ** 2, 1))
+            norm2.append(np.sum(X2 ** 2, axis=1))
         norm2 = np.array(norm2).T  # shape = [len(X), n_classes]
         u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
         return (-0.5 * (norm2 + u) + np.log(self.priors_))
@@ -712,6 +746,11 @@ def _decision_function(self, X):
     def decision_function(self, X):
         """Apply decision function to an array of samples.
 
+        The decision function is equal (up to a constant factor) to the
+        log-posterior of the model, i.e. `log p(y = k | x)`. In a binary
+        classification setting this instead corresponds to the difference
+        `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.
+
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
@@ -721,7 +760,7 @@ def decision_function(self, X):
         -------
         C : ndarray of shape (n_samples,) or (n_samples, n_classes)
             Decision function values related to each class, per sample.
-            In the two-class case, the shape is [n_samples,], giving the
+            In the two-class case, the shape is (n_samples,), giving the
             log likelihood ratio of the positive class.
         """
         dec_func = self._decision_function(X)
@@ -768,7 +807,7 @@ def predict_proba(self, X):
         return likelihood / likelihood.sum(axis=1)[:, np.newaxis]
 
     def predict_log_proba(self, X):
-        """Return posterior probabilities of classification.
+        """Return log of posterior probabilities of classification.
 
         Parameters
         ----------
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 634943231860f..17b2c6cfd2e5d 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -358,7 +358,7 @@ def predict_log_proba(self, X):
     def _more_tags(self):
         return {
             'poor_score': True, 'no_validation': True,
-            '_xfail_test': {
+            '_xfail_checks': {
                 'check_methods_subset_invariance':
                 'fails for the predict method'
             }
@@ -561,6 +561,8 @@ def predict(self, X, return_std=False):
             Whether to return the standard deviation of posterior prediction.
             All zeros in this case.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         y : array-like of shape (n_samples,) or (n_samples, n_outputs)
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 40a1c2434316c..98d606961c1e1 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -159,9 +159,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         if class_weight == 'subsample':
             with catch_warnings():
                 simplefilter('ignore', DeprecationWarning)
-                curr_sample_weight *= compute_sample_weight('auto', y, indices)
+                curr_sample_weight *= compute_sample_weight('auto', y,
+                                                            indices=indices)
         elif class_weight == 'balanced_subsample':
-            curr_sample_weight *= compute_sample_weight('balanced', y, indices)
+            curr_sample_weight *= compute_sample_weight('balanced', y,
+                                                        indices=indices)
 
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
     else:
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 32e534fdc8517..439500c1917d8 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -405,15 +405,15 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         # Check input
         # Since check_array converts both X and y to the same dtype, but the
         # trees use different types for X and y, checking them separately.
-        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
-                                dtype=DTYPE)
+
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                   dtype=DTYPE, multi_output=True)
         n_samples, self.n_features_ = X.shape
 
         sample_weight_is_none = sample_weight is None
 
         sample_weight = _check_sample_weight(sample_weight, X)
 
-        y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
         y = column_or_1d(y, warn=True)
         y = self._validate_y(y, sample_weight)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
index 1ecee3c9ee27e..4e11abfcabdf8 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -16,6 +16,9 @@ from libc.math cimport isnan
 
 from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
 
+np.import_array()
+
+
 def _map_to_bins(const X_DTYPE_C [:, :] data,
                  list binning_thresholds,
                  const unsigned char missing_values_bin_idx,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
index 8d307c3806532..18f1b6a365421 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
@@ -13,6 +13,8 @@ cimport numpy as np
 from .common import Y_DTYPE
 from .common cimport Y_DTYPE_C
 
+np.import_array()
+
 
 def _update_raw_predictions(
         Y_DTYPE_C [::1] raw_predictions,  # OUT
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
index 821a81a48fcf3..4114cd24aa8df 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -10,11 +10,13 @@ from cython.parallel import prange
 import numpy as np
 cimport numpy as np
 
-from libc.math cimport exp
+from libc.math cimport exp, log
 
 from .common cimport Y_DTYPE_C
 from .common cimport G_H_DTYPE_C
 
+np.import_array()
+
 
 def _update_gradients_least_squares(
         G_H_DTYPE_C [::1] gradients,  # OUT
@@ -27,7 +29,7 @@ def _update_gradients_least_squares(
 
     n_samples = raw_predictions.shape[0]
     for i in prange(n_samples, schedule='static', nogil=True):
-        # Note: a more correct exp is 2 * (raw_predictions - y_true)
+        # Note: a more correct expression is 2 * (raw_predictions - y_true)
         # but since we use 1 for the constant hessian value (and not 2) this
         # is strictly equivalent for the leaves values.
         gradients[i] = raw_predictions[i] - y_true[i]
@@ -87,6 +89,35 @@ def _update_gradients_least_absolute_deviation(
         gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1
 
 
+def _update_gradients_hessians_poisson(
+        G_H_DTYPE_C [::1] gradients,  # OUT
+        G_H_DTYPE_C [::1] hessians,  # OUT
+        const Y_DTYPE_C [::1] y_true,  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
+
+    cdef:
+        int n_samples
+        int i
+        Y_DTYPE_C y_pred
+
+    n_samples = raw_predictions.shape[0]
+    if sample_weight is None:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # Note: We use only half of the deviance loss. Therefore, there is
+            # no factor of 2.
+            y_pred = exp(raw_predictions[i])
+            gradients[i] = (y_pred - y_true[i])
+            hessians[i] = y_pred
+    else:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # Note: We use only half of the deviance loss. Therefore, there is
+            # no factor of 2.
+            y_pred = exp(raw_predictions[i])
+            gradients[i] = (y_pred - y_true[i]) * sample_weight[i]
+            hessians[i] = y_pred * sample_weight[i]
+
+
 def _update_gradients_hessians_binary_crossentropy(
         G_H_DTYPE_C [::1] gradients,  # OUT
         G_H_DTYPE_C [::1] hessians,  # OUT
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index b3234cb5ba945..d346aabdac070 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -18,6 +18,8 @@ from .common import Y_DTYPE
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport node_struct
 
+np.import_array()
+
 
 def _predict_from_numeric_data(
         node_struct [:] nodes,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
index 60399c2fbdd70..161ad114829fe 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -2,6 +2,8 @@
 import numpy as np
 cimport numpy as np
 
+np.import_array()
+
 
 ctypedef np.npy_float64 X_DTYPE_C
 ctypedef np.npy_uint8 X_BINNED_DTYPE_C
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 796f4f060dda5..8287cda367a10 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -23,6 +23,7 @@
 from .binning import _BinMapper
 from .grower import TreeGrower
 from .loss import _LOSSES
+from .loss import BaseLoss
 
 
 class BaseHistGradientBoosting(BaseEstimator, ABC):
@@ -58,7 +59,8 @@ def _validate_parameters(self):
         The parameters that are directly passed to the grower are checked in
         TreeGrower."""
 
-        if self.loss not in self._VALID_LOSSES:
+        if (self.loss not in self._VALID_LOSSES and
+                not isinstance(self.loss, BaseLoss)):
             raise ValueError(
                 "Loss {} is not supported for {}. Accepted losses: "
                 "{}.".format(self.loss, self.__class__.__name__,
@@ -150,7 +152,11 @@ def fit(self, X, y, sample_weight=None):
         # data.
         self._in_fit = True
 
-        self.loss_ = self._get_loss(sample_weight=sample_weight)
+        if isinstance(self.loss, str):
+            self.loss_ = self._get_loss(sample_weight=sample_weight)
+        elif isinstance(self.loss, BaseLoss):
+            self.loss_ = self.loss
+
         if self.early_stopping == 'auto':
             self.do_early_stopping_ = n_samples > 10000
         else:
@@ -752,11 +758,13 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
 
     Parameters
     ----------
-    loss : {'least_squares', 'least_absolute_deviation'}, \
+    loss : {'least_squares', 'least_absolute_deviation', 'poisson'}, \
             optional (default='least_squares')
         The loss function to use in the boosting process. Note that the
-        "least squares" loss actually implements an "half least squares loss"
-        to simplify the computation of the gradient.
+        "least squares" and "poisson" losses actually implement
+        "half least squares loss" and "half poisson deviance" to simplify the
+        computation of the gradient. Furthermore, "poisson" loss internally
+        uses a log-link and requires ``y >= 0``
     learning_rate : float, optional (default=0.1)
         The learning rate, also known as *shrinkage*. This is used as a
         multiplicative factor for the leaves values. Use ``1`` for no
@@ -862,7 +870,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     0.92...
     """
 
-    _VALID_LOSSES = ('least_squares', 'least_absolute_deviation')
+    _VALID_LOSSES = ('least_squares', 'least_absolute_deviation',
+                     'poisson')
 
     @_deprecate_positional_args
     def __init__(self, loss='least_squares', *, learning_rate=0.1,
@@ -896,14 +905,20 @@ def predict(self, X):
         y : ndarray, shape (n_samples,)
             The predicted values.
         """
-        # Return raw predictions after converting shape
-        # (n_samples, 1) to (n_samples,)
-        return self._raw_predict(X).ravel()
+        check_is_fitted(self)
+        # Return inverse link of raw predictions after converting
+        # shape (n_samples, 1) to (n_samples,)
+        return self.loss_.inverse_link_function(self._raw_predict(X).ravel())
 
     def _encode_y(self, y):
         # Just convert y to the expected dtype
         self.n_trees_per_iteration_ = 1
         y = y.astype(Y_DTYPE, copy=False)
+        if self.loss == 'poisson':
+            # Ensure y >= 0 and sum(y) > 0
+            if not (np.all(y >= 0) and np.sum(y) > 0):
+                raise ValueError("loss='poisson' requires non-negative y and "
+                                 "sum(y) > 0.")
         return y
 
     def _get_loss(self, sample_weight):
diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
index 740e5e002cf4e..8bd7c4ee8b350 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
@@ -17,6 +17,8 @@ from .common cimport hist_struct
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport G_H_DTYPE_C
 
+np.import_array()
+
 # Notes:
 # - IN views are read-only, OUT views are write-only
 # - In a lot of functions here, we pass feature_idx and the whole 2d
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index c7884a25a9c41..f256408bf01fb 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -9,7 +9,7 @@
 from abc import ABC, abstractmethod
 
 import numpy as np
-from scipy.special import expit, logsumexp
+from scipy.special import expit, logsumexp, xlogy
 
 from .common import Y_DTYPE
 from .common import G_H_DTYPE
@@ -19,11 +19,13 @@
 from ._loss import _update_gradients_hessians_least_absolute_deviation
 from ._loss import _update_gradients_hessians_binary_crossentropy
 from ._loss import _update_gradients_hessians_categorical_crossentropy
+from ._loss import _update_gradients_hessians_poisson
 from ...utils.stats import _weighted_percentile
 
 
 class BaseLoss(ABC):
     """Base class for a loss."""
+
     def __init__(self, hessians_are_constant):
         self.hessians_are_constant = hessians_are_constant
 
@@ -153,6 +155,7 @@ class LeastSquares(BaseLoss):
     the computation of the gradients and get a unit hessian (and be consistent
     with what is done in LightGBM).
     """
+
     def __init__(self, sample_weight):
         # If sample weights are provided, the hessians and gradients
         # are multiplied by sample_weight, which means the hessians are
@@ -195,6 +198,7 @@ class LeastAbsoluteDeviation(BaseLoss):
 
         loss(x_i) = |y_true_i - raw_pred_i|
     """
+
     def __init__(self, sample_weight):
         # If sample weights are provided, the hessians and gradients
         # are multiplied by sample_weight, which means the hessians are
@@ -265,6 +269,51 @@ def update_leaves_values(self, grower, y_true, raw_predictions,
             # Note that the regularization is ignored here
 
 
+class Poisson(BaseLoss):
+    """Poisson deviance loss with log-link, for regression.
+
+    For a given sample x_i, Poisson deviance loss is defined as::
+
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
+                    - y_true_i + exp(raw_pred_i))
+
+    This actually computes half the Poisson deviance to simplify
+    the computation of the gradients.
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    inverse_link_function = staticmethod(np.exp)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        # TODO: For speed, we could remove the constant xlogy(y_true, y_true)
+        # Advantage of this form: minimum of zero at raw_predictions = y_true.
+        loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1)
+                + np.exp(raw_predictions))
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        y_pred = np.average(y_train, weights=sample_weight)
+        eps = np.finfo(y_train.dtype).eps
+        y_pred = np.clip(y_pred, eps, None)
+        return np.log(y_pred)
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        hessians = hessians.reshape(-1)
+        _update_gradients_hessians_poisson(gradients, hessians,
+                                           y_true, raw_predictions,
+                                           sample_weight)
+
+
 class BinaryCrossEntropy(BaseLoss):
     """Binary cross-entropy loss, for binary classification.
 
@@ -372,5 +421,6 @@ def predict_proba(self, raw_predictions):
     'least_squares': LeastSquares,
     'least_absolute_deviation': LeastAbsoluteDeviation,
     'binary_crossentropy': BinaryCrossEntropy,
-    'categorical_crossentropy': CategoricalCrossEntropy
+    'categorical_crossentropy': CategoricalCrossEntropy,
+    'poisson': Poisson,
 }
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 43405551ef357..984cc6767facf 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -27,6 +27,8 @@ from .common cimport hist_struct
 from .common import HISTOGRAM_DTYPE
 from .common cimport MonotonicConstraint
 
+np.import_array()
+
 
 cdef struct split_info_struct:
     # Same as the SplitInfo class, but we need a C struct to use it in the
@@ -809,7 +811,7 @@ cpdef inline Y_DTYPE_C compute_node_value(
     """
 
     cdef:
-        Y_DTYPE_C value 
+        Y_DTYPE_C value
 
     value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 1b61e65793422..dfed16dafca39 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -2,16 +2,21 @@
 import pytest
 from numpy.testing import assert_allclose, assert_array_equal
 from sklearn.datasets import make_classification, make_regression
+from sklearn.datasets import make_low_rank_matrix
 from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
 from sklearn.model_selection import train_test_split
 from sklearn.base import clone, BaseEstimator, TransformerMixin
 from sklearn.pipeline import make_pipeline
+from sklearn.metrics import mean_poisson_deviance
+from sklearn.dummy import DummyRegressor
 
 # To use this experimental feature, we need to explicitly ask for it:
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
+from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
+from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.utils import shuffle
@@ -192,6 +197,45 @@ def test_least_absolute_deviation():
     assert gbdt.score(X, y) > .9
 
 
+@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
+def test_poisson_y_positive(y):
+    # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
+    err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
+    gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0)
+    with pytest.raises(ValueError, match=err_msg):
+        gbdt.fit(np.zeros(shape=(len(y), 1)), y)
+
+
+def test_poisson():
+    # For Poisson distributed target, Poisson loss should give better results
+    # than least squares measured in Poisson deviance as metric.
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 100, 100
+    X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features,
+                             random_state=rng)
+    # We create a log-linear Poisson model and downscale coef as it will get
+    # exponentiated.
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
+                                                        random_state=rng)
+    gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
+    gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
+                                            random_state=rng)
+    gbdt_pois.fit(X_train, y_train)
+    gbdt_ls.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y in [(X_train, y_train), (X_test, y_test)]:
+        metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
+        # least_squares might produce non-positive predictions => clip
+        metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
+                                                     None))
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        assert metric_pois < metric_ls
+        assert metric_pois < metric_dummy
+
+
 def test_binning_train_validation_are_separated():
     # Make sure training and validation data are binned separately.
     # See issue 13926
@@ -681,3 +725,22 @@ def test_single_node_trees(Est):
                for predictor in est._predictors)
     # Still gives correct predictions thanks to the baseline prediction
     assert_allclose(est.predict(X), y)
+
+
+@pytest.mark.parametrize('Est, loss, X, y', [
+    (
+        HistGradientBoostingClassifier,
+        BinaryCrossEntropy(sample_weight=None),
+        X_classification,
+        y_classification
+    ),
+    (
+        HistGradientBoostingRegressor,
+        LeastSquares(sample_weight=None),
+        X_regression,
+        y_regression
+    )
+])
+def test_custom_loss(Est, loss, X, y):
+    est = Est(loss=loss, max_iter=20)
+    est.fit(X, y)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 915dc300e4760..c3f6ded7be39a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -9,6 +9,7 @@
 from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
 from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.utils._testing import skip_if_32bit
 
 
 def get_derivatives_helper(loss):
@@ -52,11 +53,13 @@ def get_hessians(y_true, raw_predictions):
     # ('binary_crossentropy', 0.3, 0),
     ('binary_crossentropy', -12, 1),
     ('binary_crossentropy', 30, 1),
+    ('poisson', 12., 1.),
+    ('poisson', 0., 2.),
+    ('poisson', -22., 10.),
 ])
 @pytest.mark.skipif(sp_version == (1, 2, 0),
                     reason='bug in scipy 1.2.0, see scipy issue #9608')
-@pytest.mark.skipif(Y_DTYPE != np.float64,
-                    reason='Newton internally uses float64 != Y_DTYPE')
+@skip_if_32bit
 def test_derivatives(loss, x0, y_true):
     # Check that gradients are zero when the loss is minimized on 1D array
     # using Halley's method with the first and second order derivatives
@@ -76,10 +79,11 @@ def fprime(x):
     def fprime2(x):
         return get_hessians(y_true, x)
 
-    optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2)
+    optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
+                     maxiter=70, tol=2e-8)
     assert np.allclose(loss.inverse_link_function(optimum), y_true)
     assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
-    assert np.allclose(get_gradients(y_true, optimum), 0)
+    assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
 
 
 @pytest.mark.parametrize('loss, n_classes, prediction_dim', [
@@ -87,6 +91,7 @@ def fprime2(x):
     ('least_absolute_deviation', 0, 1),
     ('binary_crossentropy', 2, 1),
     ('categorical_crossentropy', 3, 3),
+    ('poisson', 0, 1),
 ])
 @pytest.mark.skipif(Y_DTYPE != np.float64,
                     reason='Need 64 bits float precision for numerical checks')
@@ -100,6 +105,8 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
     n_samples = 100
     if loss in ('least_squares', 'least_absolute_deviation'):
         y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif loss in ('poisson'):
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
     else:
         y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
     raw_predictions = rng.normal(
@@ -114,7 +121,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
 
     # Approximate gradients
     # For multiclass loss, we should only change the predictions of one tree
-    # (here the first), hence the use of offset[:, 0] += eps
+    # (here the first), hence the use of offset[0, :] += eps
     # As a softmax is computed, offsetting the whole array by a constant would
     # have no effect on the probabilities, and thus on the loss
     eps = 1e-9
@@ -164,6 +171,27 @@ def test_baseline_least_absolute_deviation():
     assert baseline_prediction == pytest.approx(np.median(y_train))
 
 
+def test_baseline_poisson():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['poisson'](sample_weight=None)
+    y_train = rng.poisson(size=100).astype(np.float64)
+    # Sanity check, make sure at least one sample is non-zero so we don't take
+    # log(0)
+    assert y_train.sum() > 0
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert np.isscalar(baseline_prediction)
+    assert baseline_prediction.dtype == y_train.dtype
+    assert_all_finite(baseline_prediction)
+    # Make sure baseline prediction produces the log of the mean of all targets
+    assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
+
+    # Test baseline for y_true = 0
+    y_train.fill(0.)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert_all_finite(baseline_prediction)
+
+
 def test_baseline_binary_crossentropy():
     rng = np.random.RandomState(0)
 
@@ -215,7 +243,8 @@ def test_baseline_categorical_crossentropy():
     ('least_squares', 'regression'),
     ('least_absolute_deviation', 'regression'),
     ('binary_crossentropy', 'classification'),
-    ('categorical_crossentropy', 'classification')
+    ('categorical_crossentropy', 'classification'),
+    ('poisson', 'poisson_regression'),
     ])
 @pytest.mark.parametrize('sample_weight', ['ones', 'random'])
 def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
@@ -232,6 +261,8 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
 
     if problem == 'regression':
         y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif problem == 'poisson_regression':
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
     else:
         y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
index 7df1e616445fc..3c837844f29e3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
@@ -1,5 +1,5 @@
 import numpy as np
-from sklearn.datasets import load_boston
+from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import r2_score
 import pytest
@@ -12,8 +12,9 @@
 
 
 @pytest.mark.parametrize('n_bins', [200, 256])
-def test_boston_dataset(n_bins):
-    X, y = load_boston(return_X_y=True)
+def test_regression_dataset(n_bins):
+    X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
+                           random_state=42)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, random_state=42)
 
@@ -24,8 +25,8 @@ def test_boston_dataset(n_bins):
     gradients = -y_train.astype(G_H_DTYPE)
     hessians = np.ones(1, dtype=G_H_DTYPE)
 
-    min_samples_leaf = 8
-    max_leaf_nodes = 31
+    min_samples_leaf = 10
+    max_leaf_nodes = 30
     grower = TreeGrower(X_train_binned, gradients, hessians,
                         min_samples_leaf=min_samples_leaf,
                         max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
@@ -34,8 +35,8 @@ def test_boston_dataset(n_bins):
 
     predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
 
-    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
-    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
+    assert r2_score(y_train, predictor.predict(X_train)) > 0.82
+    assert r2_score(y_test, predictor.predict(X_test)) > 0.67
 
 
 @pytest.mark.parametrize('threshold, expected_predictions', [
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index 9cec1c08efc9e..0c1bec9ebfb65 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -146,6 +146,8 @@ class IsolationForest(OutlierMixin, BaseBagging):
         is defined in such a way we obtain the expected number of outliers
         (samples with decision function < 0) in training.
 
+        .. versionadded:: 0.20
+
     estimators_features_ : list of arrays
         The subset of drawn features for each base estimator.
 
@@ -391,6 +393,7 @@ def score_samples(self, X):
             The lower, the more abnormal.
         """
         # code structure from ForestClassifier/predict_proba
+
         check_is_fitted(self)
 
         # Check data
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index a75e9236f1612..73aa55c0575a7 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -13,6 +13,7 @@
 from ..base import clone
 from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
 from ..base import is_classifier, is_regressor
+from ..utils._estimator_html_repr import _VisualBlock
 
 from ._base import _fit_single_estimator
 from ._base import _BaseHeterogeneousEnsemble
@@ -233,6 +234,14 @@ def predict(self, X, **predict_params):
             self.transform(X), **predict_params
         )
 
+    def _sk_visual_block_(self, final_estimator):
+        names, estimators = zip(*self.estimators)
+        parallel = _VisualBlock('parallel', estimators, names=names,
+                                dash_wrapped=False)
+        serial = _VisualBlock('serial', (parallel, final_estimator),
+                              dash_wrapped=False)
+        return _VisualBlock('serial', [serial])
+
 
 class StackingClassifier(ClassifierMixin, _BaseStacking):
     """Stack of estimators with a final classifier.
@@ -496,6 +505,15 @@ def transform(self, X):
         """
         return self._transform(X)
 
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = LogisticRegression()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_(final_estimator)
+
 
 class StackingRegressor(RegressorMixin, _BaseStacking):
     """Stack of estimators with a final regressor.
@@ -665,3 +683,12 @@ def transform(self, X):
             Prediction outputs for each estimator.
         """
         return self._transform(X)
+
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = RidgeCV()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_(final_estimator)
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index cab321702c85d..6a2b5736d8b4e 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -32,6 +32,7 @@
 from ..utils.validation import column_or_1d
 from ..utils.validation import _deprecate_positional_args
 from ..exceptions import NotFittedError
+from ..utils._estimator_html_repr import _VisualBlock
 
 
 class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
@@ -104,6 +105,10 @@ def n_features_in_(self):
 
         return self.estimators_[0].n_features_in_
 
+    def _sk_visual_block_(self):
+        names, estimators = zip(*self.estimators)
+        return _VisualBlock('parallel', estimators, names=names)
+
 
 class VotingClassifier(ClassifierMixin, _BaseVoting):
     """Soft Voting/Majority Rule classifier for unfitted estimators.
@@ -141,6 +146,8 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionadded:: 0.18
+
     flatten_transform : bool, default=True
         Affects shape of transform output only when voting='soft'
         If voting='soft' and flatten_transform=True, transform method returns
@@ -161,7 +168,6 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
     named_estimators_ : :class:`~sklearn.utils.Bunch`
         Attribute to access any fitted sub-estimators by name.
 
-
         .. versionadded:: 0.20
 
     classes_ : array-like of shape (n_predictions,)
@@ -233,6 +239,8 @@ def fit(self, X, y, sample_weight=None):
             Note that this is supported only if all underlying estimators
             support sample weights.
 
+            .. versionadded:: 0.18
+
         Returns
         -------
         self : object
@@ -350,8 +358,8 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
 
     .. versionadded:: 0.21
 
-    A voting regressor is an ensemble meta-estimator that fits base
-    regressors each on the whole dataset. It, then, averages the individual
+    A voting regressor is an ensemble meta-estimator that fits several base
+    regressors, each on the whole dataset. Then it averages the individual
     predictions to form a final prediction.
 
     Read more in the :ref:`User Guide <voting_regressor>`.
diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index 883f0067f5e78..3e8401332aeef 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -29,7 +29,7 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.feature_selection import SelectKBest
 from sklearn.model_selection import train_test_split
-from sklearn.datasets import load_boston, load_iris, make_hastie_10_2
+from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
 from sklearn.utils import check_random_state
 from sklearn.preprocessing import FunctionTransformer
 
@@ -44,12 +44,12 @@
 iris.data = iris.data[perm]
 iris.target = iris.target[perm]
 
-# also load the boston dataset
+# also load the diabetes dataset
 # and randomly permute it
-boston = load_boston()
-perm = rng.permutation(boston.target.size)
-boston.data = boston.data[perm]
-boston.target = boston.target[perm]
+diabetes = load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
 
 
 # TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
@@ -140,8 +140,8 @@ def fit(self, X, y):
 def test_regression():
     # Check regression for various parameter settings.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
     grid = ParameterGrid({"max_samples": [0.5, 1.0],
                           "max_features": [0.5, 1.0],
@@ -162,8 +162,8 @@ def test_regression():
 def test_sparse_regression():
     # Check regression for various parameter settings on sparse input.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
 
     class CustomSVR(SVR):
@@ -229,8 +229,8 @@ def fit(self, X, y):
 def test_bootstrap_samples():
     # Test that bootstrapping samples generate non-perfect base estimators.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     base_estimator = DecisionTreeRegressor().fit(X_train, y_train)
@@ -268,8 +268,8 @@ def test_bootstrap_samples():
 def test_bootstrap_features():
     # Test that bootstrapping features may generate duplicate features.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
@@ -278,7 +278,7 @@ def test_bootstrap_features():
                                 random_state=rng).fit(X_train, y_train)
 
     for features in ensemble.estimators_features_:
-        assert boston.data.shape[1] == np.unique(features).shape[0]
+        assert diabetes.data.shape[1] == np.unique(features).shape[0]
 
     ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                 max_features=1.0,
@@ -286,7 +286,7 @@ def test_bootstrap_features():
                                 random_state=rng).fit(X_train, y_train)
 
     for features in ensemble.estimators_features_:
-        assert boston.data.shape[1] > np.unique(features).shape[0]
+        assert diabetes.data.shape[1] > np.unique(features).shape[0]
 
 
 def test_probability():
@@ -355,8 +355,8 @@ def test_oob_score_regression():
     # Check that oob prediction is a good estimation of the generalization
     # error.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
@@ -383,8 +383,8 @@ def test_oob_score_regression():
 def test_single_estimator():
     # Check singleton ensembles.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
@@ -488,8 +488,8 @@ def test_parallel_regression():
     # Check parallel regression.
     rng = check_random_state(0)
 
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = BaggingRegressor(DecisionTreeRegressor(),
@@ -553,8 +553,8 @@ def test_base_estimator():
     assert isinstance(ensemble.base_estimator_, Perceptron)
 
     # Regression
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = BaggingRegressor(None,
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 8144a095cec3a..775ed851d5a53 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -163,17 +163,17 @@ def check_boston_criterion(name, criterion):
     # Check consistency on dataset boston house prices.
     ForestRegressor = FOREST_REGRESSORS[name]
 
-    clf = ForestRegressor(n_estimators=5, criterion=criterion,
+    reg = ForestRegressor(n_estimators=5, criterion=criterion,
                           random_state=1)
-    clf.fit(boston.data, boston.target)
-    score = clf.score(boston.data, boston.target)
+    reg.fit(boston.data, boston.target)
+    score = reg.score(boston.data, boston.target)
     assert score > 0.94, ("Failed with max_features=None, criterion %s "
                           "and score = %f" % (criterion, score))
 
-    clf = ForestRegressor(n_estimators=5, criterion=criterion,
+    reg = ForestRegressor(n_estimators=5, criterion=criterion,
                           max_features=6, random_state=1)
-    clf.fit(boston.data, boston.target)
-    score = clf.score(boston.data, boston.target)
+    reg.fit(boston.data, boston.target)
+    score = reg.score(boston.data, boston.target)
     assert score > 0.95, ("Failed with max_features=6, criterion %s "
                           "and score = %f" % (criterion, score))
 
@@ -682,10 +682,10 @@ def test_distribution():
     y = rng.rand(1000)
     n_trees = 500
 
-    clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
+    reg = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
 
     uniques = defaultdict(int)
-    for tree in clf.estimators_:
+    for tree in reg.estimators_:
         tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
                        for f, t in zip(tree.tree_.feature,
                                        tree.tree_.threshold))
@@ -713,10 +713,10 @@ def test_distribution():
     X[:, 1] = np.random.randint(0, 3, 1000)
     y = rng.rand(1000)
 
-    clf = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y)
+    reg = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y)
 
     uniques = defaultdict(int)
-    for tree in clf.estimators_:
+    for tree in reg.estimators_:
         tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
                        for f, t in zip(tree.tree_.feature,
                                        tree.tree_.threshold))
@@ -1065,25 +1065,25 @@ def check_warm_start(name, random_state=42):
     # right size and the same results as a normal fit.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf_ws = None
+    est_ws = None
     for n_estimators in [5, 10]:
-        if clf_ws is None:
-            clf_ws = ForestEstimator(n_estimators=n_estimators,
+        if est_ws is None:
+            est_ws = ForestEstimator(n_estimators=n_estimators,
                                      random_state=random_state,
                                      warm_start=True)
         else:
-            clf_ws.set_params(n_estimators=n_estimators)
-        clf_ws.fit(X, y)
-        assert len(clf_ws) == n_estimators
+            est_ws.set_params(n_estimators=n_estimators)
+        est_ws.fit(X, y)
+        assert len(est_ws) == n_estimators
 
-    clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state,
+    est_no_ws = ForestEstimator(n_estimators=10, random_state=random_state,
                                 warm_start=False)
-    clf_no_ws.fit(X, y)
+    est_no_ws.fit(X, y)
 
-    assert (set([tree.random_state for tree in clf_ws]) ==
-                 set([tree.random_state for tree in clf_no_ws]))
+    assert (set([tree.random_state for tree in est_ws]) ==
+            set([tree.random_state for tree in est_no_ws]))
 
-    assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X),
+    assert_array_equal(est_ws.apply(X), est_no_ws.apply(X),
                        err_msg="Failed with {0}".format(name))
 
 
@@ -1096,17 +1096,17 @@ def check_warm_start_clear(name):
     # Test if fit clears state and grows a new forest when warm_start==False.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False,
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False,
                           random_state=1)
-    clf.fit(X, y)
+    est.fit(X, y)
 
-    clf_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True,
+    est_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True,
                             random_state=2)
-    clf_2.fit(X, y)  # inits state
-    clf_2.set_params(warm_start=False, random_state=1)
-    clf_2.fit(X, y)  # clears old state and equals clf
+    est_2.fit(X, y)  # inits state
+    est_2.set_params(warm_start=False, random_state=1)
+    est_2.fit(X, y)  # clears old state and equals est
 
-    assert_array_almost_equal(clf_2.apply(X), clf.apply(X))
+    assert_array_almost_equal(est_2.apply(X), est.apply(X))
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@@ -1118,10 +1118,10 @@ def check_warm_start_smaller_n_estimators(name):
     # Test if warm start second fit with smaller n_estimators raises error.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True)
-    clf.fit(X, y)
-    clf.set_params(n_estimators=4)
-    assert_raises(ValueError, clf.fit, X, y)
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True)
+    est.fit(X, y)
+    est.set_params(n_estimators=4)
+    assert_raises(ValueError, est.fit, X, y)
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@@ -1134,20 +1134,20 @@ def check_warm_start_equal_n_estimators(name):
     # same forest and raises a warning.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
+    est = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
                           random_state=1)
-    clf.fit(X, y)
+    est.fit(X, y)
 
-    clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
+    est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
                             random_state=1)
-    clf_2.fit(X, y)
-    # Now clf_2 equals clf.
+    est_2.fit(X, y)
+    # Now est_2 equals est.
 
-    clf_2.set_params(random_state=2)
-    assert_warns(UserWarning, clf_2.fit, X, y)
+    est_2.set_params(random_state=2)
+    assert_warns(UserWarning, est_2.fit, X, y)
     # If we had fit the trees again we would have got a different forest as we
     # changed the random state.
-    assert_array_equal(clf.apply(X), clf_2.apply(X))
+    assert_array_equal(est.apply(X), est_2.apply(X))
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@@ -1160,31 +1160,31 @@ def check_warm_start_oob(name):
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
     # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning.
-    clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False,
+    est = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False,
                           random_state=1, bootstrap=True, oob_score=True)
-    clf.fit(X, y)
+    est.fit(X, y)
 
-    clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False,
+    est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False,
                             random_state=1, bootstrap=True, oob_score=False)
-    clf_2.fit(X, y)
+    est_2.fit(X, y)
 
-    clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
-    clf_2.fit(X, y)
+    est_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
+    est_2.fit(X, y)
 
-    assert hasattr(clf_2, 'oob_score_')
-    assert clf.oob_score_ == clf_2.oob_score_
+    assert hasattr(est_2, 'oob_score_')
+    assert est.oob_score_ == est_2.oob_score_
 
     # Test that oob_score is computed even if we don't need to train
     # additional trees.
-    clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True,
+    est_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True,
                             random_state=1, bootstrap=True, oob_score=False)
-    clf_3.fit(X, y)
-    assert not hasattr(clf_3, 'oob_score_')
+    est_3.fit(X, y)
+    assert not hasattr(est_3, 'oob_score_')
 
-    clf_3.set_params(oob_score=True)
-    ignore_warnings(clf_3.fit)(X, y)
+    est_3.set_params(oob_score=True)
+    ignore_warnings(est_3.fit)(X, y)
 
-    assert clf.oob_score_ == clf_3.oob_score_
+    assert est.oob_score_ == est_3.oob_score_
 
 
 @pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS)
diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 3593bc0422ff7..aeb384ab44503 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -21,7 +21,7 @@
 from sklearn.ensemble import IsolationForest
 from sklearn.ensemble._iforest import _average_path_length
 from sklearn.model_selection import train_test_split
-from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import load_diabetes, load_iris
 from sklearn.utils import check_random_state
 from sklearn.metrics import roc_auc_score
 
@@ -37,12 +37,12 @@
 iris.data = iris.data[perm]
 iris.target = iris.target[perm]
 
-# also load the boston dataset
+# also load the diabetes dataset
 # and randomly permute it
-boston = load_boston()
-perm = rng.permutation(boston.target.size)
-boston.data = boston.data[perm]
-boston.target = boston.target[perm]
+diabetes = load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
 
 
 def test_iforest():
@@ -63,8 +63,8 @@ def test_iforest():
 def test_iforest_sparse():
     """Check IForest for various parameter settings on sparse input."""
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
     grid = ParameterGrid({"max_samples": [0.5, 1.0],
                           "bootstrap": [True, False]})
@@ -157,8 +157,8 @@ def test_iforest_parallel_regression():
     """Check parallel regression."""
     rng = check_random_state(0)
 
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = IsolationForest(n_jobs=3,
@@ -226,8 +226,8 @@ def test_max_samples_consistency():
 def test_iforest_subsampled_features():
     # It tests non-regression for #5732 which failed at predict.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
     clf = IsolationForest(max_features=0.8)
     clf.fit(X_train, y_train)
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index 4eb47bea0a514..f81b9e59a5f1b 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -33,7 +33,7 @@
 iris = datasets.load_iris()
 X, y = iris.data[:, 1:3], iris.target
 
-X_r, y_r = datasets.load_boston(return_X_y=True)
+X_r, y_r = datasets.load_diabetes(return_X_y=True)
 
 
 @pytest.mark.parametrize(
@@ -120,7 +120,7 @@ def test_weights_iris():
 
 
 def test_weights_regressor():
-    """Check weighted average regression prediction on boston dataset."""
+    """Check weighted average regression prediction on diabetes dataset."""
     reg1 = DummyRegressor(strategy='mean')
     reg2 = DummyRegressor(strategy='median')
     reg3 = DummyRegressor(strategy='quantile', quantile=.2)
diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index ea34365afa703..7140b98e53027 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -49,24 +49,6 @@ class ChangedBehaviorWarning(UserWarning):
 class ConvergenceWarning(UserWarning):
     """Custom warning to capture convergence problems
 
-    Examples
-    --------
-
-    >>> import numpy as np
-    >>> import warnings
-    >>> from sklearn.cluster import KMeans
-    >>> from sklearn.exceptions import ConvergenceWarning
-    >>> warnings.simplefilter("always", ConvergenceWarning)
-    >>> X = np.asarray([[0, 0],
-    ...                 [0, 1],
-    ...                 [1, 0],
-    ...                 [1, 0]])  # last point is duplicated
-    >>> with warnings.catch_warnings(record=True) as w:
-    ...     km = KMeans(n_clusters=4).fit(X)
-    ...     print(w[-1].message)
-    Number of distinct clusters (3) found smaller than n_clusters (4).
-    Possibly due to duplicate points in X.
-
     .. versionchanged:: 0.18
        Moved from sklearn.utils.
     """
@@ -122,27 +104,6 @@ class FitFailedWarning(RuntimeWarning):
     and the cross-validation helper function cross_val_score to warn when there
     is an error while fitting the estimator.
 
-    Examples
-    --------
-    >>> from sklearn.model_selection import GridSearchCV
-    >>> from sklearn.svm import LinearSVC
-    >>> from sklearn.exceptions import FitFailedWarning
-    >>> import warnings
-    >>> warnings.simplefilter('always', FitFailedWarning)
-    >>> gs = GridSearchCV(LinearSVC(), {'C': [-1, -2]}, error_score=0, cv=2)
-    >>> X, y = [[1, 2], [3, 4], [5, 6], [7, 8]], [0, 0, 1, 1]
-    >>> with warnings.catch_warnings(record=True) as w:
-    ...     try:
-    ...         gs.fit(X, y)  # This will raise a ValueError since C is < 0
-    ...     except ValueError:
-    ...         pass
-    ...     print(repr(w[-1].message))
-    FitFailedWarning('Estimator fit failed. The score on this train-test
-    partition for these parameters will be set to 0.000000.
-    Details:...Traceback (most recent call last):...ValueError:
-    Penalty term must be positive; got (C=-2)...
-
-
     .. versionchanged:: 0.18
        Moved from sklearn.cross_validation.
     """
diff --git a/sklearn/externals/_scipy_linalg.py b/sklearn/externals/_scipy_linalg.py
deleted file mode 100644
index 70a6ff5a0c623..0000000000000
--- a/sklearn/externals/_scipy_linalg.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# This should remained pinned to version 1.2 and not updated like other
-# externals.
-"""Copyright (c) 2001-2002 Enthought, Inc.  2003-2019, SciPy Developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following
-   disclaimer in the documentation and/or other materials provided
-   with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-   contributors may be used to endorse or promote products derived
-   from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-import numpy as np
-import scipy.linalg.decomp as decomp
-
-
-def pinvh(a, cond=None, rcond=None, lower=True, return_rank=False,
-          check_finite=True):
-    """
-    Compute the (Moore-Penrose) pseudo-inverse of a Hermitian matrix.
-
-    Copied in from scipy==1.2.2, in order to preserve the default choice of the
-    `cond` and `above_cutoff` values which determine which values of the matrix
-    inversion lie below threshold and are so set to zero. Changes in scipy 1.3
-    resulted in a smaller default threshold and thus slower convergence of
-    dependent algorithms in some cases (see Sklearn github issue #14055).
-
-    Calculate a generalized inverse of a Hermitian or real symmetric matrix
-    using its eigenvalue decomposition and including all eigenvalues with
-    'large' absolute value.
-
-    Parameters
-    ----------
-    a : (N, N) array_like
-        Real symmetric or complex hermetian matrix to be pseudo-inverted
-    cond, rcond : float or None
-        Cutoff for 'small' eigenvalues.
-        Singular values smaller than rcond * largest_eigenvalue are considered
-        zero.
-
-        If None or -1, suitable machine precision is used.
-    lower : bool, optional
-        Whether the pertinent array data is taken from the lower or upper
-        triangle of a. (Default: lower)
-    return_rank : bool, optional
-        if True, return the effective rank of the matrix
-    check_finite : bool, optional
-        Whether to check that the input matrix contains only finite numbers.
-        Disabling may give a performance gain, but may result in problems
-        (crashes, non-termination) if the inputs do contain infinities or NaNs.
-
-    Returns
-    -------
-    B : (N, N) ndarray
-        The pseudo-inverse of matrix `a`.
-    rank : int
-        The effective rank of the matrix.  Returned if return_rank == True
-
-    Raises
-    ------
-    LinAlgError
-        If eigenvalue does not converge
-
-    Examples
-    --------
-    >>> from scipy.linalg import pinvh
-    >>> a = np.random.randn(9, 6)
-    >>> a = np.dot(a, a.T)
-    >>> B = pinvh(a)
-    >>> np.allclose(a, np.dot(a, np.dot(B, a)))
-    True
-    >>> np.allclose(B, np.dot(B, np.dot(a, B)))
-    True
-
-    """
-    a = decomp._asarray_validated(a, check_finite=check_finite)
-    s, u = decomp.eigh(a, lower=lower, check_finite=False)
-
-    if rcond is not None:
-        cond = rcond
-    if cond in [None, -1]:
-        t = u.dtype.char.lower()
-        factor = {'f': 1E3, 'd': 1E6}
-        cond = factor[t] * np.finfo(t).eps
-
-    # For Hermitian matrices, singular values equal abs(eigenvalues)
-    above_cutoff = (abs(s) > cond * np.max(abs(s)))
-    psigma_diag = 1.0 / s[above_cutoff]
-    u = u[:, above_cutoff]
-
-    B = np.dot(u * psigma_diag, np.conjugate(u).T)
-
-    if return_rank:
-        return B, len(psigma_diag)
-    else:
-        return B
diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
index b527b0d72e6be..303e34d6f0ab9 100644
--- a/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -11,6 +11,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, tosequence
+from ..utils.validation import _deprecate_positional_args
 
 
 def _tosequence(X):
@@ -89,8 +90,8 @@ class DictVectorizer(TransformerMixin, BaseEstimator):
     sklearn.preprocessing.OrdinalEncoder : handles nominal/categorical
       features encoded as columns of arbitrary data types.
     """
-
-    def __init__(self, dtype=np.float64, separator="=", sparse=True,
+    @_deprecate_positional_args
+    def __init__(self, *, dtype=np.float64, separator="=", sparse=True,
                  sort=True):
         self.dtype = dtype
         self.separator = separator
diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index f52e6f296169b..b9c2abaa25a72 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -7,6 +7,7 @@
 import scipy.sparse as sp
 
 from ..utils import IS_PYPY
+from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator, TransformerMixin
 
 if not IS_PYPY:
@@ -69,6 +70,10 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
         approximately conserve the inner product in the hashed space even for
         small n_features. This approach is similar to sparse random projection.
 
+    .. versionchanged:: 0.19
+        ``alternate_sign`` replaces the now deprecated ``non_negative``
+        parameter.
+
     Examples
     --------
     >>> from sklearn.feature_extraction import FeatureHasher
@@ -84,8 +89,8 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
     DictVectorizer : vectorizes string-valued features using a hash table.
     sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features.
     """
-
-    def __init__(self, n_features=(2 ** 20), input_type="dict",
+    @_deprecate_positional_args
+    def __init__(self, n_features=(2 ** 20), *, input_type="dict",
                  dtype=np.float64, alternate_sign=True):
         self._validate_params(n_features, input_type)
 
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 588abf3fcf896..737f555bbccda 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -16,6 +16,7 @@
 from numpy.lib.stride_tricks import as_strided
 
 from ..utils import check_array, check_random_state, deprecated
+from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator
 
 __all__ = ['PatchExtractor',
@@ -519,8 +520,9 @@ class PatchExtractor(BaseEstimator):
     >>> print('Patches shape: {}'.format(pe_trans.shape))
     Patches shape: (545706, 2, 2)
     """
-
-    def __init__(self, patch_size=None, max_patches=None, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, *, patch_size=None, max_patches=None,
+                 random_state=None):
         self.patch_size = patch_size
         self.max_patches = max_patches
         self.random_state = random_state
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
index 9fa7a191ca279..c0cd50cef6e09 100644
--- a/sklearn/feature_extraction/tests/test_feature_hasher.py
+++ b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -32,7 +32,7 @@ def test_feature_hasher_strings():
 
         it = (x for x in raw_X)                 # iterable
 
-        h = FeatureHasher(n_features, input_type="string",
+        h = FeatureHasher(n_features=n_features, input_type="string",
                           alternate_sign=False)
         X = h.transform(it)
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index ebc584b6271a9..661f638b000fc 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -33,6 +33,7 @@
 from ..utils import _IS_32BIT, deprecated
 from ..utils.fixes import _astype_copy_false
 from ..exceptions import NotFittedError
+from ..utils.validation import _deprecate_positional_args
 
 
 __all__ = ['HashingVectorizer',
@@ -677,8 +678,8 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
     CountVectorizer, TfidfVectorizer
 
     """
-
-    def __init__(self, input='content', encoding='utf-8',
+    @_deprecate_positional_args
+    def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None,
                  lowercase=True, preprocessor=None, tokenizer=None,
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
@@ -999,8 +1000,8 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
     when pickling. This attribute is provided only for introspection and can
     be safely removed using delattr or set to None before pickling.
     """
-
-    def __init__(self, input='content', encoding='utf-8',
+    @_deprecate_positional_args
+    def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None,
                  lowercase=True, preprocessor=None, tokenizer=None,
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
@@ -1374,6 +1375,8 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
         The inverse document frequency (IDF) vector; only defined
         if  ``use_idf`` is True.
 
+        .. versionadded:: 0.20
+
     Examples
     --------
     >>> from sklearn.feature_extraction.text import TfidfTransformer
@@ -1409,8 +1412,8 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
                    Introduction to Information Retrieval. Cambridge University
                    Press, pp. 118-120.
     """
-
-    def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
+    @_deprecate_positional_args
+    def __init__(self, *, norm='l2', use_idf=True, smooth_idf=True,
                  sublinear_tf=False):
         self.norm = norm
         self.use_idf = use_idf
@@ -1715,8 +1718,8 @@ class TfidfVectorizer(CountVectorizer):
     >>> print(X.shape)
     (4, 9)
     """
-
-    def __init__(self, input='content', encoding='utf-8',
+    @_deprecate_positional_args
+    def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None, lowercase=True,
                  preprocessor=None, tokenizer=None, analyzer='word',
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 7e7aada0d70b3..8dc7aecb7dc3e 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -340,7 +340,9 @@ def predict_log_proba(self, X):
     def _more_tags(self):
         estimator_tags = self.estimator._get_tags()
         return {'poor_score': True,
-                'allow_nan': estimator_tags.get('allow_nan', True)}
+                'allow_nan': estimator_tags.get('allow_nan', True),
+                'requires_y': True,
+                }
 
 
 class RFECV(RFE):
@@ -372,6 +374,8 @@ class RFECV(RFE):
         feature count and ``min_features_to_select`` isn't divisible by
         ``step``.
 
+        .. versionadded:: 0.20
+
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -406,6 +410,8 @@ class RFECV(RFE):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionadded:: 0.18
+
     Attributes
     ----------
     n_features_ : int
@@ -495,6 +501,8 @@ def fit(self, X, y, groups=None):
             Group labels for the samples used while splitting the dataset into
             train/test set. Only used in conjunction with a "Group" :term:`cv`
             instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
+
+            .. versionadded:: 0.20
         """
         tags = self._get_tags()
         X, y = self._validate_data(
@@ -504,7 +512,7 @@ def fit(self, X, y, groups=None):
         )
 
         # Initialization
-        cv = check_cv(self.cv, y, is_classifier(self.estimator))
+        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
         scorer = check_scoring(self.estimator, scoring=self.scoring)
         n_features = X.shape[1]
 
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 7ca0ce4a36715..6911830099844 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -146,7 +146,7 @@ def f_classif(X, y):
     chi2: Chi-squared stats of non-negative features for classification tasks.
     f_regression: F-value between label/feature for regression tasks.
     """
-    X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
+    X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])
     args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
     return f_oneway(*args)
 
@@ -277,7 +277,8 @@ def f_regression(X, y, center=True):
     SelectPercentile: Select features based on percentile of the highest
         scores.
     """
-    X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64)
+    X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                     dtype=np.float64)
     n_samples = X.shape[0]
 
     # compute centered values
@@ -363,6 +364,9 @@ def fit(self, X, y):
     def _check_params(self, X, y):
         pass
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 ######################################################################
 # Specific filters
@@ -380,6 +384,8 @@ class SelectPercentile(_BaseFilter):
         Default is f_classif (see below "See also"). The default function only
         works with classification tasks.
 
+        .. versionadded:: 0.18
+
     percentile : int, optional, default=10
         Percent of features to keep.
 
@@ -463,6 +469,8 @@ class SelectKBest(_BaseFilter):
         Default is f_classif (see below "See also"). The default function only
         works with classification tasks.
 
+        .. versionadded:: 0.18
+
     k : int or "all", optional, default=10
         Number of top features to select.
         The "all" option bypasses selection, for use in a parameter search.
diff --git a/sklearn/feature_selection/tests/test_base.py b/sklearn/feature_selection/tests/test_base.py
index d1aaccde0efa3..9515bdc32c600 100644
--- a/sklearn/feature_selection/tests/test_base.py
+++ b/sklearn/feature_selection/tests/test_base.py
@@ -15,7 +15,7 @@ def __init__(self, step=2):
         self.step = step
 
     def fit(self, X, y=None):
-        X = check_array(X, 'csc')
+        X = check_array(X, accept_sparse='csc')
         self.n_input_feats = X.shape[1]
         return self
 
diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index e70838c6d251a..2c9c0ef483d4f 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -19,6 +19,7 @@
 from ..utils.optimize import _check_optimize_result
 from ..preprocessing import LabelEncoder
 from ..multiclass import OneVsRestClassifier, OneVsOneClassifier
+from ..utils.validation import _deprecate_positional_args
 
 
 # Values required for approximating the logistic sigmoid by
@@ -144,7 +145,8 @@ def optimizer(obj_func, initial_theta, bounds):
         The log-marginal-likelihood of ``self.kernel_.theta``
 
     """
-    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
                  warm_start=False, copy_X_train=True, random_state=None):
         self.kernel = kernel
@@ -586,7 +588,8 @@ def optimizer(obj_func, initial_theta, bounds):
 
     .. versionadded:: 0.18
     """
-    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
                  warm_start=False, copy_X_train=True, random_state=None,
                  multi_class="one_vs_rest", n_jobs=None):
@@ -623,9 +626,13 @@ def fit(self, X, y):
                                        ensure_2d=False, dtype=None)
 
         self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
-            self.kernel, self.optimizer, self.n_restarts_optimizer,
-            self.max_iter_predict, self.warm_start, self.copy_X_train,
-            self.random_state)
+            kernel=self.kernel,
+            optimizer=self.optimizer,
+            n_restarts_optimizer=self.n_restarts_optimizer,
+            max_iter_predict=self.max_iter_predict,
+            warm_start=self.warm_start,
+            copy_X_train=self.copy_X_train,
+            random_state=self.random_state)
 
         self.classes_ = np.unique(y)
         self.n_classes_ = self.classes_.size
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index caf94ce41c1b4..0ba594a7ffaac 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -17,6 +17,7 @@
 from ..utils import check_random_state
 from ..utils.validation import check_array
 from ..utils.optimize import _check_optimize_result
+from ..utils.validation import _deprecate_positional_args
 
 
 class GaussianProcessRegressor(MultiOutputMixin,
@@ -149,7 +150,8 @@ def optimizer(obj_func, initial_theta, bounds):
     (array([653.0..., 592.1...]), array([316.6..., 316.6...]))
 
     """
-    def __init__(self, kernel=None, alpha=1e-10,
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, alpha=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
                  normalize_y=False, copy_X_train=True, random_state=None):
         self.kernel = kernel
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 608f8f54ee162..517de982d8478 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -14,6 +14,7 @@
 from ..utils.sparsefuncs import _get_median
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 from ..utils._mask import _get_mask
 from ..utils import is_scalar_nan
 
@@ -67,7 +68,7 @@ class _BaseImputer(TransformerMixin, BaseEstimator):
     It adds automatically support for `add_indicator`.
     """
 
-    def __init__(self, missing_values=np.nan, add_indicator=False):
+    def __init__(self, *, missing_values=np.nan, add_indicator=False):
         self.missing_values = missing_values
         self.add_indicator = add_indicator
 
@@ -127,7 +128,9 @@ class SimpleImputer(_BaseImputer):
     ----------
     missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     strategy : string, default='mean'
         The imputation strategy.
@@ -205,7 +208,8 @@ class SimpleImputer(_BaseImputer):
     upon :meth:`transform` if strategy is not "constant".
 
     """
-    def __init__(self, missing_values=np.nan, strategy="mean",
+    @_deprecate_positional_args
+    def __init__(self, *, missing_values=np.nan, strategy="mean",
                  fill_value=None, verbose=0, copy=True, add_indicator=False):
         super().__init__(
             missing_values=missing_values,
@@ -474,8 +478,9 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
     ----------
     missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be indicated (True in the output array), the
-        other values will be marked as False.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     features : str, default=None
         Whether the imputer mask should represent all or a subset of
@@ -525,8 +530,8 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
            [False, False]])
 
     """
-
-    def __init__(self, missing_values=np.nan, features="missing-only",
+    @_deprecate_positional_args
+    def __init__(self, *, missing_values=np.nan, features="missing-only",
                  sparse="auto", error_on_new=True):
         self.missing_values = missing_values
         self.features = features
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 88eff8503d510..8f80c9723eac3 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -54,7 +54,9 @@ class IterativeImputer(_BaseImputer):
 
     missing_values : int, np.nan, default=np.nan
         The placeholder for the missing values. All occurrences of
-        ``missing_values`` will be imputed.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the
@@ -206,9 +208,8 @@ class IterativeImputer(_BaseImputer):
         Journal of the Royal Statistical Society 22(2): 302-306.
         <https://www.jstor.org/stable/2984099>`_
     """
-
     def __init__(self,
-                 estimator=None,
+                 estimator=None, *,
                  missing_values=np.nan,
                  sample_posterior=False,
                  max_iter=10,
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index f782a46a6b40d..80a6423bdef79 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -14,6 +14,7 @@
 from ..utils import is_scalar_nan
 from ..utils._mask import _get_mask
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 
 class KNNImputer(_BaseImputer):
@@ -31,7 +32,9 @@ class KNNImputer(_BaseImputer):
     ----------
     missing_values : number, string, np.nan or None, default=`np.nan`
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     n_neighbors : int, default=5
         Number of neighboring samples to use for imputation.
@@ -94,8 +97,8 @@ class KNNImputer(_BaseImputer):
            [5.5, 6. , 5. ],
            [8. , 8. , 7. ]])
     """
-
-    def __init__(self, missing_values=np.nan, n_neighbors=5,
+    @_deprecate_positional_args
+    def __init__(self, *, missing_values=np.nan, n_neighbors=5,
                  weights="uniform", metric="nan_euclidean", copy=True,
                  add_indicator=False):
         super().__init__(
diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py
index a8d2fd9d6b2f7..220a335c15285 100644
--- a/sklearn/impute/tests/test_common.py
+++ b/sklearn/impute/tests/test_common.py
@@ -84,3 +84,32 @@ def test_imputers_add_indicator_sparse(imputer, marker):
     imputer.set_params(add_indicator=False)
     X_trans_no_indicator = imputer.fit_transform(X)
     assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
+
+
+# ConvergenceWarning will be raised by the IterativeImputer
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("imputer", IMPUTERS)
+@pytest.mark.parametrize("add_indicator", [True, False])
+def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip('pandas', minversion="1.0")
+    marker = np.nan
+    imputer = imputer.set_params(add_indicator=add_indicator,
+                                 missing_values=marker)
+
+    X = np.array([
+        [marker, 1,      5,      marker, 1],
+        [2,      marker, 1,      marker, 2],
+        [6,      3,      marker, marker, 3],
+        [1,      2,      9,      marker, 4]
+    ])
+    # fit on numpy array
+    X_trans_expected = imputer.fit_transform(X)
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
+
+    # fit on pandas dataframe with IntegerArrays
+    X_trans = imputer.fit_transform(X_df)
+
+    assert_allclose(X_trans_expected, X_trans)
diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
index 50f60ff6e96ad..960f671915e6a 100644
--- a/sklearn/impute/tests/test_impute.py
+++ b/sklearn/impute/tests/test_impute.py
@@ -16,7 +16,7 @@
 # make IterativeImputer available
 from sklearn.experimental import enable_iterative_imputer  # noqa
 
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.impute import MissingIndicator
 from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.dummy import DummyRegressor
@@ -48,14 +48,14 @@ def _check_statistics(X, X_true,
         assert_ae = assert_array_almost_equal
 
     # Normal matrix
-    imputer = SimpleImputer(missing_values, strategy=strategy)
+    imputer = SimpleImputer(missing_values=missing_values, strategy=strategy)
     X_trans = imputer.fit(X).transform(X.copy())
     assert_ae(imputer.statistics_, statistics,
               err_msg=err_msg.format(False))
     assert_ae(X_trans, X_true, err_msg=err_msg.format(False))
 
     # Sparse matrix
-    imputer = SimpleImputer(missing_values, strategy=strategy)
+    imputer = SimpleImputer(missing_values=missing_values, strategy=strategy)
     imputer.fit(sparse.csc_matrix(X))
     X_trans = imputer.transform(sparse.csc_matrix(X.copy()))
 
@@ -947,7 +947,7 @@ def test_iterative_imputer_early_stopping():
 def test_iterative_imputer_catch_warning():
     # check that we catch a RuntimeWarning due to a division by zero when a
     # feature is constant in the dataset
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
     n_samples, n_features = X.shape
 
     # simulate that a feature only contain one category during fit
diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
index f0fbc23333266..f3bb10a1a3275 100644
--- a/sklearn/inspection/_partial_dependence.py
+++ b/sklearn/inspection/_partial_dependence.py
@@ -20,6 +20,7 @@
 from ..utils import _determine_key_type
 from ..utils import _get_column_indices
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..tree import DecisionTreeRegressor
 from ..ensemble import RandomForestRegressor
 from ..exceptions import NotFittedError
@@ -181,7 +182,8 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
     return averaged_predictions
 
 
-def partial_dependence(estimator, X, features, response_method='auto',
+@_deprecate_positional_args
+def partial_dependence(estimator, X, features, *, response_method='auto',
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto'):
     """Partial dependence of ``features``.
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 8efafd8a7eef4..e8d77360a7ca0 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -7,6 +7,7 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
@@ -37,7 +38,8 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     return scores
 
 
-def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
+@_deprecate_positional_args
+def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
                            n_jobs=None, random_state=None):
     """Permutation importance for feature evaluation [BRE]_.
 
diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index f39c604cac77b..e02717e76dce3 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -13,9 +13,11 @@
 from ...utils import check_array
 from ...utils import check_matplotlib_support  # noqa
 from ...utils import _safe_indexing
+from ...utils.validation import _deprecate_positional_args
 
 
-def plot_partial_dependence(estimator, X, features, feature_names=None,
+@_deprecate_positional_args
+def plot_partial_dependence(estimator, X, features, *, feature_names=None,
                             target=None, response_method='auto', n_cols=3,
                             grid_resolution=100, percentiles=(0.05, 0.95),
                             method='auto', n_jobs=None, verbose=0, fig=None,
@@ -322,8 +324,12 @@ def convert_feature(fx):
         fig.clear()
         ax = fig.gca()
 
-    display = PartialDependenceDisplay(pd_results, features, feature_names,
-                                       target_idx, pdp_lim, deciles)
+    display = PartialDependenceDisplay(pd_results=pd_results,
+                                       features=features,
+                                       feature_names=feature_names,
+                                       target_idx=target_idx,
+                                       pdp_lim=pdp_lim,
+                                       deciles=deciles)
     return display.plot(ax=ax, n_cols=n_cols, line_kw=line_kw,
                         contour_kw=contour_kw)
 
@@ -337,7 +343,7 @@ class PartialDependenceDisplay:
     stored as attributes.
 
     Read more in
-    :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py`
+    :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`
     and the :ref:`User Guide <visualizations>`.
 
         .. versionadded:: 0.22
@@ -385,28 +391,44 @@ class PartialDependenceDisplay:
     axes_ : ndarray of matplotlib Axes
         If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row
         and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item
-        in `ax`. Elements that are None corresponds to a nonexisting axes in
+        in `ax`. Elements that are None correspond to a nonexisting axes in
         that position.
 
     lines_ : ndarray of matplotlib Artists
-        If `ax` is an axes or None, `line_[i, j]` is the partial dependence
+        If `ax` is an axes or None, `lines_[i, j]` is the partial dependence
         curve on the i-th row and j-th column. If `ax` is a list of axes,
         `lines_[i]` is the partial dependence curve corresponding to the i-th
-        item in `ax`. Elements that are None corresponds to a nonexisting axes
+        item in `ax`. Elements that are None correspond to a nonexisting axes
         or an axes that does not include a line plot.
 
+    deciles_vlines_ : ndarray of matplotlib LineCollection
+        If `ax` is an axes or None, `vlines_[i, j]` is the line collection
+        representing the x axis deciles of the i-th row and j-th column. If
+        `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
+        `ax`. Elements that are None correspond to a nonexisting axes or an
+        axes that does not include a PDP plot.
+        .. versionadded:: 0.23
+    deciles_hlines_ : ndarray of matplotlib LineCollection
+        If `ax` is an axes or None, `vlines_[i, j]` is the line collection
+        representing the y axis deciles of the i-th row and j-th column. If
+        `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
+        `ax`. Elements that are None correspond to a nonexisting axes or an
+        axes that does not include a 2-way plot.
+        .. versionadded:: 0.23
+
     contours_ : ndarray of matplotlib Artists
         If `ax` is an axes or None, `contours_[i, j]` is the partial dependence
         plot on the i-th row and j-th column. If `ax` is a list of axes,
         `contours_[i]` is the partial dependence plot corresponding to the i-th
-        item in `ax`. Elements that are None corresponds to a nonexisting axes
+        item in `ax`. Elements that are None correspond to a nonexisting axes
         or an axes that does not include a contour plot.
 
     figure_ : matplotlib Figure
         Figure containing partial dependence plots.
 
     """
-    def __init__(self, pd_results, features, feature_names, target_idx,
+    @_deprecate_positional_args
+    def __init__(self, pd_results, *, features, feature_names, target_idx,
                  pdp_lim, deciles):
         self.pd_results = pd_results
         self.features = features
@@ -483,8 +505,6 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
             n_rows = int(np.ceil(n_features / float(n_cols)))
 
             self.axes_ = np.empty((n_rows, n_cols), dtype=np.object)
-            self.lines_ = np.empty((n_rows, n_cols), dtype=np.object)
-            self.contours_ = np.empty((n_rows, n_cols), dtype=np.object)
 
             axes_ravel = self.axes_.ravel()
 
@@ -507,14 +527,20 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
             self.bounding_ax_ = None
             self.figure_ = ax.ravel()[0].figure
             self.axes_ = ax
-            self.lines_ = np.empty_like(ax, dtype=np.object)
-            self.contours_ = np.empty_like(ax, dtype=np.object)
 
         # create contour levels for two-way plots
         if 2 in self.pdp_lim:
             Z_level = np.linspace(*self.pdp_lim[2], num=8)
+
+        self.lines_ = np.empty_like(self.axes_, dtype=np.object)
+        self.contours_ = np.empty_like(self.axes_, dtype=np.object)
+        self.deciles_vlines_ = np.empty_like(self.axes_, dtype=np.object)
+        self.deciles_hlines_ = np.empty_like(self.axes_, dtype=np.object)
+        # Create 1d views of these 2d arrays for easy indexing
         lines_ravel = self.lines_.ravel(order='C')
         contours_ravel = self.contours_.ravel(order='C')
+        vlines_ravel = self.deciles_vlines_.ravel(order='C')
+        hlines_ravel = self.deciles_hlines_.ravel(order='C')
 
         for i, axi, fx, (avg_preds, values) in zip(count(),
                                                    self.axes_.ravel(),
@@ -540,8 +566,8 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
             trans = transforms.blended_transform_factory(axi.transData,
                                                          axi.transAxes)
             ylim = axi.get_ylim()
-            axi.vlines(self.deciles[fx[0]], 0, 0.05, transform=trans,
-                       color='k')
+            vlines_ravel[i] = axi.vlines(self.deciles[fx[0]], 0, 0.05,
+                                         transform=trans, color='k')
             axi.set_ylim(ylim)
 
             # Set xlabel if it is not already set
@@ -559,8 +585,8 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
                 trans = transforms.blended_transform_factory(axi.transAxes,
                                                              axi.transData)
                 xlim = axi.get_xlim()
-                axi.hlines(self.deciles[fx[1]], 0, 0.05, transform=trans,
-                           color='k')
+                hlines_ravel[i] = axi.hlines(self.deciles[fx[1]], 0, 0.05,
+                                             transform=trans, color='k')
                 # hline erases xlim
                 axi.set_ylabel(self.feature_names[fx[1]])
                 axi.set_xlim(xlim)
diff --git a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
index abae91d4d2642..41da3f08c9094 100644
--- a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
+++ b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
@@ -51,11 +51,20 @@ def test_plot_partial_dependence(grid_resolution, pyplot, clf_boston, boston):
     assert disp.axes_.shape == (1, 3)
     assert disp.lines_.shape == (1, 3)
     assert disp.contours_.shape == (1, 3)
+    assert disp.deciles_vlines_.shape == (1, 3)
+    assert disp.deciles_hlines_.shape == (1, 3)
 
     assert disp.lines_[0, 2] is None
     assert disp.contours_[0, 0] is None
     assert disp.contours_[0, 1] is None
 
+    # deciles lines: always show on xaxis, only show on yaxis if 2-way PDP
+    for i in range(3):
+        assert disp.deciles_vlines_[0, i] is not None
+    assert disp.deciles_hlines_[0, 0] is None
+    assert disp.deciles_hlines_[0, 1] is None
+    assert disp.deciles_hlines_[0, 2] is not None
+
     assert disp.features == [(0, ), (1, ), (0, 1)]
     assert np.all(disp.feature_names == feature_names)
     assert len(disp.deciles) == 2
@@ -132,9 +141,15 @@ def test_plot_partial_dependence_str_features(pyplot, clf_boston, boston,
     assert disp.axes_.shape == (2, 1)
     assert disp.lines_.shape == (2, 1)
     assert disp.contours_.shape == (2, 1)
+    assert disp.deciles_vlines_.shape == (2, 1)
+    assert disp.deciles_hlines_.shape == (2, 1)
 
     assert disp.lines_[0, 0] is None
+    assert disp.deciles_vlines_[0, 0] is not None
+    assert disp.deciles_hlines_[0, 0] is not None
     assert disp.contours_[1, 0] is None
+    assert disp.deciles_hlines_[1, 0] is None
+    assert disp.deciles_vlines_[1, 0] is not None
 
     # line
     ax = disp.axes_[1, 0]
@@ -309,6 +324,8 @@ def test_plot_partial_dependence_multiclass(pyplot):
     assert disp_target_0.axes_.shape == (1, 2)
     assert disp_target_0.lines_.shape == (1, 2)
     assert disp_target_0.contours_.shape == (1, 2)
+    assert disp_target_0.deciles_vlines_.shape == (1, 2)
+    assert disp_target_0.deciles_hlines_.shape == (1, 2)
     assert all(c is None for c in disp_target_0.contours_.flat)
     assert disp_target_0.target_idx == 0
 
@@ -323,6 +340,8 @@ def test_plot_partial_dependence_multiclass(pyplot):
     assert disp_symbol.axes_.shape == (1, 2)
     assert disp_symbol.lines_.shape == (1, 2)
     assert disp_symbol.contours_.shape == (1, 2)
+    assert disp_symbol.deciles_vlines_.shape == (1, 2)
+    assert disp_symbol.deciles_hlines_.shape == (1, 2)
     assert all(c is None for c in disp_symbol.contours_.flat)
     assert disp_symbol.target_idx == 0
 
diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index c13638b2fc0c7..2b381e9a20b1a 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -4,7 +4,7 @@
 from numpy.testing import assert_allclose
 
 from sklearn.compose import ColumnTransformer
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_iris
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
@@ -33,7 +33,7 @@ def test_permutation_importance_correlated_feature_regression(n_jobs):
     rng = np.random.RandomState(42)
     n_repeats = 5
 
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
     y_with_little_noise = (
         y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
 
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index d08c706caefc4..f11501960b29c 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -10,6 +10,7 @@
 from .metrics.pairwise import pairwise_kernels
 from .linear_model._ridge import _solve_cholesky_kernel
 from .utils.validation import check_is_fitted, _check_sample_weight
+from .utils.validation import _deprecate_positional_args
 
 
 class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
@@ -113,8 +114,9 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
     >>> clf.fit(X, y)
     KernelRidge(alpha=1.0)
     """
-    def __init__(self, alpha=1, kernel="linear", gamma=None, degree=3, coef0=1,
-                 kernel_params=None):
+    @_deprecate_positional_args
+    def __init__(self, alpha=1, *, kernel="linear", gamma=None, degree=3,
+                 coef0=1, kernel_params=None):
         self.alpha = alpha
         self.kernel = kernel
         self.gamma = gamma
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 8e91767b9ff53..c1f6b8233bdac 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -28,6 +28,7 @@
                     MultiOutputMixin)
 from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
@@ -245,6 +246,9 @@ def _set_intercept(self, X_offset, y_offset, X_scale):
         else:
             self.intercept_ = 0.
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 # XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
 # Maybe the n_features checking can be moved to LinearModel.
@@ -466,8 +470,8 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     >>> reg.predict(np.array([[3, 5]]))
     array([16.])
     """
-
-    def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, normalize=False, copy_X=True,
                  n_jobs=None):
         self.fit_intercept = fit_intercept
         self.normalize = normalize
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 397461e73d8be..c69ebc1ce4307 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -12,8 +12,9 @@
 from ._base import LinearModel, _rescale_data
 from ..base import RegressorMixin
 from ..utils.extmath import fast_logdet
-from ..utils.fixes import pinvh
+from scipy.linalg import pinvh
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 
 
 ###############################################################################
@@ -145,8 +146,8 @@ class BayesianRidge(RegressorMixin, LinearModel):
     M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
     Journal of Machine Learning Research, Vol. 1, 2001.
     """
-
-    def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
+    @_deprecate_positional_args
+    def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None,
                  lambda_init=None, compute_score=False, fit_intercept=True,
                  normalize=False, copy_X=True, verbose=False):
@@ -489,8 +490,8 @@ class ARDRegression(RegressorMixin, LinearModel):
     which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
     discarded.
     """
-
-    def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
+    @_deprecate_positional_args
+    def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
                  threshold_lambda=1.e+4, fit_intercept=True, normalize=False,
                  copy_X=True, verbose=False):
@@ -553,27 +554,16 @@ def fit(self, X, y):
         self.scores_ = list()
         coef_old_ = None
 
-        # Compute sigma and mu (using Woodbury matrix identity)
-        def update_sigma(X, alpha_, lambda_, keep_lambda, n_samples):
-            sigma_ = pinvh(np.eye(n_samples) / alpha_ +
-                           np.dot(X[:, keep_lambda] *
-                           np.reshape(1. / lambda_[keep_lambda], [1, -1]),
-                           X[:, keep_lambda].T))
-            sigma_ = np.dot(sigma_, X[:, keep_lambda] *
-                            np.reshape(1. / lambda_[keep_lambda], [1, -1]))
-            sigma_ = - np.dot(np.reshape(1. / lambda_[keep_lambda], [-1, 1]) *
-                              X[:, keep_lambda].T, sigma_)
-            sigma_.flat[::(sigma_.shape[1] + 1)] += 1. / lambda_[keep_lambda]
-            return sigma_
-
         def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
             coef_[keep_lambda] = alpha_ * np.dot(
                 sigma_, np.dot(X[:, keep_lambda].T, y))
             return coef_
 
+        update_sigma = (self._update_sigma if n_samples >= n_features
+                        else self._update_sigma_woodbury)
         # Iterative procedure of ARDRegression
         for iter_ in range(self.n_iter):
-            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda, n_samples)
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
             coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
 
             # Update alpha and lambda
@@ -605,9 +595,15 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
                 break
             coef_old_ = np.copy(coef_)
 
-        # update sigma and mu using updated parameters from the last iteration
-        sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda, n_samples)
-        coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+            if not keep_lambda.any():
+                break
+
+        if keep_lambda.any():
+            # update sigma and mu using updated params from the last iteration
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+        else:
+            sigma_ = np.array([]).reshape(0, 0)
 
         self.coef_ = coef_
         self.alpha_ = alpha_
@@ -616,6 +612,34 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
         self._set_intercept(X_offset_, y_offset_, X_scale_)
         return self
 
+    def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples < n_features and will invert
+        # a matrix of shape (n_samples, n_samples) making use of the
+        # woodbury formula:
+        # https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+        n_samples = X.shape[0]
+        X_keep = X[:, keep_lambda]
+        inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
+        sigma_ = pinvh(
+            np.eye(n_samples) / alpha_ + np.dot(X_keep * inv_lambda, X_keep.T)
+        )
+        sigma_ = np.dot(sigma_, X_keep * inv_lambda)
+        sigma_ = - np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
+        sigma_[np.diag_indices(sigma_.shape[1])] += 1. / lambda_[keep_lambda]
+        return sigma_
+
+    def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples >= n_features and will
+        # invert a matrix of shape (n_features, n_features)
+        X_keep = X[:, keep_lambda]
+        gram = np.dot(X_keep.T, X_keep)
+        eye = np.eye(gram.shape[0])
+        sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
+        sigma_ = pinvh(sigma_inv)
+        return sigma_
+
     def predict(self, X, return_std=False):
         """Predict using the linear model.
 
diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx
index fcbe46ce77711..5b47f45c2e248 100644
--- a/sklearn/linear_model/_cd_fast.pyx
+++ b/sklearn/linear_model/_cd_fast.pyx
@@ -19,7 +19,7 @@ from cython cimport floating
 import warnings
 from ..exceptions import ConvergenceWarning
 
-from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2, 
+from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2,
                                    _copy, _scal)
 from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans
 
@@ -154,7 +154,7 @@ def enet_coordinate_descent(floating[::1] w,
     with nogil:
         # R = y - np.dot(X, w)
         _copy(n_samples, &y[0], 1, &R[0], 1)
-        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0], 
+        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0],
               n_samples, &w[0], 1, 1.0, &R[0], 1)
 
         # tol *= np.dot(y, y)
@@ -620,18 +620,17 @@ def enet_coordinate_descent_gram(floating[::1] w,
     return np.asarray(w), gap, tol, n_iter + 1
 
 
-def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
-                                       floating l2_reg,
-                                       np.ndarray[floating, ndim=2, mode='fortran'] X,
-                                       np.ndarray[floating, ndim=2] Y,
-                                       int max_iter, floating tol, object rng,
-                                       bint random=0):
+def enet_coordinate_descent_multi_task(
+        floating[::1, :] W, floating l1_reg, floating l2_reg,
+        np.ndarray[floating, ndim=2, mode='fortran'] X,  # TODO: use views in 0.24
+        np.ndarray[floating, ndim=2, mode='fortran'] Y,
+        int max_iter, floating tol, object rng, bint random=0):
     """Cython version of the coordinate descent algorithm
         for Elastic-Net mult-task regression
 
         We minimize
 
-        (1/2) * norm(y - X w, 2)^2 + l1_reg ||w||_21 + (1/2) * l2_reg norm(w, 2)^2
+        0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2
 
     """
 
@@ -651,11 +650,11 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     cdef floating dual_norm_XtA
 
     # initial value of the residuals
-    cdef floating[:, ::1] R = np.zeros((n_samples, n_tasks), dtype=dtype)
+    cdef floating[::1, :] R = np.zeros((n_samples, n_tasks), dtype=dtype, order='F')
 
-    cdef floating[:] norm_cols_X = np.zeros(n_features, dtype=dtype)
+    cdef floating[::1] norm_cols_X = np.zeros(n_features, dtype=dtype)
     cdef floating[::1] tmp = np.zeros(n_tasks, dtype=dtype)
-    cdef floating[:] w_ii = np.zeros(n_tasks, dtype=dtype)
+    cdef floating[::1] w_ii = np.zeros(n_tasks, dtype=dtype)
     cdef floating d_w_max
     cdef floating w_max
     cdef floating d_w_ii
@@ -675,9 +674,7 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     cdef UINT32_t* rand_r_state = &rand_r_state_seed
 
     cdef floating* X_ptr = &X[0, 0]
-    cdef floating* W_ptr = &W[0, 0]
     cdef floating* Y_ptr = &Y[0, 0]
-    cdef floating* wii_ptr = &w_ii[0]
 
     if l1_reg == 0:
         warnings.warn("Coordinate descent with l1_reg=0 may lead to unexpected"
@@ -686,15 +683,15 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     with nogil:
         # norm_cols_X = (np.asarray(X) ** 2).sum(axis=0)
         for ii in range(n_features):
-            for jj in range(n_samples):
-                norm_cols_X[ii] += X[jj, ii] ** 2
+            norm_cols_X[ii] = _nrm2(n_samples, X_ptr + ii * n_samples, 1) ** 2
 
         # R = Y - np.dot(X, W.T)
-        for ii in range(n_samples):
+        _copy(n_samples * n_tasks, Y_ptr, 1, &R[0, 0], 1)
+        for ii in range(n_features):
             for jj in range(n_tasks):
-                R[ii, jj] = Y[ii, jj] - (
-                    _dot(n_features, X_ptr + ii, n_samples, W_ptr + jj, n_tasks)
-                    )
+                if W[jj, ii] != 0:
+                    _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                          &R[0, jj], 1)
 
         # tol = tol * linalg.norm(Y, ord='fro') ** 2
         tol = tol * _nrm2(n_samples * n_tasks, Y_ptr, 1) ** 2
@@ -712,42 +709,59 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                     continue
 
                 # w_ii = W[:, ii] # Store previous value
-                _copy(n_tasks, W_ptr + ii * n_tasks, 1, wii_ptr, 1)
-
-                # if np.sum(w_ii ** 2) != 0.0:  # can do better
-                if _nrm2(n_tasks, wii_ptr, 1) != 0.0:
-                    # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
-                    _ger(RowMajor, n_samples, n_tasks, 1.0,
-                         X_ptr + ii * n_samples, 1,
-                         wii_ptr, 1, &R[0, 0], n_tasks)
-
+                _copy(n_tasks, &W[0, ii], 1, &w_ii[0], 1)
+
+                # Using Numpy:
+                # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
+                # Using Blas Level2:
+                # _ger(RowMajor, n_samples, n_tasks, 1.0,
+                #      &X[0, ii], 1,
+                #      &w_ii[0], 1, &R[0, 0], n_tasks)
+                # Using Blas Level1 and for loop to avoid slower threads
+                # for such small vectors
+                for jj in range(n_tasks):
+                    if w_ii[jj] != 0:
+                        _axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
+
+                # Using numpy:
                 # tmp = np.dot(X[:, ii][None, :], R).ravel()
-                _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
-                      n_tasks, X_ptr + ii * n_samples, 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 2:
+                # _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
+                #       n_tasks, &X[0, ii], 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    tmp[jj] = _dot(n_samples, X_ptr + ii * n_samples, 1,
+                                   &R[0, jj], 1)
 
                 # nn = sqrt(np.sum(tmp ** 2))
                 nn = _nrm2(n_tasks, &tmp[0], 1)
 
                 # W[:, ii] = tmp * fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg)
-                _copy(n_tasks, &tmp[0], 1, W_ptr + ii * n_tasks, 1)
+                _copy(n_tasks, &tmp[0], 1, &W[0, ii], 1)
                 _scal(n_tasks, fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg),
-                      W_ptr + ii * n_tasks, 1)
-
-                # if np.sum(W[:, ii] ** 2) != 0.0:  # can do better
-                if _nrm2(n_tasks, W_ptr + ii * n_tasks, 1) != 0.0:
-                    # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
-                    # Update residual : rank 1 update
-                    _ger(RowMajor, n_samples, n_tasks, -1.0,
-                         X_ptr + ii * n_samples, 1, W_ptr + ii * n_tasks, 1,
-                         &R[0, 0], n_tasks)
+                      &W[0, ii], 1)
+
+                # Using numpy:
+                # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
+                # Using BLAS Level 2:
+                # Update residual : rank 1 update
+                # _ger(RowMajor, n_samples, n_tasks, -1.0,
+                #      &X[0, ii], 1, &W[0, ii], 1,
+                #      &R[0, 0], n_tasks)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    if W[jj, ii] != 0:
+                        _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
 
                 # update the maximum absolute coefficient update
-                d_w_ii = diff_abs_max(n_tasks, W_ptr + ii * n_tasks, wii_ptr)
+                d_w_ii = diff_abs_max(n_tasks, &W[0, ii], &w_ii[0])
 
                 if d_w_ii > d_w_max:
                     d_w_max = d_w_ii
 
-                W_ii_abs_max = abs_max(n_tasks, W_ptr + ii * n_tasks)
+                W_ii_abs_max = abs_max(n_tasks, &W[0, ii])
                 if W_ii_abs_max > w_max:
                     w_max = W_ii_abs_max
 
@@ -760,16 +774,14 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                 for ii in range(n_features):
                     for jj in range(n_tasks):
                         XtA[ii, jj] = _dot(
-                            n_samples, X_ptr + ii * n_samples, 1,
-                            &R[0, 0] + jj, n_tasks
+                            n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
                             ) - l2_reg * W[jj, ii]
 
                 # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
                 dual_norm_XtA = 0.0
                 for ii in range(n_features):
                     # np.sqrt(np.sum(XtA ** 2, axis=1))
-                    XtA_axis1norm = _nrm2(n_tasks,
-                                          &XtA[0, 0] + ii * n_tasks, 1)
+                    XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
                     if XtA_axis1norm > dual_norm_XtA:
                         dual_norm_XtA = XtA_axis1norm
 
@@ -777,7 +789,7 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                 # R_norm = linalg.norm(R, ord='fro')
                 # w_norm = linalg.norm(W, ord='fro')
                 R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1)
-                w_norm = _nrm2(n_features * n_tasks, W_ptr, 1)
+                w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1)
                 if (dual_norm_XtA > l1_reg):
                     const =  l1_reg / dual_norm_XtA
                     A_norm = R_norm * const
@@ -787,16 +799,12 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                     gap = R_norm ** 2
 
                 # ry_sum = np.sum(R * y)
-                ry_sum = 0.0
-                for ii in range(n_samples):
-                    for jj in range(n_tasks):
-                        ry_sum += R[ii, jj] * Y[ii, jj]
+                ry_sum = _dot(n_samples * n_tasks, &R[0, 0], 1, &Y[0, 0], 1)
 
                 # l21_norm = np.sqrt(np.sum(W ** 2, axis=0)).sum()
                 l21_norm = 0.0
                 for ii in range(n_features):
-                    # np.sqrt(np.sum(W ** 2, axis=0))
-                    l21_norm += _nrm2(n_tasks, W_ptr + n_tasks * ii, 1)
+                    l21_norm += _nrm2(n_tasks, &W[0, ii], 1)
 
                 gap += l1_reg * l21_norm - const * ry_sum + \
                      0.5 * l2_reg * (1 + const ** 2) * (w_norm ** 2)
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 46e924abbc1d0..51f37a9379a41 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -24,6 +24,7 @@
 from ..utils.fixes import _astype_copy_false, _joblib_parallel_args
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 
 # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
 from . import _cd_fast as cd_fast  # type: ignore
@@ -130,7 +131,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
     if Xy is None:
         X_sparse = sparse.isspmatrix(X)
         sparse_center = X_sparse and (fit_intercept or normalize)
-        X = check_array(X, 'csc',
+        X = check_array(X, accept_sparse='csc',
                         copy=(copy_X and fit_intercept and not X_sparse))
         if not X_sparse:
             # X can be touched inplace thanks to the above line
@@ -434,10 +435,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already Fortran ordered when bypassing
     # checks
     if check_input:
-        X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+        X = check_array(X, accept_sparse='csc', dtype=[np.float64, np.float32],
                         order='F', copy=copy_X)
-        y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False,
-                        ensure_2d=False)
+        y = check_array(y, accept_sparse='csc', dtype=X.dtype.type,
+                        order='F', copy=False, ensure_2d=False)
         if Xy is not None:
             # Xy should be a 1d contiguous array or a 2D C ordered array
             Xy = check_array(Xy, dtype=X.dtype.type, order='C', copy=False,
@@ -690,7 +691,8 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
                  normalize=False, precompute=False, max_iter=1000,
                  copy_X=True, tol=1e-4, warm_start=False, positive=False,
                  random_state=None, selection='cyclic'):
@@ -1003,7 +1005,8 @@ class Lasso(ElasticNet):
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  precompute=False, copy_X=True, max_iter=1000,
                  tol=1e-4, warm_start=False, positive=False,
                  random_state=None, selection='cyclic'):
@@ -1065,6 +1068,15 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
     y_train = y[train]
     X_test = X[test]
     y_test = y[test]
+
+    if not sparse.issparse(X):
+        for array, array_input in ((X_train, X), (y_train, y),
+                                   (X_test, X), (y_test, y)):
+            if array.base is not array_input and not array.flags['WRITEABLE']:
+                # fancy indexing should create a writable copy but it doesn't
+                # for read-only memmaps (cf. numpy#14132).
+                array.setflags(write=True)
+
     fit_intercept = path_params['fit_intercept']
     normalize = path_params['normalize']
 
@@ -1092,7 +1104,8 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
 
     # Do the ordering and type casting here, as if it is done in the path,
     # X is copied and a reference is kept here
-    X_train = check_array(X_train, 'csc', dtype=dtype, order=X_order)
+    X_train = check_array(X_train, accept_sparse='csc', dtype=dtype,
+                          order=X_order)
     alphas, coefs, _ = path(X_train, y_train, **path_params)
     del X_train, y_train
 
@@ -1139,6 +1152,14 @@ def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
         self.random_state = random_state
         self.selection = selection
 
+    @abstractmethod
+    def _get_estimator(self):
+        """Model to be fitted after the best alpha has been determined."""
+
+    @abstractmethod
+    def _is_multitask(self):
+        """Bool indicating if class is meant for multidimensional target."""
+
     def fit(self, X, y):
         """Fit linear model with coordinate descent
 
@@ -1154,54 +1175,30 @@ def fit(self, X, y):
         y : array-like of shape (n_samples,) or (n_samples, n_targets)
             Target values
         """
-        y = check_array(y, copy=False, dtype=[np.float64, np.float32],
-                        ensure_2d=False)
-        if y.shape[0] == 0:
-            raise ValueError("y has 0 samples: %r" % y)
-
-        if hasattr(self, 'l1_ratio'):
-            model_str = 'ElasticNet'
-        else:
-            model_str = 'Lasso'
-
-        if isinstance(self, ElasticNetCV) or isinstance(self, LassoCV):
-            if model_str == 'ElasticNet':
-                model = ElasticNet()
-            else:
-                model = Lasso()
-            if y.ndim > 1 and y.shape[1] > 1:
-                raise ValueError("For multi-task outputs, use "
-                                 "MultiTask%sCV" % (model_str))
-            y = column_or_1d(y, warn=True)
-        else:
-            if sparse.isspmatrix(X):
-                raise TypeError("X should be dense but a sparse matrix was"
-                                "passed")
-            elif y.ndim == 1:
-                raise ValueError("For mono-task outputs, use "
-                                 "%sCV" % (model_str))
-            if model_str == 'ElasticNet':
-                model = MultiTaskElasticNet()
-            else:
-                model = MultiTaskLasso()
-
-        if self.selection not in ["random", "cyclic"]:
-            raise ValueError("selection should be either random or cyclic.")
-
         # This makes sure that there is no duplication in memory.
         # Dealing right with copy_X is important in the following:
         # Multiple functions touch X and subsamples of X and can induce a
         # lot of duplication of memory
         copy_X = self.copy_X and self.fit_intercept
 
+        check_y_params = dict(copy=False, dtype=[np.float64, np.float32],
+                              ensure_2d=False)
         if isinstance(X, np.ndarray) or sparse.isspmatrix(X):
             # Keep a reference to X
             reference_to_old_X = X
             # Let us not impose fortran ordering so far: it is
             # not useful for the cross-validation loop and will be done
             # by the model fitting itself
-            X = self._validate_data(X, accept_sparse='csc',
-                                    dtype=[np.float64, np.float32], copy=False)
+
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(accept_sparse='csc',
+                                  dtype=[np.float64, np.float32], copy=False)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
             if sparse.isspmatrix(X):
                 if (hasattr(reference_to_old_X, "data") and
                    not np.may_share_memory(reference_to_old_X.data, X.data)):
@@ -1212,11 +1209,39 @@ def fit(self, X, y):
                 copy_X = False
             del reference_to_old_X
         else:
-            X = self._validate_data(X, accept_sparse='csc',
-                                    dtype=[np.float64, np.float32], order='F',
-                                    copy=copy_X)
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(accept_sparse='csc',
+                                  dtype=[np.float64, np.float32], order='F',
+                                  copy=copy_X)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
             copy_X = False
 
+        if y.shape[0] == 0:
+            raise ValueError("y has 0 samples: %r" % y)
+
+        if not self._is_multitask():
+            if y.ndim > 1 and y.shape[1] > 1:
+                raise ValueError("For multi-task outputs, use "
+                                 "MultiTask%s" % self.__class__.__name__)
+            y = column_or_1d(y, warn=True)
+        else:
+            if sparse.isspmatrix(X):
+                raise TypeError("X should be dense but a sparse matrix was"
+                                "passed")
+            elif y.ndim == 1:
+                raise ValueError("For mono-task outputs, use "
+                                 "%sCV" % self.__class__.__name__[9:])
+
+        model = self._get_estimator()
+
+        if self.selection not in ["random", "cyclic"]:
+            raise ValueError("selection should be either random or cyclic.")
+
         if X.shape[0] != y.shape[0]:
             raise ValueError("X and y have inconsistent dimensions (%d != %d)"
                              % (X.shape[0], y.shape[0]))
@@ -1466,7 +1491,9 @@ class LassoCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(lasso_path)
 
-    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
+                 fit_intercept=True,
                  normalize=False, precompute='auto', max_iter=1000, tol=1e-4,
                  copy_X=True, cv=None, verbose=False, n_jobs=None,
                  positive=False, random_state=None, selection='cyclic'):
@@ -1477,6 +1504,12 @@ def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
             cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive,
             random_state=random_state, selection=selection)
 
+    def _get_estimator(self):
+        return Lasso()
+
+    def _is_multitask(self):
+        return False
+
     def _more_tags(self):
         return {'multioutput': False}
 
@@ -1662,7 +1695,8 @@ class ElasticNetCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
+    @_deprecate_positional_args
+    def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True, normalize=False, precompute='auto',
                  max_iter=1000, tol=1e-4, cv=None, copy_X=True,
                  verbose=0, n_jobs=None, positive=False, random_state=None,
@@ -1684,6 +1718,12 @@ def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         self.random_state = random_state
         self.selection = selection
 
+    def _get_estimator(self):
+        return ElasticNet()
+
+    def _is_multitask(self):
+        return False
+
     def _more_tags(self):
         return {'multioutput': False}
 
@@ -1702,9 +1742,9 @@ class MultiTaskElasticNet(Lasso):
 
     Where::
 
-        ||W||_21 = sum_i sqrt(sum_j w_ij ^ 2)
+        ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2)
 
-    i.e. the sum of norm of each row.
+    i.e. the sum of norms of each row.
 
     Read more in the :ref:`User Guide <multi_task_elastic_net>`.
 
@@ -1798,10 +1838,11 @@ class MultiTaskElasticNet(Lasso):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
-    def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
                  normalize=False, copy_X=True, max_iter=1000, tol=1e-4,
                  warm_start=False, random_state=None, selection='cyclic'):
         self.l1_ratio = l1_ratio
@@ -1835,9 +1876,14 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        X = self._validate_data(X, dtype=[np.float64, np.float32], order='F',
-                                copy=self.copy_X and self.fit_intercept)
-        y = check_array(y, dtype=X.dtype.type, ensure_2d=False)
+        # Need to validate separately here.
+        # We can't pass multi_ouput=True because that would allow y to be csr.
+        check_X_params = dict(dtype=[np.float64, np.float32], order='F',
+                              copy=self.copy_X and self.fit_intercept)
+        check_y_params = dict(ensure_2d=False, order='F')
+        X, y = self._validate_data(X, y, validate_separately=(check_X_params,
+                                                              check_y_params))
+        y = y.astype(X.dtype)
 
         if hasattr(self, 'l1_ratio'):
             model_str = 'ElasticNet'
@@ -1962,13 +2008,13 @@ class MultiTaskLasso(MultiTaskElasticNet):
     --------
     >>> from sklearn import linear_model
     >>> clf = linear_model.MultiTaskLasso(alpha=0.1)
-    >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])
+    >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]])
     MultiTaskLasso(alpha=0.1)
     >>> print(clf.coef_)
-    [[0.89393398 0.        ]
-     [0.89393398 0.        ]]
+    [[0.         0.60809415]
+    [0.         0.94592424]]
     >>> print(clf.intercept_)
-    [0.10606602 0.10606602]
+    [-0.41888636 -0.87382323]
 
     See also
     --------
@@ -1980,10 +2026,11 @@ class MultiTaskLasso(MultiTaskElasticNet):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=1000, tol=1e-4, warm_start=False,
                  random_state=None, selection='cyclic'):
         self.alpha = alpha
@@ -2157,12 +2204,13 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
+    @_deprecate_positional_args
+    def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True, normalize=False,
                  max_iter=1000, tol=1e-4, cv=None, copy_X=True,
                  verbose=0, n_jobs=None, random_state=None,
@@ -2182,6 +2230,12 @@ def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         self.random_state = random_state
         self.selection = selection
 
+    def _get_estimator(self):
+        return MultiTaskElasticNet()
+
+    def _is_multitask(self):
+        return True
+
     def _more_tags(self):
         return {'multioutput_only': True}
 
@@ -2328,12 +2382,14 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     path = staticmethod(lasso_path)
 
-    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
+                 fit_intercept=True,
                  normalize=False, max_iter=1000, tol=1e-4, copy_X=True,
                  cv=None, verbose=False, n_jobs=None, random_state=None,
                  selection='cyclic'):
@@ -2344,5 +2400,11 @@ def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
             cv=cv, verbose=verbose, n_jobs=n_jobs, random_state=random_state,
             selection=selection)
 
+    def _get_estimator(self):
+        return MultiTaskLasso()
+
+    def _is_multitask(self):
+        return True
+
     def _more_tags(self):
         return {'multioutput_only': True}
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index d9046d3a1ee9b..77e6ff944b78d 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -9,6 +9,7 @@
 from ._base import LinearModel
 from ..utils import axis0_safe_slice
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import _check_optimize_result
 
@@ -222,8 +223,8 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
     .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.
            https://statweb.stanford.edu/~owen/reports/hhu.pdf
     """
-
-    def __init__(self, epsilon=1.35, max_iter=100, alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, *, epsilon=1.35, max_iter=100, alpha=0.0001,
                  warm_start=False, fit_intercept=True, tol=1e-05):
         self.epsilon = epsilon
         self.max_iter = max_iter
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index a3781cf981710..255baacea9a59 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -21,13 +21,16 @@
 from ..base import RegressorMixin, MultiOutputMixin
 # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
 from ..utils import arrayfuncs, as_float_array  # type: ignore
+from ..utils import check_random_state
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
+from ..utils.validation import _deprecate_positional_args
 
 SOLVE_TRIANGULAR_ARGS = {'check_finite': False}
 
 
-def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
+@_deprecate_positional_args
+def lars_path(X, y, Xy=None, *, Gram=None, max_iter=500, alpha_min=0,
               method='lar', copy_X=True, eps=np.finfo(np.float).eps,
               copy_Gram=True, verbose=0, return_path=True,
               return_n_iter=False, positive=False):
@@ -156,7 +159,8 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
         return_n_iter=return_n_iter, positive=positive)
 
 
-def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0,
+@_deprecate_positional_args
+def lars_path_gram(Xy, Gram, *, n_samples, max_iter=500, alpha_min=0,
                    method='lar', copy_X=True, eps=np.finfo(np.float).eps,
                    copy_Gram=True, verbose=0, return_path=True,
                    return_n_iter=False, positive=False):
@@ -800,6 +804,16 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
+    jitter : float, default=None
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for jittering. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
+
     Attributes
     ----------
     alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
@@ -844,9 +858,11 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     method = 'lar'
     positive = False
 
-    def __init__(self, fit_intercept=True, verbose=False, normalize=True,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
-                 eps=np.finfo(np.float).eps, copy_X=True, fit_path=True):
+                 eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
+                 jitter=None, random_state=None):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -855,6 +871,8 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True,
         self.eps = eps
         self.copy_X = copy_X
         self.fit_path = fit_path
+        self.jitter = jitter
+        self.random_state = random_state
 
     @staticmethod
     def _get_gram(precompute, X, y):
@@ -954,6 +972,12 @@ def fit(self, X, y, Xy=None):
         else:
             max_iter = self.max_iter
 
+        if self.jitter is not None:
+            rng = check_random_state(self.random_state)
+
+            noise = rng.uniform(high=self.jitter, size=len(y))
+            y = y + noise
+
         self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path,
                   Xy=Xy)
 
@@ -1031,6 +1055,16 @@ class LassoLars(Lars):
         algorithm are typically in congruence with the solution of the
         coordinate descent Lasso estimator.
 
+    jitter : float, default=None
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for jittering. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
+
     Attributes
     ----------
     alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
@@ -1080,10 +1114,11 @@ class LassoLars(Lars):
     """
     method = 'lasso'
 
-    def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 positive=False):
+                 positive=False, jitter=None, random_state=None):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
@@ -1094,6 +1129,8 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
         self.copy_X = copy_X
         self.eps = eps
         self.fit_path = fit_path
+        self.jitter = jitter
+        self.random_state = random_state
 
 
 ###############################################################################
@@ -1335,7 +1372,8 @@ class LarsCV(Lars):
 
     method = 'lar'
 
-    def __init__(self, fit_intercept=True, verbose=False, max_iter=500,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
                  normalize=True, precompute='auto', cv=None,
                  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
                  copy_X=True):
@@ -1576,7 +1614,8 @@ class LassoLarsCV(LarsCV):
 
     method = 'lasso'
 
-    def __init__(self, fit_intercept=True, verbose=False, max_iter=500,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
                  normalize=True, precompute='auto', cv=None,
                  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
                  copy_X=True, positive=False):
@@ -1709,7 +1748,8 @@ class LassoLarsIC(LassoLars):
     --------
     lars_path, LassoLars, LassoLarsCV
     """
-    def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
+    @_deprecate_positional_args
+    def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, positive=False):
         self.criterion = criterion
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 874dc743f4cc2..9ef3a21e4a76d 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -29,6 +29,7 @@
 from ..utils.extmath import row_norms
 from ..utils.optimize import _newton_cg, _check_optimize_result
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
 from ..utils.fixes import _joblib_parallel_args
 from ..model_selection import check_cv
@@ -1246,8 +1247,8 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
     >>> clf.score(X, y)
     0.97...
     """
-
-    def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
+    @_deprecate_positional_args
+    def __init__(self, penalty='l2', *, dual=False, tol=1e-4, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,
                  random_state=None, solver='lbfgs', max_iter=100,
                  multi_class='auto', verbose=0, warm_start=False, n_jobs=None,
@@ -1737,7 +1738,8 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
     LogisticRegression
 
     """
-    def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False,
+    @_deprecate_positional_args
+    def __init__(self, *, Cs=10, fit_intercept=True, cv=None, dual=False,
                  penalty='l2', scoring=None, solver='lbfgs', tol=1e-4,
                  max_iter=100, class_weight=None, n_jobs=None, verbose=0,
                  refit=True, intercept_scaling=1., multi_class='auto',
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index 0d572dd17c6d7..44371e9fa76e7 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -16,6 +16,7 @@
 from ._base import LinearModel, _pre_fit
 from ..base import RegressorMixin, MultiOutputMixin
 from ..utils import as_float_array, check_array
+from ..utils.validation import _deprecate_positional_args
 from ..model_selection import check_cv
 
 premature = """ Orthogonal matching pursuit ended prematurely due to linear
@@ -262,7 +263,8 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None,
         return gamma, indices[:n_active], n_active
 
 
-def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False,
+@_deprecate_positional_args
+def orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False,
                   copy_X=True, return_path=False,
                   return_n_iter=False):
     r"""Orthogonal Matching Pursuit (OMP)
@@ -371,7 +373,8 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False,
             norms_squared = np.sum((y ** 2), axis=0)
         else:
             norms_squared = None
-        return orthogonal_mp_gram(G, Xy, n_nonzero_coefs, tol, norms_squared,
+        return orthogonal_mp_gram(G, Xy, n_nonzero_coefs=n_nonzero_coefs,
+                                  tol=tol, norms_squared=norms_squared,
                                   copy_Gram=copy_X, copy_Xy=False,
                                   return_path=return_path)
 
@@ -404,7 +407,8 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False,
         return np.squeeze(coef)
 
 
-def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None,
+@_deprecate_positional_args
+def orthogonal_mp_gram(Gram, Xy, *, n_nonzero_coefs=None, tol=None,
                        norms_squared=None, copy_Gram=True,
                        copy_Xy=True, return_path=False,
                        return_n_iter=False):
@@ -616,7 +620,8 @@ class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel):
     decomposition.sparse_encode
     OrthogonalMatchingPursuitCV
     """
-    def __init__(self, n_nonzero_coefs=None, tol=None, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, n_nonzero_coefs=None, tol=None, fit_intercept=True,
                  normalize=True, precompute='auto'):
         self.n_nonzero_coefs = n_nonzero_coefs
         self.tol = tol
@@ -660,7 +665,7 @@ def fit(self, X, y):
 
         if Gram is False:
             coef_, self.n_iter_ = orthogonal_mp(
-                X, y, self.n_nonzero_coefs_, self.tol,
+                X, y, n_nonzero_coefs=self.n_nonzero_coefs_, tol=self.tol,
                 precompute=False, copy_X=True,
                 return_n_iter=True)
         else:
@@ -853,7 +858,8 @@ class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel):
     decomposition.sparse_encode
 
     """
-    def __init__(self, copy=True, fit_intercept=True, normalize=True,
+    @_deprecate_positional_args
+    def __init__(self, *, copy=True, fit_intercept=True, normalize=True,
                  max_iter=None, cv=None, n_jobs=None, verbose=False):
         self.copy = copy
         self.fit_intercept = fit_intercept
diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py
index 3b8354f5a7352..22c47fb1fcf07 100644
--- a/sklearn/linear_model/_passive_aggressive.py
+++ b/sklearn/linear_model/_passive_aggressive.py
@@ -1,6 +1,7 @@
 # Authors: Rob Zinkov, Mathieu Blondel
 # License: BSD 3 clause
 
+from ..utils.validation import _deprecate_positional_args
 from ._stochastic_gradient import BaseSGDClassifier
 from ._stochastic_gradient import BaseSGDRegressor
 from ._stochastic_gradient import DEFAULT_EPSILON
@@ -163,7 +164,8 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
     K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
 
     """
-    def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
                  early_stopping=False, validation_fraction=0.1,
                  n_iter_no_change=5, shuffle=True, verbose=0, loss="hinge",
                  n_jobs=None, random_state=None, warm_start=False,
@@ -390,7 +392,8 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
 
     """
-    def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
                  early_stopping=False, validation_fraction=0.1,
                  n_iter_no_change=5, shuffle=True, verbose=0,
                  loss="epsilon_insensitive", epsilon=DEFAULT_EPSILON,
diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py
index ff50f6ebbc06e..54d7888109702 100644
--- a/sklearn/linear_model/_perceptron.py
+++ b/sklearn/linear_model/_perceptron.py
@@ -1,6 +1,7 @@
 # Author: Mathieu Blondel
 # License: BSD 3 clause
 
+from ..utils.validation import _deprecate_positional_args
 from ._stochastic_gradient import BaseSGDClassifier
 
 
@@ -143,8 +144,8 @@ class Perceptron(BaseSGDClassifier):
 
     https://en.wikipedia.org/wiki/Perceptron and references therein.
     """
-
-    def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, penalty=None, alpha=0.0001, fit_intercept=True,
                  max_iter=1000, tol=1e-3, shuffle=True, verbose=0, eta0=1.0,
                  n_jobs=None, random_state=0, early_stopping=False,
                  validation_fraction=0.1, n_iter_no_change=5,
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index cd5e3db49842d..fffa29d47d91c 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -9,9 +9,10 @@
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
 from ..base import MultiOutputMixin
-from ..utils import check_random_state, check_array, check_consistent_length
+from ..utils import check_random_state, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ._base import LinearRegression
 from ..utils.validation import has_fit_parameter
 from ..exceptions import ConvergenceWarning
@@ -149,6 +150,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
         If the loss on a sample is greater than the ``residual_threshold``,
         then this sample is classified as an outlier.
 
+        .. versionadded:: 0.18
+
     random_state : int, RandomState instance, default=None
         The generator used to initialize the centers.
         Pass an int for reproducible output across multiple function calls.
@@ -201,8 +204,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
     .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf
     .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
     """
-
-    def __init__(self, base_estimator=None, min_samples=None,
+    @_deprecate_positional_args
+    def __init__(self, base_estimator=None, *, min_samples=None,
                  residual_threshold=None, is_data_valid=None,
                  is_model_valid=None, max_trials=100, max_skips=np.inf,
                  stop_n_inliers=np.inf, stop_score=np.inf,
@@ -238,6 +241,8 @@ def fit(self, X, y, sample_weight=None):
             raises error if sample_weight is passed and base_estimator
             fit method does not support it.
 
+            .. versionadded:: 0.18
+
         Raises
         ------
         ValueError
@@ -246,8 +251,12 @@ def fit(self, X, y, sample_weight=None):
             `max_trials` randomly chosen sub-samples.
 
         """
-        X = self._validate_data(X, accept_sparse='csr')
-        y = check_array(y, ensure_2d=False)
+        # Need to validate separately here.
+        # We can't pass multi_ouput=True because that would allow y to be csr.
+        check_X_params = dict(accept_sparse='csr')
+        check_y_params = dict(ensure_2d=False)
+        X, y = self._validate_data(X, y, validate_separately=(check_X_params,
+                                                              check_y_params))
         check_consistent_length(X, y)
 
         if self.base_estimator is not None:
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 9c3f703ac478e..ca3fba196d6d3 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -27,6 +27,7 @@
 from ..utils import compute_sample_weight
 from ..utils import column_or_1d
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..preprocessing import LabelBinarizer
 from ..model_selection import GridSearchCV
 from ..metrics import check_scoring
@@ -234,7 +235,8 @@ def _get_valid_accept_sparse(is_X_sparse, solver):
         return ['csr', 'csc', 'coo']
 
 
-def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
+@_deprecate_positional_args
+def ridge_regression(X, y, alpha, *, sample_weight=None, solver='auto',
                      max_iter=None, tol=1e-3, verbose=0, random_state=None,
                      return_n_iter=False, return_intercept=False,
                      check_input=True):
@@ -518,7 +520,8 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
 class _BaseRidge(LinearModel, metaclass=ABCMeta):
     @abstractmethod
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, solver="auto",
                  random_state=None):
         self.alpha = alpha
@@ -625,9 +628,9 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
         number.
 
     fit_intercept : bool, default=True
-        Whether to calculate the intercept for this model. If set
+        Whether to fit the intercept for this model. If set
         to false, no intercept will be used in calculations
-        (i.e. data is expected to be centered).
+        (i.e. ``X`` and ``y`` are expected to be centered).
 
     normalize : bool, default=False
         This parameter is ignored when ``fit_intercept`` is set to False.
@@ -727,8 +730,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     >>> clf.fit(X, y)
     Ridge()
     """
-
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, solver="auto",
                  random_state=None):
         super().__init__(
@@ -885,8 +888,8 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     >>> clf.score(X, y)
     0.9595...
     """
-
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, class_weight=None,
                  solver="auto", random_state=None):
         super().__init__(
@@ -1112,8 +1115,8 @@ class _RidgeGCV(LinearModel):
     http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
     https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
     """
-
-    def __init__(self, alphas=(0.1, 1.0, 10.0),
+    @_deprecate_positional_args
+    def __init__(self, alphas=(0.1, 1.0, 10.0), *,
                  fit_intercept=True, normalize=False,
                  scoring=None, copy_X=True,
                  gcv_mode=None, store_cv_values=False,
@@ -1546,7 +1549,8 @@ def fit(self, X, y, sample_weight=None):
 
 
 class _BaseRidgeCV(LinearModel):
-    def __init__(self, alphas=(0.1, 1.0, 10.0),
+    @_deprecate_positional_args
+    def __init__(self, alphas=(0.1, 1.0, 10.0), *,
                  fit_intercept=True, normalize=False, scoring=None,
                  cv=None, gcv_mode=None,
                  store_cv_values=False):
@@ -1854,8 +1858,8 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
     a one-versus-all approach. Concretely, this is implemented by taking
     advantage of the multi-variate response support in Ridge.
     """
-
-    def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True,
                  normalize=False, scoring=None, cv=None, class_weight=None,
                  store_cv_values=False):
         super().__init__(
diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py
index 9fe6f076f5145..caa9b2d133003 100644
--- a/sklearn/linear_model/_sag.py
+++ b/sklearn/linear_model/_sag.py
@@ -13,6 +13,7 @@
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import row_norms
 
 
@@ -84,6 +85,7 @@ def get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept,
     return step
 
 
+@_deprecate_positional_args
 def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0.,
                max_iter=1000, tol=0.001, verbose=0, random_state=None,
                check_input=True, max_squared_sum=None,
diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp
index 5758a8e5ee34c..141890497fcd2 100644
--- a/sklearn/linear_model/_sag_fast.pyx.tp
+++ b/sklearn/linear_model/_sag_fast.pyx.tp
@@ -58,6 +58,7 @@ from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
 
 from libc.stdio cimport printf
 
+np.import_array()
 
 
 {{for name, c_type, np_type in get_dispatch(dtypes)}}
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index bf1e77e3e355b..94428f61f1327 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -19,6 +19,7 @@
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 from ..model_selection import StratifiedShuffleSplit, ShuffleSplit
 
@@ -68,8 +69,8 @@ def __call__(self, coef, intercept):
 
 class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for SGD classification and regression."""
-
-    def __init__(self, loss, penalty='l2', alpha=0.0001, C=1.0,
+    @_deprecate_positional_args
+    def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=0.1, random_state=None,
                  learning_rate="optimal", eta0=0.0, power_t=0.5,
@@ -461,7 +462,8 @@ class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
     }
 
     @abstractmethod
-    def __init__(self, loss="hinge", penalty='l2', alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None,
                  random_state=None, learning_rate="optimal", eta0=0.0,
@@ -485,8 +487,8 @@ def _partial_fit(self, X, y, alpha, C,
                      loss, learning_rate, max_iter,
                      classes, sample_weight,
                      coef_init, intercept_init):
-        X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C",
-                         accept_large_sparse=False)
+        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
+                         order="C", accept_large_sparse=False)
 
         n_samples, n_features = X.shape
 
@@ -845,6 +847,9 @@ class SGDClassifier(BaseSGDClassifier):
           training loss by tol or fail to increase validation score by tol if
           early_stopping is True, the current learning rate is divided by 5.
 
+            .. versionadded:: 0.20
+                Added 'adaptive' option
+
     eta0 : double, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.0 as eta0 is not used by
@@ -861,6 +866,7 @@ class SGDClassifier(BaseSGDClassifier):
         improving by at least tol for n_iter_no_change consecutive epochs.
 
         .. versionadded:: 0.20
+            Added 'early_stopping' option
 
     validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
@@ -868,11 +874,13 @@ class SGDClassifier(BaseSGDClassifier):
         Only used if `early_stopping` is True.
 
         .. versionadded:: 0.20
+            Added 'validation_fraction' option
 
     n_iter_no_change : int, default=5
         Number of iterations with no improvement to wait before early stopping.
 
         .. versionadded:: 0.20
+            Added 'n_iter_no_change' option
 
     class_weight : dict, {class_label: weight} or "balanced", default=None
         Preset for the class_weight fit parameter.
@@ -950,8 +958,9 @@ class SGDClassifier(BaseSGDClassifier):
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
     """
-
-    def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, l1_ratio=0.15,
+    @_deprecate_positional_args
+    def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
+                 l1_ratio=0.15,
                  fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True,
                  verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None,
                  random_state=None, learning_rate="optimal", eta0=0.0,
@@ -1097,7 +1106,8 @@ class BaseSGDRegressor(RegressorMixin, BaseSGD):
     }
 
     @abstractmethod
-    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
                  random_state=None, learning_rate="invscaling", eta0=0.01,
@@ -1442,6 +1452,9 @@ class SGDRegressor(BaseSGDRegressor):
           training loss by tol or fail to increase validation score by tol if
           early_stopping is True, the current learning rate is divided by 5.
 
+            .. versionadded:: 0.20
+                Added 'adaptive' option
+
     eta0 : double, default=0.01
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.01.
@@ -1458,6 +1471,7 @@ class SGDRegressor(BaseSGDRegressor):
         epochs.
 
         .. versionadded:: 0.20
+            Added 'early_stopping' option
 
     validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
@@ -1465,11 +1479,13 @@ class SGDRegressor(BaseSGDRegressor):
         Only used if `early_stopping` is True.
 
         .. versionadded:: 0.20
+            Added 'validation_fraction' option
 
     n_iter_no_change : int, default=5
         Number of iterations with no improvement to wait before early stopping.
 
         .. versionadded:: 0.20
+            Added 'n_iter_no_change' option
 
     warm_start : bool, default=False
         When set to True, reuse the solution of the previous call to fit as
@@ -1543,7 +1559,8 @@ class SGDRegressor(BaseSGDRegressor):
     Ridge, ElasticNet, Lasso, sklearn.svm.SVR
 
     """
-    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
                  random_state=None, learning_rate="invscaling", eta0=0.01,
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index 16f0adae12c9c..28d2dba3f8719 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -20,6 +20,7 @@
 from ._base import LinearModel
 from ..base import RegressorMixin
 from ..utils import check_random_state
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 _EPSILON = np.finfo(np.double).eps
@@ -290,8 +291,8 @@ class TheilSenRegressor(RegressorMixin, LinearModel):
       Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
       http://home.olemiss.edu/~xdang/papers/MTSE.pdf
     """
-
-    def __init__(self, fit_intercept=True, copy_X=True,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, copy_X=True,
                  max_subpopulation=1e4, n_subsamples=None, max_iter=300,
                  tol=1.e-3, random_state=None, n_jobs=None, verbose=False):
         self.fit_intercept = fit_intercept
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
index e1922a010514f..ff3ac13c2d7f6 100644
--- a/sklearn/linear_model/tests/test_bayes.py
+++ b/sklearn/linear_model/tests/test_bayes.py
@@ -7,6 +7,8 @@
 
 import numpy as np
 from scipy.linalg import pinvh
+import pytest
+
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
@@ -159,7 +161,7 @@ def test_std_bayesian_ridge_ard_with_constant_input():
     # Test BayesianRidge and ARDRegression standard dev. for edge case of
     # constant target vector
     # The standard dev. should be relatively small (< 0.01 is tested here)
-    n_samples = 4
+    n_samples = 10
     n_features = 5
     random_state = check_random_state(42)
     constant_value = random_state.rand()
@@ -181,9 +183,9 @@ def test_update_of_sigma_in_ard():
     y = np.array([0, 0])
     clf = ARDRegression(n_iter=1)
     clf.fit(X, y)
-    # With the inputs above, ARDRegression prunes one of the two coefficients
-    # in the first iteration. Hence, the expected shape of `sigma_` is (1, 1).
-    assert clf.sigma_.shape == (1, 1)
+    # With the inputs above, ARDRegression prunes both of the two coefficients
+    # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0).
+    assert clf.sigma_.shape == (0, 0)
     # Ensure that no error is thrown at prediction stage
     clf.predict(X, return_std=True)
 
@@ -200,22 +202,19 @@ def test_toy_ard_object():
     assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
 
 
-def test_ard_accuracy_on_easy_problem():
+@pytest.mark.parametrize('seed', range(100))
+@pytest.mark.parametrize('n_samples, n_features', ((10, 100), (100, 10)))
+def test_ard_accuracy_on_easy_problem(seed, n_samples, n_features):
     # Check that ARD converges with reasonable accuracy on an easy problem
     # (Github issue #14055)
-    # This particular seed seems to converge poorly in the failure-case
-    # (scipy==1.3.0, sklearn==0.21.2)
-    seed = 45
     X = np.random.RandomState(seed=seed).normal(size=(250, 3))
     y = X[:, 1]
 
-    regressor = ARDRegression(n_iter=600)
+    regressor = ARDRegression()
     regressor.fit(X, y)
 
     abs_coef_error = np.abs(1 - regressor.coef_[1])
-    # Expect an accuracy of better than 1E-4 in most cases -
-    # Failure-case produces 0.16!
-    assert abs_coef_error < 0.01
+    assert abs_coef_error < 1e-10
 
 
 def test_return_std():
@@ -248,3 +247,28 @@ def f_noise(X, noise_mult):
         m2.fit(X, y)
         y_mean2, y_std2 = m2.predict(X_test, return_std=True)
         assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
+
+
+@pytest.mark.parametrize('seed', range(10))
+def test_update_sigma(seed):
+    # make sure the two update_sigma() helpers are equivalent. The woodbury
+    # formula is used when n_samples < n_features, and the other one is used
+    # otherwise.
+
+    rng = np.random.RandomState(seed)
+
+    # set n_samples == n_features to avoid instability issues when inverting
+    # the matrices. Using the woodbury formula would be unstable when
+    # n_samples > n_features
+    n_samples = n_features = 10
+    X = rng.randn(n_samples, n_features)
+    alpha = 1
+    lmbda = np.arange(1, n_features + 1)
+    keep_lambda = np.array([True] * n_features)
+
+    reg = ARDRegression()
+
+    sigma = reg._update_sigma(X, alpha, lmbda, keep_lambda)
+    sigma_woodbury = reg._update_sigma_woodbury(X, alpha, lmbda, keep_lambda)
+
+    np.testing.assert_allclose(sigma, sigma_woodbury)
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index fdc49599788fe..b1a072ae38996 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -7,8 +7,11 @@
 import pytest
 from scipy import interpolate, sparse
 from copy import deepcopy
+import joblib
+from distutils.version import LooseVersion
 
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
+from sklearn.datasets import make_regression
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
@@ -593,7 +596,7 @@ def test_warm_start_convergence():
 
 
 def test_warm_start_convergence_with_regularizer_decrement():
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
 
     # Train a model to converge on a lightly regularized problem
     final_alpha = 1e-5
@@ -879,9 +882,9 @@ def test_convergence_warnings():
     X = random_state.standard_normal((1000, 500))
     y = random_state.standard_normal((1000, 3))
 
-    # check that the model fails to converge
+    # check that the model fails to converge (a negative dual gap cannot occur)
     with pytest.warns(ConvergenceWarning):
-        MultiTaskElasticNet(max_iter=1, tol=0).fit(X, y)
+        MultiTaskElasticNet(max_iter=1, tol=-1).fit(X, y)
 
     # check that the model converges w/o warnings
     with pytest.warns(None) as record:
@@ -1020,3 +1023,23 @@ def test_enet_sample_weight_sparse():
     with pytest.raises(ValueError, match="Sample weights do not.*support "
                                          "sparse matrices"):
         reg.fit(X, y, sample_weight=sw, check_input=True)
+
+
+@pytest.mark.parametrize("backend", ["loky", "threading"])
+@pytest.mark.parametrize("estimator",
+                         [ElasticNetCV, MultiTaskElasticNetCV,
+                          LassoCV, MultiTaskLassoCV])
+def test_linear_models_cv_fit_for_all_backends(backend, estimator):
+    # LinearModelsCV.fit performs inplace operations on input data which is
+    # memmapped when using loky backend, causing an error due to unexpected
+    # behavior of fancy indexing of read-only memmaps (cf. numpy#14132).
+
+    if joblib.__version__ < LooseVersion('0.12') and backend == 'loky':
+        pytest.skip('loky backend does not exist in joblib <0.12')
+
+    # Create a problem sufficiently large to cause memmapping (1MB).
+    n_targets = 1 + (estimator in (MultiTaskElasticNetCV, MultiTaskLassoCV))
+    X, y = make_regression(20000, 10, n_targets=n_targets)
+
+    with joblib.parallel_backend(backend=backend):
+        estimator(n_jobs=2, cv=3).fit(X, y)
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 6e7c1fb37096a..e198dfb15e323 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -6,6 +6,7 @@
 import pytest
 from scipy import linalg
 
+from sklearn.base import clone
 from sklearn.model_selection import train_test_split
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
@@ -17,6 +18,7 @@
 from sklearn import linear_model, datasets
 from sklearn.linear_model._least_angle import _lars_path_residues
 from sklearn.linear_model import LassoLarsIC, lars_path
+from sklearn.linear_model import Lars, LassoLars
 
 # TODO: use another dataset that has multiple drops
 diabetes = datasets.load_diabetes()
@@ -733,6 +735,28 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     assert copy_X == np.array_equal(X, X_copy)
 
 
+@pytest.mark.parametrize('est', (LassoLars(alpha=1e-3), Lars()))
+def test_lars_with_jitter(est):
+    # Test that a small amount of jitter helps stability,
+    # using example provided in issue #2746
+
+    X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0],
+                  [0.0, -1.0, 0.0, 0.0, 0.0]])
+    y = [-2.5, -2.5]
+    expected_coef = [0, 2.5, 0, 2.5, 0]
+
+    # set to fit_intercept to False since target is constant and we want check
+    # the value of coef. coef would be all zeros otherwise.
+    est.set_params(fit_intercept=False)
+    est_jitter = clone(est).set_params(jitter=10e-8, random_state=0)
+
+    est.fit(X, y)
+    est_jitter.fit(X, y)
+
+    assert np.mean((est.coef_ - est_jitter.coef_)**2) > .1
+    np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3)
+
+
 def test_X_none_gram_not_none():
     with pytest.raises(ValueError,
                        match="X cannot be None if Gram is not None"):
diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py
index e742689bcde3d..f3f3080aebe66 100644
--- a/sklearn/linear_model/tests/test_omp.py
+++ b/sklearn/linear_model/tests/test_omp.py
@@ -18,8 +18,11 @@
 from sklearn.datasets import make_sparse_coded_signal
 
 n_samples, n_features, n_nonzero_coefs, n_targets = 25, 35, 5, 3
-y, X, gamma = make_sparse_coded_signal(n_targets, n_features, n_samples,
-                                       n_nonzero_coefs, random_state=0)
+y, X, gamma = make_sparse_coded_signal(n_samples=n_targets,
+                                       n_components=n_features,
+                                       n_features=n_samples,
+                                       n_nonzero_coefs=n_nonzero_coefs,
+                                       random_state=0)
 # Make X not of norm 1 for testing
 X *= 10
 y *= 10
@@ -94,8 +97,8 @@ def test_bad_input():
 
 def test_perfect_signal_recovery():
     idx, = gamma[:, 0].nonzero()
-    gamma_rec = orthogonal_mp(X, y[:, 0], 5)
-    gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], 5)
+    gamma_rec = orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)
+    gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5)
     assert_array_equal(idx, np.flatnonzero(gamma_rec))
     assert_array_equal(idx, np.flatnonzero(gamma_gram))
     assert_array_almost_equal(gamma[:, 0], gamma_rec, decimal=2)
@@ -110,7 +113,8 @@ def test_orthogonal_mp_gram_readonly():
     G_readonly.setflags(write=False)
     Xy_readonly = Xy.copy()
     Xy_readonly.setflags(write=False)
-    gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0], 5,
+    gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0],
+                                    n_nonzero_coefs=5,
                                     copy_Gram=False, copy_Xy=False)
     assert_array_equal(idx, np.flatnonzero(gamma_gram))
     assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
@@ -163,8 +167,8 @@ def test_swapped_regressors():
     gamma[0] = 0.5
     new_y = np.dot(X, gamma)
     new_Xy = np.dot(X.T, new_y)
-    gamma_hat = orthogonal_mp(X, new_y, 2)
-    gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, 2)
+    gamma_hat = orthogonal_mp(X, new_y, n_nonzero_coefs=2)
+    gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, n_nonzero_coefs=2)
     assert_array_equal(np.flatnonzero(gamma_hat), [0, 21])
     assert_array_equal(np.flatnonzero(gamma_hat_gram), [0, 21])
 
@@ -172,8 +176,10 @@ def test_swapped_regressors():
 def test_no_atoms():
     y_empty = np.zeros_like(y)
     Xy_empty = np.dot(X.T, y_empty)
-    gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, 1)
-    gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, 1)
+    gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty,
+                                                 n_nonzero_coefs=1)
+    gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty,
+                                                      n_nonzero_coefs=1)
     assert np.all(gamma_empty == 0)
     assert np.all(gamma_empty_gram == 0)
 
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index 1f7d3c2569bab..3710f3857a2a7 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -288,7 +288,9 @@ def test_ransac_none_estimator():
 
     ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0)
-    ransac_none_estimator = RANSACRegressor(None, 2, 5, random_state=0)
+    ransac_none_estimator = RANSACRegressor(None, min_samples=2,
+                                            residual_threshold=5,
+                                            random_state=0)
 
     ransac_estimator.fit(X, y)
     ransac_none_estimator.fit(X, y)
diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx
index ec80890fd8a58..b15462e597684 100644
--- a/sklearn/manifold/_barnes_hut_tsne.pyx
+++ b/sklearn/manifold/_barnes_hut_tsne.pyx
@@ -18,6 +18,8 @@ from cython.parallel cimport prange, parallel
 
 from ..neighbors._quad_tree cimport _QuadTree
 
+np.import_array()
+
 
 cdef char* EMPTY_STRING = ""
 
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index f26db5cc2028d..3229522d21c6e 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -8,6 +8,7 @@
 from ..neighbors import NearestNeighbors, kneighbors_graph
 from ..utils.deprecation import deprecated
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..utils.graph import graph_shortest_path
 from ..decomposition import KernelPCA
 from ..preprocessing import KernelCenterer
@@ -122,8 +123,8 @@ class Isomap(TransformerMixin, BaseEstimator):
     .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
            framework for nonlinear dimensionality reduction. Science 290 (5500)
     """
-
-    def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto',
                  tol=0, max_iter=None, path_method='auto',
                  neighbors_algorithm='auto', n_jobs=None, metric='minkowski',
                  p=2, metric_params=None):
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 7b46d51df718d..a1db0c43daccb 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -14,6 +14,7 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 from ..neighbors import NearestNeighbors
 
 
@@ -97,7 +98,7 @@ def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
     sklearn.neighbors.kneighbors_graph
     sklearn.neighbors.radius_neighbors_graph
     """
-    knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(X)
+    knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
     X = knn._fit_X
     n_samples = knn.n_samples_fit_
     ind = knn.kneighbors(X, return_distance=False)[:, 1:]
@@ -183,10 +184,11 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
         raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
 
 
+@_deprecate_positional_args
 def locally_linear_embedding(
-        X, n_neighbors, n_components, reg=1e-3, eigen_solver='auto', tol=1e-6,
-        max_iter=100, method='standard', hessian_tol=1E-4, modified_tol=1E-12,
-        random_state=None, n_jobs=None):
+        X, *, n_neighbors, n_components, reg=1e-3, eigen_solver='auto',
+        tol=1e-6, max_iter=100, method='standard', hessian_tol=1E-4,
+        modified_tol=1E-12, random_state=None, n_jobs=None):
     """Perform a Locally Linear Embedding analysis on the data.
 
     Read more in the :ref:`User Guide <locally_linear_embedding>`.
@@ -628,8 +630,8 @@ class LocallyLinearEmbedding(TransformerMixin,
         dimensionality reduction via tangent space alignment.
         Journal of Shanghai Univ.  8:406 (2004)
     """
-
-    def __init__(self, n_neighbors=5, n_components=2, reg=1E-3,
+    @_deprecate_positional_args
+    def __init__(self, *, n_neighbors=5, n_components=2, reg=1E-3,
                  eigen_solver='auto', tol=1E-6, max_iter=100,
                  method='standard', hessian_tol=1E-4, modified_tol=1E-12,
                  neighbors_algorithm='auto', random_state=None, n_jobs=None):
@@ -647,7 +649,7 @@ def __init__(self, n_neighbors=5, n_components=2, reg=1E-3,
         self.n_jobs = n_jobs
 
     def _fit_transform(self, X):
-        self.nbrs_ = NearestNeighbors(self.n_neighbors,
+        self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                       algorithm=self.neighbors_algorithm,
                                       n_jobs=self.n_jobs)
 
@@ -656,7 +658,8 @@ def _fit_transform(self, X):
         self.nbrs_.fit(X)
         self.embedding_, self.reconstruction_error_ = \
             locally_linear_embedding(
-                self.nbrs_, self.n_neighbors, self.n_components,
+                X=self.nbrs_, n_neighbors=self.n_neighbors,
+                n_components=self.n_components,
                 eigen_solver=self.eigen_solver, tol=self.tol,
                 max_iter=self.max_iter, method=self.method,
                 hessian_tol=self.hessian_tol, modified_tol=self.modified_tol,
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index ca8c08ed69f98..0314007264689 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -14,6 +14,7 @@
 from ..metrics import euclidean_distances
 from ..utils import check_random_state, check_array, check_symmetric
 from ..isotonic import IsotonicRegression
+from ..utils.validation import _deprecate_positional_args
 
 
 def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
@@ -129,9 +130,10 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
     return X, stress, it + 1
 
 
-def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8,
-           n_jobs=None, max_iter=300, verbose=0, eps=1e-3, random_state=None,
-           return_n_iter=False):
+@_deprecate_positional_args
+def smacof(dissimilarities, *, metric=True, n_components=2, init=None,
+           n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3,
+           random_state=None, return_n_iter=False):
     """Computes multidimensional scaling using the SMACOF algorithm.
 
     The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
@@ -357,7 +359,8 @@ class MDS(BaseEstimator):
     hypothesis" Kruskal, J. Psychometrika, 29, (1964)
 
     """
-    def __init__(self, n_components=2, metric=True, n_init=4,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, metric=True, n_init=4,
                  max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
                  random_state=None, dissimilarity="euclidean"):
         self.n_components = n_components
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index caac2236e1dd6..a42c97bb5d6b4 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -20,6 +20,7 @@
 from ..utils.fixes import lobpcg
 from ..metrics.pairwise import rbf_kernel
 from ..neighbors import kneighbors_graph, NearestNeighbors
+from ..utils.validation import _deprecate_positional_args
 
 
 def _graph_connected_component(graph, node_id):
@@ -132,7 +133,8 @@ def _set_diag(laplacian, value, norm_laplacian):
     return laplacian
 
 
-def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
+@_deprecate_positional_args
+def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
                        random_state=None, eigen_tol=0.0,
                        norm_laplacian=True, drop_first=True):
     """Project the sample on the first eigenvectors of the graph Laplacian.
@@ -299,7 +301,8 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
         # matrix to the solver and afterward set it back to the original.
         diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
         laplacian += diag_shift
-        ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
+        ml = smoothed_aggregation_solver(check_array(laplacian,
+                                                     accept_sparse='csr'))
         laplacian -= diag_shift
 
         M = ml.aspreconditioner()
@@ -440,8 +443,8 @@ class SpectralEmbedding(BaseEstimator):
       Jianbo Shi, Jitendra Malik
       http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
     """
-
-    def __init__(self, n_components=2, affinity="nearest_neighbors",
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, affinity="nearest_neighbors",
                  gamma=None, random_state=None, eigen_solver=None,
                  n_neighbors=None, n_jobs=None):
         self.n_components = n_components
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 53558f6051283..eef67d5460e22 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -19,6 +19,7 @@
 from ..utils import check_random_state
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import check_non_negative
+from ..utils.validation import _deprecate_positional_args
 from ..decomposition import PCA
 from ..metrics.pairwise import pairwise_distances
 from . import _utils
@@ -396,7 +397,8 @@ def _gradient_descent(objective, p0, it, n_iter,
     return p, error, i
 
 
-def trustworthiness(X, X_embedded, n_neighbors=5, metric='euclidean'):
+@_deprecate_positional_args
+def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'):
     r"""Expresses to what extent the local structure is retained.
 
     The trustworthiness is within [0, 1]. It is defined as
@@ -437,6 +439,8 @@ def trustworthiness(X, X_embedded, n_neighbors=5, metric='euclidean'):
         documentation of argument metric in sklearn.pairwise.pairwise_distances
         for a list of available metrics.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     trustworthiness : float
@@ -450,8 +454,8 @@ def trustworthiness(X, X_embedded, n_neighbors=5, metric='euclidean'):
     np.fill_diagonal(dist_X, np.inf)
     ind_X = np.argsort(dist_X, axis=1)
     # `ind_X[i]` is the index of sorted distances between i and other samples
-    ind_X_embedded = NearestNeighbors(n_neighbors).fit(X_embedded).kneighbors(
-        return_distance=False)
+    ind_X_embedded = NearestNeighbors(n_neighbors=n_neighbors).fit(
+            X_embedded).kneighbors(return_distance=False)
 
     # We build an inverted index of neighbors in the input space: For sample i,
     # we define `inverted_index[i]` as the inverted index of sorted distances:
@@ -632,7 +636,8 @@ class TSNE(BaseEstimator):
     # Control the number of iterations between progress checks
     _N_ITER_CHECK = 50
 
-    def __init__(self, n_components=2, perplexity=30.0,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, perplexity=30.0,
                  early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
                  n_iter_without_progress=300, min_grad_norm=1e-7,
                  metric="euclidean", init="random", verbose=0,
diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx
index 676d3676fb8c1..0cc2b0af137cc 100644
--- a/sklearn/manifold/_utils.pyx
+++ b/sklearn/manifold/_utils.pyx
@@ -5,6 +5,10 @@ cimport cython
 import numpy as np
 cimport numpy as np
 from libc.stdio cimport printf
+
+np.import_array()
+
+
 cdef extern from "numpy/npy_math.h":
     float NPY_INFINITY
 
diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py
index 18133719bf85a..9007772674a99 100644
--- a/sklearn/manifold/tests/test_isomap.py
+++ b/sklearn/manifold/tests/test_isomap.py
@@ -91,7 +91,7 @@ def test_transform():
     X, y = datasets.make_s_curve(n_samples, random_state=0)
 
     # Compute isomap embedding
-    iso = manifold.Isomap(n_components, 2)
+    iso = manifold.Isomap(n_components=n_components, n_neighbors=2)
     X_iso = iso.fit_transform(X)
 
     # Re-embed a noisy version of the points
diff --git a/sklearn/manifold/tests/test_locally_linear.py b/sklearn/manifold/tests/test_locally_linear.py
index 0968c5052a1b7..952da3ef41163 100644
--- a/sklearn/manifold/tests/test_locally_linear.py
+++ b/sklearn/manifold/tests/test_locally_linear.py
@@ -131,7 +131,7 @@ def test_singular_matrix():
     M = np.ones((10, 3))
     f = ignore_warnings
     with pytest.raises(ValueError):
-        f(manifold.locally_linear_embedding(M, 2, 1,
+        f(manifold.locally_linear_embedding(M, n_neighbors=2, n_components=1,
                                             method='standard',
                                             eigen_solver='arpack'))
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index a916bbe1dd955..2ceccca65203e 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -37,6 +37,7 @@
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 
@@ -121,7 +122,8 @@ def _weighted_sum(sample_score, sample_weight, normalize=False):
         return sample_score.sum()
 
 
-def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
+@_deprecate_positional_args
+def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Accuracy classification score.
 
     In multilabel classification, this function computes subset accuracy:
@@ -193,7 +195,8 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
     return _weighted_sum(score, sample_weight, normalize)
 
 
-def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None,
+@_deprecate_positional_args
+def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
                      normalize=None):
     """Compute confusion matrix to evaluate the accuracy of a classification.
 
@@ -224,6 +227,8 @@ def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None,
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+        .. versionadded:: 0.18
+
     normalize : {'true', 'pred', 'all'}, default=None
         Normalizes confusion matrix over the true (rows), predicted (columns)
         conditions or all the population. If None, confusion matrix will not be
@@ -330,7 +335,8 @@ def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None,
     return cm
 
 
-def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None,
+@_deprecate_positional_args
+def multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None,
                                 labels=None, samplewise=False):
     """Compute a confusion matrix for each class or sample
 
@@ -533,7 +539,9 @@ def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None,
     return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
 
 
-def cohen_kappa_score(y1, y2, labels=None, weights=None, sample_weight=None):
+@_deprecate_positional_args
+def cohen_kappa_score(y1, y2, *, labels=None, weights=None,
+                      sample_weight=None):
     r"""Cohen's kappa: a statistic that measures inter-annotator agreement.
 
     This function computes Cohen's kappa [1]_, a score that expresses the level
@@ -613,7 +621,8 @@ class labels [2]_.
     return 1 - k
 
 
-def jaccard_score(y_true, y_pred, labels=None, pos_label=1,
+@_deprecate_positional_args
+def jaccard_score(y_true, y_pred, *, labels=None, pos_label=1,
                   average='binary', sample_weight=None):
     """Jaccard similarity coefficient score
 
@@ -752,7 +761,8 @@ def jaccard_score(y_true, y_pred, labels=None, pos_label=1,
     return np.average(jaccard, weights=weights)
 
 
-def matthews_corrcoef(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
     """Compute the Matthews correlation coefficient (MCC)
 
     The Matthews correlation coefficient is used in machine learning as a
@@ -781,6 +791,8 @@ def matthews_corrcoef(y_true, y_pred, sample_weight=None):
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+        .. versionadded:: 0.18
+
     Returns
     -------
     mcc : float
@@ -839,7 +851,8 @@ def matthews_corrcoef(y_true, y_pred, sample_weight=None):
         return mcc
 
 
-def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None):
+@_deprecate_positional_args
+def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Zero-one classification loss.
 
     If normalize is ``True``, return the fraction of misclassifications
@@ -909,7 +922,8 @@ def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None):
         return n_samples - score
 
 
-def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
+@_deprecate_positional_args
+def f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
              sample_weight=None, zero_division="warn"):
     """Compute the F1 score, also known as balanced F-score or F-measure
 
@@ -1027,13 +1041,14 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     and ``UndefinedMetricWarning`` will be raised. This behavior can be
     modified with ``zero_division``.
     """
-    return fbeta_score(y_true, y_pred, 1, labels=labels,
+    return fbeta_score(y_true, y_pred, beta=1, labels=labels,
                        pos_label=pos_label, average=average,
                        sample_weight=sample_weight,
                        zero_division=zero_division)
 
 
-def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
+@_deprecate_positional_args
+def fbeta_score(y_true, y_pred, *, beta, labels=None, pos_label=1,
                 average='binary', sample_weight=None, zero_division="warn"):
     """Compute the F-beta score
 
@@ -1256,7 +1271,8 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
     return labels
 
 
-def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
+@_deprecate_positional_args
+def precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None,
                                     pos_label=1, average=None,
                                     warn_for=('precision', 'recall',
                                               'f-score'),
@@ -1488,7 +1504,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     return precision, recall, f_score, true_sum
 
 
-def precision_score(y_true, y_pred, labels=None, pos_label=1,
+@_deprecate_positional_args
+def precision_score(y_true, y_pred, *, labels=None, pos_label=1,
                     average='binary', sample_weight=None,
                     zero_division="warn"):
     """Compute the precision
@@ -1607,7 +1624,8 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
     return p
 
 
-def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
+@_deprecate_positional_args
+def recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
                  sample_weight=None, zero_division="warn"):
     """Compute the recall
 
@@ -1724,7 +1742,8 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     return r
 
 
-def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
+@_deprecate_positional_args
+def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None,
                             adjusted=False):
     """Compute the balanced accuracy
 
@@ -1736,6 +1755,8 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
 
     Read more in the :ref:`User Guide <balanced_accuracy_score>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     y_true : 1d array-like
@@ -1801,7 +1822,8 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
     return score
 
 
-def classification_report(y_true, y_pred, labels=None, target_names=None,
+@_deprecate_positional_args
+def classification_report(y_true, y_pred, *, labels=None, target_names=None,
                           sample_weight=None, digits=2, output_dict=False,
                           zero_division="warn"):
     """Build a text report showing the main classification metrics.
@@ -1833,6 +1855,8 @@ def classification_report(y_true, y_pred, labels=None, target_names=None,
     output_dict : bool (default = False)
         If True, return output as dict
 
+        .. versionadded:: 0.20
+
     zero_division : "warn", 0 or 1, default="warn"
         Sets the value to return when there is a zero division. If set to
         "warn", this acts as 0, but warnings are also raised.
@@ -1999,7 +2023,8 @@ class 2       1.00      0.67      0.80         3
         return report
 
 
-def hamming_loss(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def hamming_loss(y_true, y_pred, *, sample_weight=None):
     """Compute the average Hamming loss.
 
     The Hamming loss is the fraction of labels that are incorrectly predicted.
@@ -2090,7 +2115,8 @@ def hamming_loss(y_true, y_pred, sample_weight=None):
         raise ValueError("{0} is not supported".format(y_type))
 
 
-def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None,
+@_deprecate_positional_args
+def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None,
              labels=None):
     """Log loss, aka logistic loss or cross-entropy loss.
 
@@ -2134,6 +2160,7 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None,
         If not provided, labels will be inferred from y_true. If ``labels``
         is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
         assumed to be binary and are inferred from ``y_true``.
+
         .. versionadded:: 0.18
 
     Returns
@@ -2215,7 +2242,8 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None,
     return _weighted_sum(loss, sample_weight, normalize)
 
 
-def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
+@_deprecate_positional_args
+def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
     """Average hinge loss (non-regularized)
 
     In binary class case, assuming labels in y_true are encoded with +1 and -1,
@@ -2292,7 +2320,7 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
     LinearSVC()
     >>> pred_decision = est.decision_function([[-1], [2], [3]])
     >>> y_true = [0, 2, 3]
-    >>> hinge_loss(y_true, pred_decision, labels)
+    >>> hinge_loss(y_true, pred_decision, labels=labels)
     0.56...
     """
     check_consistent_length(y_true, pred_decision, sample_weight)
@@ -2336,7 +2364,8 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
     return np.average(losses, weights=sample_weight)
 
 
-def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None):
+@_deprecate_positional_args
+def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     """Compute the Brier score.
 
     The smaller the Brier score, the better, hence the naming with "loss".
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 8916b523fc273..c858ac3950f86 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -4,6 +4,7 @@
 
 from .. import confusion_matrix
 from ...utils import check_matplotlib_support
+from ...utils.validation import _deprecate_positional_args
 from ...base import is_classifier
 
 
@@ -21,8 +22,9 @@ class ConfusionMatrixDisplay:
     confusion_matrix : ndarray of shape (n_classes, n_classes)
         Confusion matrix.
 
-    display_labels : ndarray of shape (n_classes,)
-        Display labels for plot.
+    display_labels : ndarray of shape (n_classes,), default=None
+        Display labels for plot. If None, display labels are set from 0 to
+        `n_classes - 1`.
 
     Attributes
     ----------
@@ -39,11 +41,12 @@ class ConfusionMatrixDisplay:
     figure_ : matplotlib Figure
         Figure containing the confusion matrix.
     """
-    def __init__(self, confusion_matrix, display_labels):
+    def __init__(self, confusion_matrix, *, display_labels=None):
         self.confusion_matrix = confusion_matrix
         self.display_labels = display_labels
 
-    def plot(self, include_values=True, cmap='viridis',
+    @_deprecate_positional_args
+    def plot(self, *, include_values=True, cmap='viridis',
              xticks_rotation='horizontal', values_format=None, ax=None):
         """Plot visualization.
 
@@ -108,11 +111,16 @@ def plot(self, include_values=True, cmap='viridis',
                     ha="center", va="center",
                     color=color)
 
+        if self.display_labels is None:
+            display_labels = np.arange(n_classes)
+        else:
+            display_labels = self.display_labels
+
         fig.colorbar(self.im_, ax=ax)
         ax.set(xticks=np.arange(n_classes),
                yticks=np.arange(n_classes),
-               xticklabels=self.display_labels,
-               yticklabels=self.display_labels,
+               xticklabels=display_labels,
+               yticklabels=display_labels,
                ylabel="True label",
                xlabel="Predicted label")
 
@@ -124,7 +132,8 @@ def plot(self, include_values=True, cmap='viridis',
         return self
 
 
-def plot_confusion_matrix(estimator, X, y_true, labels=None,
+@_deprecate_positional_args
+def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
                           sample_weight=None, normalize=None,
                           display_labels=None, include_values=True,
                           xticks_rotation='horizontal',
diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py
index bfec9276f83be..bb2a91c198c41 100644
--- a/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/sklearn/metrics/_plot/precision_recall_curve.py
@@ -4,6 +4,7 @@
 from .. import precision_recall_curve
 
 from ...utils import check_matplotlib_support
+from ...utils.validation import _deprecate_positional_args
 from ...base import is_classifier
 
 
@@ -23,11 +24,11 @@ class PrecisionRecallDisplay:
     recall : ndarray
         Recall values.
 
-    average_precision : float
-        Average precision.
+    average_precision : float, default=None
+        Average precision. If None, the average precision is not shown.
 
-    estimator_name : str
-        Name of estimator.
+    estimator_name : str, default=None
+        Name of estimator. If None, then the estimator name is not shown.
 
     Attributes
     ----------
@@ -40,14 +41,15 @@ class PrecisionRecallDisplay:
     figure_ : matplotlib Figure
         Figure containing the curve.
     """
-
-    def __init__(self, precision, recall, average_precision, estimator_name):
+    def __init__(self, precision, recall, *,
+                 average_precision=None, estimator_name=None):
         self.precision = precision
         self.recall = recall
         self.average_precision = average_precision
         self.estimator_name = estimator_name
 
-    def plot(self, ax=None, name=None, **kwargs):
+    @_deprecate_positional_args
+    def plot(self, ax=None, *, name=None, **kwargs):
         """Plot visualization.
 
         Extra keyword arguments will be passed to matplotlib's `plot`.
@@ -78,23 +80,30 @@ def plot(self, ax=None, name=None, **kwargs):
 
         name = self.estimator_name if name is None else name
 
-        line_kwargs = {
-            "label": "{} (AP = {:0.2f})".format(name,
-                                                self.average_precision),
-            "drawstyle": "steps-post"
-        }
+        line_kwargs = {"drawstyle": "steps-post"}
+        if self.average_precision is not None and name is not None:
+            line_kwargs["label"] = (f"{name} (AP = "
+                                    f"{self.average_precision:0.2f})")
+        elif self.average_precision is not None:
+            line_kwargs["label"] = (f"AP = "
+                                    f"{self.average_precision:0.2f}")
+        elif name is not None:
+            line_kwargs["label"] = name
         line_kwargs.update(**kwargs)
 
         self.line_, = ax.plot(self.recall, self.precision, **line_kwargs)
         ax.set(xlabel="Recall", ylabel="Precision")
-        ax.legend(loc='lower left')
+
+        if "label" in line_kwargs:
+            ax.legend(loc='lower left')
 
         self.ax_ = ax
         self.figure_ = ax.figure
         return self
 
 
-def plot_precision_recall_curve(estimator, X, y,
+@_deprecate_positional_args
+def plot_precision_recall_curve(estimator, X, y, *,
                                 sample_weight=None, response_method="auto",
                                 name=None, ax=None, **kwargs):
     """Plot Precision Recall Curve for binary classifiers.
diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py
index d786ac6659d41..21af0aa388b07 100644
--- a/sklearn/metrics/_plot/roc_curve.py
+++ b/sklearn/metrics/_plot/roc_curve.py
@@ -4,6 +4,7 @@
 from .base import _check_classifer_response_method
 from ...utils import check_matplotlib_support
 from ...base import is_classifier
+from ...utils.validation import _deprecate_positional_args
 
 
 class RocCurveDisplay:
@@ -22,11 +23,11 @@ class RocCurveDisplay:
     tpr : ndarray
         True positive rate.
 
-    roc_auc : float
-        Area under ROC curve.
+    roc_auc : float, default=None
+        Area under ROC curve. If None, the roc_auc score is not shown.
 
-    estimator_name : str
-        Name of estimator.
+    estimator_name : str, default=None
+        Name of estimator. If None, the estimator name is not shown.
 
     Attributes
     ----------
@@ -53,14 +54,14 @@ class RocCurveDisplay:
     >>> display.plot()  # doctest: +SKIP
     >>> plt.show()      # doctest: +SKIP
     """
-
-    def __init__(self, fpr, tpr, roc_auc, estimator_name):
+    def __init__(self, *, fpr, tpr, roc_auc=None, estimator_name=None):
         self.fpr = fpr
         self.tpr = tpr
         self.roc_auc = roc_auc
         self.estimator_name = estimator_name
 
-    def plot(self, ax=None, name=None, **kwargs):
+    @_deprecate_positional_args
+    def plot(self, ax=None, *, name=None, **kwargs):
         """Plot visualization
 
         Extra keyword arguments will be passed to matplotlib's ``plot``.
@@ -88,22 +89,30 @@ def plot(self, ax=None, name=None, **kwargs):
 
         name = self.estimator_name if name is None else name
 
-        line_kwargs = {
-            'label': "{} (AUC = {:0.2f})".format(name, self.roc_auc)
-        }
+        line_kwargs = {}
+        if self.roc_auc is not None and name is not None:
+            line_kwargs["label"] = f"{name} (AUC = {self.roc_auc:0.2f})"
+        elif self.roc_auc is not None:
+            line_kwargs["label"] = f"AUC = {self.roc_auc:0.2f}"
+        elif name is not None:
+            line_kwargs["label"] = name
+
         line_kwargs.update(**kwargs)
 
         self.line_ = ax.plot(self.fpr, self.tpr, **line_kwargs)[0]
         ax.set_xlabel("False Positive Rate")
         ax.set_ylabel("True Positive Rate")
-        ax.legend(loc='lower right')
+
+        if "label" in line_kwargs:
+            ax.legend(loc='lower right')
 
         self.ax_ = ax
         self.figure_ = ax.figure
         return self
 
 
-def plot_roc_curve(estimator, X, y, sample_weight=None,
+@_deprecate_positional_args
+def plot_roc_curve(estimator, X, y, *, sample_weight=None,
                    drop_intermediate=True, response_method="auto",
                    name=None, ax=None, **kwargs):
     """Plot Receiver operating characteristic (ROC) curve.
diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
index b8a24ae15f1e5..e65c12904b757 100644
--- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@@ -267,16 +267,33 @@ def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes,
 
 def test_confusion_matrix_standard_format(pyplot):
     cm = np.array([[10000000, 0], [123456, 12345678]])
-    plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_
+    plotted_text = ConfusionMatrixDisplay(
+        cm, display_labels=[False, True]).plot().text_
     # Values should be shown as whole numbers 'd',
     # except the first number which should be shown as 1e+07 (longer length)
-    # and the last number will be showns as 1.2e+07 (longer length)
+    # and the last number will be shown as 1.2e+07 (longer length)
     test = [t.get_text() for t in plotted_text.ravel()]
     assert test == ['1e+07', '0', '123456', '1.2e+07']
 
     cm = np.array([[0.1, 10], [100, 0.525]])
-    plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_
+    plotted_text = ConfusionMatrixDisplay(
+        cm, display_labels=[False, True]).plot().text_
     # Values should now formatted as '.2g', since there's a float in
     # Values are have two dec places max, (e.g 100 becomes 1e+02)
     test = [t.get_text() for t in plotted_text.ravel()]
     assert test == ['0.1', '10', '1e+02', '0.53']
+
+
+@pytest.mark.parametrize("display_labels, expected_labels", [
+    (None, ["0", "1"]),
+    (["cat", "dog"], ["cat", "dog"]),
+])
+def test_default_labels(pyplot, display_labels, expected_labels):
+    cm = np.array([[10, 0], [12, 120]])
+    disp = ConfusionMatrixDisplay(cm, display_labels=display_labels).plot()
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    assert_array_equal(x_ticks, expected_labels)
+    assert_array_equal(y_ticks, expected_labels)
diff --git a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
index f22b112e96dc7..48305a93d0b3f 100644
--- a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
+++ b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
@@ -4,6 +4,7 @@
 
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics import plot_precision_recall_curve
+from sklearn.metrics import PrecisionRecallDisplay
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import precision_recall_curve
 from sklearn.datasets import make_classification
@@ -170,3 +171,22 @@ def test_plot_precision_recall_curve_estimator_name_multiple_calls(pyplot):
     clf_name = "another_name"
     disp.plot(name=clf_name)
     assert clf_name in disp.line_.get_label()
+
+
+@pytest.mark.parametrize(
+    "average_precision, estimator_name, expected_label",
+    [
+        (0.9, None, "AP = 0.90"),
+        (None, "my_est", "my_est"),
+        (0.8, "my_est2", "my_est2 (AP = 0.80)"),
+    ]
+)
+def test_default_labels(pyplot, average_precision, estimator_name,
+                        expected_label):
+    prec = np.array([1, 0.5, 0])
+    recall = np.array([0, 0.5, 1])
+    disp = PrecisionRecallDisplay(prec, recall,
+                                  average_precision=average_precision,
+                                  estimator_name=estimator_name)
+    disp.plot()
+    assert disp.line_.get_label() == expected_label
diff --git a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
index 699387ff4cfa3..50e69ad41af8d 100644
--- a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
+++ b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
@@ -4,6 +4,7 @@
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.metrics import plot_roc_curve
+from sklearn.metrics import RocCurveDisplay
 from sklearn.datasets import load_iris
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import roc_curve, auc
@@ -150,3 +151,20 @@ def test_plot_roc_curve_estimator_name_multiple_calls(pyplot, data_binary):
     clf_name = "another_name"
     disp.plot(name=clf_name)
     assert clf_name in disp.line_.get_label()
+
+
+@pytest.mark.parametrize(
+    "roc_auc, estimator_name, expected_label",
+    [
+        (0.9, None, "AUC = 0.90"),
+        (None, "my_est", "my_est"),
+        (0.8, "my_est2", "my_est2 (AUC = 0.80)")
+    ]
+)
+def test_default_labels(pyplot, roc_auc, estimator_name,
+                        expected_label):
+    fpr = np.array([0, 0.5, 1])
+    tpr = np.array([0, 0.5, 1])
+    disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
+                           estimator_name=estimator_name).plot()
+    assert disp.line_.get_label() == expected_label
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index e525539c0d706..18d948214bbec 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -31,6 +31,7 @@
 from ..utils.multiclass import type_of_target
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
 from ..preprocessing._label import _encode
@@ -101,7 +102,8 @@ def auc(x, y):
     return area
 
 
-def average_precision_score(y_true, y_score, average="macro", pos_label=1,
+@_deprecate_positional_args
+def average_precision_score(y_true, y_score, *, average="macro", pos_label=1,
                             sample_weight=None):
     """Compute average precision (AP) from prediction scores
 
@@ -243,7 +245,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
     return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
 
 
-def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
+@_deprecate_positional_args
+def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
                   max_fpr=None, multi_class="raise", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
@@ -383,7 +386,7 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
                                          multi_class, average, sample_weight)
     elif y_type == "binary":
         labels = np.unique(y_true)
-        y_true = label_binarize(y_true, labels)[:, 0]
+        y_true = label_binarize(y_true, classes=labels)[:, 0]
         return _average_binary_score(partial(_binary_roc_auc_score,
                                              max_fpr=max_fpr),
                                      y_true, y_score, average,
@@ -489,7 +492,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
                                              y_score, average=average)
     else:
         # ovr is same as multi-label
-        y_true_multilabel = label_binarize(y_true, classes)
+        y_true_multilabel = label_binarize(y_true, classes=classes)
         return _average_binary_score(_binary_roc_auc_score, y_true_multilabel,
                                      y_score, average,
                                      sample_weight=sample_weight)
@@ -594,7 +597,8 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     return fps, tps, y_score[threshold_idxs]
 
 
-def precision_recall_curve(y_true, probas_pred, pos_label=None,
+@_deprecate_positional_args
+def precision_recall_curve(y_true, probas_pred, *, pos_label=None,
                            sample_weight=None):
     """Compute precision-recall pairs for different probability thresholds
 
@@ -683,7 +687,8 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None,
     return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
 
 
-def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
+@_deprecate_positional_args
+def roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None,
               drop_intermediate=True):
     """Compute Receiver operating characteristic (ROC)
 
@@ -813,7 +818,9 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
     return fpr, tpr, thresholds
 
 
-def label_ranking_average_precision_score(y_true, y_score, sample_weight=None):
+@_deprecate_positional_args
+def label_ranking_average_precision_score(y_true, y_score, *,
+                                          sample_weight=None):
     """Compute ranking-based average precision
 
     Label ranking average precision (LRAP) is the average over each ground
@@ -841,6 +848,8 @@ def label_ranking_average_precision_score(y_true, y_score, sample_weight=None):
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     score : float
@@ -899,7 +908,8 @@ def label_ranking_average_precision_score(y_true, y_score, sample_weight=None):
     return out
 
 
-def coverage_error(y_true, y_score, sample_weight=None):
+@_deprecate_positional_args
+def coverage_error(y_true, y_score, *, sample_weight=None):
     """Coverage error measure
 
     Compute how far we need to go through the ranked scores to cover all
@@ -958,7 +968,8 @@ def coverage_error(y_true, y_score, sample_weight=None):
     return np.average(coverage, weights=sample_weight)
 
 
-def label_ranking_loss(y_true, y_score, sample_weight=None):
+@_deprecate_positional_args
+def label_ranking_loss(y_true, y_score, *, sample_weight=None):
     """Compute Ranking loss measure
 
     Compute the average number of label pairs that are incorrectly ordered
@@ -1022,7 +1033,7 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
             unique_inverse[y_true.indices[start:stop]],
             minlength=len(unique_scores))
         all_at_reversed_rank = np.bincount(unique_inverse,
-                                        minlength=len(unique_scores))
+                                           minlength=len(unique_scores))
         false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
 
         # if the scores are ordered, it's possible to count the number of
@@ -1163,7 +1174,8 @@ def _check_dcg_target_type(y_true):
                 supported_fmt, y_type))
 
 
-def dcg_score(y_true, y_score, k=None,
+@_deprecate_positional_args
+def dcg_score(y_true, y_score, *, k=None,
               log_base=2, sample_weight=None, ignore_ties=False):
     """Compute Discounted Cumulative Gain.
 
@@ -1239,22 +1251,22 @@ def dcg_score(y_true, y_score, k=None,
     >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
     >>> # we predict scores for the answers
     >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
-    >>> dcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    >>> dcg_score(true_relevance, scores)
     9.49...
     >>> # we can set k to truncate the sum; only top k answers contribute
-    >>> dcg_score(true_relevance, scores, k=2) # doctest: +ELLIPSIS
+    >>> dcg_score(true_relevance, scores, k=2)
     5.63...
     >>> # now we have some ties in our prediction
     >>> scores = np.asarray([[1, 0, 0, 0, 1]])
     >>> # by default ties are averaged, so here we get the average true
     >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5
-    >>> dcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS
+    >>> dcg_score(true_relevance, scores, k=1)
     7.5
     >>> # we can choose to ignore ties for faster results, but only
     >>> # if we know there aren't ties in our scores, otherwise we get
     >>> # wrong results:
     >>> dcg_score(true_relevance,
-    ...           scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS
+    ...           scores, k=1, ignore_ties=True)
     5.0
 
     """
@@ -1320,7 +1332,9 @@ def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False):
     return gain
 
 
-def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
+@_deprecate_positional_args
+def ndcg_score(y_true, y_score, *, k=None, sample_weight=None,
+               ignore_ties=False):
     """Compute Normalized Discounted Cumulative Gain.
 
     Sum the true scores ranked in the order induced by the predicted scores,
@@ -1387,29 +1401,29 @@ def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
     >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
     >>> # we predict some scores (relevance) for the answers
     >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
-    >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores)
     0.69...
     >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]])
-    >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores)
     0.49...
     >>> # we can set k to truncate the sum; only top k answers contribute.
-    >>> ndcg_score(true_relevance, scores, k=4) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores, k=4)
     0.35...
     >>> # the normalization takes k into account so a perfect answer
     >>> # would still get 1.0
-    >>> ndcg_score(true_relevance, true_relevance, k=4) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, true_relevance, k=4)
     1.0
     >>> # now we have some ties in our prediction
     >>> scores = np.asarray([[1, 0, 0, 0, 1]])
     >>> # by default ties are averaged, so here we get the average (normalized)
     >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75
-    >>> ndcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores, k=1)
     0.75
     >>> # we can choose to ignore ties for faster results, but only
     >>> # if we know there aren't ties in our scores, otherwise we get
     >>> # wrong results:
     >>> ndcg_score(true_relevance,
-    ...           scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS
+    ...           scores, k=1, ignore_ties=True)
     0.5
 
     """
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 6026a5293806a..afbb469072cf5 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -29,6 +29,7 @@
 from ..utils.validation import (check_array, check_consistent_length,
                                 _num_samples)
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import UndefinedMetricWarning
 
 
@@ -117,7 +118,8 @@ def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
     return y_type, y_true, y_pred, multioutput
 
 
-def mean_absolute_error(y_true, y_pred,
+@_deprecate_positional_args
+def mean_absolute_error(y_true, y_pred, *,
                         sample_weight=None,
                         multioutput='uniform_average'):
     """Mean absolute error regression loss
@@ -188,7 +190,8 @@ def mean_absolute_error(y_true, y_pred,
     return np.average(output_errors, weights=multioutput)
 
 
-def mean_squared_error(y_true, y_pred,
+@_deprecate_positional_args
+def mean_squared_error(y_true, y_pred, *,
                        sample_weight=None,
                        multioutput='uniform_average', squared=True):
     """Mean squared error regression loss
@@ -263,7 +266,8 @@ def mean_squared_error(y_true, y_pred,
     return mse if squared else np.sqrt(mse)
 
 
-def mean_squared_log_error(y_true, y_pred,
+@_deprecate_positional_args
+def mean_squared_log_error(y_true, y_pred, *,
                            sample_weight=None,
                            multioutput='uniform_average'):
     """Mean squared logarithmic error regression loss
@@ -326,10 +330,12 @@ def mean_squared_log_error(y_true, y_pred,
                          "targets contain negative values.")
 
     return mean_squared_error(np.log1p(y_true), np.log1p(y_pred),
-                              sample_weight, multioutput)
+                              sample_weight=sample_weight,
+                              multioutput=multioutput)
 
 
-def median_absolute_error(y_true, y_pred, multioutput='uniform_average'):
+@_deprecate_positional_args
+def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average'):
     """Median absolute error regression loss
 
     Median absolute error output is non-negative floating point. The best value
@@ -392,7 +398,8 @@ def median_absolute_error(y_true, y_pred, multioutput='uniform_average'):
     return np.average(output_errors, weights=multioutput)
 
 
-def explained_variance_score(y_true, y_pred,
+@_deprecate_positional_args
+def explained_variance_score(y_true, y_pred, *,
                              sample_weight=None,
                              multioutput='uniform_average'):
     """Explained variance regression score function
@@ -484,7 +491,8 @@ def explained_variance_score(y_true, y_pred,
     return np.average(output_scores, weights=avg_weights)
 
 
-def r2_score(y_true, y_pred, sample_weight=None,
+@_deprecate_positional_args
+def r2_score(y_true, y_pred, *, sample_weight=None,
              multioutput="uniform_average"):
     """R^2 (coefficient of determination) regression score function.
 
@@ -655,7 +663,8 @@ def max_error(y_true, y_pred):
     return np.max(np.abs(y_true - y_pred))
 
 
-def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
+@_deprecate_positional_args
+def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
     """Mean Tweedie deviance regression loss.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
@@ -719,7 +728,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
     return np.average(dev, weights=sample_weight)
 
 
-def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):
     """Mean Poisson deviance regression loss.
 
     Poisson deviance is equivalent to the Tweedie deviance with
@@ -756,7 +766,8 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
     )
 
 
-def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):
     """Mean Gamma deviance regression loss.
 
     Gamma deviance is equivalent to the Tweedie deviance with
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 3df175c2ca306..400e92c158ca8 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -43,6 +43,7 @@
 from .cluster import fowlkes_mallows_score
 
 from ..utils.multiclass import type_of_target
+from ..utils.validation import _deprecate_positional_args
 from ..base import is_regressor
 
 
@@ -371,7 +372,8 @@ def _passthrough_scorer(estimator, *args, **kwargs):
     return estimator.score(*args, **kwargs)
 
 
-def check_scoring(estimator, scoring=None, allow_none=False):
+@_deprecate_positional_args
+def check_scoring(estimator, scoring=None, *, allow_none=False):
     """Determine scorer from user options.
 
     A TypeError will be thrown if the estimator cannot be scored.
@@ -528,7 +530,8 @@ def _check_multimetric_scoring(estimator, scoring=None):
         return scorers, True
 
 
-def make_scorer(score_func, greater_is_better=True, needs_proba=False,
+@_deprecate_positional_args
+def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
                 needs_threshold=False, **kwargs):
     """Make a scorer from a performance metric or loss function.
 
diff --git a/sklearn/metrics/cluster/_bicluster.py b/sklearn/metrics/cluster/_bicluster.py
index f8d8d18e9f6b0..ac0d0a454a74a 100644
--- a/sklearn/metrics/cluster/_bicluster.py
+++ b/sklearn/metrics/cluster/_bicluster.py
@@ -2,6 +2,7 @@
 from scipy.optimize import linear_sum_assignment
 
 from ...utils.validation import check_consistent_length, check_array
+from ...utils.validation import _deprecate_positional_args
 
 __all__ = ["consensus_score"]
 
@@ -44,7 +45,8 @@ def _pairwise_similarity(a, b, similarity):
     return result
 
 
-def consensus_score(a, b, similarity="jaccard"):
+@_deprecate_positional_args
+def consensus_score(a, b, *, similarity="jaccard"):
     """The similarity of two sets of biclusters.
 
     Similarity between individual biclusters is computed. Then the
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 973c45a908bf1..d652737bd23c0 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -23,6 +23,7 @@
 
 from ._expected_mutual_info_fast import expected_mutual_information
 from ...utils.validation import check_array, check_consistent_length
+from ...utils.validation import _deprecate_positional_args
 from ...utils.fixes import _astype_copy_false
 
 
@@ -77,7 +78,8 @@ def _generalized_average(U, V, average_method):
                          "'arithmetic', or 'max'")
 
 
-def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False):
+@_deprecate_positional_args
+def contingency_matrix(labels_true, labels_pred, *, eps=None, sparse=False):
     """Build a contingency matrix describing the relationship between labels.
 
     Parameters
@@ -241,7 +243,8 @@ def adjusted_rand_score(labels_true, labels_pred):
     return (sum_comb - prod_comb) / (mean_comb - prod_comb)
 
 
-def homogeneity_completeness_v_measure(labels_true, labels_pred, beta=1.0):
+@_deprecate_positional_args
+def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0):
     """Compute the homogeneity and completeness and V-Measure scores at once.
 
     Those metrics are based on normalized conditional entropy measures of
@@ -463,7 +466,8 @@ def completeness_score(labels_true, labels_pred):
     return homogeneity_completeness_v_measure(labels_true, labels_pred)[1]
 
 
-def v_measure_score(labels_true, labels_pred, beta=1.0):
+@_deprecate_positional_args
+def v_measure_score(labels_true, labels_pred, *, beta=1.0):
     """V-measure cluster labeling given a ground truth.
 
     This score is identical to :func:`normalized_mutual_info_score` with
@@ -563,7 +567,8 @@ def v_measure_score(labels_true, labels_pred, beta=1.0):
                                               beta=beta)[2]
 
 
-def mutual_info_score(labels_true, labels_pred, contingency=None):
+@_deprecate_positional_args
+def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     """Mutual Information between two clusterings.
 
     The Mutual Information is a measure of the similarity between two labels of
@@ -649,7 +654,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
     return np.clip(mi.sum(), 0.0, None)
 
 
-def adjusted_mutual_info_score(labels_true, labels_pred,
+@_deprecate_positional_args
+def adjusted_mutual_info_score(labels_true, labels_pred, *,
                                average_method='arithmetic'):
     """Adjusted Mutual Information between two clusterings.
 
@@ -770,7 +776,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred,
     return ami
 
 
-def normalized_mutual_info_score(labels_true, labels_pred,
+@_deprecate_positional_args
+def normalized_mutual_info_score(labels_true, labels_pred, *,
                                  average_method='arithmetic'):
     """Normalized Mutual Information between two clusterings.
 
@@ -870,9 +877,12 @@ def normalized_mutual_info_score(labels_true, labels_pred,
     return nmi
 
 
-def fowlkes_mallows_score(labels_true, labels_pred, sparse=False):
+@_deprecate_positional_args
+def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False):
     """Measure the similarity of two clusterings of a set of points.
 
+    .. versionadded:: 0.18
+
     The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of
     the precision and recall::
 
diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py
index 8841df701c69f..ce5563c4763d3 100644
--- a/sklearn/metrics/cluster/_unsupervised.py
+++ b/sklearn/metrics/cluster/_unsupervised.py
@@ -16,6 +16,7 @@
 from ..pairwise import pairwise_distances_chunked
 from ..pairwise import pairwise_distances
 from ...preprocessing import LabelEncoder
+from ...utils.validation import _deprecate_positional_args
 
 
 def check_number_of_labels(n_labels, n_samples):
@@ -34,7 +35,8 @@ def check_number_of_labels(n_labels, n_samples):
                          "to n_samples - 1 (inclusive)" % n_labels)
 
 
-def silhouette_score(X, labels, metric='euclidean', sample_size=None,
+@_deprecate_positional_args
+def silhouette_score(X, labels, *, metric='euclidean', sample_size=None,
                      random_state=None, **kwds):
     """Compute the mean Silhouette Coefficient of all samples.
 
@@ -147,7 +149,8 @@ def _silhouette_reduce(D_chunk, start, labels, label_freqs):
     return intra_clust_dists, inter_clust_dists
 
 
-def silhouette_samples(X, labels, metric='euclidean', **kwds):
+@_deprecate_positional_args
+def silhouette_samples(X, labels, *, metric='euclidean', **kwds):
     """Compute the Silhouette Coefficient for each sample.
 
     The Silhouette Coefficient is a measure of how well samples are clustered
@@ -309,6 +312,8 @@ def davies_bouldin_score(X, labels):
 
     Read more in the :ref:`User Guide <davies-bouldin_index>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     X : array-like, shape (``n_samples``, ``n_features``)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 25646acb49ea7..2e1332d18a20c 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -28,6 +28,7 @@
 from ..utils.extmath import row_norms, safe_sparse_dot
 from ..preprocessing import normalize
 from ..utils._mask import _get_mask
+from ..utils.validation import _deprecate_positional_args
 
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
@@ -58,7 +59,8 @@ def _return_float_dtype(X, Y):
     return X, Y, dtype
 
 
-def check_pairwise_arrays(X, Y, precomputed=False, dtype=None,
+@_deprecate_positional_args
+def check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None,
                           accept_sparse='csr', force_all_finite=True,
                           copy=False):
     """ Set X and Y appropriately and checks inputs
@@ -98,17 +100,20 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None,
         raise an error.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in array. The
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
         possibilities are:
 
         - True: Force all values of array to be finite.
-        - False: accept both np.inf and np.nan in array.
-        - 'allow-nan': accept only np.nan values in array. Values cannot
-          be infinite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
 
         .. versionadded:: 0.22
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     copy : bool
         Whether a forced copy will be triggered. If copy=False, a copy might
         be triggered by a conversion.
@@ -192,7 +197,8 @@ def check_paired_arrays(X, Y):
 
 
 # Pairwise distances
-def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
+@_deprecate_positional_args
+def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
                         X_norm_squared=None):
     """
     Considering the rows of X (and Y=X) as vectors, compute the
@@ -313,7 +319,8 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     return distances if squared else np.sqrt(distances, out=distances)
 
 
-def nan_euclidean_distances(X, Y=None, squared=False,
+@_deprecate_positional_args
+def nan_euclidean_distances(X, Y=None, *, squared=False,
                             missing_values=np.nan, copy=True):
     """Calculate the euclidean distances in the presence of missing values.
 
@@ -503,7 +510,8 @@ def _argmin_min_reduce(dist, start):
     return indices, values
 
 
-def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
+@_deprecate_positional_args
+def pairwise_distances_argmin_min(X, Y, *, axis=1, metric="euclidean",
                                   metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
 
@@ -589,7 +597,8 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
     return indices, values
 
 
-def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
+@_deprecate_positional_args
+def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean",
                               metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
 
@@ -659,7 +668,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    return pairwise_distances_argmin_min(X, Y, axis, metric,
+    return pairwise_distances_argmin_min(X, Y, axis=axis, metric=metric,
                                          metric_kwargs=metric_kwargs)[0]
 
 
@@ -711,7 +720,8 @@ def haversine_distances(X, Y=None):
     return DistanceMetric.get_metric('haversine').pairwise(X, Y)
 
 
-def manhattan_distances(X, Y=None, sum_over_features=True):
+@_deprecate_positional_args
+def manhattan_distances(X, Y=None, *, sum_over_features=True):
     """ Compute the L1 distances between the vectors in X and Y.
 
     With sum_over_features equal to False it returns the componentwise
@@ -908,7 +918,8 @@ def paired_cosine_distances(X, Y):
     'cityblock': paired_manhattan_distances}
 
 
-def paired_distances(X, Y, metric="euclidean", **kwds):
+@_deprecate_positional_args
+def paired_distances(X, Y, *, metric="euclidean", **kwds):
     """
     Computes the paired distances between X and Y.
 
@@ -1433,18 +1444,25 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
         if X is Y:
             V = np.var(X, axis=0, ddof=1)
         else:
+            warnings.warn("from version 0.25, pairwise_distances for "
+                          "metric='seuclidean' will require V to be "
+                          "specified if Y is passed.", FutureWarning)
             V = np.var(np.vstack([X, Y]), axis=0, ddof=1)
         return {'V': V}
     if metric == "mahalanobis" and 'VI' not in kwds:
         if X is Y:
             VI = np.linalg.inv(np.cov(X.T)).T
         else:
+            warnings.warn("from version 0.25, pairwise_distances for "
+                          "metric='mahalanobis' will require VI to be "
+                          "specified if Y is passed.", FutureWarning)
             VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T
         return {'VI': VI}
     return {}
 
 
-def pairwise_distances_chunked(X, Y=None, reduce_func=None,
+@_deprecate_positional_args
+def pairwise_distances_chunked(X, Y=None, *, reduce_func=None,
                                metric='euclidean', n_jobs=None,
                                working_memory=None, **kwds):
     """Generate a distance matrix chunk by chunk with optional reduction
@@ -1606,7 +1624,8 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None,
         yield D_chunk
 
 
-def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
+@_deprecate_positional_args
+def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None,
                        force_all_finite=True, **kwds):
     """ Compute the distance matrix from a vector array X and optional Y.
 
@@ -1675,15 +1694,19 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
         for more details.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in array. The
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
         possibilities are:
 
         - True: Force all values of array to be finite.
-        - False: accept both np.inf and np.nan in array.
-        - 'allow-nan': accept only np.nan values in array. Values cannot
-          be infinite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
 
         .. versionadded:: 0.22
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
 
     **kwds : optional keyword parameters
         Any further parameters are passed directly to the distance function.
@@ -1820,7 +1843,8 @@ def kernel_metrics():
 }
 
 
-def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
+@_deprecate_positional_args
+def pairwise_kernels(X, Y=None, metric="linear", *, filter_params=False,
                      n_jobs=None, **kwds):
     """Compute the kernel between arrays X and optional array Y.
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index be6364e63b2cd..1f959d95ce844 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -638,7 +638,7 @@ def test_matthews_corrcoef_against_jurman():
         for k in range(N)
     ])
     mcc_jurman = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
-    mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight)
+    mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight)
 
     assert_almost_equal(mcc_ours, mcc_jurman, 10)
 
@@ -654,7 +654,7 @@ def test_matthews_corrcoef():
     y_true_inv = ["b" if i == "a" else "a" for i in y_true]
     assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1)
 
-    y_true_inv2 = label_binarize(y_true, ["a", "b"])
+    y_true_inv2 = label_binarize(y_true, classes=["a", "b"])
     y_true_inv2 = np.where(y_true_inv2, 'a', 'b')
     assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1)
 
@@ -725,7 +725,8 @@ def test_matthews_corrcoef_multiclass():
     y_true = [0, 0, 1, 1, 2]
     y_pred = [1, 1, 0, 0, 2]
     sample_weight = [1, 1, 1, 1, 0]
-    assert_almost_equal(matthews_corrcoef(y_true, y_pred, sample_weight), -1)
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred,
+                                          sample_weight=sample_weight), -1)
 
     # For the zero vector case, the corrcoef cannot be calculated and should
     # result in a RuntimeWarning
@@ -734,7 +735,7 @@ def test_matthews_corrcoef_multiclass():
     sample_weight = [1, 1, 0, 0]
     mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
                                matthews_corrcoef, y_true, y_pred,
-                               sample_weight)
+                               sample_weight=sample_weight)
 
     # But will output 0
     assert_almost_equal(mcc, 0.)
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index fdff2c4c3959e..d7a96de12c9e3 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -350,7 +350,7 @@ def test_pairwise_kernels_filter_param():
     assert_array_almost_equal(K, K2)
 
     with pytest.raises(TypeError):
-        pairwise_kernels(X, Y, "rbf", **params)
+        pairwise_kernels(X, Y, metric="rbf", **params)
 
 
 @pytest.mark.parametrize('metric, func', PAIRED_DISTANCES.items())
@@ -1281,8 +1281,16 @@ def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                 params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}
 
         expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
-        dist = np.vstack(tuple(dist_function(X, Y,
-                                             metric=metric, n_jobs=n_jobs)))
+        # TODO: Remove warn_checker in 0.25
+        if y_is_x:
+            warn_checker = pytest.warns(None)
+        else:
+            warn_checker = pytest.warns(FutureWarning,
+                                        match="to be specified if Y is passed")
+        with warn_checker:
+            dist = np.vstack(tuple(dist_function(X, Y,
+                                                 metric=metric,
+                                                 n_jobs=n_jobs)))
 
         assert_allclose(dist, expected_dist_explicit_params)
         assert_allclose(dist, expected_dist_default_params)
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 64e88f37ed2bc..189d36ae88328 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -190,11 +190,11 @@ def check_scoring_validator_for_single_metric_usecases(scoring_validator):
     with pytest.raises(TypeError, match=pattern):
         scoring_validator(estimator)
 
-    scorer = scoring_validator(estimator, "accuracy")
+    scorer = scoring_validator(estimator, scoring="accuracy")
     assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)
 
     estimator = EstimatorWithFit()
-    scorer = scoring_validator(estimator, "accuracy")
+    scorer = scoring_validator(estimator, scoring="accuracy")
     assert isinstance(scorer, _PredictScorer)
 
     # Test the allow_none parameter for check_scoring alone
@@ -274,11 +274,11 @@ def test_check_scoring_gridsearchcv():
     # slightly redundant non-regression test.
 
     grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3)
-    scorer = check_scoring(grid, "f1")
+    scorer = check_scoring(grid, scoring="f1")
     assert isinstance(scorer, _PredictScorer)
 
     pipe = make_pipeline(LinearSVC())
-    scorer = check_scoring(pipe, "f1")
+    scorer = check_scoring(pipe, scoring="f1")
     assert isinstance(scorer, _PredictScorer)
 
     # check that cross_val_score definitely calls the scorer
@@ -544,13 +544,13 @@ def test_scorer_memmap_input(name):
 
 def test_scoring_is_not_metric():
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(LogisticRegression(), f1_score)
+        check_scoring(LogisticRegression(), scoring=f1_score)
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(LogisticRegression(), roc_auc_score)
+        check_scoring(LogisticRegression(), scoring=roc_auc_score)
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(Ridge(), r2_score)
+        check_scoring(Ridge(), scoring=r2_score)
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(KMeans(), cluster_module.adjusted_rand_score)
+        check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score)
 
 
 def test_deprecated_scorer():
diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py
index c68fa260faee3..648fb8d903d38 100644
--- a/sklearn/mixture/_bayesian_mixture.py
+++ b/sklearn/mixture/_bayesian_mixture.py
@@ -15,6 +15,7 @@
 from ._gaussian_mixture import _estimate_gaussian_parameters
 from ._gaussian_mixture import _estimate_log_gaussian_prob
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 def _log_dirichlet_norm(dirichlet_concentration):
@@ -307,8 +308,8 @@ class BayesianGaussianMixture(BaseMixture):
        inference for Dirichlet process mixtures". Bayesian analysis 1.1
        <https://www.cs.princeton.edu/courses/archive/fall11/cos597C/reading/BleiJordan2005.pdf>`_
     """
-
-    def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, *, n_components=1, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                  weight_concentration_prior_type='dirichlet_process',
                  weight_concentration_prior=None,
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index 277f65f929eac..596e66f6a4e64 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -11,6 +11,7 @@
 from ._base import BaseMixture, _check_shape
 from ..utils import check_array
 from ..utils.extmath import row_norms
+from ..utils.validation import _deprecate_positional_args
 
 
 ###############################################################################
@@ -585,8 +586,8 @@ class GaussianMixture(BaseMixture):
     BayesianGaussianMixture : Gaussian mixture model fit with a variational
         inference.
     """
-
-    def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, n_components=1, *, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                  weights_init=None, means_init=None, precisions_init=None,
                  random_state=None, warm_start=False,
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 3e5b85ed73a02..27bf36d38e6c0 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -34,6 +34,7 @@
 from ..utils import check_random_state
 from ..utils.random import sample_without_replacement
 from ..utils.validation import indexable, check_is_fitted, _check_fit_params
+from ..utils.validation import _deprecate_positional_args
 from ..utils.metaestimators import if_delegate_has_method
 from ..metrics._scorer import _check_multimetric_scoring
 from ..metrics import check_scoring
@@ -189,7 +190,7 @@ class ParameterSampler:
     It is highly recommended to use continuous distributions for continuous
     parameters.
 
-    Read more in the :ref:`User Guide <search>`.
+    Read more in the :ref:`User Guide <grid_search>`.
 
     Parameters
     ----------
@@ -234,7 +235,8 @@ class ParameterSampler:
     ...                  {'b': 1.038159, 'a': 2}]
     True
     """
-    def __init__(self, param_distributions, n_iter, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, param_distributions, n_iter, *, random_state=None):
         if not isinstance(param_distributions, (Mapping, Iterable)):
             raise TypeError('Parameter distribution is not a dict or '
                             'a list ({!r})'.format(param_distributions))
@@ -400,9 +402,11 @@ class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def __init__(self, estimator, scoring=None, n_jobs=None, iid='deprecated',
-                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
-                 error_score=np.nan, return_train_score=True):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, scoring=None, n_jobs=None,
+                 iid='deprecated', refit=True, cv=None, verbose=0,
+                 pre_dispatch='2*n_jobs', error_score=np.nan,
+                 return_train_score=True):
 
         self.scoring = scoring
         self.estimator = estimator
@@ -620,7 +624,8 @@ def _run_search(self, evaluate_candidates):
         """
         raise NotImplementedError("_run_search not implemented.")
 
-    def fit(self, X, y=None, groups=None, **fit_params):
+    @_deprecate_positional_args
+    def fit(self, X, y=None, *, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
         Parameters
@@ -909,6 +914,9 @@ class GridSearchCV(BaseSearchCV):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     pre_dispatch : int, or str, default=n_jobs
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -1002,6 +1010,11 @@ class GridSearchCV(BaseSearchCV):
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
 
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
 
     Examples
     --------
@@ -1126,6 +1139,8 @@ class GridSearchCV(BaseSearchCV):
 
         This is present only if ``refit`` is not False.
 
+        .. versionadded:: 0.20
+
     Notes
     -----
     The parameters selected are those that maximize the score of the left out
@@ -1155,7 +1170,8 @@ class GridSearchCV(BaseSearchCV):
     """
     _required_parameters = ["estimator", "param_grid"]
 
-    def __init__(self, estimator, param_grid, scoring=None,
+    @_deprecate_positional_args
+    def __init__(self, estimator, param_grid, *, scoring=None,
                  n_jobs=None, iid='deprecated', refit=True, cv=None,
                  verbose=0, pre_dispatch='2*n_jobs',
                  error_score=np.nan, return_train_score=False):
@@ -1239,6 +1255,9 @@ class RandomizedSearchCV(BaseSearchCV):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     pre_dispatch : int, or str, default=None
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -1268,6 +1287,7 @@ class RandomizedSearchCV(BaseSearchCV):
     cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
+
         - None, to use the default 5-fold cross validation,
         - integer, to specify the number of folds in a `(Stratified)KFold`,
         - :term:`CV splitter`,
@@ -1338,6 +1358,11 @@ class RandomizedSearchCV(BaseSearchCV):
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
 
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
     Attributes
     ----------
     cv_results_ : dict of numpy (masked) ndarrays
@@ -1441,6 +1466,8 @@ class RandomizedSearchCV(BaseSearchCV):
 
         This is present only if ``refit`` is not False.
 
+        .. versionadded:: 0.20
+
     Notes
     -----
     The parameters selected are those that maximize the score of the held-out
@@ -1482,8 +1509,9 @@ class RandomizedSearchCV(BaseSearchCV):
     """
     _required_parameters = ["estimator", "param_distributions"]
 
-    def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
-                 n_jobs=None, iid='deprecated', refit=True,
+    @_deprecate_positional_args
+    def __init__(self, estimator, param_distributions, *, n_iter=10,
+                 scoring=None, n_jobs=None, iid='deprecated', refit=True,
                  cv=None, verbose=0, pre_dispatch='2*n_jobs',
                  random_state=None, error_score=np.nan,
                  return_train_score=False):
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index e728533c3b5cf..9b2087e039f40 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -24,6 +24,7 @@
 from ..utils import _approximate_mode
 from ..utils.validation import _num_samples, column_or_1d
 from ..utils.validation import check_array
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import type_of_target
 from ..base import _pprint
 
@@ -50,7 +51,6 @@ class BaseCrossValidator(metaclass=ABCMeta):
 
     Implementations must define `_iter_test_masks` or `_iter_test_indices`.
     """
-
     def split(self, X, y=None, groups=None):
         """Generate indices to split data into training and test set.
 
@@ -270,7 +270,8 @@ class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
     """Base class for KFold, GroupKFold, and StratifiedKFold"""
 
     @abstractmethod
-    def __init__(self, n_splits, shuffle, random_state):
+    @_deprecate_positional_args
+    def __init__(self, n_splits, *, shuffle, random_state):
         if not isinstance(n_splits, numbers.Integral):
             raise ValueError('The number of folds must be of Integral type. '
                              '%s of type %s was passed.'
@@ -426,10 +427,11 @@ class KFold(_BaseKFold):
 
     RepeatedKFold: Repeats K-Fold n times.
     """
-
-    def __init__(self, n_splits=5, shuffle=False,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, shuffle=False,
                  random_state=None):
-        super().__init__(n_splits, shuffle, random_state)
+        super().__init__(n_splits=n_splits, shuffle=shuffle,
+                         random_state=random_state)
 
     def _iter_test_indices(self, X, y=None, groups=None):
         n_samples = _num_samples(X)
@@ -633,9 +635,10 @@ class StratifiedKFold(_BaseKFold):
     --------
     RepeatedStratifiedKFold: Repeats Stratified K-Fold n times.
     """
-
-    def __init__(self, n_splits=5, shuffle=False, random_state=None):
-        super().__init__(n_splits, shuffle, random_state)
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle,
+                         random_state=random_state)
 
     def _make_test_folds(self, X, y=None):
         rng = check_random_state(self.random_state)
@@ -736,6 +739,8 @@ def split(self, X, y, groups=None):
 class TimeSeriesSplit(_BaseKFold):
     """Time Series cross-validator
 
+    .. versionadded:: 0.18
+
     Provides train/test indices to split time series data samples
     that are observed at fixed time intervals, in train/test sets.
     In each split, test indices must be higher than before, and thus shuffling
@@ -787,7 +792,8 @@ class TimeSeriesSplit(_BaseKFold):
     with a test set of size ``n_samples//(n_splits + 1)``,
     where ``n_samples`` is the number of samples.
     """
-    def __init__(self, n_splits=5, max_train_size=None):
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, max_train_size=None):
         super().__init__(n_splits, shuffle=False, random_state=None)
         self.max_train_size = max_train_size
 
@@ -1099,7 +1105,8 @@ class _RepeatedSplits(metaclass=ABCMeta):
         Constructor parameters for cv. Must not contain random_state
         and shuffle.
     """
-    def __init__(self, cv, n_repeats=10, random_state=None, **cvargs):
+    @_deprecate_positional_args
+    def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):
         if not isinstance(n_repeats, numbers.Integral):
             raise ValueError("Number of repetitions must be of Integral type.")
 
@@ -1226,9 +1233,11 @@ class RepeatedKFold(_RepeatedSplits):
     --------
     RepeatedStratifiedKFold: Repeats Stratified K-Fold n times.
     """
-    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
         super().__init__(
-            KFold, n_repeats, random_state, n_splits=n_splits)
+            KFold, n_repeats=n_repeats,
+            random_state=random_state, n_splits=n_splits)
 
 
 class RepeatedStratifiedKFold(_RepeatedSplits):
@@ -1280,15 +1289,17 @@ class RepeatedStratifiedKFold(_RepeatedSplits):
     --------
     RepeatedKFold: Repeats K-Fold n times.
     """
-    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
         super().__init__(
-            StratifiedKFold, n_repeats, random_state, n_splits=n_splits)
+            StratifiedKFold, n_repeats=n_repeats, random_state=random_state,
+            n_splits=n_splits)
 
 
 class BaseShuffleSplit(metaclass=ABCMeta):
     """Base class for ShuffleSplit and StratifiedShuffleSplit"""
-
-    def __init__(self, n_splits=10, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         self.n_splits = n_splits
         self.test_size = test_size
@@ -1421,7 +1432,8 @@ class ShuffleSplit(BaseShuffleSplit):
     TRAIN: [3 4 1] TEST: [5 2]
     TRAIN: [3 5 1] TEST: [2 4]
     """
-    def __init__(self, n_splits=10, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
             n_splits=n_splits,
@@ -1510,8 +1522,8 @@ class GroupShuffleSplit(ShuffleSplit):
     TRAIN: [2 3 4 5 6 7] TEST: [0 1]
     TRAIN: [0 1 5 6 7] TEST: [2 3 4]
     '''
-
-    def __init__(self, n_splits=5, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
             n_splits=n_splits,
@@ -1626,8 +1638,8 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     TRAIN: [4 1 0] TEST: [2 3 5]
     TRAIN: [0 5 1] TEST: [3 4 2]
     """
-
-    def __init__(self, n_splits=10, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
             n_splits=n_splits,
@@ -1959,7 +1971,8 @@ def split(self, X=None, y=None, groups=None):
             yield train, test
 
 
-def check_cv(cv=5, y=None, classifier=False):
+@_deprecate_positional_args
+def check_cv(cv=5, y=None, *, classifier=False):
     """Input checker utility for building a cross-validator
 
     Parameters
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index ae6151a88727b..dd204ad4a57d0 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -25,6 +25,7 @@
                      _message_with_time)
 from ..utils.validation import _check_fit_params
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.metaestimators import _safe_split
 from ..metrics import check_scoring
 from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer
@@ -37,7 +38,8 @@
            'permutation_test_score', 'learning_curve', 'validation_curve']
 
 
-def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
+@_deprecate_positional_args
+def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
                    n_jobs=None, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs', return_train_score=False,
                    return_estimator=False, error_score=np.nan):
@@ -134,15 +136,24 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
 
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
     return_estimator : bool, default=False
         Whether to return the estimators fitted on each split.
 
+        .. versionadded:: 0.20
+
     error_score : 'raise' or numeric
         Value to assign to the score if an error occurs in estimator fitting.
         If set to 'raise', the error is raised.
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     scores : dict of float arrays of shape (n_splits,)
@@ -261,8 +272,9 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     return ret
 
 
-def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
-                    n_jobs=None, verbose=0, fit_params=None,
+@_deprecate_positional_args
+def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None,
+                    cv=None, n_jobs=None, verbose=0, fit_params=None,
                     pre_dispatch='2*n_jobs', error_score=np.nan):
     """Evaluate a score by cross-validation
 
@@ -351,6 +363,8 @@ def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     scores : array of float, shape=(len(list(cv)),)
@@ -487,7 +501,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
             msg = ''
         else:
             msg = '%s' % (', '.join('%s=%s' % (k, v)
-                          for k, v in parameters.items()))
+                                    for k, v in parameters.items()))
         print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
 
     # Adjust length of sample weights
@@ -613,7 +627,8 @@ def _score(estimator, X_test, y_test, scorer):
     return scores
 
 
-def cross_val_predict(estimator, X, y=None, groups=None, cv=None,
+@_deprecate_positional_args
+def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
                       n_jobs=None, verbose=0, fit_params=None,
                       pre_dispatch='2*n_jobs', method='predict'):
     """Generate cross-validated estimates for each input data point
@@ -804,6 +819,9 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params,
     X : array-like of shape (n_samples, n_features)
         The data to fit.
 
+        .. versionchanged:: 0.20
+            X is only required to be an object with finite length or shape now
+
     y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
         The target variable to try to predict in the case of
         supervised learning.
@@ -943,7 +961,8 @@ def _check_is_permutation(indices, n_samples):
     return True
 
 
-def permutation_test_score(estimator, X, y, groups=None, cv=None,
+@_deprecate_positional_args
+def permutation_test_score(estimator, X, y, *, groups=None, cv=None,
                            n_permutations=100, n_jobs=None, random_state=0,
                            verbose=0, scoring=None):
     """Evaluate the significance of a cross-validated score with permutations
@@ -1083,7 +1102,8 @@ def _shuffle(y, groups, random_state):
     return _safe_indexing(y, indices)
 
 
-def learning_curve(estimator, X, y, groups=None,
+@_deprecate_positional_args
+def learning_curve(estimator, X, y, *, groups=None,
                    train_sizes=np.linspace(0.1, 1.0, 5), cv=None,
                    scoring=None, exploit_incremental_learning=False,
                    n_jobs=None, pre_dispatch="all", verbose=0, shuffle=False,
@@ -1186,6 +1206,8 @@ def learning_curve(estimator, X, y, groups=None,
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     return_times : bool, default=False
         Whether to return the fit and score times.
 
@@ -1367,7 +1389,8 @@ def _incremental_fit_estimator(estimator, X, y, classes, train, test,
     return np.array(ret).T
 
 
-def validation_curve(estimator, X, y, param_name, param_range, groups=None,
+@_deprecate_positional_args
+def validation_curve(estimator, X, y, *, param_name, param_range, groups=None,
                      cv=None, scoring=None, n_jobs=None, pre_dispatch="all",
                      verbose=0, error_score=np.nan):
     """Validation curve.
@@ -1379,7 +1402,7 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
     will also compute training scores and is merely a utility for plotting the
     results.
 
-    Read more in the :ref:`User Guide <learning_curve>`.
+    Read more in the :ref:`User Guide <validation_curve>`.
 
     Parameters
     ----------
@@ -1449,6 +1472,8 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     train_scores : array of shape (n_ticks, n_cv_folds)
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 0205eb8901699..3b984745420f1 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -960,7 +960,7 @@ def test_repeated_kfold_determinstic_split():
 def test_get_n_splits_for_repeated_kfold():
     n_splits = 3
     n_repeats = 4
-    rkf = RepeatedKFold(n_splits, n_repeats)
+    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
     expected_n_splits = n_splits * n_repeats
     assert expected_n_splits == rkf.get_n_splits()
 
@@ -968,7 +968,7 @@ def test_get_n_splits_for_repeated_kfold():
 def test_get_n_splits_for_repeated_stratified_kfold():
     n_splits = 3
     n_repeats = 4
-    rskf = RepeatedStratifiedKFold(n_splits, n_repeats)
+    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)
     expected_n_splits = n_splits * n_repeats
     assert expected_n_splits == rskf.get_n_splits()
 
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 67b66b6a91431..ec005f51fc260 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -42,7 +42,7 @@
 from sklearn.model_selection._validation import _score
 
 from sklearn.datasets import make_regression
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_iris
 from sklearn.datasets import load_digits
 from sklearn.metrics import explained_variance_score
@@ -370,8 +370,8 @@ def test_cross_validate():
 
     for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)):
         # It's okay to evaluate regression metrics on classification too
-        mse_scorer = check_scoring(est, 'neg_mean_squared_error')
-        r2_scorer = check_scoring(est, 'r2')
+        mse_scorer = check_scoring(est, scoring='neg_mean_squared_error')
+        r2_scorer = check_scoring(est, scoring='r2')
         train_mse_scores = []
         test_mse_scores = []
         train_r2_scores = []
@@ -768,7 +768,7 @@ def test_cross_val_score_multilabel():
 
 
 def test_cross_val_predict():
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
     cv = KFold()
 
     est = Ridge()
@@ -1251,7 +1251,8 @@ def test_validation_curve_cv_splits_consistency():
     X, y = make_classification(n_samples=100, random_state=0)
 
     scores1 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
-                               'C', [0.1, 0.1, 0.2, 0.2],
+                               param_name='C',
+                               param_range=[0.1, 0.1, 0.2, 0.2],
                                cv=OneTimeSplitter(n_splits=n_splits,
                                                   n_samples=n_samples))
     # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the
@@ -1262,7 +1263,8 @@ def test_validation_curve_cv_splits_consistency():
                                          2))
 
     scores2 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
-                               'C', [0.1, 0.1, 0.2, 0.2],
+                               param_name='C',
+                               param_range=[0.1, 0.1, 0.2, 0.2],
                                cv=KFold(n_splits=n_splits, shuffle=True))
 
     # For scores2, compare the 1st and 2nd parameter's scores
@@ -1272,7 +1274,8 @@ def test_validation_curve_cv_splits_consistency():
                                          2))
 
     scores3 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
-                               'C', [0.1, 0.1, 0.2, 0.2],
+                               param_name='C',
+                               param_range=[0.1, 0.1, 0.2, 0.2],
                                cv=KFold(n_splits=n_splits))
 
     # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check.
@@ -1679,9 +1682,9 @@ def test_warn_trace(msg):
                          failing_clf, X, y, cv=3, error_score='unvalid-string')
 
     assert_raise_message(ValueError, error_message, validation_curve,
-                         failing_clf, X, y, 'parameter',
-                         [FailingClassifier.FAILING_PARAMETER], cv=3,
-                         error_score='unvalid-string')
+                         failing_clf, X, y, param_name='parameter',
+                         param_range=[FailingClassifier.FAILING_PARAMETER],
+                         cv=3, error_score='unvalid-string')
 
     assert failing_clf.score() == 0.  # FailingClassifier coverage
 
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 9eeb4248f83fd..1f0bfaf6517b4 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -48,6 +48,7 @@
 from .utils.validation import _num_samples
 from .utils.validation import check_is_fitted
 from .utils.validation import check_X_y, check_array
+from .utils.validation import _deprecate_positional_args
 from .utils.multiclass import (_check_partial_fit_first_call,
                                check_classification_targets,
                                _ovr_decision_function)
@@ -164,6 +165,9 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     Attributes
     ----------
     estimators_ : list of `n_classes` estimators
@@ -201,7 +205,8 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
     array([2, 0, 1])
 
     """
-    def __init__(self, estimator, n_jobs=None):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
@@ -390,6 +395,10 @@ def decision_function(self, X):
         Returns
         -------
         T : array-like of shape (n_samples, n_classes)
+
+            .. versionchanged:: 0.19
+                output shape changed to ``(n_samples,)`` to conform to
+                scikit-learn conventions for binary classification.
         """
         check_is_fitted(self)
         if len(self.estimators_) == 1:
@@ -515,8 +524,8 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         Indices of samples used when training the estimators.
         ``None`` when ``estimator`` does not have ``_pairwise`` attribute.
     """
-
-    def __init__(self, estimator, n_jobs=None):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
@@ -643,6 +652,10 @@ def decision_function(self, X):
         Returns
         -------
         Y : array-like of shape (n_samples, n_classes)
+
+            .. versionchanged:: 0.19
+                output shape changed to ``(n_samples,)`` to conform to
+                scikit-learn conventions for binary classification.
         """
         check_is_fitted(self)
 
@@ -699,10 +712,9 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         than one-vs-the-rest.
 
     random_state : int, RandomState instance or None, optional, default: None
-        The generator used to initialize the codebook.  If int, random_state is
-        the seed used by the random number generator; If RandomState instance,
-        random_state is the random number generator; If None, the random number
-        generator is the RandomState instance used by `np.random`.
+        The generator used to initialize the codebook.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     n_jobs : int or None, optional (default=None)
         The number of jobs to use for the computation.
@@ -753,8 +765,8 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
        Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)
        2008.
     """
-
-    def __init__(self, estimator, code_size=1.5, random_state=None,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, code_size=1.5, random_state=None,
                  n_jobs=None):
         self.estimator = estimator
         self.code_size = code_size
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 2f8976a86c8b8..a5ede43f0fe8c 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -25,7 +25,7 @@
 from .utils import check_array, check_X_y, check_random_state
 from .utils.metaestimators import if_delegate_has_method
 from .utils.validation import (check_is_fitted, has_fit_parameter,
-                               _check_fit_params)
+                               _check_fit_params, _deprecate_positional_args)
 from .utils.multiclass import check_classification_targets
 from .utils import deprecated
 
@@ -64,7 +64,8 @@ def _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None,
 class _MultiOutputEstimator(BaseEstimator, MetaEstimatorMixin,
                             metaclass=ABCMeta):
     @abstractmethod
-    def __init__(self, estimator, n_jobs=None):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
@@ -214,6 +215,8 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
     simple strategy for extending regressors that do not natively support
     multi-target regression.
 
+    .. versionadded:: 0.18
+
     Parameters
     ----------
     estimator : estimator object
@@ -229,6 +232,9 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
         using `n_jobs>1` can result in slower performance due
         to the overhead of spawning processes.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     Attributes
     ----------
     estimators_ : list of ``n_output`` estimators
@@ -245,9 +251,9 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
     >>> clf.predict(X[[0]])
     array([[176..., 35..., 57...]])
     """
-
-    def __init__(self, estimator, n_jobs=None):
-        super().__init__(estimator, n_jobs)
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
+        super().__init__(estimator, n_jobs=n_jobs)
 
     @if_delegate_has_method('estimator')
     def partial_fit(self, X, y, sample_weight=None):
@@ -295,6 +301,9 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     Attributes
     ----------
     classes_ : array, shape = (n_classes,)
@@ -315,9 +324,9 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):
     >>> clf.predict(X[-2:])
     array([[1, 1, 0], [1, 1, 1]])
     """
-
-    def __init__(self, estimator, n_jobs=None):
-        super().__init__(estimator, n_jobs)
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
+        super().__init__(estimator, n_jobs=n_jobs)
 
     def fit(self, X, Y, sample_weight=None, **fit_params):
         """Fit the model to data matrix X and targets Y.
@@ -362,6 +371,11 @@ def predict_proba(self):
             such arrays if n_outputs > 1.
             The class probabilities of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
+
+            .. versionchanged:: 0.19
+                This function now returns a list of arrays where the length of
+                the list is ``n_outputs``, and each array is (``n_samples``,
+                ``n_classes``) for that particular output.
         """
         check_is_fitted(self)
         if not all([hasattr(estimator, "predict_proba")
@@ -409,7 +423,9 @@ def _more_tags(self):
 
 
 class _BaseChain(BaseEstimator, metaclass=ABCMeta):
-    def __init__(self, base_estimator, order=None, cv=None, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, base_estimator, *, order=None, cv=None,
+                 random_state=None):
         self.base_estimator = base_estimator
         self.order = order
         self.cv = cv
@@ -689,6 +705,8 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain):
 
     Read more in the :ref:`User Guide <regressorchain>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     base_estimator : estimator
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 6ef3895ffdb60..e631bb3dcd599 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -32,6 +32,7 @@
 from .utils.multiclass import _check_partial_fit_first_call
 from .utils.validation import check_is_fitted, check_non_negative, column_or_1d
 from .utils.validation import _check_sample_weight
+from .utils.validation import _deprecate_positional_args
 
 __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB',
            'CategoricalNB']
@@ -139,6 +140,8 @@ class GaussianNB(_BaseNB):
         Portion of the largest variance of all features that is added to
         variances for calculation stability.
 
+        .. versionadded:: 0.20
+
     Attributes
     ----------
     class_count_ : ndarray of shape (n_classes,)
@@ -177,7 +180,8 @@ class labels known to the classifier
     [1]
     """
 
-    def __init__(self, priors=None, var_smoothing=1e-9):
+    @_deprecate_positional_args
+    def __init__(self, *, priors=None, var_smoothing=1e-9):
         self.priors = priors
         self.var_smoothing = var_smoothing
 
@@ -745,7 +749,8 @@ class MultinomialNB(_BaseDiscreteNB):
     https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
     """
 
-    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None):
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):
         self.alpha = alpha
         self.fit_prior = fit_prior
         self.class_prior = class_prior
@@ -782,6 +787,8 @@ class ComplementNB(_BaseDiscreteNB):
 
     Read more in the :ref:`User Guide <complement_naive_bayes>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     alpha : float, default=1.0
@@ -847,7 +854,8 @@ class ComplementNB(_BaseDiscreteNB):
     https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
     """
 
-    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None,
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None,
                  norm=False):
         self.alpha = alpha
         self.fit_prior = fit_prior
@@ -961,7 +969,8 @@ class BernoulliNB(_BaseDiscreteNB):
     naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).
     """
 
-    def __init__(self, alpha=1.0, binarize=.0, fit_prior=True,
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True,
                  class_prior=None):
         self.alpha = alpha
         self.binarize = binarize
@@ -1072,7 +1081,8 @@ class CategoricalNB(_BaseDiscreteNB):
     [3]
     """
 
-    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None):
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):
         self.alpha = alpha
         self.fit_prior = fit_prior
         self.class_prior = class_prior
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 945959ef10d9c..a1eebdcf78648 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import BaseEstimator, MultiOutputMixin
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..utils import check_X_y, check_array, gen_even_slices
+from ..utils import check_array, gen_even_slices
 from ..utils import _to_object_array
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
@@ -1104,10 +1104,14 @@ def fit(self, X, y):
              or [n_samples, n_outputs]
         """
         if not isinstance(X, (KDTree, BallTree)):
-            X, y = check_X_y(X, y, "csr", multi_output=True)
+            X, y = self._validate_data(X, y, accept_sparse="csr",
+                                       multi_output=True)
         self._y = y
         return self._fit(X)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class SupervisedIntegerMixin:
     def fit(self, X, y):
@@ -1124,7 +1128,8 @@ def fit(self, X, y):
 
         """
         if not isinstance(X, (KDTree, BallTree)):
-            X, y = check_X_y(X, y, "csr", multi_output=True)
+            X, y = self._validate_data(X, y, accept_sparse="csr",
+                                       multi_output=True)
 
         if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
             if y.ndim != 1:
@@ -1151,6 +1156,9 @@ def fit(self, X, y):
 
         return self._fit(X)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class UnsupervisedMixin:
     def fit(self, X, y=None):
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index ef6a2a2d5d330..599a4e9cc6426 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -239,9 +239,10 @@ cdef NodeData_t[::1] get_memview_NodeData_1D(
 # Define doc strings, substituting the appropriate class name using
 # the DOC_DICT variable defined in the pyx files.
 CLASS_DOC = \
-"""{BinaryTree} for fast generalized N-point problems
+"""
+{BinaryTree}(X, leaf_size=40, metric='minkowski', **kwargs)
 
-{BinaryTree}(X, leaf_size=40, metric='minkowski', \\**kwargs)
+{BinaryTree} for fast generalized N-point problems
 
 Parameters
 ----------
@@ -1159,15 +1160,50 @@ cdef class BinaryTree:
         self._update_memviews()
 
     def get_tree_stats(self):
+        """
+        get_tree_stats(self)
+
+        Get tree status.
+
+        Returns
+        -------
+        tree_stats: tuple of int
+            (number of trims, number of leaves, number of splits)
+        """
         return (self.n_trims, self.n_leaves, self.n_splits)
 
     def reset_n_calls(self):
+        """
+        reset_n_calls(self)
+
+        Reset number of calls to 0.
+        """
         self.n_calls = 0
 
     def get_n_calls(self):
+        """
+        get_n_calls(self)
+
+        Get number of calls.
+
+        Returns
+        -------
+        n_calls: int
+            number of distance computation calls
+        """
         return self.n_calls
 
     def get_arrays(self):
+        """
+        get_arrays(self)
+
+        Get data and node arrays.
+
+        Returns
+        -------
+        arrays: tuple of array
+            Arrays for storing tree data, index, node data and node bounds.
+        """
         return (self.data_arr, self.idx_array_arr,
                 self.node_data_arr, self.node_bounds_arr)
 
@@ -1362,7 +1398,8 @@ cdef class BinaryTree:
     def query_radius(self, X, r, int return_distance=False,
                      int count_only=False, int sort_results=False):
         """
-        query_radius(self, X, r, count_only = False):
+        query_radius(X, r, return_distance=False,
+        count_only=False, sort_results=False)
 
         query the tree for neighbors within a radius r
 
@@ -1694,7 +1731,10 @@ cdef class BinaryTree:
             return np.exp(log_density_arr)
 
     def two_point_correlation(self, X, r, dualtree=False):
-        """Compute the two-point correlation function
+        """
+        two_point_correlation(X, r, dualtree=False)
+
+        Compute the two-point correlation function
 
         Parameters
         ----------
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 0580b710afd44..331eb7821a511 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -20,6 +20,7 @@
     RadiusNeighborsMixin, SupervisedIntegerMixin
 from ..base import ClassifierMixin
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
@@ -71,10 +72,10 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, default=None
@@ -142,7 +143,8 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, n_neighbors=5,
+    @_deprecate_positional_args
+    def __init__(self, n_neighbors=5, *,
                  weights='uniform', algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None,
                  **kwargs):
@@ -303,10 +305,10 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     outlier_label : {manual label, 'most_frequent'}, default=None
@@ -374,7 +376,8 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, radius=1.0, weights='uniform',
+    @_deprecate_positional_args
+    def __init__(self, radius=1.0, *, weights='uniform',
                  algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                  outlier_label=None, metric_params=None, n_jobs=None,
                  **kwargs):
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx
index 94c67f8ee9fa3..0c24efdd214e6 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/neighbors/_dist_metrics.pyx
@@ -110,8 +110,10 @@ cdef class DistanceMetric:
     This class provides a uniform interface to fast distance metric
     functions.  The various metrics can be accessed via the :meth:`get_metric`
     class method and the metric string identifier (see below).
-    For example, to use the Euclidean distance:
 
+    Examples
+    --------
+    >>> from sklearn.neighbors import DistanceMetric
     >>> dist = DistanceMetric.get_metric('euclidean')
     >>> X = [[0, 1, 2],
              [3, 4, 5]]
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 9fc4a6e830cde..6bf8da3f4ef5e 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -9,7 +9,7 @@
 from ._base import UnsupervisedMixin
 from ._unsupervised import NearestNeighbors
 from ..base import TransformerMixin
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 
 
 def _check_params(X, metric, p, metric_params):
@@ -37,8 +37,10 @@ def _query_include_self(X, include_self, mode):
     return X
 
 
-def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
-                     p=2, metric_params=None, include_self=False, n_jobs=None):
+@_deprecate_positional_args
+def kneighbors_graph(X, n_neighbors, *, mode='connectivity',
+                     metric='minkowski', p=2, metric_params=None,
+                     include_self=False, n_jobs=None):
     """Computes the (weighted) graph of k-Neighbors for points in X
 
     Read more in the :ref:`User Guide <unsupervised_neighbors>`.
@@ -103,7 +105,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     radius_neighbors_graph
     """
     if not isinstance(X, KNeighborsMixin):
-        X = NearestNeighbors(n_neighbors, metric=metric, p=p,
+        X = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p,
                              metric_params=metric_params, n_jobs=n_jobs).fit(X)
     else:
         _check_params(X, metric, p, metric_params)
@@ -112,9 +114,10 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
 
 
-def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
-                           p=2, metric_params=None, include_self=False,
-                           n_jobs=None):
+@_deprecate_positional_args
+def radius_neighbors_graph(X, radius, *, mode='connectivity',
+                           metric='minkowski', p=2, metric_params=None,
+                           include_self=False, n_jobs=None):
     """Computes the (weighted) graph of Neighbors for points in X
 
     Neighborhoods are restricted the points at a distance lower than
@@ -192,8 +195,8 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
     return X.radius_neighbors_graph(query, radius, mode)
 
 
-class KNeighborsTransformer(NeighborsBase, KNeighborsMixin,
-                            UnsupervisedMixin, TransformerMixin):
+class KNeighborsTransformer(KNeighborsMixin, UnsupervisedMixin,
+                            TransformerMixin, NeighborsBase):
     """Transform X into a (weighted) graph of k nearest neighbors
 
     The transformed data is a sparse graph as returned by kneighbors_graph.
@@ -281,7 +284,8 @@ class KNeighborsTransformer(NeighborsBase, KNeighborsMixin,
     ...     KNeighborsTransformer(n_neighbors=5, mode='distance'),
     ...     Isomap(neighbors_algorithm='precomputed'))
     """
-    def __init__(self, mode='distance', n_neighbors=5, algorithm='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto',
                  leaf_size=30, metric='minkowski', p=2, metric_params=None,
                  n_jobs=1):
         super(KNeighborsTransformer, self).__init__(
@@ -335,8 +339,8 @@ def fit_transform(self, X, y=None):
         return self.fit(X).transform(X)
 
 
-class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin,
-                                 UnsupervisedMixin, TransformerMixin):
+class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin,
+                                 TransformerMixin, NeighborsBase):
     """Transform X into a (weighted) graph of neighbors nearer than a radius
 
     The transformed data is a sparse graph as returned by
@@ -422,7 +426,8 @@ class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin,
     ...     RadiusNeighborsTransformer(radius=42.0, mode='distance'),
     ...     DBSCAN(min_samples=30, metric='precomputed'))
     """
-    def __init__(self, mode='distance', radius=1., algorithm='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, mode='distance', radius=1., algorithm='auto',
                  leaf_size=30, metric='minkowski', p=2, metric_params=None,
                  n_jobs=1):
         super(RadiusNeighborsTransformer, self).__init__(
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 91a97e2810baa..1a967e301b357 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -9,6 +9,7 @@
 from ..base import BaseEstimator
 from ..utils import check_array, check_random_state
 from ..utils.validation import _check_sample_weight, check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ..utils.extmath import row_norms
 from ._ball_tree import BallTree, DTYPE
@@ -89,7 +90,8 @@ class KernelDensity(BaseEstimator):
     >>> log_density
     array([-1.52955942, -1.51462041, -1.60244657])
     """
-    def __init__(self, bandwidth=1.0, algorithm='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, bandwidth=1.0, algorithm='auto',
                  kernel='gaussian', metric="euclidean", atol=0, rtol=0,
                  breadth_first=True, leaf_size=40, metric_params=None):
         self.algorithm = algorithm
@@ -146,6 +148,8 @@ def fit(self, X, y=None, sample_weight=None):
         sample_weight : array_like, shape (n_samples,), optional
             List of sample weights attached to the data X.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         self : object
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index fc27b7ed69420..2d456ff3e620f 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -11,13 +11,14 @@
 from ..base import OutlierMixin
 
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..utils import check_array
 
 __all__ = ["LocalOutlierFactor"]
 
 
-class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
-                         OutlierMixin):
+class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin,
+                         OutlierMixin, NeighborsBase):
     """Unsupervised Outlier Detection using Local Outlier Factor (LOF)
 
     The anomaly score of each sample is called Local Outlier Factor.
@@ -116,6 +117,8 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
         that you should only use predict, decision_function and score_samples
         on new unseen data and not on the training set.
 
+        .. versionadded:: 0.20
+
     n_jobs : int, default=None
         The number of parallel jobs to run for neighbors search.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
@@ -147,6 +150,8 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
         case, the offset is defined in such a way we obtain the expected
         number of outliers in training.
 
+        .. versionadded:: 0.20
+
     Examples
     --------
     >>> import numpy as np
@@ -163,7 +168,8 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
     .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
            LOF: identifying density-based local outliers. In ACM sigmod record.
     """
-    def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30,
+    @_deprecate_positional_args
+    def __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30,
                  metric='minkowski', p=2, metric_params=None,
                  contamination="auto", novelty=False, n_jobs=None):
         super().__init__(
@@ -176,8 +182,9 @@ def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30,
 
     @property
     def fit_predict(self):
-        """"Fits the model to the training set X and returns the labels.
+        """Fits the model to the training set X and returns the labels.
 
+        **Only available for novelty detection (when novelty is set to True).**
         Label is 1 for an inlier and -1 for an outlier according to the LOF
         score and the contamination parameter.
 
@@ -207,7 +214,7 @@ def fit_predict(self):
         return self._fit_predict
 
     def _fit_predict(self, X, y=None):
-        """"Fits the model to the training set X and returns the labels.
+        """Fits the model to the training set X and returns the labels.
 
         Label is 1 for an inlier and -1 for an outlier according to the LOF
         score and the contamination parameter.
@@ -286,9 +293,9 @@ def fit(self, X, y=None):
     def predict(self):
         """Predict the labels (1 inlier, -1 outlier) of X according to LOF.
 
+        **Only available for novelty detection (when novelty is set to True).**
         This method allows to generalize prediction to *new observations* (not
-        in the training set). Only available for novelty detection (when
-        novelty is set to True).
+        in the training set).
 
         Parameters
         ----------
@@ -345,8 +352,8 @@ def decision_function(self):
 
         Bigger is better, i.e. large values correspond to inliers.
 
+        **Only available for novelty detection (when novelty is set to True).**
         The shift offset allows a zero threshold for being an outlier.
-        Only available for novelty detection (when novelty is set to True).
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any
@@ -381,8 +388,8 @@ def _decision_function(self, X):
 
         Bigger is better, i.e. large values correspond to inliers.
 
+        **Only available for novelty detection (when novelty is set to True).**
         The shift offset allows a zero threshold for being an outlier.
-        Only available for novelty detection (when novelty is set to True).
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any
@@ -411,7 +418,7 @@ def score_samples(self):
         It is the opposite as bigger is better, i.e. large values correspond
         to inliers.
 
-        Only available for novelty detection (when novelty is set to True).
+        **Only available for novelty detection (when novelty is set to True).**
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any
@@ -447,7 +454,7 @@ def _score_samples(self, X):
         It is the opposite as bigger is better, i.e. large values correspond
         to inliers.
 
-        Only available for novelty detection (when novelty is set to True).
+        **Only available for novelty detection (when novelty is set to True).**
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index cd87d594281da..8920b2d99ed02 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -23,6 +23,7 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import check_random_state
 from ..utils.validation import check_is_fitted, check_array, check_scalar
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -161,7 +162,8 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator):
 
     """
 
-    def __init__(self, n_components=None, init='auto', warm_start=False,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, init='auto', warm_start=False,
                  max_iter=50, tol=1e-5, callback=None, verbose=0,
                  random_state=None):
         self.n_components = n_components
@@ -306,7 +308,7 @@ def _validate_params(self, X, y):
         # Check the preferred dimensionality of the projected space
         if self.n_components is not None:
             check_scalar(
-                self.n_components, 'n_components', numbers.Integral, 1)
+                self.n_components, 'n_components', numbers.Integral, min_val=1)
 
             if self.n_components > X.shape[1]:
                 raise ValueError('The preferred dimensionality of the '
@@ -325,9 +327,9 @@ def _validate_params(self, X, y):
                                  .format(X.shape[1],
                                          self.components_.shape[1]))
 
-        check_scalar(self.max_iter, 'max_iter', numbers.Integral, 1)
-        check_scalar(self.tol, 'tol', numbers.Real, 0.)
-        check_scalar(self.verbose, 'verbose', numbers.Integral, 0)
+        check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1)
+        check_scalar(self.tol, 'tol', numbers.Real, min_val=0.)
+        check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0)
 
         if self.callback is not None:
             if not callable(self.callback):
@@ -520,3 +522,6 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
             sys.stdout.flush()
 
         return sign * loss, sign * gradient.ravel()
+
+    def _more_tags(self):
+        return {'requires_y': True}
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index 0fdcd597353f5..62f74940100e7 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -16,6 +16,7 @@
 from ..metrics.pairwise import pairwise_distances
 from ..preprocessing import LabelEncoder
 from ..utils.validation import check_array, check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..utils.sparsefuncs import csc_median_axis_0
 from ..utils.multiclass import check_classification_targets
 
@@ -41,6 +42,9 @@ class NearestCentroid(ClassifierMixin, BaseEstimator):
         If the "manhattan" metric is provided, this centroid is the median and
         for all other metrics, the centroid is now set to be the mean.
 
+        .. versionchanged:: 0.19
+            ``metric='precomputed'`` was deprecated and now raises an error
+
     shrink_threshold : float, default=None
         Threshold for shrinking centroids to remove features.
 
@@ -82,7 +86,8 @@ class NearestCentroid(ClassifierMixin, BaseEstimator):
 
     """
 
-    def __init__(self, metric='euclidean', shrink_threshold=None):
+    @_deprecate_positional_args
+    def __init__(self, metric='euclidean', *, shrink_threshold=None):
         self.metric = metric
         self.shrink_threshold = shrink_threshold
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 00d8f10c8880d..845aacbfd4248 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -18,6 +18,7 @@
 from ._base import RadiusNeighborsMixin, SupervisedFloatMixin
 from ..base import RegressorMixin
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
@@ -77,10 +78,10 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, default=None
@@ -139,7 +140,8 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, n_neighbors=5, weights='uniform',
+    @_deprecate_positional_args
+    def __init__(self, n_neighbors=5, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None,
                  **kwargs):
@@ -253,10 +255,10 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, default=None
@@ -307,7 +309,8 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, radius=1.0, weights='uniform',
+    @_deprecate_positional_args
+    def __init__(self, radius=1.0, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None,
                  **kwargs):
diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/neighbors/_typedefs.pyx
index bbdfd00505b43..789afb4997dd1 100644
--- a/sklearn/neighbors/_typedefs.pyx
+++ b/sklearn/neighbors/_typedefs.pyx
@@ -4,6 +4,9 @@ import numpy as np
 cimport numpy as np
 from libc.math cimport sqrt
 
+np.import_array()
+
+
 # use a hack to determine the associated numpy data types
 # NOTE: the following requires the buffer interface, only available in
 #       numpy 1.5+.  We'll choose the DTYPE by hand instead.
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 20be4f636c2a4..7e120d7587b66 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -3,10 +3,11 @@
 from ._base import KNeighborsMixin
 from ._base import RadiusNeighborsMixin
 from ._base import UnsupervisedMixin
+from ..utils.validation import _deprecate_positional_args
 
 
-class NearestNeighbors(NeighborsBase, KNeighborsMixin,
-                       RadiusNeighborsMixin, UnsupervisedMixin):
+class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin,
+                       UnsupervisedMixin, NeighborsBase):
     """Unsupervised learner for implementing neighbor searches.
 
     Read more in the :ref:`User Guide <unsupervised_neighbors>`.
@@ -43,10 +44,10 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     p : int, default=2
@@ -78,7 +79,7 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin,
       >>> from sklearn.neighbors import NearestNeighbors
       >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
 
-      >>> neigh = NearestNeighbors(2, 0.4)
+      >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
       >>> neigh.fit(samples)
       NearestNeighbors(...)
 
@@ -105,7 +106,8 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, n_neighbors=5, radius=1.0,
+    @_deprecate_positional_args
+    def __init__(self, *, n_neighbors=5, radius=1.0,
                  algorithm='auto', leaf_size=30, metric='minkowski',
                  p=2, metric_params=None, n_jobs=None):
         super().__init__(
diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py
index e17e8e575f728..cff7ffafe5acd 100644
--- a/sklearn/neighbors/tests/test_kde.py
+++ b/sklearn/neighbors/tests/test_kde.py
@@ -74,7 +74,7 @@ def test_kernel_density_sampling(n_samples=100, n_features=3):
 
     for kernel in ['gaussian', 'tophat']:
         # draw a tophat sample
-        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
         samp = kde.sample(100)
         assert X.shape == samp.shape
 
@@ -91,7 +91,7 @@ def test_kernel_density_sampling(n_samples=100, n_features=3):
 
     # check unsupported kernels
     for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
-        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
         assert_raises(NotImplementedError, kde.sample, 100)
 
     # non-regression test: used to return a scalar
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 88e32669777a1..d62b998052656 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1245,9 +1245,9 @@ def custom_metric(x1, x2):
         return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))
 
     X = np.random.RandomState(42).rand(20, 2)
-    nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto',
+    nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
                                        metric=custom_metric)
-    nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute',
+    nbrs2 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
                                        metric=custom_metric)
 
     nbrs1.fit(X)
@@ -1339,7 +1339,7 @@ def test_non_euclidean_kneighbors():
         nbrs_graph = neighbors.kneighbors_graph(
             X, 3, metric=metric, mode='connectivity',
             include_self=True).toarray()
-        nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
+        nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, metric=metric).fit(X)
         assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())
 
     # Test radiusneighbors_graph
@@ -1351,7 +1351,7 @@ def test_non_euclidean_kneighbors():
         assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A)
 
     # Raise error when wrong parameters are supplied,
-    X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
+    X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric='manhattan')
     X_nbrs.fit(X)
     assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
                   metric='euclidean')
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 3ec30336c23c1..f9b8fce5eb0c7 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -25,7 +25,7 @@
 from ..utils import check_array, column_or_1d
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..utils.multiclass import _check_partial_fit_first_call, unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils.optimize import _check_optimize_result
@@ -936,7 +936,8 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
     Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
         optimization." arXiv preprint arXiv:1412.6980 (2014).
     """
-    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
+    @_deprecate_positional_args
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
                  solver='adam', alpha=0.0001,
                  batch_size='auto', learning_rate="constant",
                  learning_rate_init=0.001, power_t=0.5, max_iter=200,
@@ -1339,7 +1340,8 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
     Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
         optimization." arXiv preprint arXiv:1412.6980 (2014).
     """
-    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
+    @_deprecate_positional_args
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
                  solver='adam', alpha=0.0001,
                  batch_size='auto', learning_rate="constant",
                  learning_rate_init=0.001,
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 06e7cc71bad3c..fcb4e90772598 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -20,7 +20,7 @@
 from ..utils import gen_even_slices
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import log_logistic
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 
 
 class BernoulliRBM(TransformerMixin, BaseEstimator):
@@ -106,7 +106,8 @@ class BernoulliRBM(TransformerMixin, BaseEstimator):
         Approximations to the Likelihood Gradient. International Conference
         on Machine Learning (ICML) 2008
     """
-    def __init__(self, n_components=256, learning_rate=0.1, batch_size=10,
+    @_deprecate_positional_args
+    def __init__(self, n_components=256, *, learning_rate=0.1, batch_size=10,
                  n_iter=10, verbose=0, random_state=None):
         self.n_components = n_components
         self.learning_rate = learning_rate
@@ -356,7 +357,7 @@ def fit(self, X, y=None):
 
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         batch_slices = list(gen_even_slices(n_batches * self.batch_size,
-                                            n_batches, n_samples))
+                                            n_batches, n_samples=n_samples))
         verbose = self.verbose
         begin = time.time()
         for iteration in range(1, self.n_iter + 1):
@@ -375,7 +376,7 @@ def fit(self, X, y=None):
 
     def _more_tags(self):
         return {
-            '_xfail_test': {
+            '_xfail_checks': {
                 'check_methods_subset_invariance':
                 'fails for the decision_function method'
             }
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index c1bbdbd629ff8..6f02cb565e15c 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -18,9 +18,11 @@
 from joblib import Parallel, delayed
 
 from .base import clone, TransformerMixin
+from .utils._estimator_html_repr import _VisualBlock
 from .utils.metaestimators import if_delegate_has_method
 from .utils import Bunch, _print_elapsed_time
 from .utils.validation import check_memory
+from .utils.validation import _deprecate_positional_args
 
 from .utils.metaestimators import _BaseComposition
 
@@ -104,7 +106,8 @@ class Pipeline(_BaseComposition):
     # BaseEstimator interface
     _required_parameters = ['steps']
 
-    def __init__(self, steps, memory=None, verbose=False):
+    @_deprecate_positional_args
+    def __init__(self, steps, *, memory=None, verbose=False):
         self.steps = steps
         self.memory = memory
         self.verbose = verbose
@@ -393,6 +396,8 @@ def predict(self, X, **predict_params):
             transformations in the pipeline are not propagated to the
             final estimator.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         y_pred : array-like
@@ -619,6 +624,21 @@ def n_features_in_(self):
         # delegate to first step (which will call _check_is_fitted)
         return self.steps[0][1].n_features_in_
 
+    def _sk_visual_block_(self):
+        _, estimators = zip(*self.steps)
+
+        def _get_name(name, est):
+            if est is None or est == 'passthrough':
+                return f'{name}: passthrough'
+            # Is an estimator
+            return f'{name}: {est.__class__.__name__}'
+        names = [_get_name(name, est) for name, est in self.steps]
+        name_details = [str(est) for est in estimators]
+        return _VisualBlock('serial', estimators,
+                            names=names,
+                            name_details=name_details,
+                            dash_wrapped=False)
+
 
 def _name_estimators(estimators):
     """Generate names for estimators."""
@@ -771,6 +791,9 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     transformer_weights : dict, default=None
         Multiplicative weights for features per transformer.
         Keys are transformer names, values the weights.
@@ -797,7 +820,8 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
     """
     _required_parameters = ["transformer_list"]
 
-    def __init__(self, transformer_list, n_jobs=None,
+    @_deprecate_positional_args
+    def __init__(self, transformer_list, *, n_jobs=None,
                  transformer_weights=None, verbose=False):
         self.transformer_list = transformer_list
         self.n_jobs = n_jobs
@@ -996,6 +1020,10 @@ def n_features_in_(self):
         # X is passed to all transformers so we just delegate to the first one
         return self.transformer_list[0][1].n_features_in_
 
+    def _sk_visual_block_(self):
+        names, transformers = zip(*self.transformer_list)
+        return _VisualBlock('parallel', transformers, names=names)
+
 
 def make_union(*transformers, **kwargs):
     """
@@ -1015,6 +1043,9 @@ def make_union(*transformers, **kwargs):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     verbose : bool, default=False
         If True, the time elapsed while fitting each transformer will be
         printed as it is completed.
diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx
index dd36f8321410f..84fef3f042dc7 100644
--- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx
+++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx
@@ -8,6 +8,7 @@ from scipy.sparse import csr_matrix
 from numpy cimport ndarray
 cimport numpy as np
 
+np.import_array()
 ctypedef np.int32_t INDEX_T
 
 ctypedef fused DATA_T:
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index c95351db9d985..cc8776951f114 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -29,7 +29,7 @@
                                  mean_variance_axis, incr_mean_variance_axis,
                                  min_max_axis)
 from ..utils.validation import (check_is_fitted, check_random_state,
-                                FLOAT_DTYPES)
+                                FLOAT_DTYPES, _deprecate_positional_args)
 
 from ._csr_polynomial_expansion import _csr_polynomial_expansion
 
@@ -78,7 +78,8 @@ def _handle_zeros_in_scale(scale, copy=True):
         return scale
 
 
-def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
+@_deprecate_positional_args
+def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
     """Standardize a dataset along any axis
 
     Center to the mean and component wise scale to unit variance.
@@ -291,7 +292,8 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    def __init__(self, feature_range=(0, 1), copy=True):
+    @_deprecate_positional_args
+    def __init__(self, feature_range=(0, 1), *, copy=True):
         self.feature_range = feature_range
         self.copy = copy
 
@@ -435,7 +437,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
+@_deprecate_positional_args
+def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
     """Transform features by scaling each feature to a given range.
 
     This estimator scales and translates each feature individually such
@@ -626,7 +629,8 @@ class StandardScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """  # noqa
 
-    def __init__(self, copy=True, with_mean=True, with_std=True):
+    @_deprecate_positional_args
+    def __init__(self, *, copy=True, with_mean=True, with_std=True):
         self.with_mean = with_mean
         self.with_std = with_std
         self.copy = copy
@@ -908,7 +912,8 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    def __init__(self, copy=True):
+    @_deprecate_positional_args
+    def __init__(self, *, copy=True):
         self.copy = copy
 
     def _reset(self):
@@ -1024,7 +1029,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def maxabs_scale(X, axis=0, copy=True):
+@_deprecate_positional_args
+def maxabs_scale(X, *, axis=0, copy=True):
     """Scale each feature to the [-1, 1] range without breaking the sparsity.
 
     This estimator scales each feature individually such
@@ -1172,8 +1178,8 @@ class RobustScaler(TransformerMixin, BaseEstimator):
     https://en.wikipedia.org/wiki/Median
     https://en.wikipedia.org/wiki/Interquartile_range
     """
-
-    def __init__(self, with_centering=True, with_scaling=True,
+    @_deprecate_positional_args
+    def __init__(self, *, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True):
         self.with_centering = with_centering
         self.with_scaling = with_scaling
@@ -1282,7 +1288,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def robust_scale(X, axis=0, with_centering=True, with_scaling=True,
+@_deprecate_positional_args
+def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True):
     """Standardize a dataset along any axis
 
@@ -1433,7 +1440,8 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
     See :ref:`examples/linear_model/plot_polynomial_interpolation.py
     <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
     """
-    def __init__(self, degree=2, interaction_only=False, include_bias=True,
+    @_deprecate_positional_args
+    def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
                  order='C'):
         self.degree = degree
         self.interaction_only = interaction_only
@@ -1638,7 +1646,8 @@ def transform(self, X):
         return XP
 
 
-def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
+@_deprecate_positional_args
+def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
     """Scale input vectors individually to unit norm (vector length).
 
     Read more in the :ref:`User Guide <preprocessing_normalization>`.
@@ -1698,7 +1707,7 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
     else:
         raise ValueError("'%d' is not a supported axis" % axis)
 
-    X = check_array(X, sparse_format, copy=copy,
+    X = check_array(X, accept_sparse=sparse_format, copy=copy,
                     estimator='the normalize function', dtype=FLOAT_DTYPES)
     if axis == 0:
         X = X.T
@@ -1797,7 +1806,8 @@ class Normalizer(TransformerMixin, BaseEstimator):
     normalize: Equivalent function without the estimator API.
     """
 
-    def __init__(self, norm='l2', copy=True):
+    @_deprecate_positional_args
+    def __init__(self, norm='l2', *, copy=True):
         self.norm = norm
         self.copy = copy
 
@@ -1833,7 +1843,8 @@ def _more_tags(self):
         return {'stateless': True}
 
 
-def binarize(X, threshold=0.0, copy=True):
+@_deprecate_positional_args
+def binarize(X, *, threshold=0.0, copy=True):
     """Boolean thresholding of array-like or scipy.sparse matrix
 
     Read more in the :ref:`User Guide <preprocessing_binarization>`.
@@ -1931,7 +1942,8 @@ class Binarizer(TransformerMixin, BaseEstimator):
     binarize: Equivalent function without the estimator API.
     """
 
-    def __init__(self, threshold=0.0, copy=True):
+    @_deprecate_positional_args
+    def __init__(self, *, threshold=0.0, copy=True):
         self.threshold = threshold
         self.copy = copy
 
@@ -2228,7 +2240,8 @@ class QuantileTransformer(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    def __init__(self, n_quantiles=1000, output_distribution='uniform',
+    @_deprecate_positional_args
+    def __init__(self, *, n_quantiles=1000, output_distribution='uniform',
                  ignore_implicit_zeros=False, subsample=int(1e5),
                  random_state=None, copy=True):
         self.n_quantiles = n_quantiles
@@ -2560,7 +2573,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def quantile_transform(X, axis=0, n_quantiles=1000,
+@_deprecate_positional_args
+def quantile_transform(X, *, axis=0, n_quantiles=1000,
                        output_distribution='uniform',
                        ignore_implicit_zeros=False,
                        subsample=int(1e5),
@@ -2764,7 +2778,8 @@ class PowerTransformer(TransformerMixin, BaseEstimator):
     .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
            of the Royal Statistical Society B, 26, 211-252 (1964).
     """
-    def __init__(self, method='yeo-johnson', standardize=True, copy=True):
+    @_deprecate_positional_args
+    def __init__(self, method='yeo-johnson', *, standardize=True, copy=True):
         self.method = method
         self.standardize = standardize
         self.copy = copy
@@ -3034,7 +3049,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def power_transform(X, method='yeo-johnson', standardize=True, copy=True):
+@_deprecate_positional_args
+def power_transform(X, method='yeo-johnson', *, standardize=True, copy=True):
     """
     Power transforms are a family of parametric, monotonic transformations
     that are applied to make data more Gaussian-like. This is useful for
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 67641601e06f5..fa7d574e65ccd 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -16,6 +16,7 @@
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 
 
 class KBinsDiscretizer(TransformerMixin, BaseEstimator):
@@ -24,6 +25,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
     Read more in the :ref:`User Guide <preprocessing_discretization>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     n_bins : int or array-like, shape (n_features,) (default=5)
@@ -113,9 +116,11 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
            [-0.5,  2.5, -2.5, -0.5],
            [ 0.5,  3.5, -1.5,  0.5],
            [ 0.5,  3.5, -1.5,  1.5]])
+
     """
 
-    def __init__(self, n_bins=5, encode='onehot', strategy='quantile'):
+    @_deprecate_positional_args
+    def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'):
         self.n_bins = n_bins
         self.encode = encode
         self.strategy = strategy
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index aa3d8d9dabca8..3b0e43c151e0c 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -8,6 +8,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ._label import _encode, _encode_check_unknown
 
@@ -185,6 +186,8 @@ class OneHotEncoder(_BaseEncoder):
 
         The used categories can be found in the ``categories_`` attribute.
 
+        .. versionadded:: 0.20
+
     drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \
             default=None
         Specifies a methodology to use to drop one of the categories per
@@ -292,7 +295,8 @@ class OneHotEncoder(_BaseEncoder):
            [1., 0., 1., 0.]])
     """
 
-    def __init__(self, categories='auto', drop=None, sparse=True,
+    @_deprecate_positional_args
+    def __init__(self, *, categories='auto', drop=None, sparse=True,
                  dtype=np.float64, handle_unknown='error'):
         self.categories = categories
         self.sparse = sparse
@@ -601,7 +605,7 @@ class OrdinalEncoder(_BaseEncoder):
 
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
 
-    .. versionchanged:: 0.20.1
+    .. versionadded:: 0.20
 
     Parameters
     ----------
@@ -653,7 +657,8 @@ class OrdinalEncoder(_BaseEncoder):
            ['Female', 2]], dtype=object)
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    @_deprecate_positional_args
+    def __init__(self, *, categories='auto', dtype=np.float64):
         self.categories = categories
         self.dtype = dtype
 
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 9cf365ebb3cdf..c4e6782b7cb19 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -2,6 +2,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils.validation import _allclose_dense_sparse
+from ..utils.validation import _deprecate_positional_args
 
 
 def _identity(X):
@@ -65,9 +66,13 @@ class FunctionTransformer(TransformerMixin, BaseEstimator):
     kw_args : dict, optional
         Dictionary of additional keyword arguments to pass to func.
 
+        .. versionadded:: 0.18
+
     inv_kw_args : dict, optional
         Dictionary of additional keyword arguments to pass to inverse_func.
 
+        .. versionadded:: 0.18
+
     Examples
     --------
     >>> import numpy as np
@@ -78,7 +83,9 @@ class FunctionTransformer(TransformerMixin, BaseEstimator):
     array([[0.       , 0.6931...],
            [1.0986..., 1.3862...]])
     """
-    def __init__(self, func=None, inverse_func=None, validate=False,
+
+    @_deprecate_positional_args
+    def __init__(self, func=None, inverse_func=None, *, validate=False,
                  accept_sparse=False, check_inverse=True, kw_args=None,
                  inv_kw_args=None):
         self.func = func
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index 43b6ac642284c..88fad3670cb01 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -21,6 +21,7 @@
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 
@@ -396,7 +397,8 @@ class LabelBinarizer(TransformerMixin, BaseEstimator):
         using a one-hot aka one-of-K scheme.
     """
 
-    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
+    @_deprecate_positional_args
+    def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
         if neg_label >= pos_label:
             raise ValueError("neg_label={0} must be strictly less than "
                              "pos_label={1}.".format(neg_label, pos_label))
@@ -483,7 +485,7 @@ def transform(self, y):
             raise ValueError("The object was not fitted with multilabel"
                              " input.")
 
-        return label_binarize(y, self.classes_,
+        return label_binarize(y, classes=self.classes_,
                               pos_label=self.pos_label,
                               neg_label=self.neg_label,
                               sparse_output=self.sparse_output)
@@ -541,7 +543,9 @@ def _more_tags(self):
         return {'X_types': ['1dlabels']}
 
 
-def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False):
+@_deprecate_positional_args
+def label_binarize(y, *, classes, neg_label=0, pos_label=1,
+                   sparse_output=False):
     """Binarize labels in a one-vs-all fashion
 
     Several regression and binary classification algorithms are
@@ -851,7 +855,8 @@ class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
         using a one-hot aka one-of-K scheme.
     """
 
-    def __init__(self, classes=None, sparse_output=False):
+    @_deprecate_positional_args
+    def __init__(self, *, classes=None, sparse_output=False):
         self.classes = classes
         self.sparse_output = sparse_output
 
diff --git a/sklearn/preprocessing/tests/test_common.py b/sklearn/preprocessing/tests/test_common.py
index 7938256d482b7..802329fc5ce32 100644
--- a/sklearn/preprocessing/tests/test_common.py
+++ b/sklearn/preprocessing/tests/test_common.py
@@ -126,3 +126,33 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
                 Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
             assert len(records) == 0
             assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
+
+
+@pytest.mark.parametrize(
+    "est, func",
+    [(MaxAbsScaler(), maxabs_scale),
+     (MinMaxScaler(), minmax_scale),
+     (StandardScaler(), scale),
+     (StandardScaler(with_mean=False), scale),
+     (PowerTransformer('yeo-johnson'), power_transform),
+     (PowerTransformer('box-cox'), power_transform,),
+     (QuantileTransformer(n_quantiles=3), quantile_transform),
+     (RobustScaler(), robust_scale),
+     (RobustScaler(with_centering=False), robust_scale)]
+)
+def test_missing_value_pandas_na_support(est, func):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip('pandas', minversion="1.0")
+
+    X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1],
+                  [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
+                  [1, 2, 3, 4, 5, 6, 7, 8]]).T
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c'])
+    X_df['c'] = X_df['c'].astype('int')
+
+    X_trans = est.fit_transform(X)
+    X_df_trans = est.fit_transform(X_df)
+
+    assert_allclose(X_trans, X_df_trans)
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 7999df083631c..f79703610bee5 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -2295,7 +2295,7 @@ def test_power_transformer_boxcox_strictly_positive_exception():
         pt.fit(X_with_negatives)
 
     with pytest.raises(ValueError, match=not_positive_message):
-        power_transform(X_with_negatives, 'box-cox')
+        power_transform(X_with_negatives, method='box-cox')
 
     with pytest.raises(ValueError, match=not_positive_message):
         pt.transform(np.zeros(X_2d.shape))
@@ -2304,7 +2304,7 @@ def test_power_transformer_boxcox_strictly_positive_exception():
         pt.fit(np.zeros(X_2d.shape))
 
     with pytest.raises(ValueError, match=not_positive_message):
-        power_transform(np.zeros(X_2d.shape), 'box-cox')
+        power_transform(np.zeros(X_2d.shape), method='box-cox')
 
 
 @pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d),
@@ -2432,7 +2432,7 @@ def test_power_transformer_fit_transform(method, standardize):
     if method == 'box-cox':
         X = np.abs(X)
 
-    pt = PowerTransformer(method, standardize)
+    pt = PowerTransformer(method, standardize=standardize)
     assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
 
 
@@ -2449,7 +2449,7 @@ def test_power_transformer_copy_True(method, standardize):
     assert X is not X_original  # sanity checks
     assert_array_almost_equal(X, X_original)
 
-    pt = PowerTransformer(method, standardize, copy=True)
+    pt = PowerTransformer(method, standardize=standardize, copy=True)
 
     pt.fit(X)
     assert_array_almost_equal(X, X_original)
@@ -2477,7 +2477,7 @@ def test_power_transformer_copy_False(method, standardize):
     assert X is not X_original  # sanity checks
     assert_array_almost_equal(X, X_original)
 
-    pt = PowerTransformer(method, standardize, copy=False)
+    pt = PowerTransformer(method, standardize=standardize, copy=False)
 
     pt.fit(X)
     assert_array_almost_equal(X, X_original)  # fit didn't change X
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 887fa90c98d61..505c57cb5f1c1 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -178,7 +178,7 @@ def test_label_binarizer_errors():
     with pytest.raises(ValueError):
         LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
     with pytest.raises(ValueError):
-        label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
+        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
 
 
 @pytest.mark.parametrize(
@@ -509,13 +509,13 @@ def check_binarized_results(y, classes, pos_label, neg_label, expected):
     for sparse_output in [True, False]:
         if ((pos_label == 0 or neg_label != 0) and sparse_output):
             with pytest.raises(ValueError):
-                label_binarize(y, classes, neg_label=neg_label,
+                label_binarize(y, classes=classes, neg_label=neg_label,
                                pos_label=pos_label,
                                sparse_output=sparse_output)
             continue
 
         # check label_binarize
-        binarized = label_binarize(y, classes, neg_label=neg_label,
+        binarized = label_binarize(y, classes=classes, neg_label=neg_label,
                                    pos_label=pos_label,
                                    sparse_output=sparse_output)
         assert_array_equal(toarray(binarized), expected)
@@ -576,7 +576,7 @@ def test_label_binarize_multiclass():
     check_binarized_results(y, classes, pos_label, neg_label, expected)
 
     with pytest.raises(ValueError):
-        label_binarize(y, classes, neg_label=-1, pos_label=pos_label,
+        label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
                        sparse_output=True)
 
 
@@ -595,7 +595,7 @@ def test_label_binarize_multilabel():
                                 expected)
 
     with pytest.raises(ValueError):
-        label_binarize(y, classes, neg_label=-1, pos_label=pos_label,
+        label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
                        sparse_output=True)
 
 
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index d18f3bf846901..61eeeea5ef45e 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -39,6 +39,7 @@
 from .utils.extmath import safe_sparse_dot
 from .utils.random import sample_without_replacement
 from .utils.validation import check_array, check_is_fitted
+from .utils.validation import _deprecate_positional_args
 from .exceptions import DataDimensionalityWarning
 from .utils import deprecated
 
@@ -310,7 +311,7 @@ class BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def __init__(self, n_components='auto', eps=0.1, dense_output=False,
+    def __init__(self, n_components='auto', *, eps=0.1, dense_output=False,
                  random_state=None):
         self.n_components = n_components
         self.eps = eps
@@ -489,7 +490,8 @@ class GaussianRandomProjection(BaseRandomProjection):
     SparseRandomProjection
 
     """
-    def __init__(self, n_components='auto', eps=0.1, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, n_components='auto', *, eps=0.1, random_state=None):
         super().__init__(
             n_components=n_components,
             eps=eps,
@@ -626,7 +628,8 @@ class SparseRandomProjection(BaseRandomProjection):
            https://users.soe.ucsc.edu/~optas/papers/jl.pdf
 
     """
-    def __init__(self, n_components='auto', density='auto', eps=0.1,
+    @_deprecate_positional_args
+    def __init__(self, n_components='auto', *, density='auto', eps=0.1,
                  dense_output=False, random_state=None):
         super().__init__(
             n_components=n_components,
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index d46dacbe754e4..efa9eb2255ce3 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -68,6 +68,7 @@
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted, check_array
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -105,7 +106,8 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
         for more details.
     """
 
-    def __init__(self, kernel='rbf', gamma=20, n_neighbors=7,
+    @_deprecate_positional_args
+    def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
                  alpha=1, max_iter=30, tol=1e-3, n_jobs=None):
 
         self.max_iter = max_iter
@@ -129,7 +131,7 @@ def _get_kernel(self, X, y=None):
                 return rbf_kernel(X, y, gamma=self.gamma)
         elif self.kernel == "knn":
             if self.nn_fit is None:
-                self.nn_fit = NearestNeighbors(self.n_neighbors,
+                self.nn_fit = NearestNeighbors(n_neighbors=self.n_neighbors,
                                                n_jobs=self.n_jobs).fit(X)
             if y is None:
                 return self.nn_fit.kneighbors_graph(self.nn_fit._fit_X,
@@ -378,7 +380,8 @@ class LabelPropagation(BaseLabelPropagation):
 
     _variant = 'propagation'
 
-    def __init__(self, kernel='rbf', gamma=20, n_neighbors=7,
+    @_deprecate_positional_args
+    def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
                  max_iter=1000, tol=1e-3, n_jobs=None):
         super().__init__(kernel=kernel, gamma=gamma,
                          n_neighbors=n_neighbors, max_iter=max_iter,
@@ -491,7 +494,8 @@ class LabelSpreading(BaseLabelPropagation):
 
     _variant = 'spreading'
 
-    def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2,
+    @_deprecate_positional_args
+    def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2,
                  max_iter=30, tol=1e-3, n_jobs=None):
 
         # this one has different base parameters
diff --git a/sklearn/svm/_bounds.py b/sklearn/svm/_bounds.py
index 1e1ed8939ce5f..b35728041f6cf 100644
--- a/sklearn/svm/_bounds.py
+++ b/sklearn/svm/_bounds.py
@@ -6,10 +6,12 @@
 
 from ..preprocessing import LabelBinarizer
 from ..utils.validation import check_consistent_length, check_array
+from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import safe_sparse_dot
 
 
-def l1_min_c(X, y, loss='squared_hinge', fit_intercept=True,
+@_deprecate_positional_args
+def l1_min_c(X, y, *, loss='squared_hinge', fit_intercept=True,
              intercept_scaling=1.0):
     """
     Return the lowest bound for C such that for C in (l1_min_C, infinity)
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 46086729af35c..d082c22d0a3bc 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -5,6 +5,7 @@
 from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, \
     LinearModel
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
 from ..utils.deprecation import deprecated
 
@@ -177,9 +178,9 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
     """
-
-    def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=1e-4,
-                 C=1.0, multi_class='ovr', fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, penalty='l2', loss='squared_hinge', *, dual=True,
+                 tol=1e-4, C=1.0, multi_class='ovr', fit_intercept=True,
                  intercept_scaling=1, class_weight=None, verbose=0,
                  random_state=None, max_iter=1000):
         self.dual = dual
@@ -212,6 +213,8 @@ def fit(self, X, y, sample_weight=None):
             samples. If not provided,
             then each sample is given unit weight.
 
+            .. versionadded:: 0.18
+
         Returns
         -------
         self : object
@@ -364,7 +367,8 @@ class LinearSVR(RegressorMixin, LinearModel):
         various loss functions and regularization regimes.
     """
 
-    def __init__(self, epsilon=0.0, tol=1e-4, C=1.0,
+    @_deprecate_positional_args
+    def __init__(self, *, epsilon=0.0, tol=1e-4, C=1.0,
                  loss='epsilon_insensitive', fit_intercept=True,
                  intercept_scaling=1., dual=True, verbose=0,
                  random_state=None, max_iter=1000):
@@ -396,6 +400,8 @@ def fit(self, X, y, sample_weight=None):
             samples. If not provided,
             then each sample is given unit weight.
 
+            .. versionadded:: 0.18
+
         Returns
         -------
         self : object
@@ -627,7 +633,8 @@ class SVC(BaseSVC):
 
     _impl = 'c_svc'
 
-    def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, shrinking=True, probability=False,
                  tol=1e-3, cache_size=200, class_weight=None,
                  verbose=False, max_iter=-1, decision_function_shape='ovr',
@@ -838,7 +845,8 @@ class NuSVC(BaseSVC):
 
     _impl = 'nu_svc'
 
-    def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, shrinking=True, probability=False, tol=1e-3,
                  cache_size=200, class_weight=None, verbose=False, max_iter=-1,
                  decision_function_shape='ovr', break_ties=False,
@@ -855,7 +863,7 @@ def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale',
 
     def _more_tags(self):
         return {
-            '_xfail_test': {
+            '_xfail_checks': {
                 'check_methods_subset_invariance':
                 'fails for the decision_function method',
                 'check_class_weight_classifiers': 'class_weight is ignored.'
@@ -992,7 +1000,8 @@ class SVR(RegressorMixin, BaseLibSVM):
 
     _impl = 'epsilon_svr'
 
-    def __init__(self, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True,
                  cache_size=200, verbose=False, max_iter=-1):
 
@@ -1137,7 +1146,8 @@ class NuSVR(RegressorMixin, BaseLibSVM):
 
     _impl = 'nu_svr'
 
-    def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3,
+    @_deprecate_positional_args
+    def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3,
                  gamma='scale', coef0=0.0, shrinking=True,
                  tol=1e-3, cache_size=200, verbose=False, max_iter=-1):
 
@@ -1235,6 +1245,8 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
         The offset is the opposite of `intercept_` and is provided for
         consistency with other outlier detection algorithms.
 
+        .. versionadded:: 0.20
+
     fit_status_ : int
         0 if correctly fitted, 1 otherwise (will raise warning)
 
@@ -1245,13 +1257,14 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
     >>> clf = OneClassSVM(gamma='auto').fit(X)
     >>> clf.predict(X)
     array([-1,  1,  1,  1, -1])
-    >>> clf.score_samples(X)  # doctest: +ELLIPSIS
+    >>> clf.score_samples(X)
     array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])
     """
 
     _impl = 'one_class'
 
-    def __init__(self, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200,
                  verbose=False, max_iter=-1):
 
diff --git a/sklearn/svm/_libsvm_sparse.pyx b/sklearn/svm/_libsvm_sparse.pyx
index f180560f1d1e7..4b5070a64aad8 100644
--- a/sklearn/svm/_libsvm_sparse.pyx
+++ b/sklearn/svm/_libsvm_sparse.pyx
@@ -4,6 +4,8 @@ cimport numpy as np
 from scipy import sparse
 from ..exceptions import ConvergenceWarning
 
+np.import_array()
+
 
 cdef extern from *:
     ctypedef char* const_char_p "const char*"
@@ -186,7 +188,7 @@ def libsvm_sparse_train ( int n_features,
 
     # copy model.nSV
     # TODO: do only in classification
-    cdef np.ndarray n_class_SV 
+    cdef np.ndarray n_class_SV
     n_class_SV = np.empty(n_class, dtype=np.int32)
     copy_nSV(n_class_SV.data, model)
 
diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp
index cc603b435f655..29a5581c280dc 100644
--- a/sklearn/svm/src/liblinear/linear.cpp
+++ b/sklearn/svm/src/liblinear/linear.cpp
@@ -26,7 +26,7 @@
    Modified 2020:
    - Improved random number generator by using a mersenne twister + tweaked
      lemire postprocessor. This fixed a convergence issue on windows targets.
-     Sylvain Marie
+     Sylvain Marie, Schneider Electric
      See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
  */
diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp
index c9a5df10c4924..a5f735d8111cf 100644
--- a/sklearn/svm/src/libsvm/svm.cpp
+++ b/sklearn/svm/src/libsvm/svm.cpp
@@ -52,7 +52,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    - Improved random number generator by using a mersenne twister + tweaked
      lemire postprocessor. This fixed a convergence issue on windows targets.
-     Sylvain Marie,
+     Sylvain Marie, Schneider Electric
      see <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
  */
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
index b46861b71e765..7cd7b4c9fbf2b 100644
--- a/sklearn/svm/src/newrand/newrand.h
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -3,7 +3,7 @@
    - New random number generator using a mersenne twister + tweaked lemire
      postprocessor. This fixed a convergence issue on windows targets for
      libsvm and liblinear.
-     Sylvain Marie
+     Sylvain Marie, Schneider Electric
      See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
  */
diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 664a39c790b9b..8454ebf64de1a 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -37,11 +37,12 @@ def test_l1_min_c(loss, X_label, Y_label, intercept_label):
 def test_l1_min_c_l2_loss():
     # loss='l2' should raise ValueError
     assert_raise_message(ValueError, "loss type not in",
-                         l1_min_c, dense_X, Y1, "l2")
+                         l1_min_c, dense_X, Y1, loss="l2")
 
 
 def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None):
-    min_c = l1_min_c(X, y, loss, fit_intercept, intercept_scaling)
+    min_c = l1_min_c(X, y, loss=loss, fit_intercept=fit_intercept,
+                     intercept_scaling=intercept_scaling)
 
     clf = {
         'log': LogisticRegression(penalty='l1', solver='liblinear'),
@@ -72,4 +73,4 @@ def test_ill_posed_min_c():
 
 def test_unsupported_loss():
     with pytest.raises(ValueError):
-        l1_min_c(dense_X, Y1, 'l1')
+        l1_min_c(dense_X, Y1, loss='l1')
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 95f7b01f27058..db5c88051346a 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -23,6 +23,7 @@
 
 from sklearn.base import TransformerMixin
 from sklearn.utils._mocking import MockDataFrame
+from sklearn import config_context
 import pickle
 
 
@@ -211,10 +212,10 @@ def test_repr():
     test = T(K(), K())
     assert (
         repr(test) ==
-        "T(a=K(c=None, d=None), b=K(c=None, d=None))")
+        "T(a=K(), b=K())")
 
     some_est = T(a=["long_params"] * 1000)
-    assert len(repr(some_est)) == 495
+    assert len(repr(some_est)) == 485
 
 
 def test_str():
@@ -511,3 +512,28 @@ def fit(self, X, y=None):
         params = est.get_params()
 
     assert params['param'] is None
+
+
+def test_repr_mimebundle_():
+    # Checks the display configuration flag controls the json output
+    tree = DecisionTreeClassifier()
+    output = tree._repr_mimebundle_()
+    assert "text/plain" in output
+    assert "text/html" not in output
+
+    with config_context(display='diagram'):
+        output = tree._repr_mimebundle_()
+        assert "text/plain" in output
+        assert "text/html" in output
+
+
+def test_repr_html_wraps():
+    # Checks the display configuration flag controls the html output
+    tree = DecisionTreeClassifier()
+    msg = "_repr_html_ is only defined when"
+    with pytest.raises(AttributeError, match=msg):
+        output = tree._repr_html_()
+
+    with config_context(display='diagram'):
+        output = tree._repr_html_()
+        assert "<style>" in output
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index af98c1bc50a74..73c99b0483de8 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -48,7 +48,9 @@ def test_all_estimator_no_base_class():
         assert not name.lower().startswith('base'), msg
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_estimator_cls_parameterize_with_checks():
+    # TODO: remove test in 0.24
     # Non-regression test for #16707 to ensure that parametrize_with_checks
     # works with estimator classes
     param_checks = parametrize_with_checks([LogisticRegression])
@@ -105,7 +107,7 @@ def _tested_estimators():
         yield estimator
 
 
-@parametrize_with_checks(_tested_estimators())
+@parametrize_with_checks(list(_tested_estimators()))
 def test_estimators(estimator, check, request):
     # Common tests for estimator instances
     with ignore_warnings(category=(FutureWarning,
@@ -115,7 +117,9 @@ def test_estimators(estimator, check, request):
         check(estimator)
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_check_estimator_generate_only():
+    # TODO in 0.24: remove checks on passing a class
     estimator_cls_gen_checks = check_estimator(LogisticRegression,
                                                generate_only=True)
     all_instance_gen_checks = check_estimator(LogisticRegression(),
@@ -238,3 +242,19 @@ def test_all_tests_are_importable():
                                  '__init__.py or an add_subpackage directive '
                                  'in the parent '
                                  'setup.py'.format(missing_tests))
+
+
+# TODO: remove in 0.24
+def test_class_support_deprecated():
+    # Make sure passing classes to check_estimator or parametrize_with_checks
+    # is deprecated
+
+    msg = "Passing a class is deprecated"
+    with pytest.warns(FutureWarning, match=msg):
+        check_estimator(LogisticRegression)
+
+    with pytest.warns(FutureWarning, match=msg):
+        parametrize_with_checks([LogisticRegression])
+
+    # Make sure check_parameters_default_constructible accepts instances now
+    check_parameters_default_constructible('name', LogisticRegression())
diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index dfa944110ad7a..eec349861258c 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -4,7 +4,8 @@
 
 def test_config_context():
     assert get_config() == {'assume_finite': False, 'working_memory': 1024,
-                            'print_changed_only': False}
+                            'print_changed_only': True,
+                            'display': 'text'}
 
     # Not using as a context manager affects nothing
     config_context(assume_finite=True)
@@ -12,7 +13,8 @@ def test_config_context():
 
     with config_context(assume_finite=True):
         assert get_config() == {'assume_finite': True, 'working_memory': 1024,
-                                'print_changed_only': False}
+                                'print_changed_only': True,
+                                'display': 'text'}
     assert get_config()['assume_finite'] is False
 
     with config_context(assume_finite=True):
@@ -37,7 +39,8 @@ def test_config_context():
         assert get_config()['assume_finite'] is True
 
     assert get_config() == {'assume_finite': False, 'working_memory': 1024,
-                            'print_changed_only': False}
+                            'print_changed_only': True,
+                            'display': 'text'}
 
     # No positional arguments
     assert_raises(TypeError, config_context, True)
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index ca2549f2ea4c1..aa858670a54ce 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -17,7 +17,6 @@
 from sklearn.utils._testing import _get_func_name
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import all_estimators
-from sklearn.utils.estimator_checks import _safe_tags
 from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
 from sklearn.utils.estimator_checks import _enforce_estimator_tags_x
 from sklearn.utils.deprecation import _is_deprecated
@@ -199,6 +198,10 @@ def test_fit_docstring_attributes(name, Estimator):
     if Estimator.__name__ == 'DummyClassifier':
         est.strategy = "stratified"
 
+    # TO BE REMOVED for v0.25 (avoid FutureWarning)
+    if Estimator.__name__ == 'AffinityPropagation':
+        est.random_state = 63
+
     X, y = make_classification(n_samples=20, n_features=3,
                                n_redundant=0, n_classes=2,
                                random_state=2)
@@ -206,9 +209,9 @@ def test_fit_docstring_attributes(name, Estimator):
     y = _enforce_estimator_tags_y(est, y)
     X = _enforce_estimator_tags_x(est, X)
 
-    if '1dlabels' in _safe_tags(est, 'X_types'):
+    if '1dlabels' in est._get_tags()['X_types']:
         est.fit(y)
-    elif '2dlabels' in _safe_tags(est, 'X_types'):
+    elif '2dlabels' in est._get_tags()['X_types']:
         est.fit(np.c_[y, y])
     else:
         est.fit(X, y)
@@ -236,7 +239,7 @@ def test_fit_docstring_attributes(name, Estimator):
                'MiniBatchKMeans', 'MLPClassifier', 'MLPRegressor',
                'MultiTaskElasticNet', 'MultiTaskElasticNetCV',
                'MultiTaskLasso', 'MultiTaskLassoCV', 'NearestNeighbors',
-               'NuSVR', 'OAS', 'OneClassSVM', 'OrthogonalMatchingPursuit',
+               'NuSVR', 'OneClassSVM', 'OrthogonalMatchingPursuit',
                'PLSCanonical', 'PLSRegression', 'PLSSVD',
                'PassiveAggressiveClassifier', 'Perceptron', 'RBFSampler',
                'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor',
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index 1f0f9347a188c..1106684998f75 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -122,7 +122,7 @@ def test_gnb_priors_sum_isclose():
     priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14,
                        0.11, 0.0])
     Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-    clf = GaussianNB(priors)
+    clf = GaussianNB(priors=priors)
     # smoke test for issue #9633
     clf.fit(X, Y)
 
diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
index 033bb84279d54..b8d69632105b0 100644
--- a/sklearn/tests/test_random_projection.py
+++ b/sklearn/tests/test_random_projection.py
@@ -62,21 +62,21 @@ def densify(matrix):
 # test on JL lemma
 ###############################################################################
 def test_invalid_jl_domain():
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, 1.1)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, 0.0)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, -0.1)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, 0.5)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=1.1)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=0.0)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=-0.1)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, eps=0.5)
 
 
 def test_input_size_jl_min_dim():
     assert_raises(ValueError, johnson_lindenstrauss_min_dim,
-                  3 * [100], 2 * [0.9])
+                  3 * [100], eps=2 * [0.9])
 
     assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
-                  2 * [0.9])
+                  eps=2 * [0.9])
 
     johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
-                                  np.full((10, 10), 0.5))
+                                  eps=np.full((10, 10), 0.5))
 
 
 ###############################################################################
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index fe77610a20601..3994613d92b6b 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -36,6 +36,7 @@
 from ..utils import compute_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ._criterion import Criterion
 from ._splitter import Splitter
@@ -82,7 +83,8 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion,
                  splitter,
                  max_depth,
@@ -146,8 +148,14 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             raise ValueError("ccp_alpha must be greater than or equal to 0")
 
         if check_input:
-            X = self._validate_data(X, dtype=DTYPE, accept_sparse="csc")
-            y = check_array(y, ensure_2d=False, dtype=None)
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr.
+            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
+            check_y_params = dict(ensure_2d=False, dtype=None)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
             if issparse(X):
                 X.sort_indices()
 
@@ -809,7 +817,8 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="gini",
                  splitter="best",
                  max_depth=None,
@@ -1152,18 +1161,19 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
     Examples
     --------
-    >>> from sklearn.datasets import load_boston
+    >>> from sklearn.datasets import load_diabetes
     >>> from sklearn.model_selection import cross_val_score
     >>> from sklearn.tree import DecisionTreeRegressor
-    >>> X, y = load_boston(return_X_y=True)
+    >>> X, y = load_diabetes(return_X_y=True)
     >>> regressor = DecisionTreeRegressor(random_state=0)
     >>> cross_val_score(regressor, X, y, cv=10)
     ...                    # doctest: +SKIP
     ...
-    array([ 0.61..., 0.57..., -0.34..., 0.41..., 0.75...,
-            0.07..., 0.29..., 0.33..., -1.42..., -1.77...])
+    array([-0.39..., -0.46...,  0.02...,  0.06..., -0.50...,
+           0.16...,  0.11..., -0.73..., -0.30..., -0.00...])
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="mse",
                  splitter="best",
                  max_depth=None,
@@ -1493,7 +1503,8 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
     >>> cls.score(X_test, y_test)
     0.8947...
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="gini",
                  splitter="random",
                  max_depth=None,
@@ -1697,20 +1708,21 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
     Examples
     --------
-    >>> from sklearn.datasets import load_boston
+    >>> from sklearn.datasets import load_diabetes
     >>> from sklearn.model_selection import train_test_split
     >>> from sklearn.ensemble import BaggingRegressor
     >>> from sklearn.tree import ExtraTreeRegressor
-    >>> X, y = load_boston(return_X_y=True)
+    >>> X, y = load_diabetes(return_X_y=True)
     >>> X_train, X_test, y_train, y_test = train_test_split(
     ...     X, y, random_state=0)
     >>> extra_tree = ExtraTreeRegressor(random_state=0)
     >>> reg = BaggingRegressor(extra_tree, random_state=0).fit(
     ...     X_train, y_train)
     >>> reg.score(X_test, y_test)
-    0.7447...
+    0.33...
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="mse",
                  splitter="random",
                  max_depth=None,
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 3197995818f81..d0f67326012e9 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..base import is_classifier
 
 from . import _criterion
@@ -77,7 +78,8 @@ def __repr__(self):
 SENTINEL = Sentinel()
 
 
-def plot_tree(decision_tree, max_depth=None, feature_names=None,
+@_deprecate_positional_args
+def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
               class_names=None, label='all', filled=False,
               impurity=True, node_ids=False,
               proportion=False, rotate='deprecated', rounded=False,
@@ -656,7 +658,8 @@ def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0):
             ax.annotate("\n  (...)  \n", xy_parent, xy, **kwargs)
 
 
-def export_graphviz(decision_tree, out_file=None, max_depth=None,
+@_deprecate_positional_args
+def export_graphviz(decision_tree, out_file=None, *, max_depth=None,
                     feature_names=None, class_names=None, label='all',
                     filled=False, leaves_parallel=False, impurity=True,
                     node_ids=False, proportion=False, rotate=False,
@@ -807,7 +810,8 @@ def compute_depth_(current_node, current_depth,
     return max(depths)
 
 
-def export_text(decision_tree, feature_names=None, max_depth=10,
+@_deprecate_positional_args
+def export_text(decision_tree, *, feature_names=None, max_depth=10,
                 spacing=3, decimals=2, show_weights=False):
     """Build a text report showing the rules of a decision tree.
 
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index ad49f81fcf9ac..f12f1daeb57c1 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -465,6 +465,5 @@ def test_not_fitted_tree(pyplot):
 
     # Testing if not fitted tree throws the correct error
     clf = DecisionTreeRegressor()
-    out = StringIO()
     with pytest.raises(NotFittedError):
-        plot_tree(clf, out)
+        plot_tree(clf)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index aac6e292a198a..f814ea11c12c1 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -25,11 +25,13 @@
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
 from .fixes import np_version
+from ._estimator_html_repr import estimator_html_repr
 from .validation import (as_float_array,
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
-                         check_symmetric, check_scalar)
+                         check_symmetric, check_scalar,
+                         _deprecate_positional_args)
 from .. import get_config
 
 
@@ -51,7 +53,7 @@
            "check_symmetric", "indices_to_mask", "deprecated",
            "parallel_backend", "register_parallel_backend",
            "resample", "shuffle", "check_matplotlib_support", "all_estimators",
-           "DataConversionWarning"
+           "DataConversionWarning", "estimator_html_repr"
            ]
 
 IS_PYPY = platform.python_implementation() == 'PyPy'
@@ -314,10 +316,10 @@ def safe_indexing(X, indices, axis=0):
     CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
     not supported.
     """
-    return _safe_indexing(X, indices, axis)
+    return _safe_indexing(X, indices, axis=axis)
 
 
-def _safe_indexing(X, indices, axis=0):
+def _safe_indexing(X, indices, *, axis=0):
     """Return rows, items or columns of X using indices.
 
     .. warning::
@@ -684,7 +686,8 @@ def shuffle(*arrays, **options):
     return resample(*arrays, **options)
 
 
-def safe_sqr(X, copy=True):
+@_deprecate_positional_args
+def safe_sqr(X, *, copy=True):
     """Element wise squaring of array-likes and sparse matrices.
 
     Parameters
@@ -723,7 +726,8 @@ def _chunk_generator(gen, chunksize):
             return
 
 
-def gen_batches(n, batch_size, min_batch_size=0):
+@_deprecate_positional_args
+def gen_batches(n, batch_size, *, min_batch_size=0):
     """Generator to create slices containing batch_size elements, from 0 to n.
 
     The last slice may contain less than batch_size elements, when batch_size
@@ -772,7 +776,8 @@ def gen_batches(n, batch_size, min_batch_size=0):
         yield slice(start, n)
 
 
-def gen_even_slices(n, n_packs, n_samples=None):
+@_deprecate_positional_args
+def gen_even_slices(n, n_packs, *, n_samples=None):
     """Generator to create n_packs slices going up to n.
 
     Parameters
@@ -957,8 +962,8 @@ def _print_elapsed_time(source, message=None):
                                timeit.default_timer() - start))
 
 
-def get_chunk_n_rows(row_bytes, max_n_rows=None,
-                     working_memory=None):
+@_deprecate_positional_args
+def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
     """Calculates how many rows can be processed within working_memory
 
     Parameters
diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py
new file mode 100644
index 0000000000000..9b2e45790fd2b
--- /dev/null
+++ b/sklearn/utils/_estimator_html_repr.py
@@ -0,0 +1,311 @@
+from contextlib import closing
+from contextlib import suppress
+from io import StringIO
+import uuid
+import html
+
+from sklearn import config_context
+
+
+class _VisualBlock:
+    """HTML Representation of Estimator
+
+    Parameters
+    ----------
+    kind : {'serial', 'parallel', 'single'}
+        kind of HTML block
+
+    estimators : list of estimators or `_VisualBlock`s or a single estimator
+        If kind != 'single', then `estimators` is a list of
+        estimators.
+        If kind == 'single', then `estimators` is a single estimator.
+
+    names : list of str
+        If kind != 'single', then `names` corresponds to estimators.
+        If kind == 'single', then `names` is a single string corresponding to
+        the single estimator.
+
+    name_details : list of str, str, or None, default=None
+        If kind != 'single', then `name_details` corresponds to `names`.
+        If kind == 'single', then `name_details` is a single string
+        corresponding to the single estimator.
+
+    dash_wrapped : bool, default=True
+        If true, wrapped HTML element will be wrapped with a dashed border.
+        Only active when kind != 'single'.
+    """
+    def __init__(self, kind, estimators, *, names=None, name_details=None,
+                 dash_wrapped=True):
+        self.kind = kind
+        self.estimators = estimators
+        self.dash_wrapped = dash_wrapped
+
+        if self.kind in ('parallel', 'serial'):
+            if names is None:
+                names = (None, ) * len(estimators)
+            if name_details is None:
+                name_details = (None, ) * len(estimators)
+
+        self.names = names
+        self.name_details = name_details
+
+    def _sk_visual_block_(self):
+        return self
+
+
+def _write_label_html(out, name, name_details,
+                      outer_class="sk-label-container",
+                      inner_class="sk-label",
+                      checked=False):
+    """Write labeled html with or without a dropdown with named details"""
+    out.write(f'<div class="{outer_class}">'
+              f'<div class="{inner_class} sk-toggleable">')
+    name = html.escape(name)
+
+    if name_details is not None:
+        checked_str = 'checked' if checked else ''
+        est_id = uuid.uuid4()
+        out.write(f'<input class="sk-toggleable__control sk-hidden--visually" '
+                  f'id="{est_id}" type="checkbox" {checked_str}>'
+                  f'<label class="sk-toggleable__label" for="{est_id}">'
+                  f'{name}</label>'
+                  f'<div class="sk-toggleable__content"><pre>{name_details}'
+                  f'</pre></div>')
+    else:
+        out.write(f'<label>{name}</label>')
+    out.write('</div></div>')  # outer_class inner_class
+
+
+def _get_visual_block(estimator):
+    """Generate information about how to display an estimator.
+    """
+    with suppress(AttributeError):
+        return estimator._sk_visual_block_()
+
+    if isinstance(estimator, str):
+        return _VisualBlock('single', estimator,
+                            names=estimator, name_details=estimator)
+    elif estimator is None:
+        return _VisualBlock('single', estimator,
+                            names='None', name_details='None')
+
+    # check if estimator looks like a meta estimator wraps estimators
+    if hasattr(estimator, 'get_params'):
+        estimators = []
+        for key, value in estimator.get_params().items():
+            # Only look at the estimators in the first layer
+            if '__' not in key and hasattr(value, 'get_params'):
+                estimators.append(value)
+        if len(estimators):
+            return _VisualBlock('parallel', estimators, names=None)
+
+    return _VisualBlock('single', estimator,
+                        names=estimator.__class__.__name__,
+                        name_details=str(estimator))
+
+
+def _write_estimator_html(out, estimator, estimator_label,
+                          estimator_label_details, first_call=False):
+    """Write estimator to html in serial, parallel, or by itself (single).
+    """
+    if first_call:
+        est_block = _get_visual_block(estimator)
+    else:
+        with config_context(print_changed_only=True):
+            est_block = _get_visual_block(estimator)
+
+    if est_block.kind in ('serial', 'parallel'):
+        dashed_wrapped = first_call or est_block.dash_wrapped
+        dash_cls = " sk-dashed-wrapped" if dashed_wrapped else ""
+        out.write(f'<div class="sk-item{dash_cls}">')
+
+        if estimator_label:
+            _write_label_html(out, estimator_label, estimator_label_details)
+
+        kind = est_block.kind
+        out.write(f'<div class="sk-{kind}">')
+        est_infos = zip(est_block.estimators, est_block.names,
+                        est_block.name_details)
+
+        for est, name, name_details in est_infos:
+            if kind == 'serial':
+                _write_estimator_html(out, est, name, name_details)
+            else:  # parallel
+                out.write('<div class="sk-parallel-item">')
+                # wrap element in a serial visualblock
+                serial_block = _VisualBlock('serial', [est],
+                                            dash_wrapped=False)
+                _write_estimator_html(out, serial_block, name, name_details)
+                out.write('</div>')  # sk-parallel-item
+
+        out.write('</div></div>')
+    elif est_block.kind == 'single':
+        _write_label_html(out, est_block.names, est_block.name_details,
+                          outer_class="sk-item", inner_class="sk-estimator",
+                          checked=first_call)
+
+
+_STYLE = """
+div.sk-top-container {
+  color: black;
+  background-color: white;
+}
+div.sk-toggleable {
+  background-color: white;
+}
+label.sk-toggleable__label {
+  cursor: pointer;
+  display: block;
+  width: 100%;
+  margin-bottom: 0;
+  padding: 0.2em 0.3em;
+  box-sizing: border-box;
+  text-align: center;
+}
+div.sk-toggleable__content {
+  max-height: 0;
+  max-width: 0;
+  overflow: hidden;
+  text-align: left;
+  background-color: #f0f8ff;
+}
+div.sk-toggleable__content pre {
+  margin: 0.2em;
+  color: black;
+  border-radius: 0.25em;
+  background-color: #f0f8ff;
+}
+input.sk-toggleable__control:checked~div.sk-toggleable__content {
+  max-height: 200px;
+  max-width: 100%;
+  overflow: auto;
+}
+div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  background-color: #d4ebff;
+}
+div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  background-color: #d4ebff;
+}
+input.sk-hidden--visually {
+  border: 0;
+  clip: rect(1px 1px 1px 1px);
+  clip: rect(1px, 1px, 1px, 1px);
+  height: 1px;
+  margin: -1px;
+  overflow: hidden;
+  padding: 0;
+  position: absolute;
+  width: 1px;
+}
+div.sk-estimator {
+  font-family: monospace;
+  background-color: #f0f8ff;
+  margin: 0.25em 0.25em;
+  border: 1px dotted black;
+  border-radius: 0.25em;
+  box-sizing: border-box;
+}
+div.sk-estimator:hover {
+  background-color: #d4ebff;
+}
+div.sk-parallel-item::after {
+  content: "";
+  width: 100%;
+  border-bottom: 1px solid gray;
+  flex-grow: 1;
+}
+div.sk-label:hover label.sk-toggleable__label {
+  background-color: #d4ebff;
+}
+div.sk-serial::before {
+  content: "";
+  position: absolute;
+  border-left: 1px solid gray;
+  box-sizing: border-box;
+  top: 2em;
+  bottom: 0;
+  left: 50%;
+}
+div.sk-serial {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  background-color: white;
+}
+div.sk-item {
+  z-index: 1;
+}
+div.sk-parallel {
+  display: flex;
+  align-items: stretch;
+  justify-content: center;
+  background-color: white;
+}
+div.sk-parallel-item {
+  display: flex;
+  flex-direction: column;
+  position: relative;
+  background-color: white;
+}
+div.sk-parallel-item:first-child::after {
+  align-self: flex-end;
+  width: 50%;
+}
+div.sk-parallel-item:last-child::after {
+  align-self: flex-start;
+  width: 50%;
+}
+div.sk-parallel-item:only-child::after {
+  width: 0;
+}
+div.sk-dashed-wrapped {
+  border: 1px dashed gray;
+  margin: 0.2em;
+  box-sizing: border-box;
+  padding-bottom: 0.1em;
+  background-color: white;
+  position: relative;
+}
+div.sk-label label {
+  font-family: monospace;
+  font-weight: bold;
+  background-color: white;
+  display: inline-block;
+  line-height: 1.2em;
+}
+div.sk-label-container {
+  position: relative;
+  z-index: 2;
+  text-align: center;
+}
+div.sk-container {
+  display: inline-block;
+  position: relative;
+}
+""".replace('  ', '').replace('\n', '')  # noqa
+
+
+def estimator_html_repr(estimator):
+    """Build a HTML representation of an estimator.
+
+    Read more in the :ref:`User Guide <visualizing_composite_estimators>`.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        The estimator to visualize.
+
+    Returns
+    -------
+    html: str
+        HTML representation of estimator.
+    """
+    with closing(StringIO()) as out:
+        out.write(f'<style>{_STYLE}</style>'
+                  f'<div class="sk-top-container"><div class="sk-container">')
+        _write_estimator_html(out, estimator, estimator.__class__.__name__,
+                              str(estimator), first_call=True)
+        out.write('</div></div>')
+
+        html_output = out.getvalue()
+        return html_output
diff --git a/sklearn/utils/_logistic_sigmoid.pyx b/sklearn/utils/_logistic_sigmoid.pyx
index 4ca32193c5ce6..3531d99bc4f44 100644
--- a/sklearn/utils/_logistic_sigmoid.pyx
+++ b/sklearn/utils/_logistic_sigmoid.pyx
@@ -7,6 +7,7 @@ from libc.math cimport log, exp
 import numpy as np
 cimport numpy as np
 
+np.import_array()
 ctypedef np.float64_t DTYPE_t
 
 
diff --git a/sklearn/utils/_show_versions.py b/sklearn/utils/_show_versions.py
index 53bcf2f35269d..e9c7c687c5aaa 100644
--- a/sklearn/utils/_show_versions.py
+++ b/sklearn/utils/_show_versions.py
@@ -73,7 +73,10 @@ def get_version(module):
 
 
 def show_versions():
-    """Print useful debugging information"""
+    """Print useful debugging information"
+
+    .. versionadded:: 0.20
+    """
 
     sys_info = _get_sys_info()
     deps_info = _get_deps_info()
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
index 5f785cb36df45..8c64e33e1d0d4 100644
--- a/sklearn/utils/class_weight.py
+++ b/sklearn/utils/class_weight.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 
+from .validation import _deprecate_positional_args
+
 
 def compute_class_weight(class_weight, classes, y):
     """Estimate class weights for unbalanced datasets.
@@ -69,7 +71,8 @@ def compute_class_weight(class_weight, classes, y):
     return weight
 
 
-def compute_sample_weight(class_weight, y, indices=None):
+@_deprecate_positional_args
+def compute_sample_weight(class_weight, y, *, indices=None):
     """Estimate sample weights by class for unbalanced datasets.
 
     Parameters
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 34a0e25c7fcaa..5fe5fa1458ecc 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -33,7 +33,7 @@
 from ..linear_model import Ridge
 
 from ..base import (clone, ClusterMixin, is_classifier, is_regressor,
-                    _DEFAULT_TAGS, RegressorMixin, is_outlier_detector)
+                    RegressorMixin, is_outlier_detector, BaseEstimator)
 
 from ..metrics import accuracy_score, adjusted_rand_score, f1_score
 from ..random_projection import BaseRandomProjection
@@ -58,22 +58,9 @@
 BOSTON = None
 CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
 
-def _safe_tags(estimator, key=None):
-    # if estimator doesn't have _get_tags, use _DEFAULT_TAGS
-    # if estimator has tags but not key, use _DEFAULT_TAGS[key]
-    if hasattr(estimator, "_get_tags"):
-        if key is not None:
-            return estimator._get_tags().get(key, _DEFAULT_TAGS[key])
-        tags = estimator._get_tags()
-        return {key: tags.get(key, _DEFAULT_TAGS[key])
-                for key in _DEFAULT_TAGS.keys()}
-    if key is not None:
-        return _DEFAULT_TAGS[key]
-    return _DEFAULT_TAGS
-
 
 def _yield_checks(name, estimator):
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     yield check_no_attributes_set_in_init
     yield check_estimators_dtypes
     yield check_fit_score_takes_y
@@ -116,7 +103,7 @@ def _yield_checks(name, estimator):
 
 
 def _yield_classifier_checks(name, classifier):
-    tags = _safe_tags(classifier)
+    tags = classifier._get_tags()
 
     # test classifiers can handle non-array data and pandas objects
     yield check_classifier_data_not_an_array
@@ -171,7 +158,7 @@ def check_supervised_y_no_nan(name, estimator_orig):
 
 
 def _yield_regressor_checks(name, regressor):
-    tags = _safe_tags(regressor)
+    tags = regressor._get_tags()
     # TODO: test with intercept
     # TODO: test with multiple responses
     # basic testing
@@ -198,12 +185,12 @@ def _yield_regressor_checks(name, regressor):
 def _yield_transformer_checks(name, transformer):
     # All transformers should either deal with sparse data or raise an
     # exception with type TypeError and an intelligible error message
-    if not _safe_tags(transformer, "no_validation"):
+    if not transformer._get_tags()["no_validation"]:
         yield check_transformer_data_not_an_array
     # these don't actually fit the data, so don't raise errors
     yield check_transformer_general
     yield partial(check_transformer_general, readonly_memmap=True)
-    if not _safe_tags(transformer, "stateless"):
+    if not transformer._get_tags()["stateless"]:
         yield check_transformers_unfitted
     # Dependent on external solvers and hence accessing the iter
     # param is non-trivial.
@@ -237,12 +224,12 @@ def _yield_outliers_checks(name, estimator):
         # test outlier detectors can handle non-array data
         yield check_classifier_data_not_an_array
         # test if NotFittedError is raised
-        if _safe_tags(estimator, "requires_fit"):
+        if estimator._get_tags()["requires_fit"]:
             yield check_estimators_unfitted
 
 
 def _yield_all_checks(name, estimator):
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     if "2darray" not in tags["X_types"]:
         warnings.warn("Can't test estimator {} which requires input "
                       " of type {}".format(name, tags["X_types"]),
@@ -283,6 +270,8 @@ def _yield_all_checks(name, estimator):
     yield check_fit_idempotent
     if not tags["no_validation"]:
         yield check_n_features_in
+        if tags["requires_y"]:
+            yield check_requires_y_none
     if tags["requires_positive_X"]:
         yield check_fit_non_negative
 
@@ -344,12 +333,15 @@ def _construct_instance(Estimator):
     return estimator
 
 
+# TODO: probably not needed anymore in 0.24 since _generate_class_checks should
+# be removed too. Just put this in check_estimator()
 def _generate_instance_checks(name, estimator):
     """Generate instance checks."""
     yield from ((estimator, partial(check, name))
                 for check in _yield_all_checks(name, estimator))
 
 
+# TODO: remove this in 0.24
 def _generate_class_checks(Estimator):
     """Generate class checks."""
     name = Estimator.__name__
@@ -359,50 +351,74 @@ def _generate_class_checks(Estimator):
 
 
 def _mark_xfail_checks(estimator, check, pytest):
-    """Mark estimator check pairs with xfail"""
+    """Mark (estimator, check) pairs with xfail according to the
+    _xfail_checks_ tag"""
     if isinstance(estimator, type):
-        # try to construct estimator to get tags, if it is unable to then
-        # return the estimator class
+        # try to construct estimator instance, if it is unable to then
+        # return the estimator class, ignoring the tag
+        # TODO: remove this if block in 0.24 since passing instances isn't
+        # supported anymore
         try:
-            xfail_checks = _safe_tags(_construct_instance(estimator),
-                                      '_xfail_test')
+            estimator = _construct_instance(estimator)
         except Exception:
             return estimator, check
-    else:
-        xfail_checks = _safe_tags(estimator, '_xfail_test')
-
-    if not xfail_checks:
-        return estimator, check
 
+    xfail_checks = estimator._get_tags()['_xfail_checks'] or {}
     check_name = _set_check_estimator_ids(check)
-    msg = xfail_checks.get(check_name, None)
 
-    if msg is None:
+    if check_name not in xfail_checks:
+        # check isn't part of the xfail_checks tags, just return it
         return estimator, check
-
-    return pytest.param(
-        estimator, check, marks=pytest.mark.xfail(reason=msg))
+    else:
+        # check is in the tag, mark it as xfail for pytest
+        reason = xfail_checks[check_name]
+        return pytest.param(estimator, check,
+                            marks=pytest.mark.xfail(reason=reason))
 
 
 def parametrize_with_checks(estimators):
     """Pytest specific decorator for parametrizing estimator checks.
 
-    The `id` of each test is set to be a pprint version of the estimator
+    The `id` of each check is set to be a pprint version of the estimator
     and the name of the check with its keyword arguments.
+    This allows to use `pytest -k` to specify which tests to run::
 
-    Read more in the :ref:`User Guide<rolling_your_own_estimator>`.
+        pytest test_check_estimators.py -k check_estimators_fit_returns_self
 
     Parameters
     ----------
     estimators : list of estimators objects or classes
         Estimators to generated checks for.
 
+        .. deprecated:: 0.23
+           Passing a class is deprecated from version 0.23, and won't be
+           supported in 0.24. Pass an instance instead.
+
     Returns
     -------
     decorator : `pytest.mark.parametrize`
+
+    Examples
+    --------
+    >>> from sklearn.utils.estimator_checks import parametrize_with_checks
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.tree import DecisionTreeRegressor
+
+    >>> @parametrize_with_checks([LogisticRegression(),
+    ...                           DecisionTreeRegressor()])
+    ... def test_sklearn_compatible_estimator(estimator, check):
+    ...     check(estimator)
+
     """
     import pytest
 
+    if any(isinstance(est, type) for est in estimators):
+        # TODO: remove class support in 0.24 and update docstrings
+        msg = ("Passing a class is deprecated since version 0.23 "
+               "and won't be supported in 0.24."
+               "Please pass an instance instead.")
+        warnings.warn(msg, FutureWarning)
+
     checks_generator = chain.from_iterable(
         check_estimator(estimator, generate_only=True)
         for estimator in estimators)
@@ -419,22 +435,36 @@ def check_estimator(Estimator, generate_only=False):
     """Check if estimator adheres to scikit-learn conventions.
 
     This estimator will run an extensive test-suite for input validation,
-    shapes, etc.
+    shapes, etc, making sure that the estimator complies with `scikit-learn`
+    conventions as detailed in :ref:`rolling_your_own_estimator`.
     Additional tests for classifiers, regressors, clustering or transformers
     will be run if the Estimator class inherits from the corresponding mixin
     from sklearn.base.
 
     This test can be applied to classes or instances.
     Classes currently have some additional tests that related to construction,
-    while passing instances allows the testing of multiple options.
+    while passing instances allows the testing of multiple options. However,
+    support for classes is deprecated since version 0.23 and will be removed
+    in version 0.24 (class checks will still be run on the instances).
+
+    Setting `generate_only=True` returns a generator that yields (estimator,
+    check) tuples where the check can be called independently from each
+    other, i.e. `check(estimator)`. This allows all checks to be run
+    independently and report the checks that are failing.
 
-    Read more in :ref:`rolling_your_own_estimator`.
+    scikit-learn provides a pytest specific decorator,
+    :func:`~sklearn.utils.parametrize_with_checks`, making it easier to test
+    multiple estimators.
 
     Parameters
     ----------
-    estimator : estimator object or class
+    estimator : estimator object
         Estimator to check. Estimator is a class object or instance.
 
+        .. deprecated:: 0.23
+           Passing a class is deprecated from version 0.23, and won't be
+           supported in 0.24. Pass an instance instead.
+
     generate_only : bool, optional (default=False)
         When `False`, checks are evaluated when `check_estimator` is called.
         When `True`, `check_estimator` returns a generator that yields
@@ -449,8 +479,14 @@ def check_estimator(Estimator, generate_only=False):
         Generator that yields (estimator, check) tuples. Returned when
         `generate_only=True`.
     """
+    # TODO: remove class support in 0.24 and update docstrings
     if isinstance(Estimator, type):
         # got a class
+        msg = ("Passing a class is deprecated since version 0.23 "
+               "and won't be supported in 0.24."
+               "Please pass an instance instead.")
+        warnings.warn(msg, FutureWarning)
+
         checks_generator = _generate_class_checks(Estimator)
     else:
         # got an instance
@@ -683,7 +719,7 @@ def check_estimator_sparse_data(name, estimator_orig):
     X[X < .8] = 0
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     X_csr = sparse.csr_matrix(X)
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['binary_only']:
         y = (2 * rng.rand(40)).astype(np.int)
     else:
@@ -749,7 +785,7 @@ def check_sample_weights_pandas_series(name, estimator_orig):
             X = pd.DataFrame(_pairwise_estimator_convert_X(X, estimator_orig))
             y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
             weights = pd.Series([1] * 12)
-            if _safe_tags(estimator, "multioutput_only"):
+            if estimator._get_tags()["multioutput_only"]:
                 y = pd.DataFrame(y)
             try:
                 estimator.fit(X, y, sample_weight=weights)
@@ -774,7 +810,7 @@ def check_sample_weights_not_an_array(name, estimator_orig):
         X = _NotAnArray(pairwise_estimator_convert_X(X, estimator_orig))
         y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
         weights = _NotAnArray([1] * 12)
-        if _safe_tags(estimator, "multioutput_only"):
+        if estimator._get_tags()["multioutput_only"]:
             y = _NotAnArray(y.data.reshape(-1, 1))
         estimator.fit(X, y, sample_weight=weights)
 
@@ -788,8 +824,8 @@ def check_sample_weights_list(name, estimator_orig):
         rnd = np.random.RandomState(0)
         n_samples = 30
         X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)),
-                                         estimator_orig)
-        if _safe_tags(estimator, 'binary_only'):
+                                          estimator_orig)
+        if estimator._get_tags()['binary_only']:
             y = np.arange(n_samples) % 2
         else:
             y = np.arange(n_samples) % 3
@@ -868,7 +904,7 @@ def check_dtype_object(name, estimator_orig):
     rng = np.random.RandomState(0)
     X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)
     X = X.astype(object)
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['binary_only']:
         y = (X[:, 0] * 2).astype(np.int)
     else:
@@ -972,7 +1008,7 @@ def check_dont_overwrite_parameters(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     y = X[:, 0].astype(np.int)
-    if _safe_tags(estimator, 'binary_only'):
+    if estimator._get_tags()['binary_only']:
         y[y == 2] = 1
     y = _enforce_estimator_tags_y(estimator, y)
 
@@ -1023,7 +1059,7 @@ def check_fit2d_predict1d(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     y = X[:, 0].astype(np.int)
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['binary_only']:
         y[y == 2] = 1
     estimator = clone(estimator_orig)
@@ -1074,7 +1110,7 @@ def check_methods_subset_invariance(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     y = X[:, 0].astype(np.int)
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         y[y == 2] = 1
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
@@ -1175,7 +1211,7 @@ def check_fit1d(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20))
     y = X.astype(np.int)
     estimator = clone(estimator_orig)
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     if tags["no_validation"]:
         # FIXME this is a bit loose
         return
@@ -1267,7 +1303,7 @@ def _check_transformer(name, transformer_orig, X, y):
             X_pred2 = transformer.transform(X)
             X_pred3 = transformer.fit_transform(X, y=y_)
 
-        if _safe_tags(transformer_orig, 'non_deterministic'):
+        if transformer_orig._get_tags()['non_deterministic']:
             msg = name + ' is non deterministic'
             raise SkipTest(msg)
         if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
@@ -1298,7 +1334,7 @@ def _check_transformer(name, transformer_orig, X, y):
 
         # raises error on malformed input for transform
         if hasattr(X, 'shape') and \
-           not _safe_tags(transformer, "stateless") and \
+           not transformer._get_tags()["stateless"] and \
            X.ndim == 2 and X.shape[1] > 1:
 
             # If it's not an array, it does not have a 'T' property
@@ -1312,7 +1348,7 @@ def _check_transformer(name, transformer_orig, X, y):
 
 @ignore_warnings
 def check_pipeline_consistency(name, estimator_orig):
-    if _safe_tags(estimator_orig, 'non_deterministic'):
+    if estimator_orig._get_tags()['non_deterministic']:
         msg = name + ' is non deterministic'
         raise SkipTest(msg)
 
@@ -1347,7 +1383,7 @@ def check_fit_score_takes_y(name, estimator_orig):
     n_samples = 30
     X = rnd.uniform(size=(n_samples, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         y = np.arange(n_samples) % 2
     else:
         y = np.arange(n_samples) % 3
@@ -1380,7 +1416,7 @@ def check_estimators_dtypes(name, estimator_orig):
     X_train_int_64 = X_train_32.astype(np.int64)
     X_train_int_32 = X_train_32.astype(np.int32)
     y = X_train_int_64[:, 0]
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         y[y == 2] = 1
     y = _enforce_estimator_tags_y(estimator_orig, y)
 
@@ -1516,7 +1552,7 @@ def check_estimators_pickle(name, estimator_orig):
     X -= X.min()
     X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
 
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     # include NaN values when the estimator should deal with them
     if tags['allow_nan']:
         # set randomly 10 elements to np.nan
@@ -1531,11 +1567,6 @@ def check_estimators_pickle(name, estimator_orig):
     set_random_state(estimator)
     estimator.fit(X, y)
 
-    result = dict()
-    for method in check_methods:
-        if hasattr(estimator, method):
-            result[method] = getattr(estimator, method)(X)
-
     # pickle and unpickle!
     pickled_estimator = pickle.dumps(estimator)
     if estimator.__module__.startswith('sklearn.'):
@@ -1581,7 +1612,7 @@ def check_estimators_partial_fit_n_features(name, estimator_orig):
 @ignore_warnings(category=FutureWarning)
 def check_classifier_multioutput(name, estimator):
     n_samples, n_labels, n_classes = 42, 5, 3
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     estimator = clone(estimator)
     X, y = make_multilabel_classification(random_state=42,
                                           n_samples=n_samples,
@@ -1688,7 +1719,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False):
     pred = clusterer.labels_
     assert pred.shape == (n_samples,)
     assert adjusted_rand_score(pred, y) > 0.4
-    if _safe_tags(clusterer, 'non_deterministic'):
+    if clusterer._get_tags()['non_deterministic']:
         return
     set_random_state(clusterer)
     with warnings.catch_warnings(record=True):
@@ -1787,7 +1818,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False,
         X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b])
 
     problems = [(X_b, y_b)]
-    tags = _safe_tags(classifier_orig)
+    tags = classifier_orig._get_tags()
     if not tags['binary_only']:
         problems.append((X_m, y_m))
 
@@ -2026,7 +2057,7 @@ def check_classifiers_multilabel_representation_invariance(name,
 def check_estimators_fit_returns_self(name, estimator_orig,
                                       readonly_memmap=False):
     """Check if self is returned when calling fit"""
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         n_centers = 2
     else:
         n_centers = 3
@@ -2063,7 +2094,7 @@ def check_estimators_unfitted(name, estimator_orig):
 
 @ignore_warnings(category=FutureWarning)
 def check_supervised_y_2d(name, estimator_orig):
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['multioutput_only']:
         # These only work on 2d, so this test makes no sense
         return
@@ -2179,7 +2210,7 @@ def check_classifiers_classes(name, classifier_orig):
     y_names_binary = np.take(labels_binary, y_binary)
 
     problems = [(X_binary, y_binary, y_names_binary)]
-    if not _safe_tags(classifier_orig, 'binary_only'):
+    if not classifier_orig._get_tags()['binary_only']:
         problems.append((X_multiclass, y_multiclass, y_names_multiclass))
 
     for X, y, y_names in problems:
@@ -2264,7 +2295,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False,
     # TODO: find out why PLS and CCA fail. RANSAC is random
     # and furthermore assumes the presence of outliers, hence
     # skipped
-    if not _safe_tags(regressor, "poor_score"):
+    if not regressor._get_tags()["poor_score"]:
         assert regressor.score(X, y_) > 0.5
 
 
@@ -2297,7 +2328,7 @@ def check_regressors_no_decision_function(name, regressor_orig):
 @ignore_warnings(category=FutureWarning)
 def check_class_weight_classifiers(name, classifier_orig):
 
-    if _safe_tags(classifier_orig, 'binary_only'):
+    if classifier_orig._get_tags()['binary_only']:
         problems = [2]
     else:
         problems = [2, 3]
@@ -2400,7 +2431,7 @@ def check_class_weight_balanced_linear_classifier(name, Classifier):
 
 @ignore_warnings(category=FutureWarning)
 def check_estimators_overwrite_params(name, estimator_orig):
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         n_centers = 2
     else:
         n_centers = 3
@@ -2563,6 +2594,12 @@ def check_parameters_default_constructible(name, Estimator):
     # this check works on classes, not instances
     # test default-constructibility
     # get rid of deprecation warnings
+    if isinstance(Estimator, BaseEstimator):
+        # Convert estimator instance to its class
+        # TODO: Always convert to class in 0.24, because check_estimator() will
+        # only accept instances, not classes
+        Estimator = Estimator.__class__
+
     with ignore_warnings(category=FutureWarning):
         estimator = _construct_instance(Estimator)
         # test cloning
@@ -2636,13 +2673,13 @@ def enforce_estimator_tags_y(estimator, y):
 def _enforce_estimator_tags_y(estimator, y):
     # Estimators with a `requires_positive_y` tag only accept strictly positive
     # data
-    if _safe_tags(estimator, "requires_positive_y"):
+    if estimator._get_tags()["requires_positive_y"]:
         # Create strictly positive y. The minimal increment above 0 is 1, as
         # y could be of integer dtype.
         y += 1 + abs(y.min())
     # Estimators in mono_output_task_error raise ValueError if y is of 1-D
     # Convert into a 2-D y for those estimators.
-    if _safe_tags(estimator, "multioutput_only"):
+    if estimator._get_tags()["multioutput_only"]:
         return np.reshape(y, (-1, 1))
     return y
 
@@ -2654,11 +2691,11 @@ def _enforce_estimator_tags_x(estimator, X):
         X = X.dot(X.T)
     # Estimators with `1darray` in `X_types` tag only accept
     # X of shape (`n_samples`,)
-    if '1darray' in _safe_tags(estimator, 'X_types'):
+    if '1darray' in estimator._get_tags()['X_types']:
         X = X[:, 0]
     # Estimators with a `requires_positive_X` tag only accept
     # strictly positive data
-    if _safe_tags(estimator, 'requires_positive_X'):
+    if estimator._get_tags()['requires_positive_X']:
         X -= X.min()
     return X
 
@@ -2796,7 +2833,7 @@ def check_classifiers_regression_target(name, estimator_orig):
     X, y = load_boston(return_X_y=True)
     e = clone(estimator_orig)
     msg = 'Unknown label type: '
-    if not _safe_tags(e, "no_validation"):
+    if not e._get_tags()["no_validation"]:
         assert_raises_regex(ValueError, msg, e.fit, X, y)
 
 
@@ -2971,3 +3008,35 @@ def check_n_features_in(name, estimator_orig):
             "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html",  # noqa
             FutureWarning
         )
+
+
+def check_requires_y_none(name, estimator_orig):
+    # Make sure that an estimator with requires_y=True fails gracefully when
+    # given y=None
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = _pairwise_estimator_convert_X(X, estimator)
+
+    warning_msg = ("As of scikit-learn 0.23, estimators should have a "
+                   "'requires_y' tag set to the appropriate value. "
+                   "The default value of the tag is False. "
+                   "An error will be raised from version 0.25 when calling "
+                   "check_estimator() if the tag isn't properly set.")
+
+    expected_err_msgs = (
+        "requires y to be passed, but the target y is None",
+        "Expected array-like (array or non-string sequence), got None",
+        "y should be a 1d array"
+    )
+
+    try:
+        estimator.fit(X, None)
+    except ValueError as ve:
+        if not any(msg in str(ve) for msg in expected_err_msgs):
+            warnings.warn(warning_msg, FutureWarning)
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index d11b307e7500e..488b142d9cae9 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -349,24 +349,24 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
     B = safe_sparse_dot(Q.T, M)
 
     # compute the SVD on the thin matrix: (k + p) wide
-    Uhat, s, V = linalg.svd(B, full_matrices=False)
+    Uhat, s, Vt = linalg.svd(B, full_matrices=False)
 
     del B
     U = np.dot(Q, Uhat)
 
     if flip_sign:
         if not transpose:
-            U, V = svd_flip(U, V)
+            U, Vt = svd_flip(U, Vt)
         else:
             # In case of transpose u_based_decision=false
             # to actually flip based on u and not v.
-            U, V = svd_flip(U, V, u_based_decision=False)
+            U, Vt = svd_flip(U, Vt, u_based_decision=False)
 
     if transpose:
         # transpose back the results according to the input convention
-        return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
+        return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T
     else:
-        return U[:, :n_components], s[:n_components], V[:n_components, :]
+        return U[:, :n_components], s[:n_components], Vt[:n_components, :]
 
 
 def weighted_mode(a, w, axis=0):
@@ -508,6 +508,8 @@ def svd_flip(u, v, u_based_decision=True):
         u and v are the output of `linalg.svd` or
         :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
         dimensions so one can compute `np.dot(u * s, v)`.
+        The input v should really be called vt to be consistent with scipy's
+        ouput.
 
     u_based_decision : boolean, (default=True)
         If True, use the columns of u as the basis for sign flipping.
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 622c102fbbd0b..6635c7345aec5 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -42,14 +42,6 @@ def _parse_version(version_string):
     # mypy error: Name 'lobpcg' already defined (possibly by an import)
     from ..externals._lobpcg import lobpcg  # type: ignore  # noqa
 
-if sp_version >= (1, 3):
-    # Preserves earlier default choice of pinvh cutoff `cond` value.
-    # Can be removed once issue #14055 is fully addressed.
-    from ..externals._scipy_linalg import pinvh
-else:
-    # mypy error: Name 'pinvh' already defined (possibly by an import)
-    from scipy.linalg import pinvh  # type: ignore  # noqa
-
 
 def _object_dtype_isnan(X):
     return X != X
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
index 3301ac977b4b9..8e471d5fdf577 100644
--- a/sklearn/utils/multiclass.py
+++ b/sklearn/utils/multiclass.py
@@ -27,7 +27,9 @@ def _unique_multiclass(y):
 
 
 def _unique_indicator(y):
-    return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
+    return np.arange(
+        check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
+    )
 
 
 _FN_UNIQUE_LABELS = {
@@ -83,7 +85,8 @@ def unique_labels(*ys):
 
     # Check consistency for the indicator format
     if (label_type == "multilabel-indicator" and
-            len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
+            len(set(check_array(y,
+                                accept_sparse=['csr', 'csc', 'coo']).shape[1]
                     for y in ys)) > 1):
         raise ValueError("Multi-label binary indicator input with "
                          "different numbers of labels")
diff --git a/sklearn/utils/tests/conftest.py b/sklearn/utils/tests/conftest.py
new file mode 100644
index 0000000000000..148225a481f69
--- /dev/null
+++ b/sklearn/utils/tests/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+import sklearn
+
+
+@pytest.fixture
+def print_changed_only_false():
+    sklearn.set_config(print_changed_only=False)
+    yield
+    sklearn.set_config(print_changed_only=True)  # reset to default
diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py
index 487da5b431be0..067b12cc32f28 100644
--- a/sklearn/utils/tests/test_class_weight.py
+++ b/sklearn/utils/tests/test_class_weight.py
@@ -192,39 +192,41 @@ def test_compute_sample_weight_with_subsample():
     # Test compute_sample_weight with subsamples specified.
     # Test with balanced classes and all samples present
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with column vector of balanced classes and all samples present
     y = np.asarray([[1], [1], [1], [2], [2], [2]])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with a subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, range(4))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(4))
     assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
                                               2. / 3, 2., 2., 2.])
 
     # Test with a bootstrap subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y,
+                                          indices=[0, 1, 1, 2, 2, 3])
     expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
     assert_array_almost_equal(sample_weight, expected_balanced)
 
     # Test with a bootstrap subsample for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
-    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y,
+                                          indices=[0, 1, 1, 2, 2, 3])
     assert_array_almost_equal(sample_weight, expected_balanced ** 2)
 
     # Test with a missing class
     y = np.asarray([1, 1, 1, 2, 2, 2, 3])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
     # Test with a missing class for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
 
@@ -237,15 +239,15 @@ def test_compute_sample_weight_errors():
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y, range(4))
+        compute_sample_weight("ni", y, indices=range(4))
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y_)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y_, range(4))
+        compute_sample_weight("ni", y_, indices=range(4))
 
     # Not "balanced" for subsample
     with pytest.raises(ValueError):
-        compute_sample_weight({1: 2, 2: 1}, y, range(4))
+        compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
 
     # Not a list or preset for multi-output
     with pytest.raises(ValueError):
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index a7f4911791467..594ff65f9e889 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -356,6 +356,7 @@ def fit(self, X, y):
     check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod())
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_check_estimator():
     # tests that the estimator actually fails on "bad" estimators.
     # not a complete test of all checks, which are very extensive.
@@ -363,7 +364,8 @@ def test_check_estimator():
     # check that we have a set_params and can clone
     msg = "it does not implement a 'get_params' method"
     assert_raises_regex(TypeError, msg, check_estimator, object)
-    assert_raises_regex(TypeError, msg, check_estimator, object())
+    msg = "object has no attribute '_get_tags'"
+    assert_raises_regex(AttributeError, msg, check_estimator, object())
     # check that values returned by get_params match set_params
     msg = "get_params result does not match what was passed to set_params"
     assert_raises_regex(AssertionError, msg, check_estimator,
@@ -578,7 +580,10 @@ def test_check_regressor_data_not_an_array():
                         EstimatorInconsistentForPandas())
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_check_estimator_required_parameters_skip():
+    # TODO: remove whole test in 0.24 since passes classes to check_estimator()
+    # isn't supported anymore
     class MyEstimator(BaseEstimator):
         _required_parameters = ["special_parameter"]
 
diff --git a/sklearn/utils/tests/test_estimator_html_repr.py b/sklearn/utils/tests/test_estimator_html_repr.py
new file mode 100644
index 0000000000000..47d33051bd9a7
--- /dev/null
+++ b/sklearn/utils/tests/test_estimator_html_repr.py
@@ -0,0 +1,267 @@
+from contextlib import closing
+from io import StringIO
+
+import pytest
+
+from sklearn import config_context
+from sklearn.linear_model import LogisticRegression
+from sklearn.neural_network import MLPClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.decomposition import PCA
+from sklearn.decomposition import TruncatedSVD
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import FeatureUnion
+from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import VotingClassifier
+from sklearn.feature_selection import SelectPercentile
+from sklearn.cluster import Birch
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.svm import LinearSVC
+from sklearn.svm import LinearSVR
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.multiclass import OneVsOneClassifier
+from sklearn.ensemble import StackingClassifier
+from sklearn.ensemble import StackingRegressor
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RationalQuadratic
+from sklearn.utils._estimator_html_repr import _write_label_html
+from sklearn.utils._estimator_html_repr import _get_visual_block
+from sklearn.utils._estimator_html_repr import estimator_html_repr
+
+
+@pytest.mark.parametrize("checked", [True, False])
+def test_write_label_html(checked):
+    # Test checking logic and labeling
+    name = "LogisticRegression"
+    tool_tip = "hello-world"
+
+    with closing(StringIO()) as out:
+        _write_label_html(out, name, tool_tip, checked=checked)
+        html_label = out.getvalue()
+        assert 'LogisticRegression</label>' in html_label
+        assert html_label.startswith('<div class="sk-label-container">')
+        assert '<pre>hello-world</pre>' in html_label
+        if checked:
+            assert 'checked>' in html_label
+
+
+@pytest.mark.parametrize('est', ['passthrough', 'drop', None])
+def test_get_visual_block_single_str_none(est):
+    # Test estimators that are represnted by strings
+    est_html_info = _get_visual_block(est)
+    assert est_html_info.kind == 'single'
+    assert est_html_info.estimators == est
+    assert est_html_info.names == str(est)
+    assert est_html_info.name_details == str(est)
+
+
+def test_get_visual_block_single_estimator():
+    est = LogisticRegression(C=10.0)
+    est_html_info = _get_visual_block(est)
+    assert est_html_info.kind == 'single'
+    assert est_html_info.estimators == est
+    assert est_html_info.names == est.__class__.__name__
+    assert est_html_info.name_details == str(est)
+
+
+def test_get_visual_block_pipeline():
+    pipe = Pipeline([
+        ('imputer', SimpleImputer()),
+        ('do_nothing', 'passthrough'),
+        ('do_nothing_more', None),
+        ('classifier', LogisticRegression())
+    ])
+    est_html_info = _get_visual_block(pipe)
+    assert est_html_info.kind == 'serial'
+    assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
+    assert est_html_info.names == ['imputer: SimpleImputer',
+                                   'do_nothing: passthrough',
+                                   'do_nothing_more: passthrough',
+                                   'classifier: LogisticRegression']
+    assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
+
+
+def test_get_visual_block_feature_union():
+    f_union = FeatureUnion([
+        ('pca', PCA()), ('svd', TruncatedSVD())
+    ])
+    est_html_info = _get_visual_block(f_union)
+    assert est_html_info.kind == 'parallel'
+    assert est_html_info.names == ('pca', 'svd')
+    assert est_html_info.estimators == tuple(
+        trans[1] for trans in f_union.transformer_list)
+    assert est_html_info.name_details == (None, None)
+
+
+def test_get_visual_block_voting():
+    clf = VotingClassifier([
+        ('log_reg', LogisticRegression()),
+        ('mlp', MLPClassifier())
+    ])
+    est_html_info = _get_visual_block(clf)
+    assert est_html_info.kind == 'parallel'
+    assert est_html_info.estimators == tuple(trans[1]
+                                             for trans in clf.estimators)
+    assert est_html_info.names == ('log_reg', 'mlp')
+    assert est_html_info.name_details == (None, None)
+
+
+def test_get_visual_block_column_transformer():
+    ct = ColumnTransformer([
+        ('pca', PCA(), ['num1', 'num2']),
+        ('svd', TruncatedSVD, [0, 3])
+    ])
+    est_html_info = _get_visual_block(ct)
+    assert est_html_info.kind == 'parallel'
+    assert est_html_info.estimators == tuple(
+        trans[1] for trans in ct.transformers)
+    assert est_html_info.names == ('pca', 'svd')
+    assert est_html_info.name_details == (['num1', 'num2'], [0, 3])
+
+
+def test_estimator_html_repr_pipeline():
+    num_trans = Pipeline(steps=[
+        ('pass', 'passthrough'),
+        ('imputer', SimpleImputer(strategy='median'))
+    ])
+
+    cat_trans = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='constant',
+                                  missing_values='empty')),
+        ('one-hot', OneHotEncoder(drop='first'))
+    ])
+
+    preprocess = ColumnTransformer([
+        ('num', num_trans, ['a', 'b', 'c', 'd', 'e']),
+        ('cat', cat_trans, [0, 1, 2, 3])
+    ])
+
+    feat_u = FeatureUnion([
+            ('pca', PCA(n_components=1)),
+            ('tsvd', Pipeline([('first', TruncatedSVD(n_components=3)),
+                               ('select', SelectPercentile())]))
+    ])
+
+    clf = VotingClassifier([
+        ('lr', LogisticRegression(solver='lbfgs', random_state=1)),
+        ('mlp', MLPClassifier(alpha=0.001))
+    ])
+
+    pipe = Pipeline([
+        ('preprocessor', preprocess), ('feat_u', feat_u), ('classifier', clf)
+    ])
+    html_output = estimator_html_repr(pipe)
+
+    # top level estimators show estimator with changes
+    assert str(pipe) in html_output
+    for _, est in pipe.steps:
+        assert (f"<div class=\"sk-toggleable__content\">"
+                f"<pre>{str(est)}") in html_output
+
+    # low level estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert str(num_trans['pass']) in html_output
+        assert 'passthrough</label>' in html_output
+        assert str(num_trans['imputer']) in html_output
+
+        for _, _, cols in preprocess.transformers:
+            assert f"<pre>{cols}</pre>" in html_output
+
+        # feature union
+        for name, _ in feat_u.transformer_list:
+            assert f"<label>{name}</label>" in html_output
+
+        pca = feat_u.transformer_list[0][1]
+        assert f"<pre>{str(pca)}</pre>" in html_output
+
+        tsvd = feat_u.transformer_list[1][1]
+        first = tsvd['first']
+        select = tsvd['select']
+        assert f"<pre>{str(first)}</pre>" in html_output
+        assert f"<pre>{str(select)}</pre>" in html_output
+
+        # voting classifer
+        for name, est in clf.estimators:
+            assert f"<label>{name}</label>" in html_output
+            assert f"<pre>{str(est)}</pre>" in html_output
+
+
+@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
+def test_stacking_classsifer(final_estimator):
+    estimators = [('mlp', MLPClassifier(alpha=0.001)),
+                  ('tree', DecisionTreeClassifier())]
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=final_estimator)
+
+    html_output = estimator_html_repr(clf)
+
+    assert str(clf) in html_output
+    # If final_estimator's default changes from LogisticRegression
+    # this should be updated
+    if final_estimator is None:
+        assert "LogisticRegression(" in html_output
+    else:
+        assert final_estimator.__class__.__name__ in html_output
+
+
+@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
+def test_stacking_regressor(final_estimator):
+    reg = StackingRegressor(
+        estimators=[('svr', LinearSVR())], final_estimator=final_estimator)
+    html_output = estimator_html_repr(reg)
+
+    assert str(reg.estimators[0][0]) in html_output
+    assert "LinearSVR</label>" in html_output
+    if final_estimator is None:
+        assert "RidgeCV</label>" in html_output
+    else:
+        assert final_estimator.__class__.__name__ in html_output
+
+
+def test_birch_duck_typing_meta():
+    # Test duck typing meta estimators with Birch
+    birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
+    html_output = estimator_html_repr(birch)
+
+    # inner estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert f"<pre>{str(birch.n_clusters)}" in html_output
+        assert "AgglomerativeClustering</label>" in html_output
+
+    # outer estimator contains all changes
+    assert f"<pre>{str(birch)}" in html_output
+
+
+def test_ovo_classifier_duck_typing_meta():
+    # Test duck typing metaestimators with OVO
+    ovo = OneVsOneClassifier(LinearSVC(penalty='l1'))
+    html_output = estimator_html_repr(ovo)
+
+    # inner estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert f"<pre>{str(ovo.estimator)}" in html_output
+        assert "LinearSVC</label>" in html_output
+
+    # outter estimator
+    assert f"<pre>{str(ovo)}" in html_output
+
+
+def test_duck_typing_nested_estimator():
+    # Test duck typing metaestimators with GP
+    kernel = RationalQuadratic(length_scale=1.0, alpha=0.1)
+    gp = GaussianProcessRegressor(kernel=kernel)
+    html_output = estimator_html_repr(gp)
+
+    assert f"<pre>{str(kernel)}" in html_output
+    assert f"<pre>{str(gp)}" in html_output
+
+
+@pytest.mark.parametrize('print_changed_only', [True, False])
+def test_one_estimator_print_change_only(print_changed_only):
+    pca = PCA(n_components=10)
+
+    with config_context(print_changed_only=print_changed_only):
+        pca_repr = str(pca)
+        html_output = estimator_html_repr(pca)
+        assert pca_repr in html_output
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index 2abcbfa3c74e7..80151be8c7e20 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -100,12 +100,12 @@ def check_randomized_svd_low_rank(dtype):
     assert X.shape == (n_samples, n_features)
 
     # compute the singular values of X using the slow exact method
-    U, s, V = linalg.svd(X, full_matrices=False)
+    U, s, Vt = linalg.svd(X, full_matrices=False)
 
     # Convert the singular values to the specific dtype
     U = U.astype(dtype, copy=False)
     s = s.astype(dtype, copy=False)
-    V = V.astype(dtype, copy=False)
+    Vt = Vt.astype(dtype, copy=False)
 
     for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
         # compute the singular values of X using the fast approximate method
@@ -133,7 +133,7 @@ def check_randomized_svd_low_rank(dtype):
         assert_almost_equal(s[:k], sa, decimal=decimal)
 
         # check the singular vectors too (while not checking the sign)
-        assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va),
+        assert_almost_equal(np.dot(U[:, :k], Vt[:k, :]), np.dot(Ua, Va),
                             decimal=decimal)
 
         # check the sparse matrix representation
@@ -306,28 +306,28 @@ def test_randomized_svd_power_iteration_normalizer():
     n_components = 50
 
     # Check that it diverges with many (non-normalized) power iterations
-    U, s, V = randomized_svd(X, n_components, n_iter=2,
-                             power_iteration_normalizer='none')
-    A = X - U.dot(np.diag(s).dot(V))
+    U, s, Vt = randomized_svd(X, n_components, n_iter=2,
+                              power_iteration_normalizer='none')
+    A = X - U.dot(np.diag(s).dot(Vt))
     error_2 = linalg.norm(A, ord='fro')
-    U, s, V = randomized_svd(X, n_components, n_iter=20,
-                             power_iteration_normalizer='none')
-    A = X - U.dot(np.diag(s).dot(V))
+    U, s, Vt = randomized_svd(X, n_components, n_iter=20,
+                              power_iteration_normalizer='none')
+    A = X - U.dot(np.diag(s).dot(Vt))
     error_20 = linalg.norm(A, ord='fro')
     assert np.abs(error_2 - error_20) > 100
 
     for normalizer in ['LU', 'QR', 'auto']:
-        U, s, V = randomized_svd(X, n_components, n_iter=2,
-                                 power_iteration_normalizer=normalizer,
-                                 random_state=0)
-        A = X - U.dot(np.diag(s).dot(V))
+        U, s, Vt = randomized_svd(X, n_components, n_iter=2,
+                                  power_iteration_normalizer=normalizer,
+                                  random_state=0)
+        A = X - U.dot(np.diag(s).dot(Vt))
         error_2 = linalg.norm(A, ord='fro')
 
         for i in [5, 10, 50]:
-            U, s, V = randomized_svd(X, n_components, n_iter=i,
-                                     power_iteration_normalizer=normalizer,
-                                     random_state=0)
-            A = X - U.dot(np.diag(s).dot(V))
+            U, s, Vt = randomized_svd(X, n_components, n_iter=i,
+                                      power_iteration_normalizer=normalizer,
+                                      random_state=0)
+            A = X - U.dot(np.diag(s).dot(Vt))
             error = linalg.norm(A, ord='fro')
             assert 15 > np.abs(error_2 - error)
 
@@ -355,20 +355,20 @@ def test_svd_flip():
     X = rs.randn(n_samples, n_features)
 
     # Check matrix reconstruction
-    U, S, V = linalg.svd(X, full_matrices=False)
-    U1, V1 = svd_flip(U, V, u_based_decision=False)
+    U, S, Vt = linalg.svd(X, full_matrices=False)
+    U1, V1 = svd_flip(U, Vt, u_based_decision=False)
     assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)
 
     # Check transposed matrix reconstruction
     XT = X.T
-    U, S, V = linalg.svd(XT, full_matrices=False)
-    U2, V2 = svd_flip(U, V, u_based_decision=True)
+    U, S, Vt = linalg.svd(XT, full_matrices=False)
+    U2, V2 = svd_flip(U, Vt, u_based_decision=True)
     assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)
 
     # Check that different flip methods are equivalent under reconstruction
-    U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True)
+    U_flip1, V_flip1 = svd_flip(U, Vt, u_based_decision=True)
     assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
-    U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False)
+    U_flip2, V_flip2 = svd_flip(U, Vt, u_based_decision=False)
     assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
 
 
diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py
index 146ccf781ae8a..866d872a9b65c 100644
--- a/sklearn/utils/tests/test_pprint.py
+++ b/sklearn/utils/tests/test_pprint.py
@@ -174,7 +174,7 @@ def __init__(self, missing_values=np.nan, strategy="mean",
         self.copy = copy
 
 
-def test_basic():
+def test_basic(print_changed_only_false):
     # Basic pprint test
     lr = LogisticRegression()
     expected = """
@@ -189,8 +189,7 @@ def test_basic():
 
 
 def test_changed_only():
-    # Make sure the changed_only param is correctly used
-    set_config(print_changed_only=True)
+    # Make sure the changed_only param is correctly used when True (default)
     lr = LogisticRegression(C=99)
     expected = """LogisticRegression(C=99)"""
     assert lr.__repr__() == expected
@@ -216,10 +215,8 @@ def test_changed_only():
     # make sure array parameters don't throw error (see #13583)
     repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
 
-    set_config(print_changed_only=False)
 
-
-def test_pipeline():
+def test_pipeline(print_changed_only_false):
     # Render a pipeline object
     pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
     expected = """
@@ -240,7 +237,7 @@ def test_pipeline():
     assert pipeline.__repr__() == expected
 
 
-def test_deeply_nested():
+def test_deeply_nested(print_changed_only_false):
     # Render a deeply nested estimator
     rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
     expected = """
@@ -277,7 +274,7 @@ def test_deeply_nested():
     assert rfe.__repr__() == expected
 
 
-def test_gridsearch():
+def test_gridsearch(print_changed_only_false):
     # render a gridsearch
     param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                    'C': [1, 10, 100, 1000]},
@@ -302,7 +299,7 @@ def test_gridsearch():
     assert gs.__repr__() == expected
 
 
-def test_gridsearch_pipeline():
+def test_gridsearch_pipeline(print_changed_only_false):
     # render a pipeline inside a gridsearch
     pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
 
@@ -372,7 +369,7 @@ def test_gridsearch_pipeline():
     assert repr_ == expected
 
 
-def test_n_max_elements_to_show():
+def test_n_max_elements_to_show(print_changed_only_false):
 
     n_max_elements_to_show = 30
     pp = _EstimatorPrettyPrinter(
@@ -461,7 +458,7 @@ def test_n_max_elements_to_show():
     assert pp.pformat(gs) == expected
 
 
-def test_bruteforce_ellipsis():
+def test_bruteforce_ellipsis(print_changed_only_false):
     # Check that the bruteforce ellipsis (used when the number of non-blank
     # characters exceeds N_CHAR_MAX) renders correctly.
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 5f6df9685a25c..bcfd8fcd8d50e 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -63,7 +63,7 @@ def test_as_float_array():
     X = X.astype(np.int64)
     X2 = as_float_array(X, copy=True)
     # Checking that the array wasn't overwritten
-    assert as_float_array(X, False) is not X
+    assert as_float_array(X, copy=False) is not X
     assert X2.dtype == np.float64
     # Test int dtypes <= 32bit
     tested_dtypes = [np.bool,
@@ -349,6 +349,37 @@ def test_check_array():
             check_array(X, dtype="numeric")
 
 
+@pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"])
+@pytest.mark.parametrize("dtype, expected_dtype", [
+    ([np.float32, np.float64], np.float32),
+    (np.float64, np.float64),
+    ("numeric", np.float64),
+])
+def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip('pandas', minversion="1.0")
+
+    X_np = np.array([[1, 2, 3, np.nan, np.nan],
+                     [np.nan, np.nan, 8, 4, 6],
+                     [1, 2, 3, 4, 5]]).T
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X = pd.DataFrame(X_np, dtype=pd_dtype, columns=['a', 'b', 'c'])
+    # column c has no nans
+    X['c'] = X['c'].astype('float')
+    X_checked = check_array(X, force_all_finite='allow-nan', dtype=dtype)
+    assert_allclose(X_checked, X_np)
+    assert X_checked.dtype == expected_dtype
+
+    X_checked = check_array(X, force_all_finite=False, dtype=dtype)
+    assert_allclose(X_checked, X_np)
+    assert X_checked.dtype == expected_dtype
+
+    msg = "Input contains NaN, infinity"
+    with pytest.raises(ValueError, match=msg):
+        check_array(X, force_all_finite=True)
+
+
 def test_check_array_pandas_dtype_object_conversion():
     # test that data-frame like objects with dtype object
     # get converted
@@ -912,7 +943,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val):
     """Test that check_scalar returns no error/warning if valid inputs are
     provided"""
     with pytest.warns(None) as record:
-        check_scalar(x, "test_name", target_type, min_val, max_val)
+        check_scalar(x, "test_name", target_type=target_type,
+                     min_val=min_val, max_val=max_val)
     assert len(record) == 0
 
 
@@ -1097,6 +1129,15 @@ def f2(a=1, *, b=1, c=1, d=1):
                       match=r"Pass b=2 as keyword args"):
         f2(1, 2)
 
+    # The * is place before a keyword only argument without a default value
+    @_deprecate_positional_args
+    def f3(a, *, b, c=1, d=1):
+        pass
+
+    with pytest.warns(FutureWarning,
+                      match=r"Pass b=2 as keyword args"):
+        f3(1, 2)
+
 
 def test_deprecate_positional_args_warns_for_class():
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 4bb50c3deb5e7..7a6ef1e05fdde 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -36,6 +36,44 @@
 warnings.simplefilter('ignore', NonBLASDotWarning)
 
 
+def _deprecate_positional_args(f):
+    """Decorator for methods that issues warnings for positional arguments
+
+    Using the keyword-only argument syntax in pep 3102, arguments after the
+    * will issue a warning when passed as a positional argument.
+
+    Parameters
+    ----------
+    f : function
+        function to check arguments on
+    """
+    sig = signature(f)
+    kwonly_args = []
+    all_args = []
+
+    for name, param in sig.parameters.items():
+        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
+            all_args.append(name)
+        elif param.kind == Parameter.KEYWORD_ONLY:
+            kwonly_args.append(name)
+
+    @wraps(f)
+    def inner_f(*args, **kwargs):
+        extra_args = len(args) - len(all_args)
+        if extra_args > 0:
+            # ignore first 'self' argument for instance methods
+            args_msg = ['{}={}'.format(name, arg)
+                        for name, arg in zip(kwonly_args[:extra_args],
+                                             args[-extra_args:])]
+            warnings.warn("Pass {} as keyword args. From version 0.25 "
+                          "passing these as positional arguments will "
+                          "result in an error".format(", ".join(args_msg)),
+                          FutureWarning)
+        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
+        return f(**kwargs)
+    return inner_f
+
+
 def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
     """Like assert_all_finite, but only for ndarray."""
     # validation is also imported in extmath
@@ -67,7 +105,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
             raise ValueError("Input contains NaN")
 
 
-def assert_all_finite(X, allow_nan=False):
+@_deprecate_positional_args
+def assert_all_finite(X, *, allow_nan=False):
     """Throw a ValueError if X contains NaN or infinity.
 
     Parameters
@@ -79,7 +118,8 @@ def assert_all_finite(X, allow_nan=False):
     _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
 
 
-def as_float_array(X, copy=True, force_all_finite=True):
+@_deprecate_positional_args
+def as_float_array(X, *, copy=True, force_all_finite=True):
     """Converts an array-like to an array of floats.
 
     The new dtype will be np.float32 or np.float64, depending on the original
@@ -95,17 +135,20 @@ def as_float_array(X, copy=True, force_all_finite=True):
         returned if X's dtype is not a floating point type.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in X. The possibilities
-        are:
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. The
+        possibilities are:
 
         - True: Force all values of X to be finite.
-        - False: accept both np.inf and np.nan in X.
-        - 'allow-nan': accept only np.nan values in X. Values cannot be
-          infinite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
+          be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     Returns
     -------
     XT : {array, sparse matrix}
@@ -113,9 +156,9 @@ def as_float_array(X, copy=True, force_all_finite=True):
     """
     if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
                                     and not sp.issparse(X)):
-        return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64,
-                           copy=copy, force_all_finite=force_all_finite,
-                           ensure_2d=False)
+        return check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                           dtype=np.float64, copy=copy,
+                           force_all_finite=force_all_finite, ensure_2d=False)
     elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
         return X.copy() if copy else X
     elif X.dtype in [np.float32, np.float64]:  # is numpy array
@@ -277,17 +320,20 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
         be triggered by a conversion.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in X. The possibilities
-        are:
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. The
+        possibilities are:
 
         - True: Force all values of X to be finite.
-        - False: accept both np.inf and np.nan in X.
-        - 'allow-nan': accept only np.nan values in X. Values cannot be
-          infinite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
+          be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     Returns
     -------
     spmatrix_converted : scipy sparse matrix.
@@ -349,7 +395,8 @@ def _ensure_no_complex_data(array):
                          "{}\n".format(array))
 
 
-def check_array(array, accept_sparse=False, accept_large_sparse=True,
+@_deprecate_positional_args
+def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
                 dtype="numeric", order=None, copy=False, force_all_finite=True,
                 ensure_2d=True, allow_nd=False, ensure_min_samples=1,
                 ensure_min_features=1, estimator=None):
@@ -397,19 +444,20 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
         be triggered by a conversion.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in array. The
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
         possibilities are:
 
         - True: Force all values of array to be finite.
-        - False: accept both np.inf and np.nan in array.
-        - 'allow-nan': accept only np.nan values in array. Values cannot
-          be infinite.
-
-        For object dtyped data, only np.nan is checked and not np.inf.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     ensure_2d : boolean (default=True)
         Whether to raise a value error if array is not 2D.
 
@@ -450,6 +498,7 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     # check if the object contains several dtypes (typically a pandas
     # DataFrame), and store them. If not, store None.
     dtypes_orig = None
+    has_pd_integer_array = False
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
         # throw warning if columns are sparse. If all columns are sparse, then
         # array.sparse exists and sparsity will be perserved (later).
@@ -466,7 +515,20 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
         # pandas boolean dtype __array__ interface coerces bools to objects
         for i, dtype_iter in enumerate(dtypes_orig):
             if dtype_iter.kind == 'b':
-                dtypes_orig[i] = np.object
+                dtypes_orig[i] = np.dtype(np.object)
+            elif dtype_iter.name.startswith(("Int", "UInt")):
+                # name looks like an Integer Extension Array, now check for
+                # the dtype
+                with suppress(ImportError):
+                    from pandas import (Int8Dtype, Int16Dtype,
+                                        Int32Dtype, Int64Dtype,
+                                        UInt8Dtype, UInt16Dtype,
+                                        UInt32Dtype, UInt64Dtype)
+                    if isinstance(dtype_iter, (Int8Dtype, Int16Dtype,
+                                               Int32Dtype, Int64Dtype,
+                                               UInt8Dtype, UInt16Dtype,
+                                               UInt32Dtype, UInt64Dtype)):
+                        has_pd_integer_array = True
 
         if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
@@ -487,6 +549,10 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
             # list of accepted types.
             dtype = dtype[0]
 
+    if has_pd_integer_array:
+        # If there are any pandas integer extension arrays,
+        array = array.astype(dtype)
+
     if force_all_finite not in (True, False, 'allow-nan'):
         raise ValueError('force_all_finite should be a bool or "allow-nan"'
                          '. Got {!r} instead'.format(force_all_finite))
@@ -620,7 +686,8 @@ def _check_large_sparse(X, accept_large_sparse=False):
                                  % indices_datatype)
 
 
-def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
+@_deprecate_positional_args
+def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True,
               dtype="numeric", order=None, copy=False, force_all_finite=True,
               ensure_2d=True, allow_nd=False, multi_output=False,
               ensure_min_samples=1, ensure_min_features=1, y_numeric=False,
@@ -670,18 +737,21 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
         be triggered by a conversion.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in X. This parameter
-        does not influence whether y can have np.inf or np.nan values.
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
+        does not influence whether y can have np.inf, np.nan, pd.NA values.
         The possibilities are:
 
         - True: Force all values of X to be finite.
-        - False: accept both np.inf and np.nan in X.
-        - 'allow-nan': accept only np.nan values in X. Values cannot be
-          infinite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
+          be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     ensure_2d : boolean (default=True)
         Whether to raise a value error if X is not 2D.
 
@@ -732,8 +802,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
                     ensure_min_features=ensure_min_features,
                     estimator=estimator)
     if multi_output:
-        y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
-                        dtype=None)
+        y = check_array(y, accept_sparse='csr', force_all_finite=True,
+                        ensure_2d=False, dtype=None)
     else:
         y = column_or_1d(y, warn=True)
         _assert_all_finite(y)
@@ -745,7 +815,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
     return X, y
 
 
-def column_or_1d(y, warn=False):
+@_deprecate_positional_args
+def column_or_1d(y, *, warn=False):
     """ Ravel column or 1d numpy array, else raises an error
 
     Parameters
@@ -825,7 +896,8 @@ def has_fit_parameter(estimator, parameter):
     return parameter in signature(estimator.fit).parameters
 
 
-def check_symmetric(array, tol=1E-10, raise_warning=True,
+@_deprecate_positional_args
+def check_symmetric(array, *, tol=1E-10, raise_warning=True,
                     raise_exception=False):
     """Make sure that array is 2D, square and symmetric.
 
@@ -881,7 +953,8 @@ def check_symmetric(array, tol=1E-10, raise_warning=True,
     return array
 
 
-def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all):
+@_deprecate_positional_args
+def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
     Checks if the estimator is fitted by verifying the presence of
@@ -974,7 +1047,7 @@ def check_non_negative(X, whom):
         raise ValueError("Negative values in data passed to %s" % whom)
 
 
-def check_scalar(x, name, target_type, min_val=None, max_val=None):
+def check_scalar(x, name, target_type, *, min_val=None, max_val=None):
     """Validate scalar parameters type and value.
 
     Parameters
@@ -1268,44 +1341,6 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
                      "matrix and an array")
 
 
-def _deprecate_positional_args(f):
-    """Decorator for methods that issues warnings for positional arguments
-
-    Using the keyword-only argument syntax in pep 3102, arguments after the
-    * will issue a warning when passed as a positional argument.
-
-    Parameters
-    ----------
-    f : function
-        function to check arguments on
-    """
-    sig = signature(f)
-    kwonly_args = []
-    all_args = []
-
-    for name, param in sig.parameters.items():
-        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
-            all_args.append(name)
-        elif param.kind == Parameter.KEYWORD_ONLY:
-            kwonly_args.append(name)
-
-    @wraps(f)
-    def inner_f(*args, **kwargs):
-        extra_args = len(args) - len(all_args)
-        if extra_args > 0:
-            # ignore first 'self' argument for instance methods
-            args_msg = ['{}={}'.format(name, arg)
-                        for name, arg in zip(kwonly_args[:extra_args],
-                                             args[-extra_args:])]
-            warnings.warn("Pass {} as keyword args. From version 0.25 "
-                          "passing these as positional arguments will "
-                          "result in an error".format(", ".join(args_msg)),
-                          FutureWarning)
-        kwargs.update({k: arg for k, arg in zip(all_args, args)})
-        return f(**kwargs)
-    return inner_f
-
-
 def _check_fit_params(X, fit_params, indices=None):
     """Check and validate the parameters passed during `fit`.