scikit-learn · ogrisel · Mar 16, 2023 · Jan 8, 2023 · Jan 8, 2023 · Jan 8, 2023
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1437,6 +1437,7 @@ details.
    preprocessing.RobustScaler
    preprocessing.SplineTransformer
    preprocessing.StandardScaler
+   preprocessing.TargetRegressorEncoder
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -830,6 +830,53 @@ lexicon order.
    >>> enc.infrequent_categories_
    [array(['b', 'c'], dtype=object)]
 
+.. _target_regressor_encoder:
+
+Target Regressor Encoder
+------------------------
+
+.. currentmodule:: sklearn.preprocessing
+
+The :class:`TargetRegressorEncoder` uses target mean conditioned on the
+categorical feature for encoding the categories [PAR]_ [MIC]_. This encoding
+scheme is useful with categorical features with high cardinality, where one hot
+encoding would inflate the feature space making it more expensive for a
+downstream model to process. A classical example of high cardinality categories
+are location based such as zip code or region. The
+:class:`TargetRegressorEncoder` implementation mixes the global target mean with
+the target mean conditioned on the category:
+
+.. math::
+    E_c = \dfrac{\sum_{X_i = c}y_i + s\mu_y}{|X_c| + s}
+
+where :math:`E_c` is the encoding for category :math:`c`, :math:`X_i` is the
+category at :math:`i`, :math:`y_i` is the target at :math:`i`, :math:`s` is a
+smoothing parameter, and :math:`X_c` is the set of data points with category
+:math:`c`.
+
+:class:`TargetRegressorEncoder` uses a cross validation scheme in
+:meth:`~TargetRegressorEncoder.fit_transform` to prevent leaking the target
+during training. In :meth:`~TargetRegressorEncoder.fit_transform`, Categorical
+encodings are obtained from one split and used to encoding the other split.
+Afterwards, a final categorical encoding is obtained from all the training data,
+which is used to encode data during :meth:`~TargetRegressorEncoder.transform`.
+This means that `fit().transform()` does not equal `fit_transform()`.
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_preprocessing_plot_target_regressor_encoder.py`
+
+.. topic:: References
+
+  .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
+     categorical attributes in classification and prediction problems"
+     SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
+
+  .. [PAR] :doi:`Pargent, F., Pfisterer, F., Thomas, J. et al. "Regularized target
+     encoding outperforms traditional methods in supervised machine learning with
+     high cardinality features" Comput Stat 37, 2671–2692 (2022)
+     <10.1007/s00180-022-01207-6>`
+
 .. _preprocessing_discretization:
 
 Discretization

diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -133,6 +133,10 @@ Changelog
 
 :mod:`sklearn.preprocessing`
 ............................
+- |MajorFeature| Introduces :class:`preprocessing.TargetRegressorEncoder` which uses
+  the target mean conditions on the categorices to encode the categories.
+  :pr:`25334` by `Thomas Fan`_.
+
 - |Enhancement| Added support for `sample_weight` in
   :class:`preprocessing.KBinsDiscretizer`. This allows specifying the parameter
   `sample_weight` for each sample to be used while fitting. The option is only

diff --git a/examples/preprocessing/plot_target_regressor_encoder.py b/examples/preprocessing/plot_target_regressor_encoder.py
@@ -0,0 +1,121 @@
+"""
+=============================
+Target Encoder for Regressors
+=============================
+
+.. currentmodule:: sklearn.preprocessing
+
+The :class:`TargetRegressorEncoder` uses target statistics conditioned on
+the categorical features for encoding. In this example, we will compare
+:class:`TargetRegressorEncoder`, :class:`OrdinalEncoder`, and dropping the
+category on a wine review dataset.
+"""
+
+# %%
+# Loading Data from OpenML
+# ========================
+# First, we load the wine reviews dataset, where the target is the points given
+# be a reviewer:
+from sklearn.datasets import fetch_openml
+
+wine_reviews = fetch_openml(data_id=42074, as_frame=True, parser="liac-arff")
+
+df = wine_reviews.frame
+df.head()
+
+# %%
+# For this example, we use the following subset of numerical and categorical
+# features in the data. The categorical features have a cardinality ranging
+# from 18 to 14810:
+numerical_features = ["price"]
+categorical_features = [
+    "country",
+    "province",
+    "region_1",
+    "region_2",
+    "variety",
+    "winery",
+]
+
+X = df[numerical_features + categorical_features]
+y = df["points"]
+X.nunique().sort_values(ascending=False)
+
+# %%
+# We split the dataset into a training and test set:
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
+print(f"Samples in training set: {len(X_train)}\nSamples in test set: {len(X_test)}")
+
+# %%
+# Building and Training Pipelines with Different Encoders
+# =======================================================
+# Dropping the categorical features
+# ---------------------------------
+# As a basline, we construct a pipeline where the categorical features are
+# dropped.
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+prep = ColumnTransformer(
+    [
+        ("num", "passthrough", numerical_features),
+        ("cat", "drop", categorical_features),
+    ]
+)
+
+reg_drop_cats = Pipeline(
+    [("prep", prep), ("hist", HistGradientBoostingRegressor(random_state=0))]
+)
+reg_drop_cats
+
+# %%
+# Here we train and use the root mean squared error to evalute the baseline
+# model:
+from sklearn.metrics import mean_squared_error
+
+reg_drop_cats.fit(X_train, y_train)
+reg_drop_cats_rmse = mean_squared_error(y_test, reg_drop_cats.predict(X_test))
+print(f"RMSE for dropping categorical features: {reg_drop_cats_rmse:.4}")
+
+# %%
+# Using the OrdinalEncoder
+# ------------------------
+# Since the categorical features have missing values, we impute the feature
+# with `'sk_missing'` before passing it to the :class:`OrdinalEncoder`.
+from sklearn.preprocessing import OrdinalEncoder
+
+cat_prep = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
+
+# %%
+# We modify the original pipeline to use the ordinal categorical preprocessing:
+reg_ordinal = reg_drop_cats.set_params(prep__cat=cat_prep)
+reg_ordinal
+
+# %%
+# When we include the categorical features through ordinal encoding the RMSE
+# improves:
+reg_ordinal.fit(X_train, y_train)
+reg_ordinal_rmse = mean_squared_error(
+    y_test, reg_ordinal.predict(X_test), squared=False
+)
+print(f"RMSE with ordinal encoding: {reg_ordinal_rmse:.4}")
+
+# %%
+# Using the TargetRegressorEncoder
+# --------------------------------
+# Finally, we replace the ordinal encoder with the
+# :class:`TargetRegressorEncoder`:
+from sklearn.preprocessing import TargetRegressorEncoder
+
+reg_target = reg_ordinal.set_params(prep__cat=TargetRegressorEncoder())
+reg_target
+
+# %%
+# The :class:`TargetRegressorEncoder` further improves the RMSE:
+reg_target.fit(X_train, y_train)
+reg_target_rmse = mean_squared_error(y_test, reg_target.predict(X_test), squared=False)
+print(f"RMSE with target encoding: {reg_target_rmse:.4}")
diff --git a/setup.py b/setup.py
@@ -353,6 +353,7 @@ def check_package_status(package, min_version):
     ],
     "preprocessing": [
         {"sources": ["_csr_polynomial_expansion.pyx"], "include_np": True},
+        {"sources": ["_target_encoder_fast.pyx"], "include_np": True},
     ],
     "neighbors": [
         {"sources": ["_ball_tree.pyx"], "include_np": True},

diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -26,6 +26,7 @@
 
 from ._encoders import OneHotEncoder
 from ._encoders import OrdinalEncoder
+from ._target_encoder import TargetRegressorEncoder
 
 from ._label import label_binarize
 from ._label import LabelBinarizer
@@ -56,6 +57,7 @@
     "RobustScaler",
     "SplineTransformer",
     "StandardScaler",
+    "TargetRegressorEncoder",
     "add_dummy_feature",
     "PolynomialFeatures",
     "binarize",