Skip to content

Commit

Permalink
FIX Fixes bug OneHotEncoder's drop_idx_ when there are infrequent cat…
Browse files Browse the repository at this point in the history
…egories (#25589)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Jérémie du Boisberranger <34657725+jeremiedbb@users.noreply.github.com>
Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
  • Loading branch information
4 people committed Mar 8, 2023
1 parent 451f212 commit 1380e3c
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 23 deletions.
4 changes: 4 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ Changelog
:mod:`sklearn.preprocessing`
............................

- |Fix| :attr:`preprocessing.OneHotEncoder.drop_idx_` now properly
references the dropped category in the `categories_` attribute
when there are infrequent categories. :pr:`25589` by `Thomas Fan`_.

- |Fix| :class:`preprocessing.OrdinalEncoder` now correctly supports
`encoded_missing_value` or `unknown_value` set to a categories' cardinality
when there is missing values in the training data. :pr:`25704` by `Thomas Fan`_.
Expand Down
89 changes: 67 additions & 22 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,10 @@ class OneHotEncoder(_BaseEncoder):
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
should be dropped.
When `max_categories` or `min_frequency` is configured to group
infrequent categories, the dropping behavior is handled after the
grouping.
.. versionadded:: 0.21
The parameter `drop` was added in 0.21.
Expand Down Expand Up @@ -544,7 +548,7 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
"""Convert `drop_idx` into the index for infrequent categories.
If there are no infrequent categories, then `drop_idx` is
returned. This method is called in `_compute_drop_idx` when the `drop`
returned. This method is called in `_set_drop_idx` when the `drop`
parameter is an array-like.
"""
if not self._infrequent_enabled:
Expand All @@ -564,24 +568,35 @@ def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
)
return default_to_infrequent[drop_idx]

def _compute_drop_idx(self):
def _set_drop_idx(self):
"""Compute the drop indices associated with `self.categories_`.
If `self.drop` is:
- `None`, returns `None`.
- `'first'`, returns all zeros to drop the first category.
- `'if_binary'`, returns zero if the category is binary and `None`
- `None`, No categories have been dropped.
- `'first'`, All zeros to drop the first category.
- `'if_binary'`, All zeros if the category is binary and `None`
otherwise.
- array-like, returns the indices of the categories that match the
- array-like, The indices of the categories that match the
categories in `self.drop`. If the dropped category is an infrequent
category, then the index for the infrequent category is used. This
means that the entire infrequent category is dropped.
This methods defines a public `drop_idx_` and a private
`_drop_idx_after_grouping`.
- `drop_idx_`: Public facing API that references the drop category in
`self.categories_`.
- `_drop_idx_after_grouping`: Used internally to drop categories *after* the
infrequent categories are grouped together.
If there are no infrequent categories or drop is `None`, then
`drop_idx_=_drop_idx_after_grouping`.
"""
if self.drop is None:
return None
drop_idx_after_grouping = None
elif isinstance(self.drop, str):
if self.drop == "first":
return np.zeros(len(self.categories_), dtype=object)
drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
elif self.drop == "if_binary":
n_features_out_no_drop = [len(cat) for cat in self.categories_]
if self._infrequent_enabled:
Expand All @@ -590,7 +605,7 @@ def _compute_drop_idx(self):
continue
n_features_out_no_drop[i] -= infreq_idx.size - 1

return np.array(
drop_idx_after_grouping = np.array(
[
0 if n_features_out == 2 else None
for n_features_out in n_features_out_no_drop
Expand Down Expand Up @@ -647,7 +662,29 @@ def _compute_drop_idx(self):
)
)
raise ValueError(msg)
return np.array(drop_indices, dtype=object)
drop_idx_after_grouping = np.array(drop_indices, dtype=object)

# `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
# categories are grouped together. If needed, we remap `drop_idx` back
# to the categories seen in `self.categories_`.
self._drop_idx_after_grouping = drop_idx_after_grouping

if not self._infrequent_enabled or drop_idx_after_grouping is None:
self.drop_idx_ = self._drop_idx_after_grouping
else:
drop_idx_ = []
for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
default_to_infrequent = self._default_to_infrequent_mappings[
feature_idx
]
if drop_idx is None or default_to_infrequent is None:
orig_drop_idx = drop_idx
else:
orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]

drop_idx_.append(orig_drop_idx)

self.drop_idx_ = np.asarray(drop_idx_, dtype=object)

def _identify_infrequent(self, category_count, n_samples, col_idx):
"""Compute the infrequent indices.
Expand Down Expand Up @@ -809,16 +846,19 @@ def _compute_transformed_categories(self, i, remove_dropped=True):

def _remove_dropped_categories(self, categories, i):
"""Remove dropped categories."""
if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
return np.delete(categories, self.drop_idx_[i])
if (
self._drop_idx_after_grouping is not None
and self._drop_idx_after_grouping[i] is not None
):
return np.delete(categories, self._drop_idx_after_grouping[i])
return categories

def _compute_n_features_outs(self):
"""Compute the n_features_out for each input feature."""
output = [len(cats) for cats in self.categories_]

if self.drop_idx_ is not None:
for i, drop_idx in enumerate(self.drop_idx_):
if self._drop_idx_after_grouping is not None:
for i, drop_idx in enumerate(self._drop_idx_after_grouping):
if drop_idx is not None:
output[i] -= 1

Expand Down Expand Up @@ -875,7 +915,7 @@ def fit(self, X, y=None):
self._fit_infrequent_category_mapping(
fit_results["n_samples"], fit_results["category_counts"]
)
self.drop_idx_ = self._compute_drop_idx()
self._set_drop_idx()
self._n_features_outs = self._compute_n_features_outs()
return self

Expand Down Expand Up @@ -914,8 +954,8 @@ def transform(self, X):

n_samples, n_features = X_int.shape

if self.drop_idx_ is not None:
to_drop = self.drop_idx_.copy()
if self._drop_idx_after_grouping is not None:
to_drop = self._drop_idx_after_grouping.copy()
# We remove all the dropped categories from mask, and decrement all
# categories that occur after them to avoid an empty column.
keep_cells = X_int != to_drop
Expand Down Expand Up @@ -1014,7 +1054,7 @@ def inverse_transform(self, X):
# category. In this case we just fill the column with this
# unique category value.
if n_categories == 0:
X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
j += n_categories
continue
sub = X[:, j : j + n_categories]
Expand All @@ -1031,14 +1071,19 @@ def inverse_transform(self, X):
if unknown.any():
# if categories were dropped then unknown categories will
# be mapped to the dropped category
if self.drop_idx_ is None or self.drop_idx_[i] is None:
if (
self._drop_idx_after_grouping is None
or self._drop_idx_after_grouping[i] is None
):
found_unknown[i] = unknown
else:
X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]
X_tr[unknown, i] = self.categories_[i][
self._drop_idx_after_grouping[i]
]
else:
dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
if dropped.any():
if self.drop_idx_ is None:
if self._drop_idx_after_grouping is None:
all_zero_samples = np.flatnonzero(dropped)
raise ValueError(
f"Samples {all_zero_samples} can not be inverted "
Expand All @@ -1047,7 +1092,7 @@ def inverse_transform(self, X):
)
# we can safely assume that all of the nulls in each column
# are the dropped value
drop_idx = self.drop_idx_[i]
drop_idx = self._drop_idx_after_grouping[i]
X_tr[dropped, i] = transformed_features[i][drop_idx]

j += n_categories
Expand Down
38 changes: 37 additions & 1 deletion sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,7 @@ def test_ohe_infrequent_two_levels_drop_frequent(drop):
max_categories=2,
drop=drop,
).fit(X_train)
assert_array_equal(ohe.drop_idx_, [0])
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"

X_test = np.array([["b"], ["c"]])
X_trans = ohe.transform(X_test)
Expand Down Expand Up @@ -2015,3 +2015,39 @@ def test_ordinal_encoder_missing_unknown_encoding_max():
X_test = np.array([["snake"]])
X_trans = enc.transform(X_test)
assert_allclose(X_trans, [[2]])


def test_drop_idx_infrequent_categories():
"""Check drop_idx is defined correctly with infrequent categories.
Non-regression test for gh-25550.
"""
X = np.array(
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
).T
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
assert_array_equal(
ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
)
assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"

X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"

X = np.array(
[["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
).T
ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
assert_array_equal(
ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
)
assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"

ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
assert_array_equal(
ohe.get_feature_names_out(),
["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
)
assert ohe.drop_idx_ is None

0 comments on commit 1380e3c

Please sign in to comment.