Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: value_counts returning incorrect dtype for string dtype #55627

Merged
merged 8 commits into from Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.2.rst
Expand Up @@ -23,6 +23,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55626`)
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/groupby/groupby.py
Expand Up @@ -108,6 +108,10 @@ class providing the base-class of operations.
SparseArray,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.arrays.string_arrow import (
ArrowStringArray,
ArrowStringArrayNumpySemantics,
)
from pandas.core.base import (
PandasObject,
SelectionMixin,
Expand Down Expand Up @@ -2855,7 +2859,9 @@ def _value_counts(
result_series.name = name
result_series.index = index.set_names(range(len(columns)))
result_frame = result_series.reset_index()
result_frame.columns = columns + [name]
result_frame.columns = Index(
columns + [name], dtype=self.grouper.groupings[0].obj.columns.dtype
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhshadrach is there a better way of getting the dtype of the original column index?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self.obj.columns.dtype I think.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a Series in SeriesGroupBy unfortunately

)
result = result_frame
return result.__finalize__(self.obj, method="value_counts")

Expand Down Expand Up @@ -3007,7 +3013,12 @@ def size(self) -> DataFrame | Series:
dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None
if isinstance(self.obj, Series):
if isinstance(self.obj.array, ArrowExtensionArray):
dtype_backend = "pyarrow"
if isinstance(self.obj.array, ArrowStringArrayNumpySemantics):
dtype_backend = None
elif isinstance(self.obj.array, ArrowStringArray):
dtype_backend = "numpy_nullable"
else:
dtype_backend = "pyarrow"
elif isinstance(self.obj.array, BaseMaskedArray):
dtype_backend = "numpy_nullable"
# TODO: For DataFrames what if columns are mixed arrow/numpy/masked?
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/groupby/methods/test_value_counts.py
Expand Up @@ -9,6 +9,8 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under7p0

from pandas import (
Categorical,
CategoricalIndex,
Expand Down Expand Up @@ -369,6 +371,20 @@ def test_against_frame_and_seriesgroupby(
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param(
"string[pyarrow_numpy]",
marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
),
pytest.param(
"string[pyarrow]",
marks=pytest.mark.skipif(pa_version_under7p0, reason="arrow not installed"),
),
],
)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending, expected_rows, expected_count, expected_group_size",
Expand All @@ -386,7 +402,10 @@ def test_compound(
expected_rows,
expected_count,
expected_group_size,
dtype,
):
education_df = education_df.astype(dtype)
education_df.columns = education_df.columns.astype(dtype)
# Multiple groupby keys and as_index=False
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
result = gp["education"].value_counts(
Expand All @@ -395,11 +414,17 @@ def test_compound(
expected = DataFrame()
for column in ["country", "gender", "education"]:
expected[column] = [education_df[column][row] for row in expected_rows]
expected = expected.astype(dtype)
expected.columns = expected.columns.astype(dtype)
if normalize:
expected["proportion"] = expected_count
expected["proportion"] /= expected_group_size
if dtype == "string[pyarrow]":
expected["proportion"] = expected["proportion"].convert_dtypes()
else:
expected["count"] = expected_count
if dtype == "string[pyarrow]":
expected["count"] = expected["count"].convert_dtypes()
tm.assert_frame_equal(result, expected)


Expand Down