Skip to content

Commit

Permalink
BUG: value_counts returning incorrect dtype for string dtype (#55627)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl committed Oct 25, 2023
1 parent 43c7d40 commit 4412800
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 2 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.2.rst
Expand Up @@ -25,6 +25,7 @@ Fixed regressions
Bug fixes
~~~~~~~~~
- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`)
- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`)
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/groupby/groupby.py
Expand Up @@ -108,6 +108,10 @@ class providing the base-class of operations.
SparseArray,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.arrays.string_arrow import (
ArrowStringArray,
ArrowStringArrayNumpySemantics,
)
from pandas.core.base import (
PandasObject,
SelectionMixin,
Expand Down Expand Up @@ -2854,7 +2858,9 @@ def _value_counts(
result_series.name = name
result_series.index = index.set_names(range(len(columns)))
result_frame = result_series.reset_index()
result_frame.columns = columns + [name]
orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] # noqa: E501
cols = Index(columns, dtype=orig_dtype).insert(len(columns), name)
result_frame.columns = cols
result = result_frame
return result.__finalize__(self.obj, method="value_counts")

Expand Down Expand Up @@ -3006,7 +3012,12 @@ def size(self) -> DataFrame | Series:
dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None
if isinstance(self.obj, Series):
if isinstance(self.obj.array, ArrowExtensionArray):
dtype_backend = "pyarrow"
if isinstance(self.obj.array, ArrowStringArrayNumpySemantics):
dtype_backend = None
elif isinstance(self.obj.array, ArrowStringArray):
dtype_backend = "numpy_nullable"
else:
dtype_backend = "pyarrow"
elif isinstance(self.obj.array, BaseMaskedArray):
dtype_backend = "numpy_nullable"
# TODO: For DataFrames what if columns are mixed arrow/numpy/masked?
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/groupby/methods/test_size.py
@@ -1,6 +1,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_integer_dtype

from pandas import (
Expand Down Expand Up @@ -104,3 +106,25 @@ def test_size_series_masked_type_returns_Int64(dtype):
result = ser.groupby(level=0).size()
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_size_strings(dtype):
# GH#55627
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
result = df.groupby("a")["b"].size()
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
expected = Series(
[2, 1],
index=Index(["a", "b"], name="a", dtype=dtype),
name="b",
dtype=exp_dtype,
)
tm.assert_series_equal(result, expected)
30 changes: 30 additions & 0 deletions pandas/tests/groupby/methods/test_value_counts.py
Expand Up @@ -9,6 +9,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import (
Categorical,
CategoricalIndex,
Expand Down Expand Up @@ -369,6 +371,14 @@ def test_against_frame_and_seriesgroupby(
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending, expected_rows, expected_count, expected_group_size",
Expand All @@ -386,7 +396,10 @@ def test_compound(
expected_rows,
expected_count,
expected_group_size,
dtype,
):
education_df = education_df.astype(dtype)
education_df.columns = education_df.columns.astype(dtype)
# Multiple groupby keys and as_index=False
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
result = gp["education"].value_counts(
Expand All @@ -395,11 +408,17 @@ def test_compound(
expected = DataFrame()
for column in ["country", "gender", "education"]:
expected[column] = [education_df[column][row] for row in expected_rows]
expected = expected.astype(dtype)
expected.columns = expected.columns.astype(dtype)
if normalize:
expected["proportion"] = expected_count
expected["proportion"] /= expected_group_size
if dtype == "string[pyarrow]":
expected["proportion"] = expected["proportion"].convert_dtypes()
else:
expected["count"] = expected_count
if dtype == "string[pyarrow]":
expected["count"] = expected["count"].convert_dtypes()
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1146,3 +1165,14 @@ def test_value_counts_time_grouper(utc):
)
expected = Series(1, index=index, name="count")
tm.assert_series_equal(result, expected)


def test_value_counts_integer_columns():
# GH#55627
df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]})
gp = df.groupby([1, 2], as_index=False, sort=False)
result = gp[3].value_counts()
expected = DataFrame(
{1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1}
)
tm.assert_frame_equal(result, expected)

0 comments on commit 4412800

Please sign in to comment.