Skip to content

Commit

Permalink
Add future_stack to DataFrame.stack (#15015)
Browse files Browse the repository at this point in the history
This PR introduces `future_stack` to `stack` API. This also means deprecating `dropna`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #15015
  • Loading branch information
galipremsagar committed Feb 10, 2024
1 parent e596480 commit 0c0c7e6
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 13 deletions.
47 changes: 39 additions & 8 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6711,7 +6711,7 @@ def to_orc(
)

@_cudf_nvtx_annotate
def stack(self, level=-1, dropna=True):
def stack(self, level=-1, dropna=no_default, future_stack=False):
"""Stack the prescribed level(s) from columns to index
Return a reshaped DataFrame or Series having a multi-level
Expand Down Expand Up @@ -6843,6 +6843,23 @@ def stack(self, level=-1, dropna=True):
weight kg 3.0
dtype: float64
"""
if future_stack:
if dropna is not no_default:
raise ValueError(
"dropna must be unspecified with future_stack=True as the new "
"implementation does not introduce rows of NA values. This "
"argument will be removed in a future version of cudf."
)
else:
if dropna is not no_default or self._data.nlevels > 1:
warnings.warn(
"The previous implementation of stack is deprecated and will be "
"removed in a future version of cudf. Specify future_stack=True "
"to adopt the new implementation and silence this warning.",
FutureWarning,
)
if dropna is no_default:
dropna = True

if isinstance(level, (int, str)):
level = [level]
Expand All @@ -6858,7 +6875,7 @@ def stack(self, level=-1, dropna=True):

level = [level] if not isinstance(level, list) else level

if len(level) > 1 and not dropna:
if not future_stack and len(level) > 1 and not dropna:
raise NotImplementedError(
"When stacking multiple levels, setting `dropna` to False "
"will generate new column combination that does not exist "
Expand Down Expand Up @@ -6900,7 +6917,9 @@ def stack(self, level=-1, dropna=True):
# Since `level` may only specify a subset of all levels, `unique()` is
# required to remove duplicates. In pandas, the order of the keys in
# the specified levels are always sorted.
unique_named_levels = named_levels.unique().sort_values()
unique_named_levels = named_levels.unique()
if not future_stack:
unique_named_levels = unique_named_levels.sort_values()

# Each index from the original dataframe should repeat by the number
# of unique values in the named_levels
Expand Down Expand Up @@ -6949,11 +6968,19 @@ def unnamed_group_generator():
# `unique_named_levels` assigns -1 to these key
# combinations, representing an all-null column that
# is used in the subsequent libcudf call.
yield grpdf.reindex(
unique_named_levels, axis=0, fill_value=-1
).sort_index().values
if future_stack:
yield grpdf.reindex(
unique_named_levels, axis=0, fill_value=-1
).values
else:
yield grpdf.reindex(
unique_named_levels, axis=0, fill_value=-1
).sort_index().values
else:
yield column_idx_df.sort_index().values
if future_stack:
yield column_idx_df.values
else:
yield column_idx_df.sort_index().values

column_indices = list(unnamed_group_generator())

Expand Down Expand Up @@ -7004,6 +7031,10 @@ def unnamed_group_generator():
[
stacked[i]
for i in unnamed_level_values.argsort().argsort()
]
if not future_stack
else [
stacked[i] for i in unnamed_level_values.argsort()
],
)
),
Expand All @@ -7013,7 +7044,7 @@ def unnamed_group_generator():

result = DataFrame._from_data(data, index=new_index)

if dropna:
if not future_stack and dropna:
return result.dropna(how="all")
else:
return result
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,7 +1120,7 @@ def unstack(df, level, fill_value=None):
"Calling unstack() on single index dataframe"
" with different column datatype is not supported."
)
res = df.T.stack(dropna=False)
res = df.T.stack(future_stack=False)
# Result's index is a multiindex
res.index.names = (
tuple(df._data.to_pandas_index().names) + df.index.names
Expand Down
33 changes: 29 additions & 4 deletions python/cudf/cudf/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@

import cudf
from cudf import melt as cudf_melt
from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220
from cudf.core.buffer.spill_manager import get_global_manager
from cudf.testing._utils import (
ALL_TYPES,
DATETIME_TYPES,
NUMERIC_TYPES,
assert_eq,
expect_warning_if,
)

pytest_xfail = pytest.mark.xfail
Expand Down Expand Up @@ -153,6 +155,10 @@ def test_df_stack_reset_index():
assert_eq(expected, actual)


@pytest.mark.skipif(
not PANDAS_GE_210,
reason="Need pandas-2.1.0+ to match `stack` api",
)
@pytest.mark.parametrize(
"columns",
[
Expand Down Expand Up @@ -206,8 +212,15 @@ def test_df_stack_multiindex_column_axis(columns, index, level, dropna):
)
gdf = cudf.from_pandas(pdf)

got = gdf.stack(level=level, dropna=dropna)
expect = pdf.stack(level=level, dropna=dropna)
with pytest.warns(FutureWarning):
got = gdf.stack(level=level, dropna=dropna, future_stack=False)
with expect_warning_if(PANDAS_GE_220):
expect = pdf.stack(level=level, dropna=dropna, future_stack=False)

assert_eq(expect, got, check_dtype=False)

got = gdf.stack(level=level, future_stack=True)
expect = pdf.stack(level=level, future_stack=True)

assert_eq(expect, got, check_dtype=False)

Expand All @@ -228,6 +241,10 @@ def test_df_stack_mixed_dtypes():
assert_eq(expect, got, check_dtype=False)


@pytest.mark.skipif(
not PANDAS_GE_210,
reason="Need pandas-2.1.0+ to match `stack` api",
)
@pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]])
def test_df_stack_multiindex_column_axis_pd_example(level):
columns = pd.MultiIndex.from_tuples(
Expand All @@ -242,8 +259,16 @@ def test_df_stack_multiindex_column_axis_pd_example(level):

df = pd.DataFrame(np.random.randn(4, 4), columns=columns)

expect = df.stack(level=level)
got = cudf.from_pandas(df).stack(level=level)
with expect_warning_if(PANDAS_GE_220):
expect = df.stack(level=level, future_stack=False)
gdf = cudf.from_pandas(df)
with pytest.warns(FutureWarning):
got = gdf.stack(level=level, future_stack=False)

assert_eq(expect, got)

expect = df.stack(level=level, future_stack=True)
got = gdf.stack(level=level, future_stack=True)

assert_eq(expect, got)

Expand Down

0 comments on commit 0c0c7e6

Please sign in to comment.