Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate fill_method and limit in pct_change APIs #14277

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 32 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7137,7 +7137,7 @@ def explode(self, column, ignore_index=False):
return super()._explode(column, ignore_index)

def pct_change(
self, periods=1, fill_method="ffill", limit=None, freq=None
self, periods=1, fill_method=no_default, limit=no_default, freq=None
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
):
"""
Calculates the percent change between sequential elements
Expand All @@ -7149,9 +7149,15 @@ def pct_change(
Periods to shift for forming percent change.
fill_method : str, default 'ffill'
How to handle NAs before computing percent changes.

.. deprecated:: 23.12
All options of `fill_method` are deprecated except `fill_method=None`.
limit : int, optional
The number of consecutive NAs to fill before stopping.
Not yet implemented.

.. deprecated:: 23.12
`limit` is deprecated.
freq : str, optional
Increment to use from time series API.
Not yet implemented.
Expand All @@ -7160,16 +7166,38 @@ def pct_change(
-------
DataFrame
"""
if limit is not None:
if limit is not no_default:
raise NotImplementedError("limit parameter not supported yet.")
if freq is not None:
raise NotImplementedError("freq parameter not supported yet.")
elif fill_method not in {"ffill", "pad", "bfill", "backfill"}:
elif fill_method not in {
no_default,
None,
"ffill",
"pad",
"bfill",
"backfill",
}:
raise ValueError(
"fill_method must be one of 'ffill', 'pad', "
"fill_method must be one of None, 'ffill', 'pad', "
"'bfill', or 'backfill'."
)

if fill_method not in (no_default, None) or limit is not no_default:
# Do not remove until pandas 3.0 support is added.
warnings.warn(
"The 'fill_method' and 'limit' keywords in "
f"{type(self).__name__}.pct_change are deprecated and will be "
"removed in a future version. Either fill in any non-leading NA values prior "
"to calling pct_change or specify 'fill_method=None' to not fill NA "
"values.",
FutureWarning,
)
if fill_method is no_default:
fill_method = "ffill"
if limit is no_default:
limit = None

with warnings.catch_warnings():
warnings.simplefilter("ignore")
data = self.fillna(method=fill_method, limit=limit)
Expand Down
38 changes: 28 additions & 10 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from cudf._lib.sort import segmented_sort_by_key
from cudf._lib.types import size_type_dtype
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, arange, as_column
Expand Down Expand Up @@ -2286,7 +2287,12 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):

@_cudf_nvtx_annotate
def pct_change(
self, periods=1, fill_method="ffill", axis=0, limit=None, freq=None
self,
periods=1,
fill_method=no_default,
axis=0,
limit=no_default,
freq=None,
):
"""
Calculates the percent change between sequential elements
Expand All @@ -2298,9 +2304,15 @@ def pct_change(
Periods to shift for forming percent change.
fill_method : str, default 'ffill'
How to handle NAs before computing percent changes.

.. deprecated:: 23.12
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So recently, supported was added back for fill_method=None which means "don't fill the NA values" pandas-dev/pandas#55527

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, updated 👍

All options of `fill_method` are deprecated except `fill_method=None`.
limit : int, optional
The number of consecutive NAs to fill before stopping.
Not yet implemented.

.. deprecated:: 23.12
`limit` is deprecated.
freq : str, optional
Increment to use from time series API.
Not yet implemented.
Expand All @@ -2312,25 +2324,31 @@ def pct_change(
"""
if not axis == 0:
raise NotImplementedError("Only axis=0 is supported.")
if limit is not None:
if limit is not no_default:
raise NotImplementedError("limit parameter not supported yet.")
if freq is not None:
raise NotImplementedError("freq parameter not supported yet.")
elif fill_method not in {"ffill", "pad", "bfill", "backfill"}:
elif fill_method not in {no_default, None, "ffill", "bfill"}:
raise ValueError(
"fill_method must be one of 'ffill', 'pad', "
"'bfill', or 'backfill'."
"fill_method must be one of 'ffill', or" "'bfill'."
)

if fill_method in ("pad", "backfill"):
alternative = "ffill" if fill_method == "pad" else "bfill"
# Do not remove until pandas 2.0 support is added.
if fill_method not in (no_default, None) or limit is not no_default:
# Do not remove until pandas 3.0 support is added.
warnings.warn(
f"{fill_method} is deprecated and will be removed in a future "
f"version. Use f{alternative} instead.",
"The 'fill_method' keyword being not None and the 'limit' keywords in "
f"{type(self).__name__}.pct_change are deprecated and will be "
"removed in a future version. Either fill in any non-leading NA values prior "
"to calling pct_change or specify 'fill_method=None' to not fill NA "
"values.",
FutureWarning,
)

if fill_method in (no_default, None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would expect to skip the fillna step if fill_method=None

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah interesting. OK yeah best to match pandas here then

fill_method = "ffill"
if limit is no_default:
limit = None

with warnings.catch_warnings():
warnings.simplefilter("ignore")
filled = self.fillna(method=fill_method, limit=limit)
Expand Down
36 changes: 32 additions & 4 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3569,7 +3569,7 @@ def explode(self, ignore_index=False):

@_cudf_nvtx_annotate
def pct_change(
self, periods=1, fill_method="ffill", limit=None, freq=None
self, periods=1, fill_method=no_default, limit=no_default, freq=None
):
"""
Calculates the percent change between sequential elements
Expand All @@ -3581,9 +3581,15 @@ def pct_change(
Periods to shift for forming percent change.
fill_method : str, default 'ffill'
How to handle NAs before computing percent changes.

.. deprecated:: 23.12
All options of `fill_method` are deprecated except `fill_method=None`.
limit : int, optional
The number of consecutive NAs to fill before stopping.
Not yet implemented.

.. deprecated:: 23.12
`limit` is deprecated.
freq : str, optional
Increment to use from time series API.
Not yet implemented.
Expand All @@ -3592,15 +3598,37 @@ def pct_change(
-------
Series
"""
if limit is not None:
if limit is not no_default:
raise NotImplementedError("limit parameter not supported yet.")
if freq is not None:
raise NotImplementedError("freq parameter not supported yet.")
elif fill_method not in {"ffill", "pad", "bfill", "backfill"}:
elif fill_method not in {
no_default,
None,
"ffill",
"pad",
"bfill",
"backfill",
}:
raise ValueError(
"fill_method must be one of 'ffill', 'pad', "
"fill_method must be one of None, 'ffill', 'pad', "
"'bfill', or 'backfill'."
)
if fill_method not in (no_default, None) or limit is not no_default:
# Do not remove until pandas 3.0 support is added.
warnings.warn(
"The 'fill_method' and 'limit' keywords in "
f"{type(self).__name__}.pct_change are deprecated and will be "
"removed in a future version. Either fill in any non-leading NA values prior "
"to calling pct_change or specify 'fill_method=None' to not fill NA "
"values.",
FutureWarning,
)

if fill_method is no_default:
fill_method = "ffill"
if limit is no_default:
limit = None

with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand Down
15 changes: 12 additions & 3 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
PANDAS_GE_134,
PANDAS_GE_150,
PANDAS_GE_200,
PANDAS_GE_210,
PANDAS_LT_140,
)
from cudf.api.extensions import no_default
from cudf.core.buffer.spill_manager import get_global_manager
from cudf.core.column import column
from cudf.testing import _utils as utils
Expand Down Expand Up @@ -9896,13 +9898,20 @@ def test_dataframe_rename_duplicate_column():
],
)
@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5])
@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"])
@pytest.mark.parametrize(
"fill_method", ["ffill", "bfill", "pad", "backfill", no_default]
)
def test_dataframe_pct_change(data, periods, fill_method):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

actual = gdf.pct_change(periods=periods, fill_method=fill_method)
expected = pdf.pct_change(periods=periods, fill_method=fill_method)
with expect_warning_if(fill_method is not no_default):
actual = gdf.pct_change(periods=periods, fill_method=fill_method)
with expect_warning_if(
PANDAS_GE_210
and (fill_method is not no_default or pdf.isna().any().any())
):
expected = pdf.pct_change(periods=periods, fill_method=fill_method)

assert_eq(expected, actual)

Expand Down
23 changes: 16 additions & 7 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import cudf
from cudf import DataFrame, Series
from cudf.api.extensions import no_default
from cudf.core._compat import (
PANDAS_GE_150,
PANDAS_LT_140,
Expand Down Expand Up @@ -3062,17 +3063,25 @@ def test_groupby_transform_maintain_index(by):
],
)
@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5])
@pytest.mark.parametrize("fill_method", ["ffill", "bfill"])
@pytest.mark.parametrize("fill_method", ["ffill", "bfill", no_default, None])
def test_groupby_pct_change(data, gkey, periods, fill_method):
gdf = cudf.DataFrame(data)
pdf = gdf.to_pandas()

actual = gdf.groupby(gkey).pct_change(
periods=periods, fill_method=fill_method
)
expected = pdf.groupby(gkey).pct_change(
periods=periods, fill_method=fill_method
)
with expect_warning_if(fill_method not in (no_default, None)):
actual = gdf.groupby(gkey).pct_change(
periods=periods, fill_method=fill_method
)
with expect_warning_if(
PANDAS_GE_210
and (
fill_method not in (no_default, None)
or (fill_method is not None and pdf.isna().any().any())
)
):
expected = pdf.groupby(gkey).pct_change(
periods=periods, fill_method=fill_method
)

assert_eq(expected, actual)

Expand Down
18 changes: 15 additions & 3 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest

import cudf
from cudf.api.extensions import no_default
from cudf.datasets import randomdata
from cudf.testing._utils import (
_create_cudf_series_float64_default,
Expand All @@ -16,6 +17,7 @@
assert_exceptions_equal,
expect_warning_if,
)
from cudf.core._compat import PANDAS_GE_210

params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
methods = ["min", "max", "sum", "mean", "var", "std"]
Expand Down Expand Up @@ -356,14 +358,24 @@ def test_series_median(dtype, num_na):
],
)
@pytest.mark.parametrize("periods", range(-5, 5))
@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"])
@pytest.mark.parametrize(
"fill_method", ["ffill", "bfill", "pad", "backfill", no_default, None]
)
def test_series_pct_change(data, periods, fill_method):
cs = cudf.Series(data)
ps = cs.to_pandas()

if np.abs(periods) <= len(cs):
got = cs.pct_change(periods=periods, fill_method=fill_method)
expected = ps.pct_change(periods=periods, fill_method=fill_method)
with expect_warning_if(fill_method not in (no_default, None)):
got = cs.pct_change(periods=periods, fill_method=fill_method)
with expect_warning_if(
PANDAS_GE_210
and (
fill_method not in (no_default, None)
or (fill_method is not None and ps.isna().any())
)
):
expected = ps.pct_change(periods=periods, fill_method=fill_method)
np.testing.assert_array_almost_equal(
got.to_numpy(na_value=np.nan), expected
)
Expand Down