Merge pull request #14916 from rapidsai/pandas_2.0_feature_branch

Add `pandas-2.x` support in `cudf`
rapidsai · Jan 30, 2024 · 238a03f · 238a03f
2 parents 57bbe94 + f281b90
commit 238a03f
Show file tree

Hide file tree

Showing 119 changed files with 5,080 additions and 5,608 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -91,6 +91,12 @@ repos:
         entry: '(category=|\s)DeprecationWarning[,)]'
         language: pygrep
         types_or: [python, cython]
+        # We need to exclude just the following file because few APIs still need
+        # DeprecationWarning: https://github.com/pandas-dev/pandas/issues/54970
+        exclude: |
+          (?x)^(
+            ^python/cudf/cudf/core/dtypes.py
+          )
       - id: no-programmatic-xfail
         name: no-programmatic-xfail
         description: 'Enforce that pytest.xfail is not introduced (see dev docs for details)'

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -59,13 +59,13 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy>=1.21,<1.25
+- numpy>=1.21
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==3.0.5
 - nvtx>=0.2.1
 - packaging
-- pandas>=1.3,<1.6.0dev0
+- pandas>=2.0,<2.1.5dev0
 - pandoc
 - pip
 - pre-commit

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -58,12 +58,12 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy>=1.21,<1.25
+- numpy>=1.21
 - numpydoc
 - nvcomp==3.0.5
 - nvtx>=0.2.1
 - packaging
-- pandas>=1.3,<1.6.0dev0
+- pandas>=2.0,<2.1.5dev0
 - pandoc
 - pip
 - pre-commit

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -76,12 +76,11 @@ requirements:
     - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }}
     - python
     - typing_extensions >=4.0.0
-    - pandas >=1.3,<1.6.0dev0
+    - pandas >=2.0,<2.1.5dev0
     - cupy >=12.0.0
     # TODO: Pin to numba<0.58 until #14160 is resolved
     - numba >=0.57,<0.58
-    # TODO: Pin to numpy<1.25 until cudf requires pandas 2
-    - numpy >=1.21,<1.25
+    - numpy >=1.21
     - {{ pin_compatible('pyarrow', max_pin='x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -266,8 +266,7 @@ dependencies:
           - *cmake_ver
           - cython>=3.0.3
           - *ninja
-          # TODO: Pin to numpy<1.25 until cudf requires pandas 2
-          - &numpy numpy>=1.21,<1.25
+          - &numpy numpy>=1.21
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==14.0.1.*
@@ -502,7 +501,7 @@ dependencies:
         packages:
           - fsspec>=0.6.0
           - *numpy
-          - pandas>=1.3,<1.6.0dev0
+          - pandas>=2.0,<2.1.5dev0
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
@@ -454,12 +454,6 @@ def on_missing_reference(app, env, node, contnode):
                 _prefixed_domain_objects[f"{prefix}{name}"] = name
 
     reftarget = node.get("reftarget")
-    if reftarget == "cudf.core.index.GenericIndex":
-        # We don't exposed docs for `cudf.core.index.GenericIndex`
-        # hence we would want the docstring & mypy references to
-        # use `cudf.Index`
-        node["reftarget"] = "cudf.Index"
-        return contnode
     if "namespacecudf" in reftarget:
         node["reftarget"] = "cudf"
         return contnode

diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md
@@ -22,7 +22,7 @@ Finally we tie these pieces together to provide a more holistic view of the proj
 % class IndexedFrame
 % class SingleColumnFrame
 % class BaseIndex
-% class GenericIndex
+% class Index
 % class MultiIndex
 % class RangeIndex
 % class DataFrame
@@ -42,8 +42,8 @@ Finally we tie these pieces together to provide a more holistic view of the proj
 % BaseIndex <|-- MultiIndex
 % Frame <|-- MultiIndex
 %
-% BaseIndex <|-- GenericIndex
-% SingleColumnFrame <|-- GenericIndex
+% BaseIndex <|-- Index
+% SingleColumnFrame <|-- Index
 %
 % @enduml
 
@@ -89,31 +89,26 @@ While we've highlighted some exceptional cases of Indexes before, let's start wi
 In practice, `BaseIndex` does have concrete implementations of a small set of methods.
 However, currently many of these implementations are not applicable to all subclasses and will be eventually be removed.
 
-Almost all indexes are subclasses of `GenericIndex`, a single-columned index with the class hierarchy:
+Almost all indexes are subclasses of `Index`, a single-columned index with the class hierarchy:
 ```python
-class GenericIndex(SingleColumnFrame, BaseIndex)
+class Index(SingleColumnFrame, BaseIndex)
 ```
 Integer, float, or string indexes are all composed of a single column of data.
-Most `GenericIndex` methods are inherited from `Frame`, saving us the trouble of rewriting them.
+Most `Index` methods are inherited from `Frame`, saving us the trouble of rewriting them.
 
 We now consider the three main exceptions to this model:
 
 - A `RangeIndex` is not backed by a column of data, so it inherits directly from `BaseIndex` alone.
   Wherever possible, its methods have special implementations designed to avoid materializing columns.
-  Where such an implementation is infeasible, we fall back to converting it to an `Int64Index` first instead.
+  Where such an implementation is infeasible, we fall back to converting it to an `Index` of `int64`
+  dtype first instead.
 - A `MultiIndex` is backed by _multiple_ columns of data.
   Therefore, its inheritance hierarchy looks like `class MultiIndex(Frame, BaseIndex)`.
   Some of its more `Frame`-like methods may be inherited,
   but many others must be reimplemented since in many cases a `MultiIndex` is not expected to behave like a `Frame`.
-- Just like in pandas, `Index` itself can never be instantiated.
-  `pandas.Index` is the parent class for indexes,
-  but its constructor returns an appropriate subclass depending on the input data type and shape.
-  Unfortunately, mimicking this behavior requires overriding `__new__`,
-  which in turn makes shared initialization across inheritance trees much more cumbersome to manage.
-  To enable sharing constructor logic across different index classes,
-  we instead define `BaseIndex` as the parent class of all indexes.
+- To enable sharing constructor logic across different index classes,
+  we define `BaseIndex` as the parent class of all indexes.
   `Index` inherits from `BaseIndex`, but it masquerades as a `BaseIndex` to match pandas.
-  This class should contain no implementations since it is simply a factory for other indexes.
 
 
 ## The Column layer

diff --git a/docs/cudf/source/user_guide/api_docs/dataframe.rst b/docs/cudf/source/user_guide/api_docs/dataframe.rst
@@ -105,13 +105,14 @@ Function application, GroupBy & window
 .. autosummary::
    :toctree: api/
 
+   DataFrame.agg
    DataFrame.apply
    DataFrame.applymap
    DataFrame.apply_chunks
    DataFrame.apply_rows
-   DataFrame.pipe
-   DataFrame.agg
    DataFrame.groupby
+   DataFrame.map
+   DataFrame.pipe
    DataFrame.rolling
 
 .. _api.dataframe.stats:
@@ -232,7 +233,6 @@ Combining / comparing / joining / merging
 .. autosummary::
    :toctree: api/
 
-   DataFrame.append
    DataFrame.assign
    DataFrame.join
    DataFrame.merge

diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst
@@ -42,7 +42,6 @@ Computations / descriptive stats
    :toctree: api/
 
    GroupBy.bfill
-   GroupBy.backfill
    GroupBy.count
    GroupBy.cumcount
    GroupBy.cummax
@@ -63,7 +62,6 @@ Computations / descriptive stats
    GroupBy.ngroup
    GroupBy.nth
    GroupBy.nunique
-   GroupBy.pad
    GroupBy.prod
    GroupBy.shift
    GroupBy.size
@@ -82,7 +80,6 @@ application to columns of a specific data type.
 .. autosummary::
    :toctree: api/
 
-   DataFrameGroupBy.backfill
    DataFrameGroupBy.bfill
    DataFrameGroupBy.count
    DataFrameGroupBy.cumcount
@@ -96,7 +93,6 @@ application to columns of a specific data type.
    DataFrameGroupBy.idxmax
    DataFrameGroupBy.idxmin
    DataFrameGroupBy.nunique
-   DataFrameGroupBy.pad
    DataFrameGroupBy.quantile
    DataFrameGroupBy.shift
    DataFrameGroupBy.size

diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst
@@ -25,7 +25,6 @@ Properties
    Index.empty
    Index.has_duplicates
    Index.hasnans
-   Index.is_monotonic
    Index.is_monotonic_increasing
    Index.is_monotonic_decreasing
    Index.is_unique
@@ -42,6 +41,7 @@ Modifying and computations
 .. autosummary::
    :toctree: api/
 
+   Index.all
    Index.any
    Index.copy
    Index.drop_duplicates
@@ -61,6 +61,7 @@ Modifying and computations
    Index.where
    Index.take
    Index.unique
+   Index.nunique
 
 Compatibility with MultiIndex
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -77,7 +78,9 @@ Missing values
    Index.fillna
    Index.dropna
    Index.isna
+   Index.isnull
    Index.notna
+   Index.notnull
 
 Memory usage
 ~~~~~~~~~~~~
@@ -143,6 +146,7 @@ Selecting
 .. autosummary::
    :toctree: api/
 
+   Index.get_indexer
    Index.get_level_values
    Index.get_loc
    Index.get_slice_bound
@@ -168,9 +172,6 @@ Numeric Index
    RangeIndex.step
    RangeIndex.to_numpy
    RangeIndex.to_arrow
-   Int64Index
-   UInt64Index
-   Float64Index
 
 .. _api.categoricalindex:
 
@@ -212,6 +213,7 @@ IntervalIndex components
 
    IntervalIndex.from_breaks
    IntervalIndex.values
+   IntervalIndex.get_indexer
    IntervalIndex.get_loc
 
 .. _api.multiindex:
@@ -258,6 +260,7 @@ MultiIndex selecting
 .. autosummary::
    :toctree: api/
 
+   MultiIndex.get_indexer
    MultiIndex.get_loc
    MultiIndex.get_level_values
 

diff --git a/docs/cudf/source/user_guide/api_docs/series.rst b/docs/cudf/source/user_guide/api_docs/series.rst
@@ -158,7 +158,6 @@ Computations / descriptive stats
    Series.unique
    Series.nunique
    Series.is_unique
-   Series.is_monotonic
    Series.is_monotonic_increasing
    Series.is_monotonic_decreasing
    Series.value_counts
@@ -226,7 +225,6 @@ Combining / comparing / joining / merging
 .. autosummary::
    :toctree: api/
 
-   Series.append
    Series.update
 
 Time Series-related

diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -158,6 +158,27 @@ module, which allow you to compare values up to a desired precision.
 Unlike Pandas, cuDF does not support duplicate column names.
 It is best to use unique strings for column names.
 
+## Writing a DataFrame to Parquet with non-string column names
+
+When there is a DataFrame with non-string column names, pandas casts each
+column name to `str` before writing to a Parquet file. `cudf` raises an
+error by default if this is attempted. However, to achieve similar behavior
+as pandas you can enable the `mode.pandas_compatible` option, which will
+enable `cudf` to cast the column names to `str` just like pandas.
+
+```python
+>>> import cudf
+>>> df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]})
+>>> df.to_parquet("df.parquet")
+
+Traceback (most recent call last):
+ValueError: Writing a Parquet file requires string column names
+>>> cudf.set_option("mode.pandas_compatible", True)
+>>> df.to_parquet("df.parquet")
+
+UserWarning: The DataFrame has column names of non-string type. They will be converted to strings on write.
+```
+
 ## No true `"object"` data type
 
 In Pandas and NumPy, the `"object"` data type is used for

diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
@@ -40,8 +40,8 @@
 In addition to the above fixtures, we also provide the following more
 specialized fixtures:
     - rangeindex: Since RangeIndex always holds int64 data we cannot conflate
-      it with index_dtype_int64 (a true Int64Index), and it cannot hold nulls.
-      As a result, it is provided as a separate fixture.
+      it with index_dtype_int64 (a true Index with int64 dtype), and it
+      cannot hold nulls. As a result, it is provided as a separate fixture.
 """
 
 import os

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 # _setup_numba _must be called before numba.cuda is imported, because
 # it sets the numba config variable responsible for enabling
@@ -41,22 +41,10 @@
     BaseIndex,
     CategoricalIndex,
     DatetimeIndex,
-    Float32Index,
-    Float64Index,
-    GenericIndex,
     Index,
-    Int8Index,
-    Int16Index,
-    Int32Index,
-    Int64Index,
     IntervalIndex,
     RangeIndex,
-    StringIndex,
     TimedeltaIndex,
-    UInt8Index,
-    UInt16Index,
-    UInt32Index,
-    UInt64Index,
     interval_range,
 )
 from cudf.core.missing import NA, NaT
@@ -109,15 +97,8 @@
     "DatetimeIndex",
     "Decimal32Dtype",
     "Decimal64Dtype",
-    "Float32Index",
-    "Float64Index",
-    "GenericIndex",
     "Grouper",
     "Index",
-    "Int16Index",
-    "Int32Index",
-    "Int64Index",
-    "Int8Index",
     "IntervalDtype",
     "IntervalIndex",
     "ListDtype",
@@ -127,13 +108,8 @@
     "RangeIndex",
     "Scalar",
     "Series",
-    "StringIndex",
     "StructDtype",
     "TimedeltaIndex",
-    "UInt16Index",
-    "UInt32Index",
-    "UInt64Index",
-    "UInt8Index",
     "api",
     "concat",
     "crosstab",