Skip to content

Commit

Permalink
Restructure pylibcudf/arrow interop facilities (#15325)
Browse files Browse the repository at this point in the history
Resolves #15310. Contributes to #15193

In addition, this PR adds pylibcudf.Column<-->pyarrow.Array interconversion as a benefit

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #15325
  • Loading branch information
vyasr committed Mar 18, 2024
1 parent fa6130f commit c9c95f9
Show file tree
Hide file tree
Showing 20 changed files with 290 additions and 262 deletions.
5 changes: 3 additions & 2 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -65,7 +65,8 @@ rapids_cython_create_modules(

target_link_libraries(strings_udf PUBLIC cudf_strings_udf)

link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
set(targets_using_arrow_headers interop avro csv orc json parquet)
link_to_pyarrow_headers("${targets_using_arrow_headers}")

add_subdirectory(cpp)
add_subdirectory(io)
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,14 @@ from cudf._lib.types cimport (
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf._lib.types import dtype_from_pylibcudf_column

# TODO: We currently need this for "casting" empty pylibcudf columns in
# from_pylibcudf by instead creating an empty numeric column. We will be able
# to remove this once column factories are exposed to pylibcudf.

cimport cudf._lib.cpp.copying as cpp_copying
cimport cudf._lib.cpp.types as libcudf_types
cimport cudf._lib.cpp.unary as libcudf_unary
from cudf._lib cimport pylibcudf
from cudf._lib.cpp.column.column cimport column, column_contents
from cudf._lib.cpp.column.column_factories cimport (
make_column_from_scalar as cpp_make_column_from_scalar,
Expand Down Expand Up @@ -618,6 +623,24 @@ cdef class Column:
pylibcudf.Column
A new pylibcudf.Column referencing the same data.
"""
cdef libcudf_types.data_type new_dtype
if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS:
col = pylibcudf.unary.cast(
col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS)
)
elif col.type().id() == pylibcudf.TypeId.EMPTY:
new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8)
# TODO: This function call is what requires cimporting pylibcudf.
# We can remove the cimport once we can directly do
# pylibcudf.column_factories.make_numeric_column or equivalent.
col = pylibcudf.Column.from_libcudf(
move(
make_numeric_column(
new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
)
)
)

dtype = dtype_from_pylibcudf_column(col)

return cudf.core.column.build_column(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def date_range(DeviceScalar start, size_type n, offset):
+ offset.kwds.get("months", 0)
)

cdef const scalar* c_start = start.c_value.get()
cdef const scalar* c_start = start.get_raw_ptr()
with nogil:
c_result = move(calendrical_month_sequence(
n,
Expand Down
77 changes: 28 additions & 49 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cpython cimport pycapsule
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table

from cudf._lib import pylibcudf

from cudf._lib.cpp.interop cimport (
DLManagedTensor,
column_metadata,
from_arrow as cpp_from_arrow,
from_dlpack as cpp_from_dlpack,
to_arrow as cpp_to_arrow,
to_dlpack as cpp_to_dlpack,
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
from cudf._lib.utils cimport (
columns_from_pylibcudf_table,
columns_from_unique_ptr,
table_view_from_columns,
)

from cudf.core.buffer import acquire_spill_lock
from cudf.core.dtypes import ListDtype, StructDtype
Expand Down Expand Up @@ -83,21 +84,19 @@ cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
dlpack_tensor.deleter(dlpack_tensor)


cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
def gather_metadata(object cols_dtypes):
"""
Generates a column_metadata vector for each column.
Generates a ColumnMetadata vector for each column.
Parameters
----------
cols_dtypes : iterable
An iterable of ``(column_name, dtype)`` pairs.
"""
cdef vector[column_metadata] cpp_metadata
cpp_metadata.reserve(len(cols_dtypes))

cpp_metadata = []
if cols_dtypes is not None:
for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
cpp_metadata.push_back(column_metadata(col_name.encode()))
cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name))
if isinstance(col_dtype, (ListDtype, StructDtype)):
_set_col_children_metadata(col_dtype, cpp_metadata[idx])
else:
Expand All @@ -108,31 +107,22 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
return cpp_metadata


cdef _set_col_children_metadata(dtype,
column_metadata& col_meta):

cdef column_metadata element_metadata

def _set_col_children_metadata(dtype, col_meta):
if isinstance(dtype, StructDtype):
for name, value in dtype.fields.items():
element_metadata = column_metadata(name.encode())
_set_col_children_metadata(
value, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
element_metadata = pylibcudf.interop.ColumnMetadata(name)
_set_col_children_metadata(value, element_metadata)
col_meta.children_meta.append(element_metadata)
elif isinstance(dtype, ListDtype):
col_meta.children_meta.reserve(2)
# Offsets - child 0
col_meta.children_meta.push_back(column_metadata())
col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())

# Element column - child 1
element_metadata = column_metadata()
_set_col_children_metadata(
dtype.element_type, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
element_metadata = pylibcudf.interop.ColumnMetadata()
_set_col_children_metadata(dtype.element_type, element_metadata)
col_meta.children_meta.append(element_metadata)
else:
col_meta.children_meta.push_back(column_metadata())
col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())


@acquire_spill_lock()
Expand All @@ -149,16 +139,11 @@ def to_arrow(list source_columns, object column_dtypes):
-------
pyarrow table
"""
cdef vector[column_metadata] cpp_metadata = gather_metadata(column_dtypes)
cdef table_view input_table_view = table_view_from_columns(source_columns)

cdef shared_ptr[CTable] cpp_arrow_table
with nogil:
cpp_arrow_table = cpp_to_arrow(
input_table_view, cpp_metadata
)

return pyarrow_wrap_table(cpp_arrow_table)
cpp_metadata = gather_metadata(column_dtypes)
return pylibcudf.interop.to_arrow(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
cpp_metadata,
)


@acquire_spill_lock()
Expand All @@ -173,12 +158,6 @@ def from_arrow(object input_table):
-------
A list of columns to construct Frame object
"""
cdef shared_ptr[CTable] cpp_arrow_table = (
pyarrow_unwrap_table(input_table)
return columns_from_pylibcudf_table(
pylibcudf.interop.from_arrow(input_table)
)
cdef unique_ptr[table] c_result

with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

return columns_from_unique_ptr(move(c_result))
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -22,4 +22,3 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
link_to_pyarrow_headers(pylibcudf_interop)
2 changes: 0 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ from . cimport (
copying,
filling,
groupby,
interop,
join,
lists,
merge,
Expand Down Expand Up @@ -41,7 +40,6 @@ __all__ = [
"filling",
"gpumemoryview",
"groupby",
"interop",
"join",
"lists",
"merge",
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ cdef class Column:
"""
cdef DataType dtype = DataType.from_libcudf(libcudf_col.get().type())
cdef size_type size = libcudf_col.get().size()

cdef size_type null_count = libcudf_col.get().null_count()

cdef column_contents contents = move(libcudf_col.get().release())
Expand Down
9 changes: 0 additions & 9 deletions python/cudf/cudf/_lib/pylibcudf/interop.pxd

This file was deleted.

0 comments on commit c9c95f9

Please sign in to comment.