Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restructure pylibcudf/arrow interop facilities #15325

Merged
merged 14 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 3 additions & 2 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -65,7 +65,8 @@ rapids_cython_create_modules(

target_link_libraries(strings_udf PUBLIC cudf_strings_udf)

link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
set(targets_using_arrow_headers interop avro csv orc json parquet)
link_to_pyarrow_headers("${targets_using_arrow_headers}")

add_subdirectory(cpp)
add_subdirectory(io)
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,14 @@ from cudf._lib.types cimport (
from cudf._lib.null_mask import bitmask_allocation_size_bytes
from cudf._lib.types import dtype_from_pylibcudf_column

# TODO: We currently need this for "casting" empty pylibcudf columns in
# from_pylibcudf by instead creating an empty numeric column. We will be able
# to remove this once column factories are exposed to pylibcudf.

cimport cudf._lib.cpp.copying as cpp_copying
cimport cudf._lib.cpp.types as libcudf_types
cimport cudf._lib.cpp.unary as libcudf_unary
from cudf._lib cimport pylibcudf
from cudf._lib.cpp.column.column cimport column, column_contents
from cudf._lib.cpp.column.column_factories cimport (
make_column_from_scalar as cpp_make_column_from_scalar,
Expand Down Expand Up @@ -618,6 +623,24 @@ cdef class Column:
pylibcudf.Column
A new pylibcudf.Column referencing the same data.
"""
cdef libcudf_types.data_type new_dtype
if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS:
col = pylibcudf.unary.cast(
col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS)
)
elif col.type().id() == pylibcudf.TypeId.EMPTY:
new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8)
# TODO: This function call is what requires cimporting pylibcudf.
# We can remove the cimport once we can directly do
# pylibcudf.column_factories.make_numeric_column or equivalent.
col = pylibcudf.Column.from_libcudf(
move(
make_numeric_column(
new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
)
)
)

dtype = dtype_from_pylibcudf_column(col)

return cudf.core.column.build_column(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def date_range(DeviceScalar start, size_type n, offset):
+ offset.kwds.get("months", 0)
)

cdef const scalar* c_start = start.c_value.get()
cdef const scalar* c_start = start.get_raw_ptr()
with nogil:
c_result = move(calendrical_month_sequence(
n,
Expand Down
77 changes: 28 additions & 49 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cpython cimport pycapsule
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table

from cudf._lib import pylibcudf

from cudf._lib.cpp.interop cimport (
DLManagedTensor,
column_metadata,
from_arrow as cpp_from_arrow,
from_dlpack as cpp_from_dlpack,
to_arrow as cpp_to_arrow,
to_dlpack as cpp_to_dlpack,
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
from cudf._lib.utils cimport (
columns_from_pylibcudf_table,
columns_from_unique_ptr,
table_view_from_columns,
)

from cudf.core.buffer import acquire_spill_lock
from cudf.core.dtypes import ListDtype, StructDtype
Expand Down Expand Up @@ -83,21 +84,19 @@ cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
dlpack_tensor.deleter(dlpack_tensor)


cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
def gather_metadata(object cols_dtypes):
"""
Generates a column_metadata vector for each column.
Generates a ColumnMetadata vector for each column.

Parameters
----------
cols_dtypes : iterable
An iterable of ``(column_name, dtype)`` pairs.
"""
cdef vector[column_metadata] cpp_metadata
cpp_metadata.reserve(len(cols_dtypes))

cpp_metadata = []
if cols_dtypes is not None:
for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
cpp_metadata.push_back(column_metadata(col_name.encode()))
cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name))
if isinstance(col_dtype, (ListDtype, StructDtype)):
_set_col_children_metadata(col_dtype, cpp_metadata[idx])
else:
Expand All @@ -108,31 +107,22 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
return cpp_metadata


cdef _set_col_children_metadata(dtype,
column_metadata& col_meta):

cdef column_metadata element_metadata

def _set_col_children_metadata(dtype, col_meta):
if isinstance(dtype, StructDtype):
for name, value in dtype.fields.items():
element_metadata = column_metadata(name.encode())
_set_col_children_metadata(
value, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
element_metadata = pylibcudf.interop.ColumnMetadata(name)
_set_col_children_metadata(value, element_metadata)
col_meta.children_meta.append(element_metadata)
elif isinstance(dtype, ListDtype):
col_meta.children_meta.reserve(2)
# Offsets - child 0
col_meta.children_meta.push_back(column_metadata())
col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())

# Element column - child 1
element_metadata = column_metadata()
_set_col_children_metadata(
dtype.element_type, element_metadata
)
col_meta.children_meta.push_back(element_metadata)
element_metadata = pylibcudf.interop.ColumnMetadata()
_set_col_children_metadata(dtype.element_type, element_metadata)
col_meta.children_meta.append(element_metadata)
else:
col_meta.children_meta.push_back(column_metadata())
col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())


@acquire_spill_lock()
Expand All @@ -149,16 +139,11 @@ def to_arrow(list source_columns, object column_dtypes):
-------
pyarrow table
"""
cdef vector[column_metadata] cpp_metadata = gather_metadata(column_dtypes)
cdef table_view input_table_view = table_view_from_columns(source_columns)

cdef shared_ptr[CTable] cpp_arrow_table
with nogil:
cpp_arrow_table = cpp_to_arrow(
input_table_view, cpp_metadata
)

return pyarrow_wrap_table(cpp_arrow_table)
cpp_metadata = gather_metadata(column_dtypes)
return pylibcudf.interop.to_arrow(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
cpp_metadata,
)


@acquire_spill_lock()
Expand All @@ -173,12 +158,6 @@ def from_arrow(object input_table):
-------
A list of columns to construct Frame object
"""
cdef shared_ptr[CTable] cpp_arrow_table = (
pyarrow_unwrap_table(input_table)
return columns_from_pylibcudf_table(
pylibcudf.interop.from_arrow(input_table)
)
cdef unique_ptr[table] c_result

with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

return columns_from_unique_ptr(move(c_result))
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -22,4 +22,3 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)
link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
link_to_pyarrow_headers(pylibcudf_interop)
2 changes: 0 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ from . cimport (
copying,
filling,
groupby,
interop,
join,
lists,
merge,
Expand Down Expand Up @@ -41,7 +40,6 @@ __all__ = [
"filling",
"gpumemoryview",
"groupby",
"interop",
"join",
"lists",
"merge",
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ cdef class Column:
"""
cdef DataType dtype = DataType.from_libcudf(libcudf_col.get().type())
cdef size_type size = libcudf_col.get().size()

cdef size_type null_count = libcudf_col.get().null_count()

cdef column_contents contents = move(libcudf_col.get().release())
Expand Down
9 changes: 0 additions & 9 deletions python/cudf/cudf/_lib/pylibcudf/interop.pxd

This file was deleted.