Skip to content

Commit

Permalink
BUG: Interchange object data buffer has the wrong dtype / from_datafr…
Browse files Browse the repository at this point in the history
…ame incorrect (#55227)

* use buffer dtype in interchange from_dataframe

* wip

* wip

* add failing test

* wip

* simplify

* remove unnecessary assertion

* dont double-extract bit width

* Update pandas/core/interchange/from_dataframe.py

Co-authored-by: Stijn de Gooijer <stijn@degooijer.io>

---------

Co-authored-by: Stijn de Gooijer <stijn@degooijer.io>
  • Loading branch information
MarcoGorelli and stinodego committed Nov 7, 2023
1 parent cbbb619 commit ed10a14
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 8 deletions.
16 changes: 8 additions & 8 deletions pandas/core/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,9 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:

assert buffers["offsets"], "String buffers must contain offsets"
# Retrieve the data buffer containing the UTF-8 code units
data_buff, protocol_data_dtype = buffers["data"]
data_buff, _ = buffers["data"]
# We're going to reinterpret the buffer as uint8, so make sure we can do it safely
assert protocol_data_dtype[1] == 8
assert protocol_data_dtype[2] in (
assert col.dtype[2] in (
ArrowCTypes.STRING,
ArrowCTypes.LARGE_STRING,
) # format_str == utf-8
Expand Down Expand Up @@ -377,15 +376,16 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
"""
buffers = col.get_buffers()

_, _, format_str, _ = col.dtype
dbuf, dtype = buffers["data"]
_, col_bit_width, format_str, _ = col.dtype
dbuf, _ = buffers["data"]
# Consider dtype being `uint` to get number of units passed since the 01.01.1970

data = buffer_to_ndarray(
dbuf,
(
DtypeKind.UINT,
dtype[1],
getattr(ArrowCTypes, f"UINT{dtype[1]}"),
DtypeKind.INT,
col_bit_width,
getattr(ArrowCTypes, f"INT{col_bit_width}"),
Endianness.NATIVE,
),
offset=col.offset,
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
DtypeKind,
)
from pandas.core.interchange.from_dataframe import from_dataframe
from pandas.core.interchange.utils import ArrowCTypes


@pytest.fixture
Expand Down Expand Up @@ -340,3 +341,24 @@ def test_interchange_from_non_pandas_tz_aware(request):
dtype="datetime64[us, Asia/Kathmandu]",
)
tm.assert_frame_equal(expected, result)


def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
# https://github.com/pandas-dev/pandas/issues/54781
df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()
interchange = df.__dataframe__()
column = interchange.get_column_by_name("a")
buffers = column.get_buffers()
buffers_data = buffers["data"]
buffer_dtype = buffers_data[1]
buffer_dtype = (
DtypeKind.UINT,
8,
ArrowCTypes.UINT8,
buffer_dtype[3],
)
buffers["data"] = (buffers_data[0], buffer_dtype)
column.get_buffers = lambda: buffers
interchange.get_column_by_name = lambda _: column
monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
pd.api.interchange.from_dataframe(df)

0 comments on commit ed10a14

Please sign in to comment.