diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 489a7a3a3ca4b..6fb3fd3c9187e 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -69,9 +69,11 @@ class BaseListArray : public Array { const TypeClass* list_type() const { return list_type_; } /// \brief Return array object containing the list's values + /// + /// Note that this buffer does not account for any slice offset or length. std::shared_ptr values() const { return values_; } - /// Note that this buffer does not account for any slice offset + /// Note that this buffer does not account for any slice offset or length. std::shared_ptr value_offsets() const { return data_->buffers[1]; } std::shared_ptr value_type() const { return list_type_->value_type(); } diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index f58c151ea6715..2faf7d381a9be 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -738,11 +738,17 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, ArrayVector value_arrays; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); + // values() does not account for offsets, so we need to slice into it. + // We can't use Flatten(), because it removes the values behind a null list + // value, and that makes the offsets into original list values and our + // flattened_values array different. + std::shared_ptr flattened_values = arr.values()->Slice( + arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0)); if (arr.value_type()->id() == Type::EXTENSION) { - const auto& arr_ext = checked_cast(*arr.values()); + const auto& arr_ext = checked_cast(*flattened_values); value_arrays.emplace_back(arr_ext.storage()); } else { - value_arrays.emplace_back(arr.values()); + value_arrays.emplace_back(flattened_values); } } @@ -772,8 +778,12 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, Py_INCREF(Py_None); *out_values = Py_None; } else { - OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset)); - OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset)); + // Need to subtract value_offset(0) since the original chunk might be a slice + // into another array. + OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset - + arr.value_offset(0))); + OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset - + arr.value_offset(0))); OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr)); if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) { @@ -791,7 +801,7 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, } RETURN_IF_PYERROR(); - chunk_offset += arr.values()->length(); + chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0); } return Status::OK(); @@ -1083,7 +1093,8 @@ struct ObjectWriterVisitor { OwnedRef keywords(PyDict_New()); PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC); OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace")); - OwnedRef datetime_utc(PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj())); + OwnedRef datetime_utc( + PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj())); // second step: adjust the datetime to tzinfo timezone (astimezone method) *out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj()); diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 729a4122c0f2b..4d0ddf875474e 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2308,6 +2308,51 @@ def test_map_array_dictionary_encoded(self): actual = arr.to_pandas() tm.assert_series_equal(actual, expected, check_names=False) + def test_list_no_duplicate_base(self): + # ARROW-18400 + arr = pa.array([[1, 2], [3, 4, 5], None, [6, None], [7, 8]]) + chunked_arr = pa.chunked_array([arr.slice(0, 3), arr.slice(3, 1)]) + + np_arr = chunked_arr.to_numpy() + + expected = np.array([[1., 2.], [3., 4., 5.], None, + [6., np.NaN]], dtype="object") + for left, right in zip(np_arr, expected): + if right is None: + assert left == right + else: + npt.assert_array_equal(left, right) + + expected_base = np.array([[1., 2., 3., 4., 5., 6., np.NaN]]) + npt.assert_array_equal(np_arr[0].base, expected_base) + + np_arr_sliced = chunked_arr.slice(1, 3).to_numpy() + + expected = np.array([[3, 4, 5], None, [6, np.NaN]], dtype="object") + for left, right in zip(np_arr_sliced, expected): + if right is None: + assert left == right + else: + npt.assert_array_equal(left, right) + + expected_base = np.array([[3., 4., 5., 6., np.NaN]]) + npt.assert_array_equal(np_arr_sliced[0].base, expected_base) + + def test_list_values_behind_null(self): + arr = pa.ListArray.from_arrays( + offsets=pa.array([0, 2, 4, 6]), + values=pa.array([1, 2, 99, 99, 3, None]), + mask=pa.array([False, True, False]) + ) + np_arr = arr.to_numpy(zero_copy_only=False) + + expected = np.array([[1., 2.], None, [3., np.NaN]], dtype="object") + for left, right in zip(np_arr, expected): + if right is None: + assert left == right + else: + npt.assert_array_equal(left, right) + class TestConvertStructTypes: """