apache · assignUser · Jan 17, 2023 · Jan 5, 2023 · Jan 5, 2023 · Jan 5, 2023
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
@@ -69,9 +69,11 @@ class BaseListArray : public Array {
   const TypeClass* list_type() const { return list_type_; }
 
   /// \brief Return array object containing the list's values
+  ///
+  /// Note that this buffer does not account for any slice offset or length.
   std::shared_ptr<Array> values() const { return values_; }
 
-  /// Note that this buffer does not account for any slice offset
+  /// Note that this buffer does not account for any slice offset or length.
   std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
 
   std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }

diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -738,11 +738,15 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
   ArrayVector value_arrays;
   for (int c = 0; c < data.num_chunks(); c++) {
     const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+    // values() does not account for offsets, so we need to slice into it.
+    // We can't use Flatten(), because it removes the null values, which we need.
+    std::shared_ptr<Array> flattened_values = arr.values()->Slice(
+        arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0));
     if (arr.value_type()->id() == Type::EXTENSION) {
-      const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr.values());
+      const auto& arr_ext = checked_cast<const ExtensionArray&>(*flattened_values);
       value_arrays.emplace_back(arr_ext.storage());
     } else {
-      value_arrays.emplace_back(arr.values());
+      value_arrays.emplace_back(flattened_values);
     }
   }
 
@@ -772,8 +776,12 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
         Py_INCREF(Py_None);
         *out_values = Py_None;
       } else {
-        OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset));
-        OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset));
+        // Need to subtract value_offset(0) since the original chunk might be a slice
+        // into another array.
+        OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset -
+                                           arr.value_offset(0)));
+        OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset -
+                                         arr.value_offset(0)));
         OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr));
 
         if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) {
@@ -791,7 +799,7 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
     }
     RETURN_IF_PYERROR();
 
-    chunk_offset += arr.values()->length();
+    chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0);
   }
 
   return Status::OK();
@@ -1083,7 +1091,8 @@ struct ObjectWriterVisitor {
       OwnedRef keywords(PyDict_New());
       PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC);
       OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace"));
-      OwnedRef datetime_utc(PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
+      OwnedRef datetime_utc(
+          PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
       // second step: adjust the datetime to tzinfo timezone (astimezone method)
       *out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj());
 

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -4513,3 +4513,27 @@ def test_does_not_mutate_timedelta_nested():
     df = table.to_pandas()
 
     assert df["timedelta_2"][0].to_pytimedelta() == timedelta_2[0]
+
+
+def test_list_no_duplicate_base():
+    # ARROW-18400
+    arr = pa.array([[1, 2], [3, 4, 5], [6], [7, 8]])
+    chunked_arr = pa.chunked_array([arr.slice(0, 2), arr.slice(2, 1)])
+
+    np_arr = chunked_arr.to_numpy()
+
+    expected = np.array([[1, 2], [3, 4, 5], [6]], dtype="object")
+    for left, right in zip(np_arr, expected):
+        npt.assert_array_equal(left, right)
+
+    expected_base = np.array([[1, 2, 3, 4, 5, 6]], dtype="object")
+    npt.assert_array_equal(np_arr[0].base, expected_base)
+
+    np_arr_sliced = chunked_arr.slice(1, 3).to_numpy()
+
+    expected = np.array([[3, 4, 5], [6]], dtype="object")
+    for left, right in zip(np_arr_sliced, expected):
+        npt.assert_array_equal(left, right)
+
+    expected_base = np.array([[3, 4, 5, 6]], dtype="object")
+    npt.assert_array_equal(np_arr_sliced[0].base, expected_base)