From 7319250597b0f4e3b5f859eb073264ce3c72a1bd Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 17 Jan 2023 18:53:02 +0100 Subject: [PATCH] GH-14875: [C++] C Data Interface: check imported buffer for non-null (#14814) The C data interface may expose null data pointers for zero-sized buffers. Make sure that all buffer pointers remain non-null internally. Followup to GH-14805 * Closes: #14875 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/array/validate.cc | 11 ++- cpp/src/arrow/c/bridge.cc | 40 +++++++--- cpp/src/arrow/c/bridge_test.cc | 132 +++++++++++++++++++++++++++++++- 3 files changed, 167 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 56470ac74b0c7..c1a37c4234e6f 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -459,14 +459,17 @@ struct ValidateArrayImpl { if (buffer == nullptr) { continue; } - int64_t min_buffer_size = -1; + int64_t min_buffer_size = 0; switch (spec.kind) { case DataTypeLayout::BITMAP: - min_buffer_size = bit_util::BytesForBits(length_plus_offset); + // If length == 0, buffer size can be 0 regardless of offset + if (data.length > 0) { + min_buffer_size = bit_util::BytesForBits(length_plus_offset); + } break; case DataTypeLayout::FIXED_WIDTH: - if (MultiplyWithOverflow(length_plus_offset, spec.byte_width, - &min_buffer_size)) { + if (data.length > 0 && MultiplyWithOverflow(length_plus_offset, spec.byte_width, + &min_buffer_size)) { return Status::Invalid("Array of type ", type.ToString(), " has impossibly large length and offset"); } diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 7579a1c48968f..d6ea60f520e71 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -31,6 +31,7 @@ #include "arrow/c/util_internal.h" #include "arrow/extension_type.h" #include "arrow/memory_pool.h" +#include "arrow/memory_pool_internal.h" // for kZeroSizeArea #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/stl_allocator.h" @@ -60,6 +61,8 @@ using internal::SchemaExportTraits; using internal::ToChars; +using memory_pool::internal::kZeroSizeArea; + namespace { Status ExportingNotImplemented(const DataType& type) { @@ -1265,7 +1268,8 @@ class ImportedBuffer : public Buffer { }; struct ArrayImporter { - explicit ArrayImporter(const std::shared_ptr& type) : type_(type) {} + explicit ArrayImporter(const std::shared_ptr& type) + : type_(type), zero_size_buffer_(std::make_shared(kZeroSizeArea, 0)) {} Status Import(struct ArrowArray* src) { if (ArrowArrayIsReleased(src)) { @@ -1529,7 +1533,7 @@ struct ArrayImporter { } Status ImportNullBitmap(int32_t buffer_id = 0) { - RETURN_NOT_OK(ImportBitsBuffer(buffer_id)); + RETURN_NOT_OK(ImportBitsBuffer(buffer_id, /*is_null_bitmap=*/true)); if (data_->null_count > 0 && data_->buffers[buffer_id] == nullptr) { return Status::Invalid( "ArrowArray struct has null bitmap buffer but non-zero null_count ", @@ -1538,15 +1542,20 @@ struct ArrayImporter { return Status::OK(); } - Status ImportBitsBuffer(int32_t buffer_id) { + Status ImportBitsBuffer(int32_t buffer_id, bool is_null_bitmap = false) { // Compute visible size of buffer - int64_t buffer_size = bit_util::BytesForBits(c_struct_->length + c_struct_->offset); - return ImportBuffer(buffer_id, buffer_size); + int64_t buffer_size = + (c_struct_->length > 0) + ? bit_util::BytesForBits(c_struct_->length + c_struct_->offset) + : 0; + return ImportBuffer(buffer_id, buffer_size, is_null_bitmap); } Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) { // Compute visible size of buffer - int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset); + int64_t buffer_size = (c_struct_->length > 0) + ? byte_width * (c_struct_->length + c_struct_->offset) + : 0; return ImportBuffer(buffer_id, buffer_size); } @@ -1563,17 +1572,27 @@ struct ArrayImporter { int64_t byte_width = 1) { auto offsets = data_->GetValues(offsets_buffer_id); // Compute visible size of buffer - int64_t buffer_size = byte_width * offsets[c_struct_->length]; + int64_t buffer_size = + (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; return ImportBuffer(buffer_id, buffer_size); } - Status ImportBuffer(int32_t buffer_id, int64_t buffer_size) { + Status ImportBuffer(int32_t buffer_id, int64_t buffer_size, + bool is_null_bitmap = false) { std::shared_ptr* out = &data_->buffers[buffer_id]; auto data = reinterpret_cast(c_struct_->buffers[buffer_id]); if (data != nullptr) { *out = std::make_shared(data, buffer_size, import_); - } else { + } else if (is_null_bitmap) { out->reset(); + } else { + // Ensure that imported buffers are never null (except for the null bitmap) + if (buffer_size != 0) { + return Status::Invalid( + "ArrowArrayStruct contains null data pointer " + "for a buffer with non-zero computed size"); + } + *out = zero_size_buffer_; } return Status::OK(); } @@ -1585,6 +1604,9 @@ struct ArrayImporter { std::shared_ptr import_; std::shared_ptr data_; std::vector child_importers_; + + // For imported null buffer pointers + std::shared_ptr zero_size_buffer_; }; } // namespace diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 813d4295331cb..90fe9d59657ec 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -124,6 +124,24 @@ class ReleaseCallback { using SchemaReleaseCallback = ReleaseCallback; using ArrayReleaseCallback = ReleaseCallback; +// Whether c_struct or any of its descendents have non-null data pointers. +bool HasData(const ArrowArray* c_struct) { + for (int64_t i = 0; i < c_struct->n_buffers; ++i) { + if (c_struct->buffers[i] != nullptr) { + return true; + } + } + if (c_struct->dictionary && HasData(c_struct->dictionary)) { + return true; + } + for (int64_t i = 0; i < c_struct->n_children; ++i) { + if (HasData(c_struct->children[i])) { + return true; + } + } + return false; +} + static const std::vector kMetadataKeys1{"key1", "key2"}; static const std::vector kMetadataValues1{"", "bar"}; @@ -1659,6 +1677,8 @@ static const uint8_t bits_buffer1[] = {0xed, 0xed}; static const void* buffers_no_nulls_no_data[1] = {nullptr}; static const void* buffers_nulls_no_data1[1] = {bits_buffer1}; +static const void* all_buffers_omitted[3] = {nullptr, nullptr, nullptr}; + static const uint8_t data_buffer1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; static const uint8_t data_buffer2[] = "abcdefghijklmnopqrstuvwxyz"; @@ -1724,10 +1744,13 @@ static const uint8_t string_data_buffer1[] = "foobarquuxxyzzy"; static const int32_t string_offsets_buffer1[] = {0, 3, 3, 6, 10, 15}; static const void* string_buffers_no_nulls1[3] = {nullptr, string_offsets_buffer1, string_data_buffer1}; +static const void* string_buffers_omitted[3] = {nullptr, string_offsets_buffer1, nullptr}; static const int64_t large_string_offsets_buffer1[] = {0, 3, 3, 6, 10}; static const void* large_string_buffers_no_nulls1[3] = { nullptr, large_string_offsets_buffer1, string_data_buffer1}; +static const void* large_string_buffers_omitted[3] = { + nullptr, large_string_offsets_buffer1, nullptr}; static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1}; @@ -1901,9 +1924,9 @@ class TestArrayImport : public ::testing::Test { Reset(); // for further tests ASSERT_OK(array->ValidateFull()); - // Special case: Null array doesn't have any data, so it needn't - // keep the ArrowArray struct alive. - if (type->id() != Type::NA) { + // Special case: arrays without data (such as Null arrays) needn't keep + // the ArrowArray struct alive. + if (HasData(&c_struct_)) { cb.AssertNotCalled(); } AssertArraysEqual(*expected, *array, true); @@ -1990,6 +2013,10 @@ TEST_F(TestArrayImport, Primitive) { CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); FillPrimitive(3, 1, 0, primitive_buffers_nulls1_8); CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(int32(), "[]")); } TEST_F(TestArrayImport, Temporal) { @@ -2070,6 +2097,12 @@ TEST_F(TestArrayImport, PrimitiveWithOffset) { FillPrimitive(4, 0, 7, primitive_buffers_no_nulls1_8); CheckImport(ArrayFromJSON(boolean(), "[false, false, true, false]")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 2, all_buffers_omitted); + CheckImport(ArrayFromJSON(int32(), "[]")); + FillPrimitive(0, 0, 3, all_buffers_omitted); + CheckImport(ArrayFromJSON(boolean(), "[]")); } TEST_F(TestArrayImport, NullWithOffset) { @@ -2092,10 +2125,48 @@ TEST_F(TestArrayImport, String) { FillStringLike(4, 0, 0, large_string_buffers_no_nulls1); CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])")); + // Empty array with null data pointers + FillStringLike(0, 0, 0, string_buffers_omitted); + CheckImport(ArrayFromJSON(utf8(), "[]")); + FillStringLike(0, 0, 0, large_string_buffers_omitted); + CheckImport(ArrayFromJSON(large_binary(), "[]")); +} + +TEST_F(TestArrayImport, StringWithOffset) { + FillStringLike(3, 0, 1, string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(utf8(), R"(["", "bar", "quux"])")); + FillStringLike(2, 0, 2, large_string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_utf8(), R"(["bar", "quux"])")); + + // Empty array with null data pointers + FillStringLike(0, 0, 1, string_buffers_omitted); + CheckImport(ArrayFromJSON(utf8(), "[]")); +} + +TEST_F(TestArrayImport, FixedSizeBinary) { FillPrimitive(2, 0, 0, primitive_buffers_no_nulls2); CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["abc", "def"])")); FillPrimitive(2, 0, 0, primitive_buffers_no_nulls3); CheckImport(ArrayFromJSON(decimal(15, 4), R"(["12345.6789", "98765.4321"])")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]")); + FillPrimitive(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(decimal(15, 4), "[]")); +} + +TEST_F(TestArrayImport, FixedSizeBinaryWithOffset) { + FillPrimitive(1, 0, 1, primitive_buffers_no_nulls2); + CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["def"])")); + FillPrimitive(1, 0, 1, primitive_buffers_no_nulls3); + CheckImport(ArrayFromJSON(decimal(15, 4), R"(["98765.4321"])")); + + // Empty array with null data pointers + FillPrimitive(0, 0, 1, all_buffers_omitted); + CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]")); + FillPrimitive(0, 0, 1, all_buffers_omitted); + CheckImport(ArrayFromJSON(decimal(15, 4), "[]")); } TEST_F(TestArrayImport, List) { @@ -2117,6 +2188,11 @@ TEST_F(TestArrayImport, List) { FillFixedSizeListLike(3, 0, 0, buffers_no_nulls_no_data); CheckImport( ArrayFromJSON(fixed_size_list(int8(), 3), "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]")); + + // Empty child array with null data pointers + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillFixedSizeListLike(0, 0, 0, buffers_no_nulls_no_data); + CheckImport(ArrayFromJSON(fixed_size_list(int8(), 3), "[]")); } TEST_F(TestArrayImport, NestedList) { @@ -2205,6 +2281,15 @@ TEST_F(TestArrayImport, SparseUnion) { FillUnionLike(UnionMode::SPARSE, 4, 0, 0, 2, sparse_union_buffers1_legacy, /*legacy=*/true); CheckImport(expected); + + // Empty array with null data pointers + expected = ArrayFromJSON(type, "[]"); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::SPARSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::SPARSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false); } TEST_F(TestArrayImport, DenseUnion) { @@ -2223,6 +2308,15 @@ TEST_F(TestArrayImport, DenseUnion) { FillUnionLike(UnionMode::DENSE, 5, 0, 0, 2, dense_union_buffers1_legacy, /*legacy=*/true); CheckImport(expected); + + // Empty array with null data pointers + expected = ArrayFromJSON(type, "[]"); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::DENSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false); + FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted); + FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted); + FillUnionLike(UnionMode::DENSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false); } TEST_F(TestArrayImport, StructWithOffset) { @@ -2359,6 +2453,29 @@ TEST_F(TestArrayImport, PrimitiveError) { // Zero null bitmap but non-zero null_count FillPrimitive(3, 1, 0, primitive_buffers_no_nulls1_8); CheckImportError(int8()); + + // Null data pointers with non-zero length + FillPrimitive(1, 0, 0, all_buffers_omitted); + CheckImportError(int8()); + FillPrimitive(1, 0, 0, all_buffers_omitted); + CheckImportError(boolean()); + FillPrimitive(1, 0, 0, all_buffers_omitted); + CheckImportError(fixed_size_binary(3)); +} + +TEST_F(TestArrayImport, StringError) { + // Bad number of buffers + FillStringLike(4, 0, 0, string_buffers_no_nulls1); + c_struct_.n_buffers = 2; + CheckImportError(utf8()); + + // Null data pointers with non-zero length + FillStringLike(4, 0, 0, string_buffers_omitted); + CheckImportError(utf8()); + + // Null offsets pointer + FillStringLike(0, 0, 0, all_buffers_omitted); + CheckImportError(utf8()); } TEST_F(TestArrayImport, StructError) { @@ -2369,6 +2486,13 @@ TEST_F(TestArrayImport, StructError) { CheckImportError(struct_({field("strs", utf8())})); } +TEST_F(TestArrayImport, ListError) { + // Null offsets pointer + FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8); + FillListLike(0, 0, 0, all_buffers_omitted); + CheckImportError(list(int8())); +} + TEST_F(TestArrayImport, MapError) { // Bad number of (struct) children in map child FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); @@ -2859,8 +2983,10 @@ TEST_F(TestArrayRoundtrip, UnknownNullCount) { TEST_F(TestArrayRoundtrip, List) { TestWithJSON(list(int32()), "[]"); TestWithJSON(list(int32()), "[[4, 5], [6, null], null]"); + TestWithJSON(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); TestWithJSONSliced(list(int32()), "[[4, 5], [6, null], null]"); + TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); } TEST_F(TestArrayRoundtrip, Struct) {