Skip to content

Commit

Permalink
Cleanup list casting and support nested lists (#5113) (#5124)
Browse files Browse the repository at this point in the history
* Cleanup list casting and support nested lists (#5113)

* Clippy

* Update can_cast_types
  • Loading branch information
tustvold committed Nov 27, 2023
1 parent e1bafdf commit ef1cc38
Showing 1 changed file with 68 additions and 115 deletions.
183 changes: 68 additions & 115 deletions arrow-cast/src/cast.rs
Expand Up @@ -47,7 +47,7 @@ use crate::parse::{
string_to_datetime, Parser,
};
use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *};
use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer};
use arrow_buffer::{i256, ArrowNativeType, OffsetBuffer};
use arrow_data::transform::MutableArrayData;
use arrow_data::ArrayData;
use arrow_schema::*;
Expand Down Expand Up @@ -124,18 +124,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
}
(Dictionary(_, value_type), _) => can_cast_types(value_type, to_type),
(_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type),
(LargeList(list_from), LargeList(list_to)) => {
(List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => {
can_cast_types(list_from.data_type(), list_to.data_type())
}
(List(list_from), List(list_to)) => {
can_cast_types(list_from.data_type(), list_to.data_type())
}
(List(list_from), LargeList(list_to)) => {
list_from.data_type() == list_to.data_type()
}
(LargeList(list_from), List(list_to)) => {
list_from.data_type() == list_to.data_type()
}
(List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) => {
can_cast_types(list_from.data_type(), to_type)
}
Expand Down Expand Up @@ -783,28 +774,10 @@ pub fn cast_with_options(
"Casting from type {from_type:?} to dictionary type {to_type:?} not supported",
))),
},
(List(_), List(ref to)) => cast_list_inner::<i32>(array, to, to_type, cast_options),
(LargeList(_), LargeList(ref to)) => {
cast_list_inner::<i64>(array, to, to_type, cast_options)
}
(List(list_from), LargeList(list_to)) => {
if list_to.data_type() != list_from.data_type() {
Err(ArrowError::CastError(
"cannot cast list to large-list with different child data".into(),
))
} else {
cast_list_container::<i32, i64>(array, cast_options)
}
}
(LargeList(list_from), List(list_to)) => {
if list_to.data_type() != list_from.data_type() {
Err(ArrowError::CastError(
"cannot cast large-list to list with different child data".into(),
))
} else {
cast_list_container::<i64, i32>(array, cast_options)
}
}
(List(_), List(to)) => cast_list_values::<i32>(array, to, cast_options),
(LargeList(_), LargeList(to)) => cast_list_values::<i64>(array, to, cast_options),
(List(_), LargeList(list_to)) => cast_list::<i32, i64>(array, list_to, cast_options),
(LargeList(_), List(list_to)) => cast_list::<i64, i32>(array, list_to, cast_options),
(List(_), FixedSizeList(field, size)) => {
let array = array.as_list::<i32>();
cast_list_to_fixed_size_list::<i32>(array, field, *size, cast_options)
Expand Down Expand Up @@ -3046,28 +3019,6 @@ fn cast_values_to_list<O: OffsetSizeTrait>(
Ok(Arc::new(list))
}

/// Helper function that takes an Generic list container and casts the inner datatype.
fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
array: &dyn Array,
to: &Field,
to_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let data = array.to_data();
let underlying_array = make_array(data.child_data()[0].clone());
let cast_array = cast_with_options(underlying_array.as_ref(), to.data_type(), cast_options)?;
let builder = data
.into_builder()
.data_type(to_type.clone())
.child_data(vec![cast_array.into_data()]);

// Safety
// Data was valid before
let array_data = unsafe { builder.build_unchecked() };
let list = GenericListArray::<OffsetSize>::from(array_data);
Ok(Arc::new(list) as ArrayRef)
}

/// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same
/// offset size so re-encoding offset is unnecessary.
fn cast_binary_to_string<O: OffsetSizeTrait>(
Expand Down Expand Up @@ -3221,7 +3172,7 @@ where

fn cast_list_to_fixed_size_list<OffsetSize>(
array: &GenericListArray<OffsetSize>,
field: &Arc<Field>,
field: &FieldRef,
size: i32,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError>
Expand Down Expand Up @@ -3289,75 +3240,57 @@ where
Ok(Arc::new(array))
}

/// Cast the container type of List/Largelist array but not the inner types.
/// This function can leave the value data intact and only has to cast the offset dtypes.
fn cast_list_container<OffsetSizeFrom, OffsetSizeTo>(
/// Helper function that takes an Generic list container and casts the inner datatype.
fn cast_list_values<O: OffsetSizeTrait>(
array: &dyn Array,
_cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError>
where
OffsetSizeFrom: OffsetSizeTrait + ToPrimitive,
OffsetSizeTo: OffsetSizeTrait + NumCast,
{
let list = array.as_list::<OffsetSizeFrom>();
// the value data stored by the list
let values = list.values();
to: &FieldRef,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let list = array.as_list::<O>();
let values = cast_with_options(list.values(), to.data_type(), cast_options)?;
Ok(Arc::new(GenericListArray::<O>::new(
to.clone(),
list.offsets().clone(),
values,
list.nulls().cloned(),
)))
}

let out_dtype = match array.data_type() {
DataType::List(value_type) => {
assert_eq!(
std::mem::size_of::<OffsetSizeFrom>(),
std::mem::size_of::<i32>()
);
assert_eq!(
std::mem::size_of::<OffsetSizeTo>(),
std::mem::size_of::<i64>()
);
DataType::LargeList(value_type.clone())
}
DataType::LargeList(value_type) => {
assert_eq!(
std::mem::size_of::<OffsetSizeFrom>(),
std::mem::size_of::<i64>()
);
assert_eq!(
std::mem::size_of::<OffsetSizeTo>(),
std::mem::size_of::<i32>()
);
if values.len() > i32::MAX as usize {
return Err(ArrowError::ComputeError(
"LargeList too large to cast to List".into(),
));
}
DataType::List(value_type.clone())
}
// implementation error
_ => unreachable!(),
};
/// Cast the container type of List/Largelist array along with the inner datatype
fn cast_list<I: OffsetSizeTrait, O: OffsetSizeTrait>(
array: &dyn Array,
field: &FieldRef,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let list = array.as_list::<I>();
let values = list.values();
let offsets = list.offsets();
let nulls = list.nulls().cloned();

let iter = list.value_offsets().iter().map(|idx| {
let idx: OffsetSizeTo = NumCast::from(*idx).unwrap();
idx
});
if !O::IS_LARGE && values.len() > i32::MAX as usize {
return Err(ArrowError::ComputeError(
"LargeList too large to cast to List".into(),
));
}

// SAFETY
// A slice produces a trusted length iterator
let offset_buffer = unsafe { Buffer::from_trusted_len_iter(iter) };
// Recursively cast values
let values = cast_with_options(values, field.data_type(), cast_options)?;
let offsets: Vec<_> = offsets.iter().map(|x| O::usize_as(x.as_usize())).collect();

// wrap up
let builder = ArrayData::builder(out_dtype)
.len(list.len())
.add_buffer(offset_buffer)
.add_child_data(values.to_data())
.nulls(list.nulls().cloned());
// Safety: valid offsets and checked for overflow
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };

let array_data = unsafe { builder.build_unchecked() };
Ok(Arc::new(GenericListArray::<OffsetSizeTo>::from(array_data)))
Ok(Arc::new(GenericListArray::<O>::new(
field.clone(),
offsets,
values,
nulls,
)))
}

#[cfg(test)]
mod tests {
use arrow_buffer::NullBuffer;
use arrow_buffer::{Buffer, NullBuffer};

use super::*;

Expand Down Expand Up @@ -9154,6 +9087,26 @@ mod tests {
assert_eq!(formatted.value(1).to_string(), "[[4], [null], [6]]");
}

#[test]
fn test_nested_list_cast() {
let mut builder = ListBuilder::new(ListBuilder::new(Int32Builder::new()));
builder.append_value([Some([Some(1), Some(2), None]), None]);
builder.append_value([None, Some([]), None]);
builder.append_null();
builder.append_value([Some([Some(2), Some(3)])]);
let start = builder.finish();

let mut builder = LargeListBuilder::new(LargeListBuilder::new(Int8Builder::new()));
builder.append_value([Some([Some(1), Some(2), None]), None]);
builder.append_value([None, Some([]), None]);
builder.append_null();
builder.append_value([Some([Some(2), Some(3)])]);
let expected = builder.finish();

let actual = cast(&start, expected.data_type()).unwrap();
assert_eq!(actual.as_ref(), &expected);
}

const CAST_OPTIONS: CastOptions<'static> = CastOptions {
safe: true,
format_options: FormatOptions::new(),
Expand Down

0 comments on commit ef1cc38

Please sign in to comment.