Skip to content

Commit 842ea66

Browse files
committed
Improve size calculations for RunEndEncoded data type and add test coverage for it
1 parent c1890d4 commit 842ea66

File tree

3 files changed

+60
-18
lines changed

3 files changed

+60
-18
lines changed

arrow-array/src/array/run_array.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -254,14 +254,19 @@ impl<R: RunEndIndexType> RunArray<R> {
254254
}
255255

256256
impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
257-
// The method assumes the caller already validated the data using `ArrayData::validate_data()`
258257
fn from(data: ArrayData) -> Self {
259-
match data.data_type() {
260-
DataType::RunEndEncoded(_, _) => {}
261-
_ => {
262-
panic!("Invalid data type for RunArray. The data type should be DataType::RunEndEncoded");
263-
}
264-
}
258+
Self::from(&data)
259+
}
260+
}
261+
262+
impl<R: RunEndIndexType> From<&ArrayData> for RunArray<R> {
263+
// The method assumes the caller already validated the data using `ArrayData::validate_data()`
264+
fn from(data: &ArrayData) -> Self {
265+
let DataType::RunEndEncoded(_, _) = data.data_type() else {
266+
panic!(
267+
"Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"
268+
);
269+
};
265270

266271
// Safety
267272
// ArrayData is valid

arrow-ipc/src/reader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1785,7 +1785,7 @@ mod tests {
17851785
// can be compared as such.
17861786
assert_eq!(input_batch.column(1), output_batch.column(1));
17871787

1788-
let run_array_1_unsliced = unslice_run_array(run_array_1_sliced.into_data()).unwrap();
1788+
let run_array_1_unsliced = unslice_run_array(&run_array_1_sliced.into_data()).unwrap();
17891789
assert_eq!(run_array_1_unsliced, output_batch.column(0).into_data());
17901790
}
17911791

arrow-ipc/src/writer.rs

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ fn append_variadic_buffer_counts(counts: &mut Vec<i64>, array: &ArrayData) {
567567
}
568568
}
569569

570-
pub(crate) fn unslice_run_array(arr: ArrayData) -> Result<ArrayData, ArrowError> {
570+
pub(crate) fn unslice_run_array(arr: &ArrayData) -> Result<ArrayData, ArrowError> {
571571
match arr.data_type() {
572572
DataType::RunEndEncoded(k, _) => match k.data_type() {
573573
DataType::Int16 => {
@@ -1433,16 +1433,40 @@ fn get_encoded_arr_batch_size<AD: Borrow<ArrayData>>(
14331433
.map(|arr| {
14341434
let arr = arr.borrow();
14351435
arr.get_slice_memory_size_with_alignment(Some(write_options.alignment))
1436-
.map(|size| {
1436+
.and_then(|mut size| {
14371437
let didnt_count_nulls = arr.nulls().is_none();
14381438
let will_write_nulls = has_validity_bitmap(arr.data_type(), write_options);
14391439

14401440
if will_write_nulls && didnt_count_nulls {
14411441
let null_len = bit_util::ceil(arr.len(), 8);
1442-
size + null_len + pad_to_alignment(write_options.alignment, null_len)
1443-
} else {
1444-
size
1442+
size += null_len + pad_to_alignment(write_options.alignment, null_len)
14451443
}
1444+
1445+
// TODO: This is ugly. We remove the child_data size in RunEndEncoded because
1446+
// it was calculated as the size existing in memory but we care about the size
1447+
// when it's decoded and then encoded into a flatbuffer. Afaik, this is the
1448+
// only data type where the size in memory is not the same size as when encoded
1449+
// (since it has a different representation in memory), so it's not horrible,
1450+
// but it's definitely not ideal.
1451+
if let DataType::RunEndEncoded(_, _) = arr.data_type() {
1452+
size -= arr
1453+
.child_data()
1454+
.iter()
1455+
.map(|data| {
1456+
data.get_slice_memory_size_with_alignment(Some(
1457+
write_options.alignment,
1458+
))
1459+
})
1460+
.sum::<Result<usize, ArrowError>>()?;
1461+
1462+
size += unslice_run_array(arr)?
1463+
.child_data()
1464+
.iter()
1465+
.map(|data| get_encoded_arr_batch_size([data], write_options))
1466+
.sum::<Result<usize, ArrowError>>()?;
1467+
}
1468+
1469+
Ok(size)
14461470
})
14471471
})
14481472
.sum()
@@ -1837,7 +1861,7 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> {
18371861
match array_data.data_type() {
18381862
DataType::Dictionary(_, _) => Ok(()),
18391863
// unslice the run encoded array.
1840-
DataType::RunEndEncoded(_, _) => write_arr(&unslice_run_array(array_data.clone())?),
1864+
DataType::RunEndEncoded(_, _) => write_arr(&unslice_run_array(array_data)?),
18411865
// recursively write out nested structures
18421866
_ => write_arr(array_data),
18431867
}
@@ -2945,10 +2969,9 @@ mod tests {
29452969

29462970
let arr_data = arr.to_data();
29472971

2948-
let write_options = IpcWriteOptions {
2949-
batch_compression_type: None,
2950-
..IpcWriteOptions::default()
2951-
};
2972+
let write_options = IpcWriteOptions::default()
2973+
.try_with_compression(None)
2974+
.unwrap();
29522975

29532976
let compute_size = get_encoded_arr_batch_size([&arr_data], &write_options).unwrap();
29542977
let num_rows = arr_data.len();
@@ -2992,5 +3015,19 @@ mod tests {
29923015

29933016
let list = FixedSizeListArray::new(list_field, 2, make_array(int_arr.to_data()), None);
29943017
encode_test(list);
3018+
3019+
let vals: Vec<Option<i32>> = vec![Some(1), None, Some(2), Some(3), Some(4), None, Some(5)];
3020+
let repeats: Vec<usize> = vec![3, 4, 1, 2];
3021+
let mut input_array: Vec<Option<i32>> = Vec::with_capacity(80);
3022+
for ix in 0_usize..32 {
3023+
let repeat: usize = repeats[ix % repeats.len()];
3024+
let val: Option<i32> = vals[ix % vals.len()];
3025+
input_array.resize(input_array.len() + repeat, val);
3026+
}
3027+
let mut builder =
3028+
PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
3029+
builder.extend(input_array);
3030+
let run_array = builder.finish();
3031+
encode_test(run_array);
29953032
}
29963033
}

0 commit comments

Comments
 (0)