Skip to content

Improve concat performance, and add append_array for some array builder implementations #7309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion arrow-array/src/builder/boolean_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// under the License.

use crate::builder::{ArrayBuilder, BooleanBufferBuilder};
use crate::{ArrayRef, BooleanArray};
use crate::{Array, ArrayRef, BooleanArray};
use arrow_buffer::Buffer;
use arrow_buffer::NullBufferBuilder;
use arrow_data::ArrayData;
Expand Down Expand Up @@ -146,6 +146,18 @@ impl BooleanBuilder {
}
}

/// Appends array values and null to this builder as is
/// (this means that underlying null values are copied as is).
#[inline]
pub fn append_array(&mut self, array: &BooleanArray) {
self.values_builder.append_buffer(array.values());
if let Some(null_buffer) = array.nulls() {
self.null_buffer_builder.append_buffer(null_buffer);
} else {
self.null_buffer_builder.append_n_non_nulls(array.len());
}
}

/// Builds the [BooleanArray] and reset this builder.
pub fn finish(&mut self) -> BooleanArray {
let len = self.len();
Expand Down Expand Up @@ -232,6 +244,7 @@ impl Extend<Option<bool>> for BooleanBuilder {
mod tests {
use super::*;
use crate::Array;
use arrow_buffer::{BooleanBuffer, NullBuffer};

#[test]
fn test_boolean_array_builder() {
Expand Down Expand Up @@ -346,4 +359,50 @@ mod tests {
let values = array.iter().map(|x| x.unwrap()).collect::<Vec<_>>();
assert_eq!(&values, &[true, true, true, false, false])
}

#[test]
fn test_append_array() {
let input = vec![
Some(true),
None,
Some(true),
None,
Some(false),
None,
None,
None,
Some(false),
Some(false),
Some(false),
Some(true),
Some(false),
];
let arr1 = BooleanArray::from(input[..5].to_vec());
let arr2 = BooleanArray::from(input[5..8].to_vec());
let arr3 = BooleanArray::from(input[8..].to_vec());

let mut builder = BooleanBuilder::new();
builder.append_array(&arr1);
builder.append_array(&arr2);
builder.append_array(&arr3);
let actual = builder.finish();
let expected = BooleanArray::from(input);

assert_eq!(actual, expected);
}

#[test]
fn test_append_array_add_underlying_null_values() {
let array = BooleanArray::new(
BooleanBuffer::from(vec![true, false, true, false]),
Some(NullBuffer::from(&[true, true, false, false])),
);

let mut builder = BooleanBuilder::new();
builder.append_array(&array);
let actual = builder.finish();

assert_eq!(actual, array);
assert_eq!(actual.values(), array.values())
}
}
219 changes: 218 additions & 1 deletion arrow-array/src/builder/generic_bytes_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait};
use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
use arrow_buffer::NullBufferBuilder;
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_data::ArrayDataBuilder;
Expand Down Expand Up @@ -129,6 +129,48 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
self.offsets_builder.append(self.next_offset());
}

/// Appends array values and null to this builder as is
/// (this means that underlying null values are copied as is).
#[inline]
pub fn append_array(&mut self, array: &GenericByteArray<T>) {
if array.len() == 0 {
return;
}

let offsets = array.offsets();

// If the offsets are contiguous, we can append them directly avoiding the need to align
// for example, when the first appended array is not sliced (starts at offset 0)
if self.next_offset() == offsets[0] {
self.offsets_builder.append_slice(&offsets[1..]);
} else {
// Shifting all the offsets
let shift: T::Offset = self.next_offset() - offsets[0];

// Creating intermediate offsets instead of pushing each offset is faster
// (even if we make MutableBuffer to avoid updating length on each push
// and reserve the necessary capacity, it's still slower)
let mut intermediate = Vec::with_capacity(offsets.len() - 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 maybe we need something like extend_from_iter in offsets builder (not for this PR)

Copy link
Contributor

@Dandandan Dandandan Apr 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think many of these instances could be changed to use Vec directly (and use optimized extend, etc. from them)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


for &offset in &offsets[1..] {
intermediate.push(offset + shift)
}

self.offsets_builder.append_slice(&intermediate);
}

// Append underlying values, starting from the first offset and ending at the last offset
self.value_builder.append_slice(
&array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()],
);

if let Some(null_buffer) = array.nulls() {
self.null_buffer_builder.append_buffer(null_buffer);
} else {
self.null_buffer_builder.append_n_non_nulls(array.len());
}
}

/// Builds the [`GenericByteArray`] and reset this builder.
pub fn finish(&mut self) -> GenericByteArray<T> {
let array_type = T::DATA_TYPE;
Expand Down Expand Up @@ -358,6 +400,7 @@ mod tests {
use super::*;
use crate::array::Array;
use crate::GenericStringArray;
use arrow_buffer::NullBuffer;
use std::fmt::Write as _;
use std::io::Write as _;

Expand Down Expand Up @@ -593,4 +636,178 @@ mod tests {
&["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
)
}

#[test]
fn test_append_array_without_nulls() {
let input = vec![
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
"thank", "you", "for", "asking",
];
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());

let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&arr1);
builder.append_array(&arr2);
builder.append_array(&arr3);

let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(input);

assert_eq!(actual, expected);
}

#[test]
fn test_append_array_with_nulls() {
let input = vec![
Some("hello"),
None,
Some("how"),
None,
None,
None,
None,
Some("I"),
Some("am"),
Some("doing"),
Some("well"),
];
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());

let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&arr1);
builder.append_array(&arr2);
builder.append_array(&arr3);

let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(input);

assert_eq!(actual, expected);
}

#[test]
fn test_append_empty_array() {
let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&arr);
let result = builder.finish();
assert_eq!(result.len(), 0);
}

#[test]
fn test_append_array_with_offset_not_starting_at_0() {
let input = vec![
Some("hello"),
None,
Some("how"),
None,
None,
None,
None,
Some("I"),
Some("am"),
Some("doing"),
Some("well"),
];
let full_array = GenericStringArray::<i32>::from(input);
let sliced = full_array.slice(1, 4);

assert_ne!(sliced.offsets()[0].as_usize(), 0);
assert_ne!(sliced.offsets().last(), full_array.offsets().last());

let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&sliced);
let actual = builder.finish();

let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]);

assert_eq!(actual, expected);
}

#[test]
fn test_append_underlying_null_values_added_as_is() {
let input_1_array_with_nulls = {
let input = vec![
"hello", "world", "how", "are", "you", "doing", "today", "I", "am",
];
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();

GenericStringArray::<i32>::new(
offsets,
buffer,
Some(NullBuffer::from(&[
true, false, true, false, false, true, true, true, false,
])),
)
};
let input_2_array_with_nulls = {
let input = vec!["doing", "well", "thank", "you", "for", "asking"];
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();

GenericStringArray::<i32>::new(
offsets,
buffer,
Some(NullBuffer::from(&[false, false, true, false, true, true])),
)
};

let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&input_1_array_with_nulls);
builder.append_array(&input_2_array_with_nulls);

let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(vec![
Some("hello"),
None, // world
Some("how"),
None, // are
None, // you
Some("doing"),
Some("today"),
Some("I"),
None, // am
None, // doing
None, // well
Some("thank"),
None, // "you",
Some("for"),
Some("asking"),
]);

assert_eq!(actual, expected);

let expected_underlying_buffer = Buffer::from(
[
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing",
"well", "thank", "you", "for", "asking",
]
.join("")
.as_bytes(),
);
assert_eq!(actual.values(), &expected_underlying_buffer);
}

#[test]
fn append_array_with_continues_indices() {
let input = vec![
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
"thank", "you", "for", "asking",
];
let full_array = GenericStringArray::<i32>::from(input);
let slice1 = full_array.slice(0, 3);
let slice2 = full_array.slice(3, 4);
let slice3 = full_array.slice(7, full_array.len() - 7);

let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&slice1);
builder.append_array(&slice2);
builder.append_array(&slice3);

let actual = builder.finish();

assert_eq!(actual, full_array);
}
}
Loading
Loading