@@ -19,7 +19,7 @@ use std::any::Any;
19
19
use std:: marker:: PhantomData ;
20
20
use std:: sync:: Arc ;
21
21
22
- use arrow_buffer:: { Buffer , NullBufferBuilder , ScalarBuffer } ;
22
+ use arrow_buffer:: { Buffer , BufferBuilder , NullBufferBuilder , ScalarBuffer } ;
23
23
use arrow_data:: ByteView ;
24
24
use arrow_schema:: ArrowError ;
25
25
use hashbrown:: hash_table:: Entry ;
@@ -28,7 +28,7 @@ use hashbrown::HashTable;
28
28
use crate :: builder:: ArrayBuilder ;
29
29
use crate :: types:: bytes:: ByteArrayNativeType ;
30
30
use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
31
- use crate :: { Array , ArrayRef , GenericByteViewArray } ;
31
+ use crate :: { ArrayRef , GenericByteViewArray } ;
32
32
33
33
const STARTING_BLOCK_SIZE : u32 = 8 * 1024 ; // 8KiB
34
34
const MAX_BLOCK_SIZE : u32 = 2 * 1024 * 1024 ; // 2MiB
@@ -79,7 +79,7 @@ impl BlockSizeGrowthStrategy {
79
79
/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended
80
80
/// using [`GenericByteViewBuilder::try_append_view`]
81
81
pub struct GenericByteViewBuilder < T : ByteViewType + ?Sized > {
82
- views_buffer : Vec < u128 > ,
82
+ views_builder : BufferBuilder < u128 > ,
83
83
null_buffer_builder : NullBufferBuilder ,
84
84
completed : Vec < Buffer > ,
85
85
in_progress : Vec < u8 > ,
@@ -99,7 +99,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
99
99
/// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values.
100
100
pub fn with_capacity ( capacity : usize ) -> Self {
101
101
Self {
102
- views_buffer : Vec :: with_capacity ( capacity) ,
102
+ views_builder : BufferBuilder :: new ( capacity) ,
103
103
null_buffer_builder : NullBufferBuilder :: new ( capacity) ,
104
104
completed : vec ! [ ] ,
105
105
in_progress : vec ! [ ] ,
@@ -148,7 +148,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
148
148
pub fn with_deduplicate_strings ( self ) -> Self {
149
149
Self {
150
150
string_tracker : Some ( (
151
- HashTable :: with_capacity ( self . views_buffer . capacity ( ) ) ,
151
+ HashTable :: with_capacity ( self . views_builder . capacity ( ) ) ,
152
152
Default :: default ( ) ,
153
153
) ) ,
154
154
..self
@@ -201,43 +201,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
201
201
let b = b. get_unchecked ( start..end) ;
202
202
203
203
let view = make_view ( b, block, offset) ;
204
- self . views_buffer . push ( view) ;
204
+ self . views_builder . append ( view) ;
205
205
self . null_buffer_builder . append_non_null ( ) ;
206
206
}
207
207
208
- /// Appends an array to the builder.
209
- /// This will flush any in-progress block and append the data buffers
210
- /// and add the (adapted) views.
211
- pub fn append_array ( & mut self , array : & GenericByteViewArray < T > ) {
212
- self . flush_in_progress ( ) ;
213
- // keep original views if this array is the first to be added or if there are no data buffers (all inline views)
214
- let keep_views = self . completed . is_empty ( ) || array. data_buffers ( ) . is_empty ( ) ;
215
-
216
- self . completed . extend ( array. data_buffers ( ) . iter ( ) . cloned ( ) ) ;
217
-
218
- if keep_views {
219
- self . views_buffer . extend_from_slice ( array. views ( ) ) ;
220
- } else {
221
- let starting_buffer = self . completed . len ( ) as u32 ;
222
-
223
- self . views_buffer . extend ( array. views ( ) . iter ( ) . map ( |v| {
224
- let mut byte_view = ByteView :: from ( * v) ;
225
- if byte_view. length > 12 {
226
- // Small views (<=12 bytes) are inlined, so only need to update large views
227
- byte_view. buffer_index += starting_buffer;
228
- } ;
229
-
230
- byte_view. as_u128 ( )
231
- } ) ) ;
232
- }
233
-
234
- if let Some ( null_buffer) = array. nulls ( ) {
235
- self . null_buffer_builder . append_buffer ( null_buffer) ;
236
- } else {
237
- self . null_buffer_builder . append_n_non_nulls ( array. len ( ) ) ;
238
- }
239
- }
240
-
241
208
/// Try to append a view of the given `block`, `offset` and `length`
242
209
///
243
210
/// See [`Self::append_block`]
@@ -288,7 +255,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
288
255
/// Useful if we want to know what value has been inserted to the builder
289
256
/// The index has to be smaller than `self.len()`, otherwise it will panic
290
257
pub fn get_value ( & self , index : usize ) -> & [ u8 ] {
291
- let view = self . views_buffer . as_slice ( ) . get ( index) . unwrap ( ) ;
258
+ let view = self . views_builder . as_slice ( ) . get ( index) . unwrap ( ) ;
292
259
let len = * view as u32 ;
293
260
if len <= 12 {
294
261
// # Safety
@@ -320,7 +287,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
320
287
let mut view_buffer = [ 0 ; 16 ] ;
321
288
view_buffer[ 0 ..4 ] . copy_from_slice ( & length. to_le_bytes ( ) ) ;
322
289
view_buffer[ 4 ..4 + v. len ( ) ] . copy_from_slice ( v) ;
323
- self . views_buffer . push ( u128:: from_le_bytes ( view_buffer) ) ;
290
+ self . views_builder . append ( u128:: from_le_bytes ( view_buffer) ) ;
324
291
self . null_buffer_builder . append_non_null ( ) ;
325
292
return ;
326
293
}
@@ -344,15 +311,16 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
344
311
Entry :: Occupied ( occupied) => {
345
312
// If the string already exists, we will directly use the view
346
313
let idx = occupied. get ( ) ;
347
- self . views_buffer . push ( self . views_buffer [ * idx] ) ;
314
+ self . views_builder
315
+ . append ( self . views_builder . as_slice ( ) [ * idx] ) ;
348
316
self . null_buffer_builder . append_non_null ( ) ;
349
317
self . string_tracker = Some ( ( ht, hasher) ) ;
350
318
return ;
351
319
}
352
320
Entry :: Vacant ( vacant) => {
353
321
// o.w. we insert the (string hash -> view index)
354
322
// the idx is current length of views_builder, as we are inserting a new view
355
- vacant. insert ( self . views_buffer . len ( ) ) ;
323
+ vacant. insert ( self . views_builder . len ( ) ) ;
356
324
}
357
325
}
358
326
self . string_tracker = Some ( ( ht, hasher) ) ;
@@ -373,7 +341,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
373
341
buffer_index : self . completed . len ( ) as u32 ,
374
342
offset,
375
343
} ;
376
- self . views_buffer . push ( view. into ( ) ) ;
344
+ self . views_builder . append ( view. into ( ) ) ;
377
345
self . null_buffer_builder . append_non_null ( ) ;
378
346
}
379
347
@@ -390,20 +358,21 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
390
358
#[ inline]
391
359
pub fn append_null ( & mut self ) {
392
360
self . null_buffer_builder . append_null ( ) ;
393
- self . views_buffer . push ( 0 ) ;
361
+ self . views_builder . append ( 0 ) ;
394
362
}
395
363
396
364
/// Builds the [`GenericByteViewArray`] and reset this builder
397
365
pub fn finish ( & mut self ) -> GenericByteViewArray < T > {
398
366
self . flush_in_progress ( ) ;
399
367
let completed = std:: mem:: take ( & mut self . completed ) ;
368
+ let len = self . views_builder . len ( ) ;
369
+ let views = ScalarBuffer :: new ( self . views_builder . finish ( ) , 0 , len) ;
400
370
let nulls = self . null_buffer_builder . finish ( ) ;
401
371
if let Some ( ( ref mut ht, _) ) = self . string_tracker . as_mut ( ) {
402
372
ht. clear ( ) ;
403
373
}
404
- let views = std:: mem:: take ( & mut self . views_buffer ) ;
405
374
// SAFETY: valid by construction
406
- unsafe { GenericByteViewArray :: new_unchecked ( views. into ( ) , completed, nulls) }
375
+ unsafe { GenericByteViewArray :: new_unchecked ( views, completed, nulls) }
407
376
}
408
377
409
378
/// Builds the [`GenericByteViewArray`] without resetting the builder
@@ -412,8 +381,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
412
381
if !self . in_progress . is_empty ( ) {
413
382
completed. push ( Buffer :: from_slice_ref ( & self . in_progress ) ) ;
414
383
}
415
- let len = self . views_buffer . len ( ) ;
416
- let views = Buffer :: from_slice_ref ( self . views_buffer . as_slice ( ) ) ;
384
+ let len = self . views_builder . len ( ) ;
385
+ let views = Buffer :: from_slice_ref ( self . views_builder . as_slice ( ) ) ;
417
386
let views = ScalarBuffer :: new ( views, 0 , len) ;
418
387
let nulls = self . null_buffer_builder . finish_cloned ( ) ;
419
388
// SAFETY: valid by construction
@@ -427,7 +396,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
427
396
428
397
/// Return the allocated size of this builder in bytes, useful for memory accounting.
429
398
pub fn allocated_size ( & self ) -> usize {
430
- let views = self . views_buffer . capacity ( ) * std:: mem:: size_of :: < u128 > ( ) ;
399
+ let views = self . views_builder . capacity ( ) * std:: mem:: size_of :: < u128 > ( ) ;
431
400
let null = self . null_buffer_builder . allocated_size ( ) ;
432
401
let buffer_size = self . completed . iter ( ) . map ( |b| b. capacity ( ) ) . sum :: < usize > ( ) ;
433
402
let in_progress = self . in_progress . capacity ( ) ;
@@ -449,7 +418,7 @@ impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> {
449
418
fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
450
419
write ! ( f, "{}ViewBuilder" , T :: PREFIX ) ?;
451
420
f. debug_struct ( "" )
452
- . field ( "views_buffer " , & self . views_buffer )
421
+ . field ( "views_builder " , & self . views_builder )
453
422
. field ( "in_progress" , & self . in_progress )
454
423
. field ( "completed" , & self . completed )
455
424
. field ( "null_buffer_builder" , & self . null_buffer_builder )
0 commit comments