@@ -567,7 +567,7 @@ fn append_variadic_buffer_counts(counts: &mut Vec<i64>, array: &ArrayData) {
567
567
}
568
568
}
569
569
570
- pub ( crate ) fn unslice_run_array ( arr : ArrayData ) -> Result < ArrayData , ArrowError > {
570
+ pub ( crate ) fn unslice_run_array ( arr : & ArrayData ) -> Result < ArrayData , ArrowError > {
571
571
match arr. data_type ( ) {
572
572
DataType :: RunEndEncoded ( k, _) => match k. data_type ( ) {
573
573
DataType :: Int16 => {
@@ -1433,16 +1433,40 @@ fn get_encoded_arr_batch_size<AD: Borrow<ArrayData>>(
1433
1433
. map ( |arr| {
1434
1434
let arr = arr. borrow ( ) ;
1435
1435
arr. get_slice_memory_size_with_alignment ( Some ( write_options. alignment ) )
1436
- . map ( | size| {
1436
+ . and_then ( | mut size| {
1437
1437
let didnt_count_nulls = arr. nulls ( ) . is_none ( ) ;
1438
1438
let will_write_nulls = has_validity_bitmap ( arr. data_type ( ) , write_options) ;
1439
1439
1440
1440
if will_write_nulls && didnt_count_nulls {
1441
1441
let null_len = bit_util:: ceil ( arr. len ( ) , 8 ) ;
1442
- size + null_len + pad_to_alignment ( write_options. alignment , null_len)
1443
- } else {
1444
- size
1442
+ size += null_len + pad_to_alignment ( write_options. alignment , null_len)
1445
1443
}
1444
+
1445
+ // TODO: This is ugly. We remove the child_data size in RunEndEncoded because
1446
+ // it was calculated as the size existing in memory but we care about the size
1447
+ // when it's decoded and then encoded into a flatbuffer. Afaik, this is the
1448
+ // only data type where the size in memory is not the same size as when encoded
1449
+ // (since it has a different representation in memory), so it's not horrible,
1450
+ // but it's definitely not ideal.
1451
+ if let DataType :: RunEndEncoded ( _, _) = arr. data_type ( ) {
1452
+ size -= arr
1453
+ . child_data ( )
1454
+ . iter ( )
1455
+ . map ( |data| {
1456
+ data. get_slice_memory_size_with_alignment ( Some (
1457
+ write_options. alignment ,
1458
+ ) )
1459
+ } )
1460
+ . sum :: < Result < usize , ArrowError > > ( ) ?;
1461
+
1462
+ size += unslice_run_array ( arr) ?
1463
+ . child_data ( )
1464
+ . iter ( )
1465
+ . map ( |data| get_encoded_arr_batch_size ( [ data] , write_options) )
1466
+ . sum :: < Result < usize , ArrowError > > ( ) ?;
1467
+ }
1468
+
1469
+ Ok ( size)
1446
1470
} )
1447
1471
} )
1448
1472
. sum ( )
@@ -1837,7 +1861,7 @@ impl<'fbb> FlatBufferSizeTracker<'fbb> {
1837
1861
match array_data. data_type ( ) {
1838
1862
DataType :: Dictionary ( _, _) => Ok ( ( ) ) ,
1839
1863
// unslice the run encoded array.
1840
- DataType :: RunEndEncoded ( _, _) => write_arr ( & unslice_run_array ( array_data. clone ( ) ) ?) ,
1864
+ DataType :: RunEndEncoded ( _, _) => write_arr ( & unslice_run_array ( array_data) ?) ,
1841
1865
// recursively write out nested structures
1842
1866
_ => write_arr ( array_data) ,
1843
1867
}
@@ -2945,10 +2969,9 @@ mod tests {
2945
2969
2946
2970
let arr_data = arr. to_data ( ) ;
2947
2971
2948
- let write_options = IpcWriteOptions {
2949
- batch_compression_type : None ,
2950
- ..IpcWriteOptions :: default ( )
2951
- } ;
2972
+ let write_options = IpcWriteOptions :: default ( )
2973
+ . try_with_compression ( None )
2974
+ . unwrap ( ) ;
2952
2975
2953
2976
let compute_size = get_encoded_arr_batch_size ( [ & arr_data] , & write_options) . unwrap ( ) ;
2954
2977
let num_rows = arr_data. len ( ) ;
@@ -2992,5 +3015,19 @@ mod tests {
2992
3015
2993
3016
let list = FixedSizeListArray :: new ( list_field, 2 , make_array ( int_arr. to_data ( ) ) , None ) ;
2994
3017
encode_test ( list) ;
3018
+
3019
+ let vals: Vec < Option < i32 > > = vec ! [ Some ( 1 ) , None , Some ( 2 ) , Some ( 3 ) , Some ( 4 ) , None , Some ( 5 ) ] ;
3020
+ let repeats: Vec < usize > = vec ! [ 3 , 4 , 1 , 2 ] ;
3021
+ let mut input_array: Vec < Option < i32 > > = Vec :: with_capacity ( 80 ) ;
3022
+ for ix in 0_usize ..32 {
3023
+ let repeat: usize = repeats[ ix % repeats. len ( ) ] ;
3024
+ let val: Option < i32 > = vals[ ix % vals. len ( ) ] ;
3025
+ input_array. resize ( input_array. len ( ) + repeat, val) ;
3026
+ }
3027
+ let mut builder =
3028
+ PrimitiveRunBuilder :: < Int16Type , Int32Type > :: with_capacity ( input_array. len ( ) ) ;
3029
+ builder. extend ( input_array) ;
3030
+ let run_array = builder. finish ( ) ;
3031
+ encode_test ( run_array) ;
2995
3032
}
2996
3033
}
0 commit comments