@@ -28,6 +28,9 @@ pub const DATETIME_COLUMNS: [&str; 8] = [
2828 "unpublished" ,
2929] ;
3030
31+ /// Columns to dictionary-encode (repeated/invariant string values).
32+ const DICTIONARY_COLUMNS : [ & str ; 3 ] = [ "type" , "stac_version" , "collection" ] ;
33+
3134/// Encodes items into a record batch.
3235pub fn encode ( items : Vec < Item > ) -> Result < ( RecordBatch , SchemaRef ) > {
3336 encode_with_options ( items, Options :: default ( ) )
@@ -198,8 +201,29 @@ impl Writer {
198201 let mut decoder = ReaderBuilder :: new ( base_schema. clone ( ) ) . build_decoder ( ) ?;
199202 decoder. serialize ( & self . values ) ?;
200203 let record_batch = decoder. flush ( ) ?. ok_or ( Error :: NoItems ) ?;
201- let mut schema_builder = SchemaBuilder :: from ( base_schema. fields ( ) ) ;
202- let mut columns = record_batch. columns ( ) . to_vec ( ) ;
204+
205+ let dict_type = DataType :: Dictionary ( Box :: new ( DataType :: Int32 ) , Box :: new ( DataType :: Utf8 ) ) ;
206+ let mut schema_builder = SchemaBuilder :: new ( ) ;
207+ let mut columns = Vec :: with_capacity ( record_batch. num_columns ( ) ) ;
208+
209+ // Dictionary-encoded columns: type, stac_version, collection
210+ for ( i, field) in record_batch. schema ( ) . fields ( ) . iter ( ) . enumerate ( ) {
211+ let should_dictionary_encode = DICTIONARY_COLUMNS . contains ( & field. name ( ) . as_str ( ) )
212+ && field. data_type ( ) == & DataType :: Utf8 ;
213+
214+ if should_dictionary_encode {
215+ let dict_array = arrow_cast:: cast ( record_batch. column ( i) , & dict_type) ?;
216+ columns. push ( dict_array) ;
217+ schema_builder. push ( Field :: new (
218+ field. name ( ) ,
219+ dict_type. clone ( ) ,
220+ field. is_nullable ( ) ,
221+ ) ) ;
222+ } else {
223+ columns. push ( record_batch. column ( i) . clone ( ) ) ;
224+ schema_builder. push ( field. as_ref ( ) . clone ( ) ) ;
225+ }
226+ }
203227 let geometry_array = self . geometry_builder . finish ( ) ;
204228 columns. push ( geometry_array. to_array_ref ( ) ) ;
205229 schema_builder. push ( geometry_array. data_type ( ) . to_field ( "geometry" , true ) ) ;
@@ -447,4 +471,26 @@ mod tests {
447471 let ( encoder, _) = Encoder :: new ( vec ! [ item. clone( ) ] , Default :: default ( ) ) . unwrap ( ) ;
448472 let _ = encoder. encode ( vec ! [ item] ) . unwrap ( ) ;
449473 }
474+
475+ #[ test]
476+ fn dictionary_encoded_columns ( ) {
477+ use arrow_schema:: DataType ;
478+
479+ let item: Item = crate :: read ( "examples/simple-item.json" ) . unwrap ( ) ;
480+ let ( record_batch, _) = super :: encode ( vec ! [ item] ) . unwrap ( ) ;
481+ let schema = record_batch. schema ( ) ;
482+
483+ for field in schema. fields ( ) {
484+ match field. name ( ) . as_str ( ) {
485+ "type" | "stac_version" | "collection" => {
486+ assert ! (
487+ matches!( field. data_type( ) , DataType :: Dictionary ( _, _) ) ,
488+ "'{}' should be dictionary-encoded" ,
489+ field. name( )
490+ ) ;
491+ }
492+ _ => { }
493+ }
494+ }
495+ }
450496}
0 commit comments