Skip to content

Commit 7a4243e

Browse files
feat: dictionary-encode repeated string fields (#937)
### Description Dictionary-encode `type`, `stac_version` and `collection` columns in Arrow record batches. ### Related issues Closes #775 ### Checklist - [x] Unit tests - [x] Pull request title follows [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/) - [x] Pre-commit hooks pass (`prek run --all-files`) Co-authored-by: Pete Gadomski <[email protected]>
1 parent 69611d5 commit 7a4243e

File tree

1 file changed

+48
-2
lines changed
  • crates/core/src/geoarrow

1 file changed

+48
-2
lines changed

crates/core/src/geoarrow/mod.rs

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ pub const DATETIME_COLUMNS: [&str; 8] = [
2828
"unpublished",
2929
];
3030

31+
/// Columns to dictionary-encode (repeated/invariant string values).
32+
const DICTIONARY_COLUMNS: [&str; 3] = ["type", "stac_version", "collection"];
33+
3134
/// Encodes items into a record batch.
3235
pub fn encode(items: Vec<Item>) -> Result<(RecordBatch, SchemaRef)> {
3336
encode_with_options(items, Options::default())
@@ -198,8 +201,29 @@ impl Writer {
198201
let mut decoder = ReaderBuilder::new(base_schema.clone()).build_decoder()?;
199202
decoder.serialize(&self.values)?;
200203
let record_batch = decoder.flush()?.ok_or(Error::NoItems)?;
201-
let mut schema_builder = SchemaBuilder::from(base_schema.fields());
202-
let mut columns = record_batch.columns().to_vec();
204+
205+
let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
206+
let mut schema_builder = SchemaBuilder::new();
207+
let mut columns = Vec::with_capacity(record_batch.num_columns());
208+
209+
// Dictionary-encoded columns: type, stac_version, collection
210+
for (i, field) in record_batch.schema().fields().iter().enumerate() {
211+
let should_dictionary_encode = DICTIONARY_COLUMNS.contains(&field.name().as_str())
212+
&& field.data_type() == &DataType::Utf8;
213+
214+
if should_dictionary_encode {
215+
let dict_array = arrow_cast::cast(record_batch.column(i), &dict_type)?;
216+
columns.push(dict_array);
217+
schema_builder.push(Field::new(
218+
field.name(),
219+
dict_type.clone(),
220+
field.is_nullable(),
221+
));
222+
} else {
223+
columns.push(record_batch.column(i).clone());
224+
schema_builder.push(field.as_ref().clone());
225+
}
226+
}
203227
let geometry_array = self.geometry_builder.finish();
204228
columns.push(geometry_array.to_array_ref());
205229
schema_builder.push(geometry_array.data_type().to_field("geometry", true));
@@ -447,4 +471,26 @@ mod tests {
447471
let (encoder, _) = Encoder::new(vec![item.clone()], Default::default()).unwrap();
448472
let _ = encoder.encode(vec![item]).unwrap();
449473
}
474+
475+
#[test]
476+
fn dictionary_encoded_columns() {
477+
use arrow_schema::DataType;
478+
479+
let item: Item = crate::read("examples/simple-item.json").unwrap();
480+
let (record_batch, _) = super::encode(vec![item]).unwrap();
481+
let schema = record_batch.schema();
482+
483+
for field in schema.fields() {
484+
match field.name().as_str() {
485+
"type" | "stac_version" | "collection" => {
486+
assert!(
487+
matches!(field.data_type(), DataType::Dictionary(_, _)),
488+
"'{}' should be dictionary-encoded",
489+
field.name()
490+
);
491+
}
492+
_ => {}
493+
}
494+
}
495+
}
450496
}

0 commit comments

Comments
 (0)