Skip to content

Commit 1da70e9

Browse files
committed
fix: support reading compressed metadata
The spec mentions this naming convention here: https://iceberg.apache.org/spec/#naming-for-gzip-compressed-metadata-json-files
1 parent d3d3127 commit 1da70e9

File tree

3 files changed

+43
-3
lines changed

3 files changed

+43
-3
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/iceberg/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ bytes = { workspace = true }
6363
chrono = { workspace = true }
6464
derive_builder = { workspace = true }
6565
expect-test = { workspace = true }
66+
flate2 = "1.1.5"
6667
fnv = { workspace = true }
6768
futures = { workspace = true }
6869
itertools = { workspace = true }

crates/iceberg/src/spec/table_metadata.rs

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@ use std::cmp::Ordering;
2222
use std::collections::HashMap;
2323
use std::fmt::{Display, Formatter};
2424
use std::hash::Hash;
25+
use std::io::Read as _;
2526
use std::sync::Arc;
2627

2728
use _serde::TableMetadataEnum;
2829
use chrono::{DateTime, Utc};
30+
use flate2::read::GzDecoder;
2931
use serde::{Deserialize, Serialize};
3032
use serde_repr::{Deserialize_repr, Serialize_repr};
3133
use uuid::Uuid;
@@ -413,9 +415,20 @@ impl TableMetadata {
413415
file_io: &FileIO,
414416
metadata_location: impl AsRef<str>,
415417
) -> Result<TableMetadata> {
416-
let input_file = file_io.new_input(metadata_location)?;
418+
let input_file = file_io.new_input(metadata_location.as_ref())?;
417419
let metadata_content = input_file.read().await?;
418-
let metadata = serde_json::from_slice::<TableMetadata>(&metadata_content)?;
420+
421+
let metadata = if metadata_location.as_ref().ends_with(".gz.metadata.json") {
422+
let mut decoder = GzDecoder::new(metadata_content.as_ref());
423+
let mut decompressed_data = Vec::new();
424+
decoder
425+
.read_to_end(&mut decompressed_data)
426+
.map_err(|e| Error::new(ErrorKind::DataInvalid, e.to_string()))?;
427+
serde_json::from_slice(&decompressed_data)?
428+
} else {
429+
serde_json::from_slice(&metadata_content)?
430+
};
431+
419432
Ok(metadata)
420433
}
421434

@@ -1318,6 +1331,7 @@ impl SnapshotLog {
13181331
mod tests {
13191332
use std::collections::HashMap;
13201333
use std::fs;
1334+
use std::io::Write as _;
13211335
use std::sync::Arc;
13221336

13231337
use anyhow::Result;
@@ -3033,7 +3047,7 @@ mod tests {
30333047
let original_metadata: TableMetadata = get_test_table_metadata("TableMetadataV2Valid.json");
30343048

30353049
// Define the metadata location
3036-
let metadata_location = format!("{}/metadata.json", temp_path);
3050+
let metadata_location = format!("{temp_path}/metadata.json");
30373051

30383052
// Write the metadata
30393053
original_metadata
@@ -3053,6 +3067,30 @@ mod tests {
30533067
assert_eq!(read_metadata, original_metadata);
30543068
}
30553069

3070+
#[tokio::test]
3071+
async fn test_table_metadata_read_compressed() {
3072+
let temp_dir = TempDir::new().unwrap();
3073+
let metadata_location = temp_dir.path().join("v1.gz.metadata.json");
3074+
3075+
let original_metadata: TableMetadata = get_test_table_metadata("TableMetadataV2Valid.json");
3076+
let json = serde_json::to_string(&original_metadata).unwrap();
3077+
3078+
let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
3079+
encoder.write_all(json.as_bytes()).unwrap();
3080+
std::fs::write(&metadata_location, encoder.finish().unwrap())
3081+
.expect("failed to write metadata");
3082+
3083+
// Read the metadata back
3084+
let file_io = FileIOBuilder::new_fs_io().build().unwrap();
3085+
let metadata_location = metadata_location.to_str().unwrap();
3086+
let read_metadata = TableMetadata::read_from(&file_io, metadata_location)
3087+
.await
3088+
.unwrap();
3089+
3090+
// Verify the metadata matches
3091+
assert_eq!(read_metadata, original_metadata);
3092+
}
3093+
30563094
#[tokio::test]
30573095
async fn test_table_metadata_read_nonexistent_file() {
30583096
// Create a FileIO instance

0 commit comments

Comments
 (0)