Skip to content

Commit 4574bb4

Browse files
authored
Add ParquetError::NeedMoreData mark ParquetError as non_exhaustive (#6630)
* add ParquetError::NeedMoreData * make ParquetError non_exhaustive * attempt to better describe use of NeedMoreData * add test of multiple retries
1 parent def94a8 commit 4574bb4

File tree

3 files changed

+79
-29
lines changed

3 files changed

+79
-29
lines changed

parquet/src/arrow/async_reader/metadata.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ impl<F: MetadataFetch> MetadataLoader<F> {
119119
return Err(ParquetError::EOF(format!(
120120
"file size of {} is less than footer + metadata {}",
121121
file_size,
122-
length + 8
122+
length + FOOTER_SIZE
123123
)));
124124
}
125125

parquet/src/errors.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use arrow_schema::ArrowError;
2727
// Note: we don't implement PartialEq as the semantics for the
2828
// external variant are not well defined (#4469)
2929
#[derive(Debug)]
30+
#[non_exhaustive]
3031
pub enum ParquetError {
3132
/// General Parquet error.
3233
/// Returned when code violates normal workflow of working with Parquet files.
@@ -47,6 +48,9 @@ pub enum ParquetError {
4748
IndexOutOfBound(usize, usize),
4849
/// An external error variant
4950
External(Box<dyn Error + Send + Sync>),
51+
/// Returned when a function needs more data to complete properly. The `usize` field indicates
52+
/// the total number of bytes required, not the number of additional bytes.
53+
NeedMoreData(usize),
5054
}
5155

5256
impl std::fmt::Display for ParquetError {
@@ -63,6 +67,7 @@ impl std::fmt::Display for ParquetError {
6367
write!(fmt, "Index {index} out of bound: {bound}")
6468
}
6569
ParquetError::External(e) => write!(fmt, "External: {e}"),
70+
ParquetError::NeedMoreData(needed) => write!(fmt, "NeedMoreData: {needed}"),
6671
}
6772
}
6873
}

parquet/src/file/metadata/reader.rs

Lines changed: 73 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,10 @@ impl ParquetMetaDataReader {
178178
///
179179
/// # Errors
180180
///
181-
/// This function will return [`ParquetError::IndexOutOfBound`] in the event `reader` does not
182-
/// provide enough data to fully parse the metadata (see example below).
181+
/// This function will return [`ParquetError::NeedMoreData`] in the event `reader` does not
182+
/// provide enough data to fully parse the metadata (see example below). The returned error
183+
/// will be populated with a `usize` field indicating the number of bytes required from the
184+
/// tail of the file to completely parse the requested metadata.
183185
///
184186
/// Other errors returned include [`ParquetError::General`] and [`ParquetError::EOF`].
185187
///
@@ -192,27 +194,58 @@ impl ParquetMetaDataReader {
192194
/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
193195
/// let file = open_parquet_file("some_path.parquet");
194196
/// let len = file.len() as usize;
195-
/// let bytes = get_bytes(&file, 1000..len);
197+
/// // Speculatively read 1 kilobyte from the end of the file
198+
/// let bytes = get_bytes(&file, len - 1024..len);
196199
/// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true);
197200
/// match reader.try_parse_sized(&bytes, len) {
198201
/// Ok(_) => (),
199-
/// Err(ParquetError::IndexOutOfBound(needed, _)) => {
202+
/// Err(ParquetError::NeedMoreData(needed)) => {
203+
/// // Read the needed number of bytes from the end of the file
200204
/// let bytes = get_bytes(&file, len - needed..len);
201205
/// reader.try_parse_sized(&bytes, len).unwrap();
202206
/// }
203207
/// _ => panic!("unexpected error")
204208
/// }
205209
/// let metadata = reader.finish().unwrap();
206210
/// ```
211+
///
212+
/// Note that it is possible for the file metadata to be completely read, but there are
213+
/// insufficient bytes available to read the page indexes. [`Self::has_metadata()`] can be used
214+
/// to test for this. In the event the file metadata is present, re-parsing of the file
215+
/// metadata can be skipped by using [`Self::read_page_indexes_sized()`], as shown below.
216+
/// ```no_run
217+
/// # use parquet::file::metadata::ParquetMetaDataReader;
218+
/// # use parquet::errors::ParquetError;
219+
/// # use crate::parquet::file::reader::Length;
220+
/// # fn get_bytes(file: &std::fs::File, range: std::ops::Range<usize>) -> bytes::Bytes { unimplemented!(); }
221+
/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
222+
/// let file = open_parquet_file("some_path.parquet");
223+
/// let len = file.len() as usize;
224+
/// // Speculatively read 1 kilobyte from the end of the file
225+
/// let mut bytes = get_bytes(&file, len - 1024..len);
226+
/// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true);
227+
/// // Loop until `bytes` is large enough
228+
/// loop {
229+
/// match reader.try_parse_sized(&bytes, len) {
230+
/// Ok(_) => break,
231+
/// Err(ParquetError::NeedMoreData(needed)) => {
232+
/// // Read the needed number of bytes from the end of the file
233+
/// bytes = get_bytes(&file, len - needed..len);
234+
/// // If file metadata was read only read page indexes, otherwise continue loop
235+
/// if reader.has_metadata() {
236+
/// reader.read_page_indexes_sized(&bytes, len);
237+
/// break;
238+
/// }
239+
/// }
240+
/// _ => panic!("unexpected error")
241+
/// }
242+
/// }
243+
/// let metadata = reader.finish().unwrap();
244+
/// ```
207245
pub fn try_parse_sized<R: ChunkReader>(&mut self, reader: &R, file_size: usize) -> Result<()> {
208246
self.metadata = match self.parse_metadata(reader) {
209247
Ok(metadata) => Some(metadata),
210-
// FIXME: throughout this module ParquetError::IndexOutOfBound is used to indicate the
211-
// need for more data. This is not it's intended use. The plan is to add a NeedMoreData
212-
// value to the enum, but this would be a breaking change. This will be done as
213-
// 54.0.0 draws nearer.
214-
// https://github.com/apache/arrow-rs/issues/6447
215-
Err(ParquetError::IndexOutOfBound(needed, _)) => {
248+
Err(ParquetError::NeedMoreData(needed)) => {
216249
// If reader is the same length as `file_size` then presumably there is no more to
217250
// read, so return an EOF error.
218251
if file_size == reader.len() as usize || needed > file_size {
@@ -223,7 +256,7 @@ impl ParquetMetaDataReader {
223256
));
224257
} else {
225258
// Ask for a larger buffer
226-
return Err(ParquetError::IndexOutOfBound(needed, file_size));
259+
return Err(ParquetError::NeedMoreData(needed));
227260
}
228261
}
229262
Err(e) => return Err(e),
@@ -246,7 +279,8 @@ impl ParquetMetaDataReader {
246279
/// Read the page index structures when a [`ParquetMetaData`] has already been obtained.
247280
/// This variant is used when `reader` cannot access the entire Parquet file (e.g. it is
248281
/// a [`Bytes`] struct containing the tail of the file).
249-
/// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`].
282+
/// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`]. Like
283+
/// [`Self::try_parse_sized()`] this function may return [`ParquetError::NeedMoreData`].
250284
pub fn read_page_indexes_sized<R: ChunkReader>(
251285
&mut self,
252286
reader: &R,
@@ -285,10 +319,7 @@ impl ParquetMetaDataReader {
285319
));
286320
} else {
287321
// Ask for a larger buffer
288-
return Err(ParquetError::IndexOutOfBound(
289-
file_size - range.start,
290-
file_size,
291-
));
322+
return Err(ParquetError::NeedMoreData(file_size - range.start));
292323
}
293324
}
294325

@@ -484,10 +515,7 @@ impl ParquetMetaDataReader {
484515
// check file is large enough to hold footer
485516
let file_size = chunk_reader.len();
486517
if file_size < (FOOTER_SIZE as u64) {
487-
return Err(ParquetError::IndexOutOfBound(
488-
FOOTER_SIZE,
489-
file_size as usize,
490-
));
518+
return Err(ParquetError::NeedMoreData(FOOTER_SIZE));
491519
}
492520

493521
let mut footer = [0_u8; 8];
@@ -500,10 +528,7 @@ impl ParquetMetaDataReader {
500528
self.metadata_size = Some(footer_metadata_len);
501529

502530
if footer_metadata_len > file_size as usize {
503-
return Err(ParquetError::IndexOutOfBound(
504-
footer_metadata_len,
505-
file_size as usize,
506-
));
531+
return Err(ParquetError::NeedMoreData(footer_metadata_len));
507532
}
508533

509534
let start = file_size - footer_metadata_len as u64;
@@ -682,7 +707,7 @@ mod tests {
682707
let err = ParquetMetaDataReader::new()
683708
.parse_metadata(&test_file)
684709
.unwrap_err();
685-
assert!(matches!(err, ParquetError::IndexOutOfBound(8, _)));
710+
assert!(matches!(err, ParquetError::NeedMoreData(8)));
686711
}
687712

688713
#[test]
@@ -701,7 +726,7 @@ mod tests {
701726
let err = ParquetMetaDataReader::new()
702727
.parse_metadata(&test_file)
703728
.unwrap_err();
704-
assert!(matches!(err, ParquetError::IndexOutOfBound(263, _)));
729+
assert!(matches!(err, ParquetError::NeedMoreData(263)));
705730
}
706731

707732
#[test]
@@ -794,7 +819,7 @@ mod tests {
794819
// should fail
795820
match reader.try_parse_sized(&bytes, len).unwrap_err() {
796821
// expected error, try again with provided bounds
797-
ParquetError::IndexOutOfBound(needed, _) => {
822+
ParquetError::NeedMoreData(needed) => {
798823
let bytes = bytes_for_range(len - needed..len);
799824
reader.try_parse_sized(&bytes, len).unwrap();
800825
let metadata = reader.finish().unwrap();
@@ -804,6 +829,26 @@ mod tests {
804829
_ => panic!("unexpected error"),
805830
};
806831

832+
// not enough for file metadata, but keep trying until page indexes are read
833+
let mut reader = ParquetMetaDataReader::new().with_page_indexes(true);
834+
let mut bytes = bytes_for_range(452505..len);
835+
loop {
836+
match reader.try_parse_sized(&bytes, len) {
837+
Ok(_) => break,
838+
Err(ParquetError::NeedMoreData(needed)) => {
839+
bytes = bytes_for_range(len - needed..len);
840+
if reader.has_metadata() {
841+
reader.read_page_indexes_sized(&bytes, len).unwrap();
842+
break;
843+
}
844+
}
845+
_ => panic!("unexpected error"),
846+
}
847+
}
848+
let metadata = reader.finish().unwrap();
849+
assert!(metadata.column_index.is_some());
850+
assert!(metadata.offset_index.is_some());
851+
807852
// not enough for page index but lie about file size
808853
let bytes = bytes_for_range(323584..len);
809854
let reader_result = reader.try_parse_sized(&bytes, len - 323584).unwrap_err();
@@ -818,7 +863,7 @@ mod tests {
818863
// should fail
819864
match reader.try_parse_sized(&bytes, len).unwrap_err() {
820865
// expected error, try again with provided bounds
821-
ParquetError::IndexOutOfBound(needed, _) => {
866+
ParquetError::NeedMoreData(needed) => {
822867
let bytes = bytes_for_range(len - needed..len);
823868
reader.try_parse_sized(&bytes, len).unwrap();
824869
reader.finish().unwrap();

0 commit comments

Comments
 (0)