@@ -178,8 +178,10 @@ impl ParquetMetaDataReader {
178178 ///
179179 /// # Errors
180180 ///
181- /// This function will return [`ParquetError::IndexOutOfBound`] in the event `reader` does not
182- /// provide enough data to fully parse the metadata (see example below).
181+ /// This function will return [`ParquetError::NeedMoreData`] in the event `reader` does not
182+ /// provide enough data to fully parse the metadata (see example below). The returned error
183+ /// will be populated with a `usize` field indicating the number of bytes required from the
184+ /// tail of the file to completely parse the requested metadata.
183185 ///
184186 /// Other errors returned include [`ParquetError::General`] and [`ParquetError::EOF`].
185187 ///
@@ -192,27 +194,58 @@ impl ParquetMetaDataReader {
192194 /// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
193195 /// let file = open_parquet_file("some_path.parquet");
194196 /// let len = file.len() as usize;
195- /// let bytes = get_bytes(&file, 1000..len);
197+ /// // Speculatively read 1 kilobyte from the end of the file
198+ /// let bytes = get_bytes(&file, len - 1024..len);
196199 /// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true);
197200 /// match reader.try_parse_sized(&bytes, len) {
198201 /// Ok(_) => (),
199- /// Err(ParquetError::IndexOutOfBound(needed, _)) => {
202+ /// Err(ParquetError::NeedMoreData(needed)) => {
203+ /// // Read the needed number of bytes from the end of the file
200204 /// let bytes = get_bytes(&file, len - needed..len);
201205 /// reader.try_parse_sized(&bytes, len).unwrap();
202206 /// }
203207 /// _ => panic!("unexpected error")
204208 /// }
205209 /// let metadata = reader.finish().unwrap();
206210 /// ```
211+ ///
212+ /// Note that it is possible for the file metadata to be completely read, but there are
213+ /// insufficient bytes available to read the page indexes. [`Self::has_metadata()`] can be used
214+ /// to test for this. In the event the file metadata is present, re-parsing of the file
215+ /// metadata can be skipped by using [`Self::read_page_indexes_sized()`], as shown below.
216+ /// ```no_run
217+ /// # use parquet::file::metadata::ParquetMetaDataReader;
218+ /// # use parquet::errors::ParquetError;
219+ /// # use crate::parquet::file::reader::Length;
220+ /// # fn get_bytes(file: &std::fs::File, range: std::ops::Range<usize>) -> bytes::Bytes { unimplemented!(); }
221+ /// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); }
222+ /// let file = open_parquet_file("some_path.parquet");
223+ /// let len = file.len() as usize;
224+ /// // Speculatively read 1 kilobyte from the end of the file
225+ /// let mut bytes = get_bytes(&file, len - 1024..len);
226+ /// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true);
227+ /// // Loop until `bytes` is large enough
228+ /// loop {
229+ /// match reader.try_parse_sized(&bytes, len) {
230+ /// Ok(_) => break,
231+ /// Err(ParquetError::NeedMoreData(needed)) => {
232+ /// // Read the needed number of bytes from the end of the file
233+ /// bytes = get_bytes(&file, len - needed..len);
234+ /// // If file metadata was read only read page indexes, otherwise continue loop
235+ /// if reader.has_metadata() {
236+ /// reader.read_page_indexes_sized(&bytes, len);
237+ /// break;
238+ /// }
239+ /// }
240+ /// _ => panic!("unexpected error")
241+ /// }
242+ /// }
243+ /// let metadata = reader.finish().unwrap();
244+ /// ```
207245 pub fn try_parse_sized < R : ChunkReader > ( & mut self , reader : & R , file_size : usize ) -> Result < ( ) > {
208246 self . metadata = match self . parse_metadata ( reader) {
209247 Ok ( metadata) => Some ( metadata) ,
210- // FIXME: throughout this module ParquetError::IndexOutOfBound is used to indicate the
211- // need for more data. This is not it's intended use. The plan is to add a NeedMoreData
212- // value to the enum, but this would be a breaking change. This will be done as
213- // 54.0.0 draws nearer.
214- // https://github.com/apache/arrow-rs/issues/6447
215- Err ( ParquetError :: IndexOutOfBound ( needed, _) ) => {
248+ Err ( ParquetError :: NeedMoreData ( needed) ) => {
216249 // If reader is the same length as `file_size` then presumably there is no more to
217250 // read, so return an EOF error.
218251 if file_size == reader. len ( ) as usize || needed > file_size {
@@ -223,7 +256,7 @@ impl ParquetMetaDataReader {
223256 ) ) ;
224257 } else {
225258 // Ask for a larger buffer
226- return Err ( ParquetError :: IndexOutOfBound ( needed, file_size ) ) ;
259+ return Err ( ParquetError :: NeedMoreData ( needed) ) ;
227260 }
228261 }
229262 Err ( e) => return Err ( e) ,
@@ -246,7 +279,8 @@ impl ParquetMetaDataReader {
246279 /// Read the page index structures when a [`ParquetMetaData`] has already been obtained.
247280 /// This variant is used when `reader` cannot access the entire Parquet file (e.g. it is
248281 /// a [`Bytes`] struct containing the tail of the file).
249- /// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`].
282+ /// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`]. Like
283+ /// [`Self::try_parse_sized()`] this function may return [`ParquetError::NeedMoreData`].
250284 pub fn read_page_indexes_sized < R : ChunkReader > (
251285 & mut self ,
252286 reader : & R ,
@@ -285,10 +319,7 @@ impl ParquetMetaDataReader {
285319 ) ) ;
286320 } else {
287321 // Ask for a larger buffer
288- return Err ( ParquetError :: IndexOutOfBound (
289- file_size - range. start ,
290- file_size,
291- ) ) ;
322+ return Err ( ParquetError :: NeedMoreData ( file_size - range. start ) ) ;
292323 }
293324 }
294325
@@ -484,10 +515,7 @@ impl ParquetMetaDataReader {
484515 // check file is large enough to hold footer
485516 let file_size = chunk_reader. len ( ) ;
486517 if file_size < ( FOOTER_SIZE as u64 ) {
487- return Err ( ParquetError :: IndexOutOfBound (
488- FOOTER_SIZE ,
489- file_size as usize ,
490- ) ) ;
518+ return Err ( ParquetError :: NeedMoreData ( FOOTER_SIZE ) ) ;
491519 }
492520
493521 let mut footer = [ 0_u8 ; 8 ] ;
@@ -500,10 +528,7 @@ impl ParquetMetaDataReader {
500528 self . metadata_size = Some ( footer_metadata_len) ;
501529
502530 if footer_metadata_len > file_size as usize {
503- return Err ( ParquetError :: IndexOutOfBound (
504- footer_metadata_len,
505- file_size as usize ,
506- ) ) ;
531+ return Err ( ParquetError :: NeedMoreData ( footer_metadata_len) ) ;
507532 }
508533
509534 let start = file_size - footer_metadata_len as u64 ;
@@ -682,7 +707,7 @@ mod tests {
682707 let err = ParquetMetaDataReader :: new ( )
683708 . parse_metadata ( & test_file)
684709 . unwrap_err ( ) ;
685- assert ! ( matches!( err, ParquetError :: IndexOutOfBound ( 8 , _ ) ) ) ;
710+ assert ! ( matches!( err, ParquetError :: NeedMoreData ( 8 ) ) ) ;
686711 }
687712
688713 #[ test]
@@ -701,7 +726,7 @@ mod tests {
701726 let err = ParquetMetaDataReader :: new ( )
702727 . parse_metadata ( & test_file)
703728 . unwrap_err ( ) ;
704- assert ! ( matches!( err, ParquetError :: IndexOutOfBound ( 263 , _ ) ) ) ;
729+ assert ! ( matches!( err, ParquetError :: NeedMoreData ( 263 ) ) ) ;
705730 }
706731
707732 #[ test]
@@ -794,7 +819,7 @@ mod tests {
794819 // should fail
795820 match reader. try_parse_sized ( & bytes, len) . unwrap_err ( ) {
796821 // expected error, try again with provided bounds
797- ParquetError :: IndexOutOfBound ( needed, _ ) => {
822+ ParquetError :: NeedMoreData ( needed) => {
798823 let bytes = bytes_for_range ( len - needed..len) ;
799824 reader. try_parse_sized ( & bytes, len) . unwrap ( ) ;
800825 let metadata = reader. finish ( ) . unwrap ( ) ;
@@ -804,6 +829,26 @@ mod tests {
804829 _ => panic ! ( "unexpected error" ) ,
805830 } ;
806831
832+ // not enough for file metadata, but keep trying until page indexes are read
833+ let mut reader = ParquetMetaDataReader :: new ( ) . with_page_indexes ( true ) ;
834+ let mut bytes = bytes_for_range ( 452505 ..len) ;
835+ loop {
836+ match reader. try_parse_sized ( & bytes, len) {
837+ Ok ( _) => break ,
838+ Err ( ParquetError :: NeedMoreData ( needed) ) => {
839+ bytes = bytes_for_range ( len - needed..len) ;
840+ if reader. has_metadata ( ) {
841+ reader. read_page_indexes_sized ( & bytes, len) . unwrap ( ) ;
842+ break ;
843+ }
844+ }
845+ _ => panic ! ( "unexpected error" ) ,
846+ }
847+ }
848+ let metadata = reader. finish ( ) . unwrap ( ) ;
849+ assert ! ( metadata. column_index. is_some( ) ) ;
850+ assert ! ( metadata. offset_index. is_some( ) ) ;
851+
807852 // not enough for page index but lie about file size
808853 let bytes = bytes_for_range ( 323584 ..len) ;
809854 let reader_result = reader. try_parse_sized ( & bytes, len - 323584 ) . unwrap_err ( ) ;
@@ -818,7 +863,7 @@ mod tests {
818863 // should fail
819864 match reader. try_parse_sized ( & bytes, len) . unwrap_err ( ) {
820865 // expected error, try again with provided bounds
821- ParquetError :: IndexOutOfBound ( needed, _ ) => {
866+ ParquetError :: NeedMoreData ( needed) => {
822867 let bytes = bytes_for_range ( len - needed..len) ;
823868 reader. try_parse_sized ( & bytes, len) . unwrap ( ) ;
824869 reader. finish ( ) . unwrap ( ) ;
0 commit comments