1616
1717package com .samsung .sds .brightics .common .data .parquet .reader .util ;
1818
19- import java .io .IOException ;
20- import java .util .ArrayList ;
21- import java .util .Arrays ;
22- import java .util .Iterator ;
23- import java .util .List ;
24- import java .util .stream .Collectors ;
25- import java .util .stream .IntStream ;
26- import java .util .stream .Stream ;
27-
19+ import com .samsung .sds .brightics .common .data .parquet .reader .BrighticsParquetReadSupport ;
20+ import com .samsung .sds .brightics .common .data .parquet .reader .DefaultRecord ;
21+ import com .samsung .sds .brightics .common .data .parquet .reader .info .FileIndex ;
22+ import com .samsung .sds .brightics .common .data .parquet .reader .info .ParquetInformation ;
23+ import com .samsung .sds .brightics .common .data .view .Column ;
2824import org .apache .hadoop .conf .Configuration ;
2925import org .apache .hadoop .fs .FileStatus ;
3026import org .apache .hadoop .fs .FileSystem ;
3632import org .apache .parquet .hadoop .metadata .BlockMetaData ;
3733import org .apache .parquet .hadoop .metadata .FileMetaData ;
3834import org .apache .parquet .hadoop .metadata .ParquetMetadata ;
39- import org .apache .parquet .schema .DecimalMetadata ;
40- import org .apache .parquet .schema .GroupType ;
41- import org .apache .parquet .schema .MessageType ;
42- import org .apache .parquet .schema .OriginalType ;
43- import org .apache .parquet .schema .Type ;
35+ import org .apache .parquet .schema .*;
4436import org .apache .parquet .schema .Type .Repetition ;
37+ import org .slf4j .Logger ;
38+ import org .slf4j .LoggerFactory ;
4539
46- import com .samsung .sds .brightics .common .core .exception .BrighticsCoreException ;
47- import com .samsung .sds .brightics .common .data .parquet .reader .BrighticsParquetReadSupport ;
48- import com .samsung .sds .brightics .common .data .parquet .reader .DefaultRecord ;
49- import com .samsung .sds .brightics .common .data .parquet .reader .info .FileIndex ;
50- import com .samsung .sds .brightics .common .data .parquet .reader .info .ParquetInformation ;
51- import com .samsung .sds .brightics .common .data .view .Column ;
40+ import java .io .IOException ;
41+ import java .util .ArrayList ;
42+ import java .util .Arrays ;
43+ import java .util .Iterator ;
44+ import java .util .List ;
45+ import java .util .stream .Collectors ;
46+ import java .util .stream .IntStream ;
47+ import java .util .stream .Stream ;
5248
5349public class BrighticsParquetUtils {
5450
55- // private static final Logger LOGGER = LoggerFactory.getLogger("ParquetClient");
51+ private static final Logger LOGGER = LoggerFactory .getLogger ("ParquetClient" );
5652
5753 public static ParquetInformation getParquetInformation (Path path , Configuration conf , int [] filteredColumnIndex ) throws IOException {
5854 FileStatus directory = FileSystem .get (conf ).getFileStatus (path );
@@ -68,7 +64,8 @@ public static ParquetInformation getParquetInformation(Path path, Configuration
6864
6965 // set schema
7066 FileMetaData fileMetaData = footers .get (0 ).getParquetMetadata ().getFileMetaData ();
71- List <Type > filteredColumns = getFilteredColumns (fileMetaData .getSchema (), filteredColumnIndex );
67+ int [] validatedColumnIndexArray = Arrays .stream (filteredColumnIndex ).filter (i -> fileMetaData .getSchema ().getColumns ().size () > i ).toArray ();
68+ List <Type > filteredColumns = getFilteredColumns (fileMetaData .getSchema (), validatedColumnIndexArray );
7269 Column [] schema = filteredColumns .stream ().map (c -> new Column (c .getName (), convertTypeName (c ))).toArray (Column []::new );
7370
7471 //set count, buffer size
@@ -86,7 +83,7 @@ public static ParquetInformation getParquetInformation(Path path, Configuration
8683 buf .add (new FileIndex (footer .getFile ().toString (), previousTotalCount , previousTotalCount + currentCount ));
8784 previousTotalCount += currentCount ;
8885 }
89- return new ParquetInformation (schema , previousTotalCount , totalBytes , buf );
86+ return new ParquetInformation (schema , previousTotalCount , totalBytes , buf , validatedColumnIndexArray );
9087 }
9188
9289 public static ParquetInformation getParquetInformation (Path path , Configuration conf ) throws IOException {
@@ -194,7 +191,8 @@ public static List<Type> getFilteredColumns(MessageType schema, int[] filteredCo
194191 List <Type > copyFields = new ArrayList <Type >();
195192 //if filteredColumns is null or 0. pass filtering
196193 if (schema .getColumns ().size () < filteredColumns .length ) {
197- throw new BrighticsCoreException ("3102" , "The column size used in the query is larger than the number of existing data columns." );
194+ LOGGER .warn ("The column size used in the query is larger than the number of existing data columns." );
195+ // throw new BrighticsCoreException("3102", "The column size used in the query is larger than the number of existing data columns.");
198196 }
199197
200198 if (filteredColumns != null && filteredColumns .length > 0 ) {
@@ -220,7 +218,7 @@ public static int[] combineFilteredColumnIndexArray(int start, int end, int[] se
220218 Stream <Integer > range = IntStream .range (start , end + 1 ).boxed ();
221219 return Stream .concat (range , selected ).distinct ().sorted ().mapToInt (i -> i ).toArray ();
222220 } else {
223- return selected .mapToInt (i -> i ).toArray ();
221+ return selected .distinct (). sorted (). mapToInt (i -> i ).toArray ();
224222 }
225223 }
226224
0 commit comments