18
18
19
19
package com .dtstack .chunjun .connector .s3 .source ;
20
20
21
+ import com .dtstack .chunjun .config .FieldConfig ;
21
22
import com .dtstack .chunjun .config .RestoreConfig ;
22
23
import com .dtstack .chunjun .connector .s3 .config .S3Config ;
23
24
import com .dtstack .chunjun .connector .s3 .enums .CompressType ;
24
25
import com .dtstack .chunjun .connector .s3 .util .ReaderUtil ;
25
26
import com .dtstack .chunjun .connector .s3 .util .S3SimpleObject ;
26
27
import com .dtstack .chunjun .connector .s3 .util .S3Util ;
28
+ import com .dtstack .chunjun .format .excel .common .ExcelData ;
29
+ import com .dtstack .chunjun .format .excel .source .ExcelInputFormat ;
30
+ import com .dtstack .chunjun .format .tika .common .TikaData ;
31
+ import com .dtstack .chunjun .format .tika .source .TikaInputFormat ;
27
32
import com .dtstack .chunjun .restore .FormatState ;
28
33
import com .dtstack .chunjun .source .format .BaseRichInputFormat ;
29
34
import com .dtstack .chunjun .throwable .ChunJunRuntimeException ;
38
43
import com .amazonaws .services .s3 .model .S3Object ;
39
44
import com .amazonaws .services .s3 .model .S3ObjectInputStream ;
40
45
import lombok .extern .slf4j .Slf4j ;
46
+ import org .apache .commons .collections .CollectionUtils ;
47
+ import org .apache .commons .io .FilenameUtils ;
41
48
import org .apache .commons .lang3 .StringUtils ;
42
49
43
50
import java .io .IOException ;
@@ -71,6 +78,12 @@ public class S3InputFormat extends BaseRichInputFormat {
71
78
72
79
private RestoreConfig restoreConf ;
73
80
81
+ private transient TikaData tikaData ;
82
+ private TikaInputFormat tikaInputFormat ;
83
+
84
+ private transient ExcelData excelData ;
85
+ private ExcelInputFormat excelInputFormat ;
86
+
74
87
@ Override
75
88
public void openInputFormat () throws IOException {
76
89
super .openInputFormat ();
@@ -137,7 +150,31 @@ protected InputSplit[] createInputSplitsInternal(int minNumSplits) {
137
150
protected RowData nextRecordInternal (RowData rowData ) throws ReadRecordException {
138
151
String [] fields ;
139
152
try {
140
- fields = readerUtil .getValues ();
153
+ if (s3Config .getTikaReadConfig ().isUseExtract () && tikaData != null ) {
154
+ fields = tikaData .getData ();
155
+ } else if (s3Config .getExcelFormatConfig ().isUseExcelFormat () && excelData != null ) {
156
+ fields = excelData .getData ();
157
+ } else {
158
+ fields = readerUtil .getValues ();
159
+ }
160
+ // 处理字段配置了对应的列索引
161
+ if (s3Config .getExcelFormatConfig ().getColumnIndex () != null ) {
162
+ List <FieldConfig > columns = s3Config .getColumn ();
163
+ String [] fieldsData = new String [columns .size ()];
164
+ for (int i = 0 ; i < CollectionUtils .size (columns ); i ++) {
165
+ FieldConfig fieldConfig = columns .get (i );
166
+ if (fieldConfig .getIndex () >= fields .length ) {
167
+ String errorMessage =
168
+ String .format (
169
+ "The column index is greater than the data size."
170
+ + " The current column index is [%s], but the data size is [%s]. Data loss may occur." ,
171
+ fieldConfig .getIndex (), fields .length );
172
+ throw new IllegalArgumentException (errorMessage );
173
+ }
174
+ fieldsData [i ] = fields [fieldConfig .getIndex ()];
175
+ }
176
+ fields = fieldsData ;
177
+ }
141
178
rowData = rowConverter .toInternal (fields );
142
179
} catch (IOException e ) {
143
180
throw new ChunJunRuntimeException (e );
@@ -164,9 +201,82 @@ protected void closeInternal() {
164
201
165
202
@ Override
166
203
public boolean reachedEnd () throws IOException {
204
+ if (s3Config .getTikaReadConfig ().isUseExtract ()) {
205
+ tikaData = getTikaData ();
206
+ return tikaData == null || tikaData .getData () == null ;
207
+ } else if (s3Config .getExcelFormatConfig ().isUseExcelFormat ()) {
208
+ excelData = getExcelData ();
209
+ return excelData == null || excelData .getData () == null ;
210
+ }
167
211
return reachedEndWithoutCheckState ();
168
212
}
169
213
214
+ public ExcelData getExcelData () {
215
+ if (excelInputFormat == null ) {
216
+ nextExcelDataStream ();
217
+ }
218
+ if (excelInputFormat != null ) {
219
+ if (!excelInputFormat .hasNext ()) {
220
+ excelInputFormat .close ();
221
+ excelInputFormat = null ;
222
+ return getExcelData ();
223
+ }
224
+ String [] record = excelInputFormat .nextRecord ();
225
+ return new ExcelData (record );
226
+ } else {
227
+ return null ;
228
+ }
229
+ }
230
+
231
+ private void nextExcelDataStream () {
232
+ if (splits .hasNext ()) {
233
+ currentObject = splits .next ();
234
+ GetObjectRequest rangeObjectRequest =
235
+ new GetObjectRequest (s3Config .getBucket (), currentObject );
236
+ log .info ("Current read file {}" , currentObject );
237
+ S3Object o = amazonS3 .getObject (rangeObjectRequest );
238
+ S3ObjectInputStream s3is = o .getObjectContent ();
239
+ excelInputFormat = new ExcelInputFormat ();
240
+ excelInputFormat .open (s3is , s3Config .getExcelFormatConfig ());
241
+ } else {
242
+ excelInputFormat = null ;
243
+ }
244
+ }
245
+
246
+ public TikaData getTikaData () {
247
+ if (tikaInputFormat == null ) {
248
+ nextTikaDataStream ();
249
+ }
250
+ if (tikaInputFormat != null ) {
251
+ if (!tikaInputFormat .hasNext ()) {
252
+ tikaInputFormat .close ();
253
+ tikaInputFormat = null ;
254
+ return getTikaData ();
255
+ }
256
+ String [] record = tikaInputFormat .nextRecord ();
257
+ return new TikaData (record );
258
+ } else {
259
+ return null ;
260
+ }
261
+ }
262
+
263
+ private void nextTikaDataStream () {
264
+ if (splits .hasNext ()) {
265
+ currentObject = splits .next ();
266
+ GetObjectRequest rangeObjectRequest =
267
+ new GetObjectRequest (s3Config .getBucket (), currentObject );
268
+ log .info ("Current read file {}" , currentObject );
269
+ S3Object o = amazonS3 .getObject (rangeObjectRequest );
270
+ S3ObjectInputStream s3is = o .getObjectContent ();
271
+ tikaInputFormat =
272
+ new TikaInputFormat (
273
+ s3Config .getTikaReadConfig (), s3Config .getFieldNameList ().size ());
274
+ tikaInputFormat .open (s3is , FilenameUtils .getName (currentObject ));
275
+ } else {
276
+ tikaInputFormat = null ;
277
+ }
278
+ }
279
+
170
280
public boolean reachedEndWithoutCheckState () throws IOException {
171
281
// br is empty, indicating that a new file needs to be read
172
282
if (readerUtil == null ) {
@@ -259,7 +369,11 @@ public List<S3SimpleObject> resolveObjects() {
259
369
if (s3Config .isUseV2 ()) {
260
370
subObjects =
261
371
S3Util .listObjectsKeyByPrefix (
262
- amazonS3 , bucket , prefix , s3Config .getFetchSize ());
372
+ amazonS3 ,
373
+ bucket ,
374
+ prefix ,
375
+ s3Config .getFetchSize (),
376
+ s3Config .getObjectsRegex ());
263
377
} else {
264
378
subObjects =
265
379
S3Util .listObjectsByv1 (
0 commit comments