18
18
19
19
import com .google .auth .oauth2 .GoogleCredentials ;
20
20
import com .google .cloud .storage .Blob ;
21
+ import com .google .cloud .storage .BlobInfo ;
21
22
import com .google .cloud .storage .Bucket ;
22
23
import com .google .cloud .storage .BucketInfo ;
23
24
import com .google .cloud .storage .Storage ;
24
25
import com .google .cloud .storage .StorageOptions ;
25
26
import com .google .common .base .Preconditions ;
27
+ import com .google .common .base .Strings ;
26
28
import com .google .common .collect .ImmutableMap ;
27
29
import com .google .common .io .Files ;
28
30
import com .google .gson .Gson ;
77
79
import java .util .concurrent .TimeUnit ;
78
80
import java .util .stream .Collectors ;
79
81
import java .util .stream .StreamSupport ;
82
+ import javax .annotation .Nullable ;
80
83
81
84
/**
82
85
* Tests reading from GCS (Google Cloud Storage) and writing to GCS from within a Dataproc cluster.
@@ -90,6 +93,7 @@ public class GCSTest extends DataprocETLTestBase {
90
93
private static final String GCS_MOVE_PLUGIN_NAME = "GCSMove" ;
91
94
private static final String GCS_COPY_PLUGIN_NAME = "GCSCopy" ;
92
95
private static final String SINK_PLUGIN_NAME = "GCS" ;
96
+ private static final String MULTI_SINK_PLUGIN_NAME = "GCSMultiFiles" ;
93
97
private static final String SOURCE_PLUGIN_NAME = "GCSFile" ;
94
98
private static final Schema ALL_DT_SCHEMA = Schema .recordOf (
95
99
"record" ,
@@ -118,6 +122,8 @@ public class GCSTest extends DataprocETLTestBase {
118
122
119
123
private static Storage storage ;
120
124
private List <String > markedForDeleteBuckets ;
125
+ private static final String CSV_CONTENT_TYPE = "text/csv" ;
126
+ private static final String MULTISINK_RUNTIME_ARG = "multisink.%s" ;
121
127
122
128
@ BeforeClass
123
129
public static void testClassSetup () throws IOException {
@@ -761,7 +767,8 @@ public void testGcsSourceFormats() throws Exception {
761
767
id,first,last,email,address,city,state,zip
762
768
1,Marilyn,Hawkins,[email protected] ,238 Melvin Way,Palo Alto,CA,94302
763
769
*/
764
- ETLStage sink = new ETLStage ("sink" , createSinkPlugin ("csv" , createPath (bucket , "output" ), schema ));
770
+ ETLStage sink = new ETLStage ("sink" , createSinkPlugin ("csv" , createPath (bucket , "output" ),
771
+ schema , CSV_CONTENT_TYPE ));
765
772
pipelineConfig = ETLBatchConfig .builder ().addStage (sink );
766
773
for (String format : formats ) {
767
774
String path = String .format ("%s/%s" , createPath (bucket , OUTPUT_BLOB_NAME ), format );
@@ -776,6 +783,7 @@ public void testGcsSourceFormats() throws Exception {
776
783
777
784
Map <String , Integer > lineCounts = new HashMap <>();
778
785
List <String > results = getResultBlobsContent (bucket , "output" );
786
+ List <String > resultBlobsContentType = getResultBlobsContentType (bucket , "output" );
779
787
for (String result : results ) {
780
788
for (String line : result .split ("\n " )) {
781
789
lineCounts .putIfAbsent (line , 0 );
@@ -787,6 +795,72 @@ public void testGcsSourceFormats() throws Exception {
787
795
expected .put (line2 , formats .size ());
788
796
expected .put (line3 , formats .size ());
789
797
Assert .assertEquals (expected , lineCounts );
798
+ Assert .assertEquals (CSV_CONTENT_TYPE , resultBlobsContentType .get (0 ));
799
+ }
800
+
801
+ @ Test
802
+ public void testMultiSinkContentType () throws Exception {
803
+ String bucketName = "cask-gcs-multisink-" + UUID .randomUUID ().toString ();
804
+ Bucket bucket = createBucket (bucketName );
805
+
806
+ Schema schema = Schema .recordOf ("customer" ,
807
+ Schema .Field .of ("id" , Schema .of (Schema .Type .INT )),
808
+ Schema .Field .of ("name" , Schema .nullableOf (Schema .of (Schema .Type .STRING ))),
809
+ Schema .Field .of ("email" , Schema .nullableOf (Schema .of (Schema .Type .STRING ))),
810
+ Schema .Field .of ("departament" , Schema .nullableOf (Schema .of (Schema .Type .STRING ))));
811
+
812
+ Schema outputSchema = Schema .recordOf ("output.schema" ,
813
+ Schema .Field .of ("id" , Schema .of (Schema .Type .INT )),
814
+ Schema .Field .of ("name" , Schema .nullableOf (Schema .of (Schema .Type .STRING ))),
815
+ Schema .Field .of ("email" , Schema .nullableOf (Schema .of (Schema .Type .STRING ))));
816
+
817
+ String line1 =
"1,Marilyn Hawkins,[email protected] ,DepartmentA" ;
818
+ String line2 =
"2,Terry Perez,[email protected] ,DepartmentB" ;
819
+ String line3 =
"3,Jack Ferguson,[email protected] ,DepartmentA" ;
820
+ String inputPath = "input" ;
821
+
822
+ bucket .create (inputPath , String .join ("\n " , Arrays .asList (line1 , line2 , line3 )).getBytes (StandardCharsets .UTF_8 ));
823
+
824
+ Map <String , String > inputSourceConfig = new HashMap <>();
825
+ inputSourceConfig .put ("schema" , schema .toString ());
826
+ inputSourceConfig .put ("format" , "${sourceFormat}" );
827
+ inputSourceConfig .put ("referenceName" , "source_" + UUID .randomUUID ().toString ());
828
+ inputSourceConfig .put ("project" , getProjectId ());
829
+ inputSourceConfig .put ("path" , createPath (bucket , inputPath ));
830
+ ETLStage source = new ETLStage ("source" ,
831
+ new ETLPlugin (SOURCE_PLUGIN_NAME ,
832
+ BatchSource .PLUGIN_TYPE ,
833
+ inputSourceConfig ,
834
+ GOOGLE_CLOUD_ARTIFACT ));
835
+
836
+ ETLBatchConfig .Builder pipelineConfig = ETLBatchConfig .builder ().addStage (source );
837
+
838
+ String path = createPath (bucket , OUTPUT_BLOB_NAME );
839
+ ETLStage sink = new ETLStage ("multsink" , createMultiSinkPlugin ("csv" ));
840
+ pipelineConfig .addStage (sink ).addConnection (source .getName (), sink .getName ());
841
+
842
+ AppRequest <ETLBatchConfig > appRequest = getBatchAppRequestV2 (pipelineConfig .build ());
843
+ ApplicationId appId = TEST_NAMESPACE .app ("GCSMultiSinkContentType" );
844
+ ApplicationManager appManager = deployApplication (appId , appRequest );
845
+
846
+ String multisink1 = String .format (MULTISINK_RUNTIME_ARG , "DepartmentA" );
847
+ String multisink2 = String .format (MULTISINK_RUNTIME_ARG , "DepartmentB" );
848
+ Map <String , String > args = new HashMap <>();
849
+ args .put (multisink1 , outputSchema .toString ());
850
+ args .put (multisink2 , outputSchema .toString ());
851
+ args .put ("sourceFormat" , "csv" );
852
+ args .put ("multiSinkPath" , path );
853
+ args .put ("multiSinkProjectId" , getProjectId ());
854
+ args .put ("multiSinkSchema" , schema .toString ());
855
+ args .put ("multiSinkSplitField" , "departament" );
856
+ args .put ("contentType" , CSV_CONTENT_TYPE );
857
+ startWorkFlow (appManager , ProgramRunStatus .COMPLETED , args );
858
+
859
+ List <String > multisinkContentType1 = getResultBlobsContentType (bucket , OUTPUT_BLOB_NAME + "/DepartmentA" );
860
+ List <String > multisinkContentType2 = getResultBlobsContentType (bucket , OUTPUT_BLOB_NAME + "/DepartmentB" );
861
+ Assert .assertEquals (CSV_CONTENT_TYPE , multisinkContentType1 .get (0 ));
862
+ Assert .assertEquals (CSV_CONTENT_TYPE , multisinkContentType2 .get (0 ));
863
+
790
864
}
791
865
792
866
private ETLStage createSourceStage (String format , String path , String regex , Schema schema ) {
@@ -803,14 +877,34 @@ private ETLStage createSourceStage(String format, String path, String regex, Sch
803
877
}
804
878
805
879
private ETLPlugin createSinkPlugin (String format , String path , Schema schema ) {
806
- return new ETLPlugin (SINK_PLUGIN_NAME , BatchSink .PLUGIN_TYPE ,
807
- ImmutableMap .of (
808
- "path" , path ,
809
- "format" , format ,
810
- "project" , getProjectId (),
811
- "referenceName" , format ,
812
- "schema" , schema .toString ()),
813
- GOOGLE_CLOUD_ARTIFACT );
880
+ return createSinkPlugin (format , path , schema , null );
881
+ }
882
+
883
+ private ETLPlugin createSinkPlugin (String format , String path , Schema schema ,@ Nullable String contentType ) {
884
+ ImmutableMap .Builder <String , String > propertyBuilder = new ImmutableMap .Builder <String , String >()
885
+ .put ("path" , path )
886
+ .put ("format" , format )
887
+ .put ("project" , getProjectId ())
888
+ .put ("referenceName" , format )
889
+ .put ("schema" , schema .toString ());
890
+ if (!Strings .isNullOrEmpty (contentType )) {
891
+ propertyBuilder .put ("contentType" , contentType );
892
+ }
893
+ return new ETLPlugin (SINK_PLUGIN_NAME , BatchSink .PLUGIN_TYPE , propertyBuilder .build (), GOOGLE_CLOUD_ARTIFACT );
894
+ }
895
+
896
+ private ETLPlugin createMultiSinkPlugin (String sinkFormat ) {
897
+ Map <String , String > map = new HashMap <>();
898
+ map .put ("path" , "${multiSinkPath}" );
899
+ map .put ("format" , sinkFormat );
900
+ //todo make macro when https://cdap.atlassian.net/browse/PLUGIN-553 is fixed
901
+ //map.put("format", "${sinkFormat}");
902
+ map .put ("project" , "${multiSinkProjectId}" );
903
+ map .put ("schema" , "${multiSinkSchema}" );
904
+ map .put ("referenceName" , "gcs-multi-input" );
905
+ map .put ("splitField" , "${multiSinkSplitField}" );
906
+ map .put ("contentType" , "${contentType}" );
907
+ return new ETLPlugin (MULTI_SINK_PLUGIN_NAME , BatchSink .PLUGIN_TYPE , map , GOOGLE_CLOUD_ARTIFACT );
814
908
}
815
909
816
910
static class DataTypesRecord {
@@ -881,4 +975,19 @@ private static String blobContentToString(Blob blob) {
881
975
return null ;
882
976
}
883
977
978
+ /**
979
+ * Reads content type of files in path
980
+ */
981
+ private List <String > getResultBlobsContentType (Bucket bucket , String path ) {
982
+ String successFile = path + "/_SUCCESS" ;
983
+ assertExists (bucket , successFile );
984
+
985
+ return StreamSupport .stream (bucket .list ().iterateAll ().spliterator (), false )
986
+ .filter (blob -> blob .getName ().startsWith (path + "/" )
987
+ && !successFile .equals (blob .getName ()) && !blob .getName ().endsWith ("/" ))
988
+ .map (BlobInfo ::getContentType )
989
+ .filter (Objects ::nonNull )
990
+ .collect (Collectors .toList ());
991
+ }
992
+
884
993
}
0 commit comments