Merge pull request #113 from data-integrations/fix/CDAP-17847-clustering-key

sagarkapare · web-flow · commit 06270e805424 · 2021-04-23T10:34:19.000-07:00
CDAP-17847 Ignore un-supported primary key types while creating clustering on BigQuery tables.
diff --git a/src/main/java/io/cdap/delta/bigquery/BigQueryEventConsumer.java b/src/main/java/io/cdap/delta/bigquery/BigQueryEventConsumer.java
@@ -226,7 +226,7 @@ public class BigQueryEventConsumer implements EventConsumer {
     // https://cloud.google.com/bigquery/docs/creating-clustered-tables#limitations
     this.maxClusteringColumns = maxClusteringColumnsStr == null ? 4 : Integer.parseInt(maxClusteringColumnsStr);
     this.sourceRowIdSupported =
-      context.getSourceProperties() == null ? false : context.getSourceProperties().isRowIdSupported();
+      context.getSourceProperties() != null && context.getSourceProperties().isRowIdSupported();
     this.sourceEventOrdering = context.getSourceProperties() == null ? SourceProperties.Ordering.ORDERED :
       context.getSourceProperties().getOrdering();
     this.datasetName = datasetName;
@@ -371,9 +371,11 @@ private void handleDDL(DDLEvent event, String normalizedDatabaseName, String nor
         updatePrimaryKeys(tableId, primaryKeys);
         // TODO: check schema of table if it exists already
         if (table == null) {
-          Clustering clustering = maxClusteringColumns <= 0 ? null :
+          List<String> clusteringSupportedKeys = getClusteringSupportedKeys(primaryKeys, event.getSchema());
+          Clustering clustering = maxClusteringColumns <= 0 || clusteringSupportedKeys.isEmpty() ? null :
             Clustering.newBuilder()
-              .setFields(primaryKeys.subList(0, Math.min(maxClusteringColumns, primaryKeys.size())))
+              .setFields(clusteringSupportedKeys.subList(0, Math.min(maxClusteringColumns,
+                                                                     clusteringSupportedKeys.size())))
               .build();
           TableDefinition tableDefinition = StandardTableDefinition.newBuilder()
             .setSchema(Schemas.convert(addSupplementaryColumnsToTargetSchema(event.getSchema())))
@@ -477,6 +479,17 @@ private void handleDDL(DDLEvent event, String normalizedDatabaseName, String nor
     }
   }
 
+  @VisibleForTesting
+  static List<String> getClusteringSupportedKeys(List<String> primaryKeys, Schema recordSchema) {
+    List<String> result = new ArrayList<>();
+    for (String key : primaryKeys) {
+      if (Schemas.isClusteringSupported(recordSchema.getField(key))) {
+        result.add(key);
+      }
+    }
+    return result;
+  }
+
   private void updatePrimaryKeys(TableId tableId, List<String> primaryKeys) throws DeltaFailureException, IOException {
     if (primaryKeys.isEmpty()) {
       throw new DeltaFailureException(
diff --git a/src/main/java/io/cdap/delta/bigquery/Schemas.java b/src/main/java/io/cdap/delta/bigquery/Schemas.java
@@ -22,66 +22,37 @@
 import io.cdap.cdap.api.data.schema.Schema;
 
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import javax.annotation.Nullable;
 
 /**
  * Utilities around BigQuery schemas.
  */
 public class Schemas {
 
+  // Set of BigQuery types supported for clustering per definition
+  // https://cloud.google.com/bigquery/docs/creating-clustered-tables#limitations
+  private static final Set<StandardSQLTypeName> CLUSTERING_SUPPORTED_TYPES
+    = new HashSet<>(Arrays.asList(StandardSQLTypeName.DATE, StandardSQLTypeName.BOOL, StandardSQLTypeName.GEOGRAPHY,
+                                  StandardSQLTypeName.INT64, StandardSQLTypeName.NUMERIC, StandardSQLTypeName.STRING,
+                                  StandardSQLTypeName.TIMESTAMP, StandardSQLTypeName.DATETIME));
+
   private Schemas() {
     // no-op
   }
 
+
   public static com.google.cloud.bigquery.Schema convert(Schema schema) {
     return com.google.cloud.bigquery.Schema.of(convertFields(schema.getFields()));
   }
 
   private static List<Field> convertFields(List<Schema.Field> fields) {
     List<Field> output = new ArrayList<>();
     for (Schema.Field field : fields) {
-      String name = field.getName();
-      boolean isNullable = field.getSchema().isNullable();
-      Schema fieldSchema = field.getSchema();
-      fieldSchema = isNullable ? fieldSchema.getNonNullable() : fieldSchema;
-      Schema.LogicalType logicalType = fieldSchema.getLogicalType();
-      Field.Mode fieldMode = isNullable ? Field.Mode.NULLABLE : Field.Mode.REQUIRED;
-      if (logicalType != null) {
-        StandardSQLTypeName bqType = convertLogicalType(logicalType);
-        // TODO: figure out what the correct behavior should be
-        if (bqType == null) {
-          throw new IllegalArgumentException(
-            String.format("Field '%s' is of type '%s', which is not supported in BigQuery.",
-                          name, logicalType.getToken()));
-        }
-        output.add(Field.newBuilder(name, bqType).setMode(fieldMode).build());
-        continue;
-      }
-
-      Schema.Type type = isNullable ? field.getSchema().getNonNullable().getType() : field.getSchema().getType();
-      if (type == Schema.Type.ARRAY) {
-        Schema componentSchema = fieldSchema.getComponentSchema();
-        componentSchema = componentSchema.isNullable() ? componentSchema.getNonNullable() : componentSchema;
-        StandardSQLTypeName bqType = convertType(componentSchema.getType());
-        if (bqType == null) {
-          throw new IllegalArgumentException(
-            String.format("Field '%s' is an array of '%s', which is not supported in BigQuery.",
-                          name, logicalType.getToken()));
-        }
-        output.add(Field.newBuilder(name, bqType).setMode(Field.Mode.REPEATED).build());
-      } else if (type == Schema.Type.RECORD) {
-        List<Field> subFields = convertFields(fieldSchema.getFields());
-        output.add(Field.newBuilder(name, StandardSQLTypeName.STRUCT, FieldList.of(subFields)).build());
-      } else {
-        StandardSQLTypeName bqType = convertType(type);
-        if (bqType == null) {
-          throw new IllegalArgumentException(
-            String.format("Field '%s' is of type '%s', which is not supported in BigQuery.",
-                          name, type.name().toLowerCase()));
-        }
-        output.add(Field.newBuilder(name, bqType).setMode(fieldMode).build());
-      }
+      output.add(convertToBigQueryField(field));
     }
     return output;
   }
@@ -96,13 +67,12 @@ private static StandardSQLTypeName convertType(Schema.Type type) {
       case DOUBLE:
         return StandardSQLTypeName.FLOAT64;
       case STRING:
+      case ENUM:
         return StandardSQLTypeName.STRING;
       case BOOLEAN:
         return StandardSQLTypeName.BOOL;
       case BYTES:
         return StandardSQLTypeName.BYTES;
-      case ENUM:
-        return StandardSQLTypeName.STRING;
     }
     return null;
   }
@@ -125,4 +95,58 @@ private static StandardSQLTypeName convertLogicalType(Schema.LogicalType logical
     }
     return null;
   }
+
+  /**
+   * Check if the BigQuery data type associated with the {@link Schema.Field} can be added
+   * as a clustering column while creating BigQuery table.
+   */
+  public static boolean isClusteringSupported(Schema.Field field) {
+    Field bigQueryField = convertToBigQueryField(field);
+    return CLUSTERING_SUPPORTED_TYPES.contains(bigQueryField.getType().getStandardType());
+  }
+
+  private static Field convertToBigQueryField(Schema.Field field) {
+    String name = field.getName();
+    boolean isNullable = field.getSchema().isNullable();
+    Schema fieldSchema = field.getSchema();
+    fieldSchema = isNullable ? fieldSchema.getNonNullable() : fieldSchema;
+    Schema.LogicalType logicalType = fieldSchema.getLogicalType();
+    Field.Mode fieldMode = isNullable ? Field.Mode.NULLABLE : Field.Mode.REQUIRED;
+    if (logicalType != null) {
+      StandardSQLTypeName bqType = convertLogicalType(logicalType);
+      // TODO: figure out what the correct behavior should be
+      if (bqType == null) {
+        throw new IllegalArgumentException(
+          String.format("Field '%s' is of type '%s', which is not supported in BigQuery.",
+                        name, logicalType.getToken()));
+      }
+      return Field.newBuilder(name, bqType).setMode(fieldMode).build();
+    }
+
+    Field output;
+    Schema.Type type = isNullable ? field.getSchema().getNonNullable().getType() : field.getSchema().getType();
+    if (type == Schema.Type.ARRAY) {
+      Schema componentSchema = fieldSchema.getComponentSchema();
+      componentSchema = componentSchema.isNullable() ? componentSchema.getNonNullable() : componentSchema;
+      StandardSQLTypeName bqType = convertType(componentSchema.getType());
+      if (bqType == null) {
+        throw new IllegalArgumentException(
+          String.format("Field '%s' is an array of '%s', which is not supported in BigQuery.",
+                        name, logicalType.getToken()));
+      }
+      output = Field.newBuilder(name, bqType).setMode(Field.Mode.REPEATED).build();
+    } else if (type == Schema.Type.RECORD) {
+      List<Field> subFields = convertFields(fieldSchema.getFields());
+      output = Field.newBuilder(name, StandardSQLTypeName.STRUCT, FieldList.of(subFields)).build();
+    } else {
+      StandardSQLTypeName bqType = convertType(type);
+      if (bqType == null) {
+        throw new IllegalArgumentException(
+          String.format("Field '%s' is of type '%s', which is not supported in BigQuery.",
+                        name, type.name().toLowerCase()));
+      }
+      output = Field.newBuilder(name, bqType).setMode(fieldMode).build();
+    }
+    return output;
+  }
 }
diff --git a/src/test/java/io/cdap/delta/bigquery/BigQueryEventConsumerTest.java b/src/test/java/io/cdap/delta/bigquery/BigQueryEventConsumerTest.java
@@ -179,6 +179,82 @@ public void testCreateTableWithClustering() throws Exception {
     cleanupTest(bucket, dataset, eventConsumer);
   }
 
+  @Test
+  public void testCreateTableWithInvalidTypesForClustering() throws Exception {
+    String bucketName = "bqtest-" + UUID.randomUUID().toString();
+    Bucket bucket = storage.create(BucketInfo.of(bucketName));
+    BigQueryEventConsumer eventConsumer = new BigQueryEventConsumer(new MockContext(300, Collections.emptyMap()),
+                                                                    storage, bigQuery, bucket, project, 0,
+                                                                    STAGING_TABLE_PREFIX, true, null, 1L, null);
+
+    String dataset = "testInvalidTypesForClustering";
+    String allinvalidsTableName = "allinvalids";
+    TableId allInvalidsTable = TableId.of(dataset, allinvalidsTableName);
+    String someInvalidsTableName = "someinvalids";
+    TableId someInvalidsTable = TableId.of(dataset, someInvalidsTableName);
+
+    try {
+      bigQuery.create(DatasetInfo.newBuilder(dataset).build());
+
+      // Primary keys with all un-supported types for clustering
+      List<String> primaryKeys = new ArrayList<>();
+      primaryKeys.add("id1");
+      Schema schema = Schema.recordOf(allinvalidsTableName,
+                                      Schema.Field.of("id1", Schema.of(Schema.Type.BYTES)));
+
+      DDLEvent allInvalidsCreateTable = DDLEvent.builder()
+        .setOperation(DDLOperation.Type.CREATE_TABLE)
+        .setDatabaseName(dataset)
+        .setTableName(allinvalidsTableName)
+        .setSchema(schema)
+        .setPrimaryKey(primaryKeys)
+        .setOffset(new Offset())
+        .build();
+      eventConsumer.applyDDL(new Sequenced<>(allInvalidsCreateTable, 0));
+
+      Table table = bigQuery.getTable(allInvalidsTable);
+      StandardTableDefinition tableDefinition = table.getDefinition();
+      Clustering clustering = tableDefinition.getClustering();
+      // No clustering should be added
+      Assert.assertNull(clustering);
+      bigQuery.delete(allInvalidsTable);
+
+      // Primary keys with some un-supported types for clustering
+      primaryKeys = new ArrayList<>();
+      primaryKeys.add("id1");
+      primaryKeys.add("id2");
+      primaryKeys.add("id3");
+      primaryKeys.add("id4");
+      primaryKeys.add("id5");
+      schema = Schema.recordOf(allinvalidsTableName,
+                               Schema.Field.of("id1", Schema.of(Schema.Type.BYTES)),
+                               Schema.Field.of("id2", Schema.of(Schema.Type.BYTES)),
+                               Schema.Field.of("id3", Schema.of(Schema.Type.BYTES)),
+                               Schema.Field.of("id4", Schema.of(Schema.Type.BYTES)),
+                               // add one valid clustering key
+                               Schema.Field.of("id5", Schema.of(Schema.Type.INT)));
+
+      DDLEvent someInvalidsTableCreate = DDLEvent.builder()
+        .setOperation(DDLOperation.Type.CREATE_TABLE)
+        .setDatabaseName(dataset)
+        .setTableName(someInvalidsTableName)
+        .setSchema(schema)
+        .setPrimaryKey(primaryKeys)
+        .setOffset(new Offset())
+        .build();
+      eventConsumer.applyDDL(new Sequenced<>(someInvalidsTableCreate, 0));
+
+      table = bigQuery.getTable(someInvalidsTable);
+      tableDefinition = table.getDefinition();
+      clustering = tableDefinition.getClustering();
+      Assert.assertNotNull(clustering);
+      Assert.assertEquals(primaryKeys.subList(4, 5), clustering.getFields());
+      bigQuery.delete(someInvalidsTable);
+    } finally {
+      cleanupTest(bucket, dataset, eventConsumer);
+    }
+  }
+
   @Test
   public void testManualDropRetries() throws Exception {
     String bucketName = "bqtest-" + UUID.randomUUID().toString();