Move data bucket creation out of the data_ingestion TF (#1530)

marksliva · facebook-github-bot · commit 9207225db0c6 · 2022-09-01T16:53:56.000-07:00
Summary: Pull Request resolved: #1530 The data bucket creation was previously bundled into the data_ingestion terraform template. This prevents limiting the s3 access to only the necessary bucket for the PCE, which will be updated in a followup. Reviewed By: ankushksingh Differential Revision: D39197009 fbshipit-source-id: 0f21427da1c90e2b54807ec030fbe29244dbe981
diff --git a/fbpcs/infra/cloud_bridge/data_ingestion/main.tf b/fbpcs/infra/cloud_bridge/data_ingestion/main.tf
@@ -34,7 +34,7 @@ resource "aws_kinesis_firehose_delivery_stream" "extended_s3_stream" {
 
   extended_s3_configuration {
     role_arn            = aws_iam_role.firehose_role.arn
-    bucket_arn          = aws_s3_bucket.bucket.arn
+    bucket_arn          = var.data_processing_output_bucket_arn
     buffer_size         = 128
     buffer_interval     = 900
     prefix              = "${var.events_data}/year=!{partitionKeyFromLambda:year}/month=!{partitionKeyFromLambda:month}/day=!{partitionKeyFromLambda:day}/hour=!{partitionKeyFromLambda:hour}/"
@@ -57,44 +57,6 @@ resource "aws_kinesis_firehose_delivery_stream" "extended_s3_stream" {
   }
 }
 
-resource "aws_s3_bucket" "bucket" {
-  bucket = var.data_processing_output_bucket
-  versioning {
-    enabled = true
-  }
-  server_side_encryption_configuration {
-    rule {
-      apply_server_side_encryption_by_default {
-        sse_algorithm = "AES256"
-      }
-    }
-  }
-
-}
-
-resource "aws_s3_bucket_policy" "bucket_policy" {
-  bucket = aws_s3_bucket.bucket.id
-
-  policy = <<EOF
-{
-  "Statement": [
-    {
-      "Effect": "Deny",
-      "Action": "s3:*",
-      "Principal": "*",
-      "Resource": [
-        "${aws_s3_bucket.bucket.arn}",
-        "${aws_s3_bucket.bucket.arn}/*"
-      ],
-      "Condition": {
-        "Bool": { "aws:SecureTransport": false }
-      }
-    }
-  ]
-}
-EOF
-}
-
 resource "aws_iam_role" "firehose_role" {
   name = "cb-data-ingestion-firehose-role${var.tag_postfix}"
 
diff --git a/fbpcs/infra/cloud_bridge/data_ingestion/output.tf b/fbpcs/infra/cloud_bridge/data_ingestion/output.tf
@@ -1,13 +1,3 @@
-output "data_processing_output_bucket_id" {
-  value       = aws_s3_bucket.bucket.id
-  description = "The id of S3 bucked used to store data processing outputs"
-}
-
-output "data_processing_output_bucket_arn" {
-  value       = aws_s3_bucket.bucket.arn
-  description = "The arn of S3 bucked used to store data processing outputs"
-}
-
 output "firehose_stream_name" {
   value       = aws_kinesis_firehose_delivery_stream.extended_s3_stream.name
   description = "The Kinesis firehose stream name"
diff --git a/fbpcs/infra/cloud_bridge/data_ingestion/variable.tf b/fbpcs/infra/cloud_bridge/data_ingestion/variable.tf
@@ -8,6 +8,11 @@ variable "data_processing_output_bucket" {
   default     = ""
 }
 
+variable "data_processing_output_bucket_arn" {
+  description = "Amazon resource name of the data bucket"
+  default     = ""
+}
+
 variable "data_ingestion_lambda_name" {
   description = "The data ingestion Lambda function name"
   default     = ""
diff --git a/fbpcs/infra/cloud_bridge/deploy.sh b/fbpcs/infra/cloud_bridge/deploy.sh
@@ -198,8 +198,11 @@ deploy_aws_resources() {
     #clean up previously generated resources if any
     cleanup_generated_resources
     # Create the S3 bucket (to store config files) if it doesn't exist
-    log_streaming_data "creating s3 bucket, if it does not exist"
+    log_streaming_data "creating s3 config bucket, if it does not exist"
     validate_or_create_s3_bucket "$s3_bucket_for_storage" "$region" "$aws_account_id"
+    # Create the S3 data bucket if it doesn't exist
+    log_streaming_data "creating s3 data bucket, if it does not exist"
+    validate_or_create_s3_bucket "$s3_bucket_data_pipeline" "$region" "$aws_account_id"
     # Deploy PCE Terraform scripts
     onedocker_ecs_container_image='539290649537.dkr.ecr.us-west-2.amazonaws.com/one-docker-prod:latest'
     publisher_vpc_cidr='10.0.0.0/16'
@@ -291,15 +294,14 @@ deploy_aws_resources() {
         -var "tag_postfix=$tag_postfix" \
         -var "aws_account_id=$aws_account_id" \
         -var "data_processing_output_bucket=$s3_bucket_data_pipeline" \
+        -var "data_processing_output_bucket_arn=$data_bucket_arn" \
         -var "data_ingestion_lambda_name=$data_ingestion_lambda_name" \
         -var "data_processing_lambda_s3_bucket=$s3_bucket_for_storage" \
         -var "data_processing_lambda_s3_key=lambda.zip" \
         -var "data_upload_key_path=$data_upload_key_path" \
         -var "query_results_key_path=$query_results_key_path"
     echo "######################## Deploy Data Ingestion Terraform scripts completed ########################"
     # store the outputs from data ingestion pipeline output into variables
-    app_data_input_bucket_id=$(terraform output data_processing_output_bucket_id | tr -d '"')
-    app_data_input_bucket_arn=$(terraform output data_processing_output_bucket_arn | tr -d '"')
     firehose_stream_name=$(terraform output firehose_stream_name | tr -d '"')
 
     if "$build_semi_automated_data_pipeline"
@@ -312,7 +314,7 @@ deploy_aws_resources() {
         cp template/lambda_trigger.py .
         echo "Updating trigger function configurations..."
         sed -i "s/glueJobName = \"TO_BE_UPDATED_DURING_DEPLOYMENT\"/glueJobName = \"glue-ETL$tag_postfix\"/g" lambda_trigger.py
-        sed -i "s~s3_write_path = \"TO_BE_UPDATED_DURING_DEPLOYMENT\"~s3_write_path = \"$app_data_input_bucket_id/events_data/\"~g" lambda_trigger.py
+        sed -i "s~s3_write_path = \"TO_BE_UPDATED_DURING_DEPLOYMENT\"~s3_write_path = \"$s3_bucket_data_pipeline/events_data/\"~g" lambda_trigger.py
 
         echo "######################## Initializing terraform working directory started ########################"
         terraform init -reconfigure \
@@ -328,8 +330,8 @@ deploy_aws_resources() {
             -var "aws_account_id=$aws_account_id" \
             -var "lambda_trigger_s3_key=lambda_trigger.zip" \
             -var "app_data_input_bucket=$s3_bucket_data_pipeline" \
-            -var "app_data_input_bucket_id=$app_data_input_bucket_id" \
-            -var "app_data_input_bucket_arn=$app_data_input_bucket_arn" \
+            -var "app_data_input_bucket_id=$s3_bucket_data_pipeline" \
+            -var "app_data_input_bucket_arn=$data_bucket_arn" \
             -var "data_upload_key_path=$data_upload_key_path"
         echo "######################## Deploy Semi-automated Data Ingestion Terraform scripts completed ########################"
     fi
@@ -406,6 +408,7 @@ else
     s3_bucket_data_pipeline="$s3_bucket_data_pipeline$tag_postfix"
 fi
 
+data_bucket_arn="arn:aws:s3:::${s3_bucket_data_pipeline}"
 policy_name="fb-pc-policy${tag_postfix}"
 database_name="mpc-events-db${tag_postfix}"
 glue_crawler_name="mpc-events-crawler${tag_postfix}"
diff --git a/fbpcs/infra/cloud_bridge/util.sh b/fbpcs/infra/cloud_bridge/util.sh
@@ -189,17 +189,6 @@ input_validation () {
     echo "The S3 bucket for storing processed data is $s3_bucket_data_pipeline".
     validate_bucket_name "$s3_bucket_data_pipeline"
 
-    if ! "$undeploy"
-    then
-        echo "making sure $s3_bucket_data_pipeline is not an existing bucket..."
-        if aws s3api head-bucket --bucket "$s3_bucket_data_pipeline" --expected-bucket-owner "$aws_account_id" 2>&1 | grep -q "404" # bucekt doesn't exist
-        then
-            echo "The bucket $s3_bucket_data_pipeline doesn't exist. Continue..."
-        else # bucket exists, we want the data-storage bucket to be new
-            echo "The bucket $s3_bucket_data_pipeline already exists under Account $aws_account_id. Please choose another bucket name."
-            exit 1
-        fi
-    fi
     echo "validate input: aws account id..."
     echo "Your AWS acount ID is $aws_account_id"
     account_A=$(aws sts get-caller-identity |grep -o 'Account":.*' | tr -d '"' | tr -d ' ' | tr -d ',' | cut -d':' -f2)