Skip to content

Commit

Permalink
Move data bucket creation out of the data_ingestion TF (#1530)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1530

The data bucket creation was previously bundled into the data_ingestion
terraform template. This prevents limiting the s3 access to only the
necessary bucket for the PCE, which will be updated in a followup.

Reviewed By: ankushksingh

Differential Revision: D39197009

fbshipit-source-id: 0f21427da1c90e2b54807ec030fbe29244dbe981
  • Loading branch information
marksliva authored and facebook-github-bot committed Sep 1, 2022
1 parent 7cf948f commit 9207225
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 66 deletions.
40 changes: 1 addition & 39 deletions fbpcs/infra/cloud_bridge/data_ingestion/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ resource "aws_kinesis_firehose_delivery_stream" "extended_s3_stream" {

extended_s3_configuration {
role_arn = aws_iam_role.firehose_role.arn
bucket_arn = aws_s3_bucket.bucket.arn
bucket_arn = var.data_processing_output_bucket_arn
buffer_size = 128
buffer_interval = 900
prefix = "${var.events_data}/year=!{partitionKeyFromLambda:year}/month=!{partitionKeyFromLambda:month}/day=!{partitionKeyFromLambda:day}/hour=!{partitionKeyFromLambda:hour}/"
Expand All @@ -57,44 +57,6 @@ resource "aws_kinesis_firehose_delivery_stream" "extended_s3_stream" {
}
}

resource "aws_s3_bucket" "bucket" {
bucket = var.data_processing_output_bucket
versioning {
enabled = true
}
server_side_encryption_configuration {
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}

}

resource "aws_s3_bucket_policy" "bucket_policy" {
bucket = aws_s3_bucket.bucket.id

policy = <<EOF
{
"Statement": [
{
"Effect": "Deny",
"Action": "s3:*",
"Principal": "*",
"Resource": [
"${aws_s3_bucket.bucket.arn}",
"${aws_s3_bucket.bucket.arn}/*"
],
"Condition": {
"Bool": { "aws:SecureTransport": false }
}
}
]
}
EOF
}

resource "aws_iam_role" "firehose_role" {
name = "cb-data-ingestion-firehose-role${var.tag_postfix}"

Expand Down
10 changes: 0 additions & 10 deletions fbpcs/infra/cloud_bridge/data_ingestion/output.tf
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
output "data_processing_output_bucket_id" {
value = aws_s3_bucket.bucket.id
description = "The id of S3 bucked used to store data processing outputs"
}

output "data_processing_output_bucket_arn" {
value = aws_s3_bucket.bucket.arn
description = "The arn of S3 bucked used to store data processing outputs"
}

output "firehose_stream_name" {
value = aws_kinesis_firehose_delivery_stream.extended_s3_stream.name
description = "The Kinesis firehose stream name"
Expand Down
5 changes: 5 additions & 0 deletions fbpcs/infra/cloud_bridge/data_ingestion/variable.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ variable "data_processing_output_bucket" {
default = ""
}

variable "data_processing_output_bucket_arn" {
description = "Amazon resource name of the data bucket"
default = ""
}

variable "data_ingestion_lambda_name" {
description = "The data ingestion Lambda function name"
default = ""
Expand Down
15 changes: 9 additions & 6 deletions fbpcs/infra/cloud_bridge/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,11 @@ deploy_aws_resources() {
#clean up previously generated resources if any
cleanup_generated_resources
# Create the S3 bucket (to store config files) if it doesn't exist
log_streaming_data "creating s3 bucket, if it does not exist"
log_streaming_data "creating s3 config bucket, if it does not exist"
validate_or_create_s3_bucket "$s3_bucket_for_storage" "$region" "$aws_account_id"
# Create the S3 data bucket if it doesn't exist
log_streaming_data "creating s3 data bucket, if it does not exist"
validate_or_create_s3_bucket "$s3_bucket_data_pipeline" "$region" "$aws_account_id"
# Deploy PCE Terraform scripts
onedocker_ecs_container_image='539290649537.dkr.ecr.us-west-2.amazonaws.com/one-docker-prod:latest'
publisher_vpc_cidr='10.0.0.0/16'
Expand Down Expand Up @@ -291,15 +294,14 @@ deploy_aws_resources() {
-var "tag_postfix=$tag_postfix" \
-var "aws_account_id=$aws_account_id" \
-var "data_processing_output_bucket=$s3_bucket_data_pipeline" \
-var "data_processing_output_bucket_arn=$data_bucket_arn" \
-var "data_ingestion_lambda_name=$data_ingestion_lambda_name" \
-var "data_processing_lambda_s3_bucket=$s3_bucket_for_storage" \
-var "data_processing_lambda_s3_key=lambda.zip" \
-var "data_upload_key_path=$data_upload_key_path" \
-var "query_results_key_path=$query_results_key_path"
echo "######################## Deploy Data Ingestion Terraform scripts completed ########################"
# store the outputs from data ingestion pipeline output into variables
app_data_input_bucket_id=$(terraform output data_processing_output_bucket_id | tr -d '"')
app_data_input_bucket_arn=$(terraform output data_processing_output_bucket_arn | tr -d '"')
firehose_stream_name=$(terraform output firehose_stream_name | tr -d '"')

if "$build_semi_automated_data_pipeline"
Expand All @@ -312,7 +314,7 @@ deploy_aws_resources() {
cp template/lambda_trigger.py .
echo "Updating trigger function configurations..."
sed -i "s/glueJobName = \"TO_BE_UPDATED_DURING_DEPLOYMENT\"/glueJobName = \"glue-ETL$tag_postfix\"/g" lambda_trigger.py
sed -i "s~s3_write_path = \"TO_BE_UPDATED_DURING_DEPLOYMENT\"~s3_write_path = \"$app_data_input_bucket_id/events_data/\"~g" lambda_trigger.py
sed -i "s~s3_write_path = \"TO_BE_UPDATED_DURING_DEPLOYMENT\"~s3_write_path = \"$s3_bucket_data_pipeline/events_data/\"~g" lambda_trigger.py

echo "######################## Initializing terraform working directory started ########################"
terraform init -reconfigure \
Expand All @@ -328,8 +330,8 @@ deploy_aws_resources() {
-var "aws_account_id=$aws_account_id" \
-var "lambda_trigger_s3_key=lambda_trigger.zip" \
-var "app_data_input_bucket=$s3_bucket_data_pipeline" \
-var "app_data_input_bucket_id=$app_data_input_bucket_id" \
-var "app_data_input_bucket_arn=$app_data_input_bucket_arn" \
-var "app_data_input_bucket_id=$s3_bucket_data_pipeline" \
-var "app_data_input_bucket_arn=$data_bucket_arn" \
-var "data_upload_key_path=$data_upload_key_path"
echo "######################## Deploy Semi-automated Data Ingestion Terraform scripts completed ########################"
fi
Expand Down Expand Up @@ -406,6 +408,7 @@ else
s3_bucket_data_pipeline="$s3_bucket_data_pipeline$tag_postfix"
fi

data_bucket_arn="arn:aws:s3:::${s3_bucket_data_pipeline}"
policy_name="fb-pc-policy${tag_postfix}"
database_name="mpc-events-db${tag_postfix}"
glue_crawler_name="mpc-events-crawler${tag_postfix}"
Expand Down
11 changes: 0 additions & 11 deletions fbpcs/infra/cloud_bridge/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -189,17 +189,6 @@ input_validation () {
echo "The S3 bucket for storing processed data is $s3_bucket_data_pipeline".
validate_bucket_name "$s3_bucket_data_pipeline"

if ! "$undeploy"
then
echo "making sure $s3_bucket_data_pipeline is not an existing bucket..."
if aws s3api head-bucket --bucket "$s3_bucket_data_pipeline" --expected-bucket-owner "$aws_account_id" 2>&1 | grep -q "404" # bucekt doesn't exist
then
echo "The bucket $s3_bucket_data_pipeline doesn't exist. Continue..."
else # bucket exists, we want the data-storage bucket to be new
echo "The bucket $s3_bucket_data_pipeline already exists under Account $aws_account_id. Please choose another bucket name."
exit 1
fi
fi
echo "validate input: aws account id..."
echo "Your AWS acount ID is $aws_account_id"
account_A=$(aws sts get-caller-identity |grep -o 'Account":.*' | tr -d '"' | tr -d ' ' | tr -d ',' | cut -d':' -f2)
Expand Down

0 comments on commit 9207225

Please sign in to comment.