From 48b28cadec1e49c0b91b9403e20747ae78f6d068 Mon Sep 17 00:00:00 2001 From: andrewpeng02 Date: Mon, 26 Feb 2024 16:35:48 -0500 Subject: [PATCH] move backend infra into terraform --- dlp-terraform/ecs/alb.tf | 63 +++++++ dlp-terraform/ecs/ecr.tf | 13 ++ dlp-terraform/ecs/ecs.tf | 218 +++++++++++++++------- dlp-terraform/ecs/ecs_django_service.tf | 0 dlp-terraform/ecs/ecs_training_service.tf | 113 +++++++++++ dlp-terraform/ecs/main.tf | 78 ++++++++ training/training/core/authenticator.py | 2 +- training/training/settings.py | 2 +- 8 files changed, 421 insertions(+), 68 deletions(-) create mode 100644 dlp-terraform/ecs/alb.tf create mode 100644 dlp-terraform/ecs/ecr.tf create mode 100644 dlp-terraform/ecs/ecs_django_service.tf create mode 100644 dlp-terraform/ecs/ecs_training_service.tf create mode 100644 dlp-terraform/ecs/main.tf diff --git a/dlp-terraform/ecs/alb.tf b/dlp-terraform/ecs/alb.tf new file mode 100644 index 000000000..985717240 --- /dev/null +++ b/dlp-terraform/ecs/alb.tf @@ -0,0 +1,63 @@ +# --- ALB --- +resource "aws_security_group" "http" { + name_prefix = "http-sg-" + description = "Allow all HTTP/HTTPS traffic from public" + vpc_id = aws_vpc.main.id + + dynamic "ingress" { + for_each = [80, 443] + content { + protocol = "tcp" + from_port = ingress.value + to_port = ingress.value + cidr_blocks = ["0.0.0.0/0"] + } + } + + egress { + protocol = "-1" + from_port = 0 + to_port = 0 + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_lb" "main" { + name = "alb" + load_balancer_type = "application" + subnets = aws_subnet.public[*].id + security_groups = [aws_security_group.http.id] +} + +resource "aws_lb_target_group" "app" { + name_prefix = "app-" + vpc_id = aws_vpc.main.id + protocol = "HTTP" + port = 8000 + target_type = "instance" + + health_check { + enabled = true + path = "/health" + matcher = 200 + interval = 30 + timeout = 5 + healthy_threshold = 5 + unhealthy_threshold = 2 + } +} + +resource "aws_lb_listener" "http" { + load_balancer_arn = aws_lb.main.id + port = 80 + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.app.id + } +} + +output "alb_url" { + value = aws_lb.main.dns_name +} diff --git a/dlp-terraform/ecs/ecr.tf b/dlp-terraform/ecs/ecr.tf new file mode 100644 index 000000000..318588be0 --- /dev/null +++ b/dlp-terraform/ecs/ecr.tf @@ -0,0 +1,13 @@ +resource "aws_ecr_repository" "training" { + name = "training" + image_tag_mutability = "MUTABLE" + force_delete = true + + image_scanning_configuration { + scan_on_push = true + } +} + +output "training_repo_url" { + value = aws_ecr_repository.training.repository_url +} diff --git a/dlp-terraform/ecs/ecs.tf b/dlp-terraform/ecs/ecs.tf index 133e0faa4..eef9c418f 100644 --- a/dlp-terraform/ecs/ecs.tf +++ b/dlp-terraform/ecs/ecs.tf @@ -1,89 +1,175 @@ -terraform { - required_providers { - aws = { - source = "hashicorp/aws" - version = "~> 4.16" +resource "aws_ecs_cluster" "main" { + name = "backend" +} + +# --- ECS Node Role --- +data "aws_iam_policy_document" "ecs_node_doc" { + statement { + actions = ["sts:AssumeRole"] + effect = "Allow" + + principals { + type = "Service" + identifiers = ["ec2.amazonaws.com"] } } +} + +resource "aws_iam_role" "ecs_node_role" { + name_prefix = "backend-ecs-node-role-" + assume_role_policy = data.aws_iam_policy_document.ecs_node_doc.json +} - required_version = ">= 1.2.0" +resource "aws_iam_role_policy_attachment" "ecs_node_role_policy" { + role = aws_iam_role.ecs_node_role.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" } -provider "aws" { - region = "us-west-2" +resource "aws_iam_instance_profile" "ecs_node" { + name_prefix = "backend-ecs-node-profile-" + path = "/ecs/instance/" + role = aws_iam_role.ecs_node_role.name } -resource "aws_ecs_cluster" "deep-learning-playground-kernels" { - name = "deep-learning-playground-kernels-test" - setting { - name = "containerInsights" - value = "enabled" +# --- ECS Node Security Group --- +resource "aws_security_group" "ecs_node_sg" { + name_prefix = "backend-ecs-node-sg-" + vpc_id = aws_vpc.main.id + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + # cidr_blocks = [aws_vpc.main.cidr_block] + security_groups = [ aws_security_group.http.id ] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] } } -resource "aws_ecs_service" "dlp-training-service" { - name = "dlp-training-service-test" - cluster = aws_ecs_cluster.deep-learning-playground-kernels.id - task_definition = "arn:aws:ecs:us-west-2:521654603461:task-definition/dlp-training-task:9" - desired_count = 1 - launch_type = "FARGATE" +# --- ECS Launch Template --- +resource "aws_launch_template" "ecs_lt_training" { + name_prefix = "training-ecs-template-" + image_id = "ami-01ff5874b57a57613" + instance_type = "g4dn.xlarge" - deployment_maximum_percent = "200" - deployment_minimum_healthy_percent = "100" - scheduling_strategy = "REPLICA" + vpc_security_group_ids = [aws_security_group.ecs_node_sg.id] + iam_instance_profile { + arn = aws_iam_instance_profile.ecs_node.arn + } + monitoring { + enabled = true + } + + user_data = base64encode(<<-EOF + #!/bin/bash + echo ECS_CLUSTER=${aws_ecs_cluster.main.name} >> /etc/ecs/ecs.config; + EOF + ) +} - network_configuration { - security_groups = ["sg-09291eb84a19daeed"] - subnets = ["subnet-0bebe768ad78b896c", "subnet-0f3e41ad21cfe6ff5"] - assign_public_ip = true +# --- ECS ASG --- +resource "aws_autoscaling_group" "training" { + name_prefix = "training-ecs-asg-" + vpc_zone_identifier = aws_subnet.public[*].id + min_size = 0 + max_size = 2 + desired_capacity = 1 + health_check_grace_period = 0 + health_check_type = "EC2" + protect_from_scale_in = false + + launch_template { + id = aws_launch_template.ecs_lt_training.id + version = "$Latest" + } + + tag { + key = "Name" + value = "backend-ecs-cluster" + propagate_at_launch = true + } + + tag { + key = "AmazonECSManaged" + value = "" + propagate_at_launch = true } } -resource "aws_appautoscaling_target" "dev_to_target" { - max_capacity = 1 - min_capacity = 1 - resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}" - scalable_dimension = "ecs:service:DesiredCount" - service_namespace = "ecs" -} -resource "aws_appautoscaling_policy" "training_service_auto_scaling_policy" { - name = "TrainingServiceAutoScalingPolicy" - policy_type = "StepScaling" - resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}" - scalable_dimension = "ecs:service:DesiredCount" - service_namespace = "ecs" - - step_scaling_policy_configuration { - adjustment_type = "ChangeInCapacity" - cooldown = 30 - metric_aggregation_type = "Average" - - step_adjustment { - metric_interval_lower_bound = 0 - scaling_adjustment = 3 + +# --- ECS Capacity Provider --- +resource "aws_ecs_capacity_provider" "training" { + name = "training-ecs-ec2" + + auto_scaling_group_provider { + auto_scaling_group_arn = aws_autoscaling_group.training.arn + managed_termination_protection = "DISABLED" + + managed_scaling { + maximum_scaling_step_size = 2 + minimum_scaling_step_size = 1 + status = "ENABLED" + target_capacity = 100 } } - - depends_on = [ - aws_appautoscaling_target.dev_to_target - ] } -resource "aws_appautoscaling_policy" "dlp-queue-size-too-small-policy" { - name = "DLPQueueSizeTooSmallPolicy" - policy_type = "StepScaling" - resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}" - scalable_dimension = "ecs:service:DesiredCount" - service_namespace = "ecs" - step_scaling_policy_configuration { - adjustment_type = "ExactCapacity" - cooldown = 30 - metric_aggregation_type = "Average" +resource "aws_ecs_cluster_capacity_providers" "main" { + cluster_name = aws_ecs_cluster.main.name + capacity_providers = [aws_ecs_capacity_provider.training.name] - step_adjustment { + default_capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.training.name + base = 1 + weight = 100 + } +} - metric_interval_upper_bound = 0 - scaling_adjustment = 1 +# --- ECS Task Role --- +data "aws_iam_policy_document" "ecs_task_doc" { + statement { + actions = ["sts:AssumeRole"] + effect = "Allow" + + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] } } - depends_on = [aws_appautoscaling_target.dev_to_target] +} + +resource "aws_iam_role" "ecs_task_role" { + name_prefix = "backend-ecs-task-role" + assume_role_policy = data.aws_iam_policy_document.ecs_task_doc.json +} + +resource "aws_iam_role_policy_attachment" "ecs_task_role_policy" { + for_each = toset([ + "arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess", + "arn:aws:iam::aws:policy/SecretsManagerReadWrite" + ]) + + role = aws_iam_role.ecs_task_role.name + policy_arn = each.value +} + + +resource "aws_iam_role" "ecs_exec_role" { + name_prefix = "backend-ecs-exec-role" + assume_role_policy = data.aws_iam_policy_document.ecs_task_doc.json +} + +resource "aws_iam_role_policy_attachment" "ecs_exec_role_policy" { + role = aws_iam_role.ecs_exec_role.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +resource "aws_cloudwatch_log_group" "ecs" { + name = "/ecs/backend" + retention_in_days = 14 } diff --git a/dlp-terraform/ecs/ecs_django_service.tf b/dlp-terraform/ecs/ecs_django_service.tf new file mode 100644 index 000000000..e69de29bb diff --git a/dlp-terraform/ecs/ecs_training_service.tf b/dlp-terraform/ecs/ecs_training_service.tf new file mode 100644 index 000000000..e9d3e2ff5 --- /dev/null +++ b/dlp-terraform/ecs/ecs_training_service.tf @@ -0,0 +1,113 @@ +resource "aws_ecs_task_definition" "training" { + family = "training" + task_role_arn = aws_iam_role.ecs_task_role.arn + execution_role_arn = aws_iam_role.ecs_exec_role.arn + network_mode = "bridge" + cpu = 1024 + memory = 4096 + + container_definitions = jsonencode(([ + { + "name" : "training", + "image" : "${aws_ecr_repository.training.repository_url}:latest", + "portMappings" : [ + { + "name" : "gunicorn-port", + "containerPort" : 8000, + "hostPort" : 0, + "protocol" : "tcp", + "appProtocol" : "http" + } + ], + "essential" : true, + "environment" : [], + "mountPoints" : [], + "volumesFrom" : [], + "logConfiguration" : { + "logDriver" : "awslogs", + "options" : { + "awslogs-create-group" : "true", + "awslogs-region" : "us-east-1", + "awslogs-group" : aws_cloudwatch_log_group.ecs.name, + "awslogs-stream-prefix" : "ecs" + } + } + } + ])) +} + +# --- ECS Service --- +resource "aws_ecs_service" "training" { + name = "training" + cluster = aws_ecs_cluster.main.id + task_definition = aws_ecs_task_definition.training.arn + desired_count = 2 + + capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.training.name + base = 1 + weight = 100 + } + + ordered_placement_strategy { + type = "spread" + field = "attribute:ecs.availability-zone" + } + + lifecycle { + ignore_changes = [desired_count] + } + + load_balancer { + target_group_arn = aws_lb_target_group.app.arn + container_name = "training" + container_port = 8000 + } + + depends_on = [aws_lb_target_group.app] +} + +# --- ECS Service Auto Scaling --- +resource "aws_appautoscaling_target" "training_ecs_target" { + service_namespace = "ecs" + scalable_dimension = "ecs:service:DesiredCount" + resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.training.name}" + min_capacity = 0 + max_capacity = 2 +} + +resource "aws_appautoscaling_policy" "training_ecs_target_cpu" { + name = "training-application-scaling-policy-cpu" + policy_type = "TargetTrackingScaling" + service_namespace = aws_appautoscaling_target.training_ecs_target.service_namespace + resource_id = aws_appautoscaling_target.training_ecs_target.resource_id + scalable_dimension = aws_appautoscaling_target.training_ecs_target.scalable_dimension + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + + target_value = 80 + scale_in_cooldown = 300 + scale_out_cooldown = 300 + } +} + +resource "aws_appautoscaling_policy" "training_ecs_target_memory" { + name = "training-application-scaling-policy-memory" + policy_type = "TargetTrackingScaling" + service_namespace = aws_appautoscaling_target.training_ecs_target.service_namespace + resource_id = aws_appautoscaling_target.training_ecs_target.resource_id + scalable_dimension = aws_appautoscaling_target.training_ecs_target.scalable_dimension + + target_tracking_scaling_policy_configuration { + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + + target_value = 80 + scale_in_cooldown = 300 + scale_out_cooldown = 300 + } +} \ No newline at end of file diff --git a/dlp-terraform/ecs/main.tf b/dlp-terraform/ecs/main.tf new file mode 100644 index 000000000..f4b20a46f --- /dev/null +++ b/dlp-terraform/ecs/main.tf @@ -0,0 +1,78 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws", + version = "5.17.0" + } + } +} + +provider "aws" { + region = "us-east-1" +} + +# --- VPC -- +data "aws_availability_zones" "available" { + state = "available" +} + +locals { + azs_count = 2 + azs_names = data.aws_availability_zones.available.names +} + +resource "aws_vpc" "main" { + cidr_block = "10.10.0.0/16" + enable_dns_hostnames = true + tags = { + Name = "backend-vpc" + } +} + +resource "aws_subnet" "public" { + count = 2 + + vpc_id = aws_vpc.main.id + availability_zone = local.azs_names[count.index] + cidr_block = cidrsubnet(aws_vpc.main.cidr_block, 8, 10 + count.index) + map_public_ip_on_launch = true + tags = { + Name = "backend-subnet-public-${local.azs_names[count.index]}" + } +} + +# --- Internet Gateway --- +resource "aws_internet_gateway" "main" { + vpc_id = aws_vpc.main.id + tags = { + Name = "backend-internet-gateway" + } +} + +resource "aws_eip" "main" { + count = local.azs_count + depends_on = [aws_internet_gateway.main] + tags = { + Name = "backend-eip-${local.azs_names[count.index]}" + } +} + +# --- Public Route Table -- +resource "aws_route_table" "public" { + vpc_id = aws_vpc.main.id + tags = { + Name = "backend-route-table-public" + } + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.main.id + } +} + +resource "aws_route_table_association" "public" { + count = local.azs_count + + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} diff --git a/training/training/core/authenticator.py b/training/training/core/authenticator.py index 469574ebe..e4f5ad7fe 100644 --- a/training/training/core/authenticator.py +++ b/training/training/core/authenticator.py @@ -9,9 +9,9 @@ class FirebaseAuth(HttpBearer): def authenticate(self, request, token): - app = init_firebase() if token is None or not token: return + app = init_firebase() try: firebase_admin.auth.verify_id_token(token) firebase_admin.delete_app(app) diff --git a/training/training/settings.py b/training/training/settings.py index a50dab7e6..12bba5624 100644 --- a/training/training/settings.py +++ b/training/training/settings.py @@ -37,7 +37,7 @@ for ip in network["IPv4Addresses"] ] ALLOWED_HOSTS += ELB_HEALTHCHECK_HOSTNAMES - ALLOWED_HOSTS.append("backend-load-balancer-296304048.us-east-1.elb.amazonaws.com") + ALLOWED_HOSTS.append("alb-912662400.us-east-1.elb.amazonaws.com") # Application definition