Skip to content

Commit

Permalink
move backend infra into terraform
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewpeng02 committed Feb 26, 2024
1 parent ee9ec1a commit 48b28ca
Show file tree
Hide file tree
Showing 8 changed files with 421 additions and 68 deletions.
63 changes: 63 additions & 0 deletions dlp-terraform/ecs/alb.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# --- ALB ---
resource "aws_security_group" "http" {
name_prefix = "http-sg-"
description = "Allow all HTTP/HTTPS traffic from public"
vpc_id = aws_vpc.main.id

dynamic "ingress" {
for_each = [80, 443]
content {
protocol = "tcp"
from_port = ingress.value
to_port = ingress.value
cidr_blocks = ["0.0.0.0/0"]
}
}

egress {
protocol = "-1"
from_port = 0
to_port = 0
cidr_blocks = ["0.0.0.0/0"]
}
}

resource "aws_lb" "main" {
name = "alb"
load_balancer_type = "application"
subnets = aws_subnet.public[*].id
security_groups = [aws_security_group.http.id]
}

resource "aws_lb_target_group" "app" {
name_prefix = "app-"
vpc_id = aws_vpc.main.id
protocol = "HTTP"
port = 8000
target_type = "instance"

health_check {
enabled = true
path = "/health"
matcher = 200
interval = 30
timeout = 5
healthy_threshold = 5
unhealthy_threshold = 2
}
}

resource "aws_lb_listener" "http" {
load_balancer_arn = aws_lb.main.id
port = 80
protocol = "HTTP"

default_action {
type = "forward"
target_group_arn = aws_lb_target_group.app.id
}
}

output "alb_url" {
value = aws_lb.main.dns_name
}
13 changes: 13 additions & 0 deletions dlp-terraform/ecs/ecr.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
resource "aws_ecr_repository" "training" {
name = "training"
image_tag_mutability = "MUTABLE"
force_delete = true

image_scanning_configuration {
scan_on_push = true
}
}

output "training_repo_url" {
value = aws_ecr_repository.training.repository_url
}
218 changes: 152 additions & 66 deletions dlp-terraform/ecs/ecs.tf
Original file line number Diff line number Diff line change
@@ -1,89 +1,175 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.16"
resource "aws_ecs_cluster" "main" {
name = "backend"
}

# --- ECS Node Role ---
data "aws_iam_policy_document" "ecs_node_doc" {
statement {
actions = ["sts:AssumeRole"]
effect = "Allow"

principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}
}
}

resource "aws_iam_role" "ecs_node_role" {
name_prefix = "backend-ecs-node-role-"
assume_role_policy = data.aws_iam_policy_document.ecs_node_doc.json
}

required_version = ">= 1.2.0"
resource "aws_iam_role_policy_attachment" "ecs_node_role_policy" {
role = aws_iam_role.ecs_node_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}

provider "aws" {
region = "us-west-2"
resource "aws_iam_instance_profile" "ecs_node" {
name_prefix = "backend-ecs-node-profile-"
path = "/ecs/instance/"
role = aws_iam_role.ecs_node_role.name
}

resource "aws_ecs_cluster" "deep-learning-playground-kernels" {
name = "deep-learning-playground-kernels-test"
setting {
name = "containerInsights"
value = "enabled"
# --- ECS Node Security Group ---
resource "aws_security_group" "ecs_node_sg" {
name_prefix = "backend-ecs-node-sg-"
vpc_id = aws_vpc.main.id

ingress {
from_port = 0
to_port = 0
protocol = "-1"
# cidr_blocks = [aws_vpc.main.cidr_block]
security_groups = [ aws_security_group.http.id ]
}

egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_ecs_service" "dlp-training-service" {
name = "dlp-training-service-test"
cluster = aws_ecs_cluster.deep-learning-playground-kernels.id
task_definition = "arn:aws:ecs:us-west-2:521654603461:task-definition/dlp-training-task:9"
desired_count = 1

launch_type = "FARGATE"
# --- ECS Launch Template ---
resource "aws_launch_template" "ecs_lt_training" {
name_prefix = "training-ecs-template-"
image_id = "ami-01ff5874b57a57613"
instance_type = "g4dn.xlarge"

deployment_maximum_percent = "200"
deployment_minimum_healthy_percent = "100"
scheduling_strategy = "REPLICA"
vpc_security_group_ids = [aws_security_group.ecs_node_sg.id]
iam_instance_profile {
arn = aws_iam_instance_profile.ecs_node.arn
}
monitoring {
enabled = true
}

user_data = base64encode(<<-EOF
#!/bin/bash
echo ECS_CLUSTER=${aws_ecs_cluster.main.name} >> /etc/ecs/ecs.config;
EOF
)
}

network_configuration {
security_groups = ["sg-09291eb84a19daeed"]
subnets = ["subnet-0bebe768ad78b896c", "subnet-0f3e41ad21cfe6ff5"]
assign_public_ip = true
# --- ECS ASG ---
resource "aws_autoscaling_group" "training" {
name_prefix = "training-ecs-asg-"
vpc_zone_identifier = aws_subnet.public[*].id
min_size = 0
max_size = 2
desired_capacity = 1
health_check_grace_period = 0
health_check_type = "EC2"
protect_from_scale_in = false

launch_template {
id = aws_launch_template.ecs_lt_training.id
version = "$Latest"
}

tag {
key = "Name"
value = "backend-ecs-cluster"
propagate_at_launch = true
}

tag {
key = "AmazonECSManaged"
value = ""
propagate_at_launch = true
}
}
resource "aws_appautoscaling_target" "dev_to_target" {
max_capacity = 1
min_capacity = 1
resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
}
resource "aws_appautoscaling_policy" "training_service_auto_scaling_policy" {
name = "TrainingServiceAutoScalingPolicy"
policy_type = "StepScaling"
resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"

step_scaling_policy_configuration {
adjustment_type = "ChangeInCapacity"
cooldown = 30
metric_aggregation_type = "Average"

step_adjustment {
metric_interval_lower_bound = 0
scaling_adjustment = 3

# --- ECS Capacity Provider ---
resource "aws_ecs_capacity_provider" "training" {
name = "training-ecs-ec2"

auto_scaling_group_provider {
auto_scaling_group_arn = aws_autoscaling_group.training.arn
managed_termination_protection = "DISABLED"

managed_scaling {
maximum_scaling_step_size = 2
minimum_scaling_step_size = 1
status = "ENABLED"
target_capacity = 100
}
}

depends_on = [
aws_appautoscaling_target.dev_to_target
]
}
resource "aws_appautoscaling_policy" "dlp-queue-size-too-small-policy" {
name = "DLPQueueSizeTooSmallPolicy"
policy_type = "StepScaling"
resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"

step_scaling_policy_configuration {
adjustment_type = "ExactCapacity"
cooldown = 30
metric_aggregation_type = "Average"
resource "aws_ecs_cluster_capacity_providers" "main" {
cluster_name = aws_ecs_cluster.main.name
capacity_providers = [aws_ecs_capacity_provider.training.name]

step_adjustment {
default_capacity_provider_strategy {
capacity_provider = aws_ecs_capacity_provider.training.name
base = 1
weight = 100
}
}

metric_interval_upper_bound = 0
scaling_adjustment = 1
# --- ECS Task Role ---
data "aws_iam_policy_document" "ecs_task_doc" {
statement {
actions = ["sts:AssumeRole"]
effect = "Allow"

principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
depends_on = [aws_appautoscaling_target.dev_to_target]
}

resource "aws_iam_role" "ecs_task_role" {
name_prefix = "backend-ecs-task-role"
assume_role_policy = data.aws_iam_policy_document.ecs_task_doc.json
}

resource "aws_iam_role_policy_attachment" "ecs_task_role_policy" {
for_each = toset([
"arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess",
"arn:aws:iam::aws:policy/SecretsManagerReadWrite"
])

role = aws_iam_role.ecs_task_role.name
policy_arn = each.value
}


resource "aws_iam_role" "ecs_exec_role" {
name_prefix = "backend-ecs-exec-role"
assume_role_policy = data.aws_iam_policy_document.ecs_task_doc.json
}

resource "aws_iam_role_policy_attachment" "ecs_exec_role_policy" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}

resource "aws_cloudwatch_log_group" "ecs" {
name = "/ecs/backend"
retention_in_days = 14
}
Empty file.
Loading

0 comments on commit 48b28ca

Please sign in to comment.