0
votes

I created an ECS cluster, along with a Load Balancer, to expose a basc hello-world node app on Fargate using Terraform. Terraform manages to create my aws resources just fine, and deploys the correct image on ECS Fargate, but the task never passes the initial health-check and restarts indefinitely. I think this is a port-forwarding problem, but I believe my Dockerfile, Load Balancer and Task Definition all expose the correct ports.

Below is the error I see when looking at my service's "events" tab on the ECS dashboard:

service my-first-service (port 2021) is unhealthy in target-group target-group due to (reason Request timed out).

Below is my Application code, the Dockerfile, and the Terraform files I am using to deploy to Fargate:

index.js

const express = require('express')
const app = express()
const port = 2021

app.get('/', (req, res) => res.send('Hello World!'))

app.listen(port, () => console.log(`Example app listening on port ${port}!`))

Dockerfile

# Use an official Node runtime as a parent image
FROM node:12.7.0-alpine

# Set the working directory to /app
WORKDIR '/app'

# Copy package.json to the working directory
COPY package.json .

# Install any needed packages specified in package.json
RUN yarn

# Copying the rest of the code to the working directory
COPY . .

# Make port 2021 available to the world outside this container
EXPOSE 2021

# Run index.js when the container launches
CMD ["node", "index.js"]

application_load_balancer_target_group.tf

resource "aws_lb_target_group" "target_group" {
  name        = "target-group"
  port        = 80
  protocol    = "HTTP"
  target_type = "ip"
  vpc_id      = "${aws_default_vpc.default_vpc.id}" # Referencing the default VPC
  health_check {
    matcher = "200,301,302"
    path = "/"
  }
}

resource "aws_lb_listener" "listener" {
  load_balancer_arn = "${aws_alb.application_load_balancer.arn}" # Referencing our load balancer
  port              = "80"
  protocol          = "HTTP"
  default_action {
    type             = "forward"
    target_group_arn = "${aws_lb_target_group.target_group.arn}" # Referencing our tagrte group
  }
}

application_load_balaner.tf

resource "aws_alb" "application_load_balancer" {
  name               = "test-lb-tf" # Naming our load balancer
  load_balancer_type = "application"
  subnets = [ # Referencing the default subnets
    "${aws_default_subnet.default_subnet_a.id}",
    "${aws_default_subnet.default_subnet_b.id}",
    "${aws_default_subnet.default_subnet_c.id}"
  ]
  # Referencing the security group
  security_groups = ["${aws_security_group.load_balancer_security_group.id}"]
}

# Creating a security group for the load balancer:
resource "aws_security_group" "load_balancer_security_group" {
  ingress {
    from_port   = 80 # Allowing traffic in from port 80
    to_port     = 80
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"] # Allowing traffic in from all sources
  }

  egress {
    from_port   = 0 # Allowing any incoming port
    to_port     = 0 # Allowing any outgoing port
    protocol    = "-1" # Allowing any outgoing protocol 
    cidr_blocks = ["0.0.0.0/0"] # Allowing traffic out to all IP addresses
  }
}

ecs_cluster.tf

resource "aws_ecs_cluster" "my_cluster" {
  name = "my-cluster" # Naming the cluster
}

ecs_service.tf

# Providing a reference to our default VPC (these are needed by the aws_ecs_service at the bottom of this file)
resource "aws_default_vpc" "default_vpc" {
}

# Providing a reference to our default subnets (NOTE: Make sure the availability zones match your zone)
resource "aws_default_subnet" "default_subnet_a" {
  availability_zone = "us-east-2a"
}

resource "aws_default_subnet" "default_subnet_b" {
  availability_zone = "us-east-2b"
}

resource "aws_default_subnet" "default_subnet_c" {
  availability_zone = "us-east-2c"
}


resource "aws_ecs_service" "my_first_service" {
  name            = "my-first-service"                             # Naming our first service
  cluster         = "${aws_ecs_cluster.my_cluster.id}"             # Referencing our created Cluster
  task_definition = "${aws_ecs_task_definition.my_first_task.arn}" # Referencing the task our service will spin up
  launch_type     = "FARGATE"
  desired_count   = 1 # Setting the number of containers we want deployed to 1

  # NOTE: The following 'load_balancer' snippet was added here after the creation of the application_load_balancer files.
  load_balancer {
    target_group_arn = "${aws_lb_target_group.target_group.arn}" # Referencing our target group
    container_name   = "${aws_ecs_task_definition.my_first_task.family}"
    container_port   = 2021 # Specifying the container port
  }

  network_configuration {
    subnets          = ["${aws_default_subnet.default_subnet_a.id}", "${aws_default_subnet.default_subnet_b.id}", "${aws_default_subnet.default_subnet_c.id}"]
    assign_public_ip = true # Providing our containers with public IPs
  }
}


resource "aws_security_group" "service_security_group" {
  ingress {
    from_port = 0
    to_port   = 0
    protocol  = "-1"
    # Only allowing traffic in from the load balancer security group
    security_groups = ["${aws_security_group.load_balancer_security_group.id}"]
  }

  egress {
    from_port   = 0 # Allowing any incoming port
    to_port     = 0 # Allowing any outgoing port
    protocol    = "-1" # Allowing any outgoing protocol 
    cidr_blocks = ["0.0.0.0/0"] # Allowing traffic out to all IP addresses
  }
}

ecs_task_definition.tf

resource "aws_ecs_task_definition" "my_first_task" {
  family                   = "my-first-task" # Naming our first task
  container_definitions    = <<DEFINITION
  [
    {
      "name": "my-first-task",
      "image": "${var.ECR_IMAGE_URL}",
      "essential": true,
      "portMappings": [
        {
          "containerPort": 2021,
          "hostPort": 2021
        }
      ],
      "memory": 512,
      "cpu": 256
    }
  ]
  DEFINITION
  requires_compatibilities = ["FARGATE"] # Stating that we are using ECS Fargate
  network_mode             = "awsvpc"    # Using awsvpc as our network mode as this is required for Fargate
  memory                   = 512         # Specifying the memory our container requires
  cpu                      = 256         # Specifying the CPU our container requires
  execution_role_arn       = "${aws_iam_role.ecsTaskExecutionRole.arn}"
}

resource "aws_iam_role" "ecsTaskExecutionRole" {
  name               = "ecsTaskExecutionRole"
  assume_role_policy = "${data.aws_iam_policy_document.assume_role_policy.json}"
}

data "aws_iam_policy_document" "assume_role_policy" {
  statement {
    actions = ["sts:AssumeRole"]

    principals {
      type        = "Service"
      identifiers = ["ecs-tasks.amazonaws.com"]
    }
  }
}

resource "aws_iam_role_policy_attachment" "ecsTaskExecutionRole_policy" {
  role       = "${aws_iam_role.ecsTaskExecutionRole.name}"
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}

Where am I going wrong here?

2
How did it go? The issue still persists?Marcin

2 Answers

0
votes

By the look of it, you are create new VPC with subnets, but there are no route tables defined, no internet gateway and attached to the VPC. So your VPC is simply private and not accessible from the internet, nor it can access ECR to get your docker image.

Maybe instead of creating a new VPC called default_vpc, you want to use an existing default vpc. If so you have to use data source:

data "aws_vpc" "default_vpc" {
  default = true
}

to get subnets:

data "aws_subnet_ids" "default" {
  vpc_id = data.aws_vpc.default_vpc.id
}

and modify the remaining of the code to reference these data sources.

Also for Fargate, it should remove:

"hostPort": 2021

And you forgot to setup security group for your ECS service. It should be:

  network_configuration {
    subnets          = data.aws_subnet_ids.default.ids
    assign_public_ip = true # Providing our containers with public IPs
    security_groups = [aws_security_group.service_security_group.id]
  }
0
votes

I had same similar issue when I was migrating from k8s to ECS Fargate. My task could not start, it was nightmare. Same image in k8s was working great with same health checks.

I can see that you are missing healthCheck in task_definition, at least that was issue for me.

here is my containerDefinition :

  container_definitions = jsonencode([{
    name        = "${var.app_name}-container-${var.environment}"
    image       = "${var.container_repository}:${var.container_image_version}"
    essential   = true

    environment: concat(
      var.custom_env_variables,
      [
        {
          name  = "JAVA_TOOL_OPTIONS"
          value = "-Xmx${var.container_memory_max_ram}m -XX:MaxRAM=${var.container_memory_max_ram}m -XX:+UseParallelGC -XX:MinHeapFreeRatio=5 -XX:MaxHeapFreeRatio=10 -XX:GCTimeRatio=4"
        },
        {
          name  = "SPRING_PROFILES_ACTIVE"
          value = var.spring_profile
        },
        {
          name  = "APP_NAME"
          value = var.spring_app_name
        }
      ]
    )

    portMappings = [
      {
        protocol      = "tcp"
        containerPort = var.container_port
      },
      {
        protocol      = "tcp"
        containerPort = var.container_actuator_port
      }
    ]
    healthCheck = {
      retries = 10
      command = [ "CMD-SHELL", "curl -f http://localhost:8081/actuator/liveness || exit 1" ]
      timeout: 5
      interval: 10
      startPeriod: var.health_start_period
    }
    logConfiguration = {
      logDriver = "awslogs"
      options = {
        awslogs-group         = aws_cloudwatch_log_group.main.name
        awslogs-stream-prefix = "ecs"
        awslogs-region        = var.aws_region
      }
    }
    mountPoints = [{
        sourceVolume = "backend_efs",
        containerPath = "/data",
        readOnly = false
    }]
  }])

there is healthCheck aprt:

healthCheck = {
      retries = 10
      command = [ "CMD-SHELL", "curl -f http://localhost:8081/actuator/liveness || exit 1" ]
      timeout: 5
      interval: 10
      startPeriod: var.health_start_period
    }

container in order to start needs to have a way to check is that task running OK. And I could only get that via curl . I have one endpoint that returns me is it live or not. You need to specify your, it is jut important that return 200.

Also there is no curl command by default, you need to add it in you DockerFile as that was next issue where I spent few hours, as there was not clear error on ECS.

I added this line:

RUN apt-get update && apt-get install -y --no-install-recommends curl