4
votes

Could anyone please shed some light on why my ECS stack is not scaling out new EC2 instances?

I configured my ECS stack using Cloudformation. The initial configuration runs fine. As soon as I launch my stack, a process is triggered that keeps the CPU load > 90% so that the scale-out alarm can trigger for testing purposes.

I set up a scale out an alarm to trigger a scale-out policy when CPU > 15% and a scale in policy will trigger when CPU < 4%.

The log messages then report the following:

Message: service ECSService-12BBO1EE3SRUF was unable to place a task because no container instance met all of its requirements. The closest matching container-instance 149e8eea-a8bc-433f-abbb-9a49c3a3c5b5 has insufficient memory available. For more information, see the Troubleshooting section. Message: Successfully set the desired count to 2. Waiting for change to be fulfilled by ecs. Cause: monitor alarm CPU utilization greater than 5% in state ALARM triggered policy ServiceScaleOutPolicy 155194fc-ee07-46ff-a822-018bd704602b

It looks like ECS is trying to place more tasks on the same instance instead of scaling out the number of instances and placing a new task on a new instance. How do I get ECS to scale out to a new instance and to place a new task on the new instance?

My cloudformation scaling configuration looks like this:

ECSAutoScalingGroup:
    Type: AWS::AutoScaling::AutoScalingGroup
    DependsOn: ECSALB
    Properties:
      VPCZoneIdentifier: !Ref 'SubnetId'
      LaunchConfigurationName: !Ref 'ContainerInstances'
      MinSize: !Ref 'DesiredCapacity'
      MaxSize: !Ref 'MaxSize'
      DesiredCapacity: !Ref 'DesiredCapacity'
      HealthCheckGracePeriod: 320
    CreationPolicy:
      ResourceSignal:
        Timeout: PT15M
    UpdatePolicy:
      AutoScalingReplacingUpdate:
        WillReplace: 'true'
      AutoScalingRollingUpdate:
        MinInstancesInService: '1'
        MaxBatchSize: '1'
        PauseTime: PT15M
        WaitOnResourceSignals: 'true'

ServiceScalingTarget:
    Type: AWS::ApplicationAutoScaling::ScalableTarget
    DependsOn: ECSService
    Properties:
      MaxCapacity: 3
      MinCapacity: 1
      ResourceId: !Join ['', [service/, !Ref 'ECSCluster', /, !GetAtt [ECSService, Name]]]
      RoleARN: !GetAtt [AutoscalingRole, Arn]
      ScalableDimension: ecs:service:DesiredCount
      ServiceNamespace: ecs

ServiceScaleOutPolicy:
    Type : "AWS::ApplicationAutoScaling::ScalingPolicy"
    Properties:
      PolicyName: ServiceScaleOutPolicy
      PolicyType: StepScaling
      ScalingTargetId: !Ref 'ServiceScalingTarget'
      StepScalingPolicyConfiguration:
          AdjustmentType: ChangeInCapacity
          Cooldown: 60
          MetricAggregationType: Average
          StepAdjustments:
          - MetricIntervalLowerBound: 0
            ScalingAdjustment: 1

  ServiceScaleInPolicy:
    Type : "AWS::ApplicationAutoScaling::ScalingPolicy"
    Properties:
      PolicyName: ServiceScaleInPolicy
      PolicyType: StepScaling
      ScalingTargetId: !Ref 'ServiceScalingTarget'
      StepScalingPolicyConfiguration:
          AdjustmentType: ChangeInCapacity
          Cooldown: 60
          MetricAggregationType: Average
          StepAdjustments:
          - MetricIntervalUpperBound: 0
            ScalingAdjustment: -1

  CPUScaleOutAlarm:
    Type: AWS::CloudWatch::Alarm
    Properties:
      AlarmName: CPU utilization greater than 15%
      AlarmDescription: Alarm if cpu utilization greater than 15% of reserved cpu
      Namespace: AWS/ECS
      MetricName: CPUUtilization
      Dimensions:
      - Name: ClusterName
        Value: !Ref ECSCluster
      - Name: ServiceName
        Value: !GetAtt ECSService.Name
      Statistic: Maximum
      Period: '60'
      EvaluationPeriods: '1'
      Threshold: '15'
      ComparisonOperator: GreaterThanThreshold
      AlarmActions:
      - !Ref ServiceScaleOutPolicy

  CPUScaleInAlarm:
    Type: AWS::CloudWatch::Alarm
    Properties:
      AlarmName: CPU utilization less than 4%
      AlarmDescription: Alarm if cpu utilization greater than 4% of reserved cpu
      Namespace: AWS/ECS
      MetricName: CPUUtilization
      Dimensions:
      - Name: ClusterName
        Value: !Ref ECSCluster
      - Name: ServiceName
        Value: !GetAtt ECSService.Name
      Statistic: Maximum
      Period: '60'
      EvaluationPeriods: '4'
      Threshold: '4'
      ComparisonOperator: LessThanThreshold
      AlarmActions:
        - !Ref ServiceScaleInPolicy
AutoscalingRole:
    Type: AWS::IAM::Role
    Properties:
      AssumeRolePolicyDocument:
        Statement:
        - Effect: Allow
          Principal:
            Service: [application-autoscaling.amazonaws.com]
          Action: ['sts:AssumeRole']
      Path: /
      Policies:
      - PolicyName: service-autoscaling
        PolicyDocument:
          Statement:
          - Effect: Allow
            Action: ['application-autoscaling:*', 'cloudwatch:DescribeAlarms', 'cloudwatch:PutMetricAlarm',
              'ecs:DescribeServices', 'ecs:UpdateService']
            Resource: '*'

TaskDefinition:
    Type: AWS::ECS::TaskDefinition
    Properties:
      Family: !Join ['', [!Ref 'AWS::StackName', -frontend-task]]
      ContainerDefinitions:
        - Name: nginx-container
          Image: nginx:latest
          Cpu: '64'
          Memory: '150'
          Essential: 'true'
          Links:
            - "kestrel-container"
          MountPoints: 
            - SourceVolume: "volume-nginx-conf"
              ContainerPath: "/etc/nginx/conf.d/default.conf"
          LogConfiguration:
            LogDriver: awslogs
            Options:
              awslogs-group: !Ref 'CloudwatchLogsGroup'
              awslogs-region: !Ref 'AWS::Region'
              awslogs-stream-prefix: task-nginx-container
          PortMappings:
          - ContainerPort: 80
          - ContainerPort: 443

        - Name: kestrel-container
          Image: some-image
          Cpu: '940'
          Memory: '512'
          Essential: 'false'
          LogConfiguration:
            LogDriver: awslogs
            Options:
              awslogs-group: !Ref 'CloudwatchLogsGroup'
              awslogs-region: !Ref 'AWS::Region'
              awslogs-stream-prefix: task-kestrel-container
          PortMappings:
          - ContainerPort: 5443

      Volumes:
          - Host: 
              SourcePath: "/docker-volumes/nginx/nginx.conf"
            Name: "volume-nginx-conf"
1

1 Answers

1
votes

Seems like you misunderstood between service auto scaling and cluster auto scaling. What you have done above is to auto-scale the service based on CPU usage of service within its own container.

What you want to do is to auto-scale the ECS cluster by adding new EC2 instance if the overall memory usage of cluster is reached one threshold.

Please find the below the snippet of how to configure auto scaling in cluster level if memory is reached 80%. I can't share the entire cloudformation.

ECSInstanceAutoScalingGroup:
    Type: AWS::AutoScaling::AutoScalingGroup
    Properties:
      VPCZoneIdentifier:
      - 'Fn::ImportValue':
          !Sub '${VPCStackName}-SubnetPrivateA'
      - 'Fn::ImportValue':
          !Sub '${VPCStackName}-SubnetPrivateB'
      - 'Fn::ImportValue':
          !Sub '${VPCStackName}-SubnetPrivateC'
      LaunchConfigurationName: !Ref 'ECSInstanceLaunchConfiguration'
      MinSize: !Ref 'ECSInstanceCount'
      MaxSize: 6
      DesiredCapacity: !Ref 'ECSInstanceCount'
      MetricsCollection:
        - Granularity: 1Minute

ECSInstanceLaunchConfiguration:
  Type: AWS::AutoScaling::LaunchConfiguration
  Metadata:
    AWS::CloudFormation::Init:
      configSets:
        ConfigCluster:
        - Install
      Install:
        files:
          /home/ec2-user/.aws/config:
            content: !Sub |
              [default]
              region = ${AWS::Region}
            mode: '000755'
            owner: ec2-user
            group: root
          /etc/ecs/ecs.config:
            content: !Sub |
              ECS_CLUSTER=${ECSCluster}
              ECS_ENABLE_CONTAINER_METADATA=true
              ECS_ENGINE_TASK_CLEANUP_WAIT_DURATION=20m
              ECS_DISABLE_IMAGE_CLEANUP=false
              ECS_IMAGE_CLEANUP_INTERVAL=10m
              ECS_IMAGE_MINIMUM_CLEANUP_AGE=20m
            mode: '000755'
            owner: root
            group: root
  Properties:
    ImageId: !Ref ECSAMI
    InstanceType: !Ref 'ECSInstanceType'
    AssociatePublicIpAddress: 'false'
    IamInstanceProfile: !Ref ECSClusterRoleInstance
    SecurityGroups:
    - !Ref 'ECSInstanceSecurityGroup'

ECSScalingPolicy:
  Type: 'AWS::AutoScaling::ScalingPolicy'
  Properties:
    AutoScalingGroupName: !Ref ECSInstanceAutoScalingGroup
    PolicyType: TargetTrackingScaling
    TargetTrackingConfiguration:
      CustomizedMetricSpecification:
        MetricName: MemoryReservation
        Namespace: "AWS/ECS"
        Dimensions:
          - Name: ClusterName
            Value: !Sub "ecs-${EnvName}-${EnvNumber}"
        Statistic: Maximum
        Unit: Percent
      TargetValue: 80
      DisableScaleIn: false