0
votes

What is the best way to eventually have a container task be completed if it times out while waiting for an instance (Provisioning State)?

My preferred solution would be it never times out, towards that end I tried to up the start time out of the conainer as below (it is the line right below "image": "XXXXXXXXXX.dkr.ecr.us-east-1.amazonaws.com/hsforum:latest", which is one of the longer lines):

    "ipcMode": null,
    "executionRoleArn": "arn:aws:iam::XXXXXXXXXX:role/PICKLEStack-FHTask871D3AF0-C9GJ55PNKL4J",
    "containerDefinitions": [
        {
            "dnsSearchDomains": [],
            "environmentFiles": null,
            "logConfiguration": null,
            "entryPoint": [],
            "portMappings": [],
            "command": [
                "-c",
                "interactive",
                "-a",
                "PICKLE",
                "--tooltype",
                "false"
            ],
            "linuxParameters": null,
            "cpu": 2,
            "environment": [],
            "resourceRequirements": null,
            "ulimits": null,
            "dnsServers": [],
            "mountPoints": [],
            "workingDirectory": null,
            "secrets": null,
            "dockerSecurityOptions": [],
            "memory": 256,
            "memoryReservation": null,
            "volumesFrom": [],
            "stopTimeout": null,
            "image": "XXXXXXXXXX.dkr.ecr.us-east-1.amazonaws.com/hsforum:latest",
            "startTimeout": 54000,
            "firelensConfiguration": null,
            "dependsOn": null,
            "disableNetworking": null,
            "interactive": null,
            "healthCheck": null,
            "essential": true,
            "links": [],
            "hostname": null,
            "extraHosts": null,
            "pseudoTerminal": null,
            "user": null,
            "readonlyRootFilesystem": null,
            "dockerLabels": null,
            "systemControls": [],
            "privileged": null,
            "name": "HSPICKLE-Scheduled-Container"
        }
    ],
    "memory": null,
    "taskRoleArn": "arn:aws:iam::032209199189:role/PICKLEStack-FHTask871D3AF0-C9GJ55PNKL4J",
    "family": "PICKLEStackHSForumScheduledRun41E73875",
    "pidMode": null,
    "requiresCompatibilities": [
        "EC2"
    ],
    "networkMode": "bridge",
    "cpu": null,
    "inferenceAccelerators": [],
    "proxyConfiguration": null,
    "volumes": [],
    "placementConstraints": [],
    "tags": []
}```

If stopping it from timing out in provisioning is not possible my only imagined solution is to capture the event on EventBridge when the ECS Task is going from "Provisioning" to "Stopping" and simply requeue it with a Lambda?

I am hoping someone out there has had to solve this before.

Thanks!
1

1 Answers

0
votes

Well it turns out that the best way I could answer this question was by using the AWS SDK and syncronous calls to RunTaskAsync. It was not really the solution I wanted but it ends up working quite well for my case, you can know a task is going to fail immediately and then even associate it with its taskARN that you can review later to keep up with failures. My use case is creating a single task at a time for specific jobs so you might need to handle the values of failures differently.

public AmazonECSClient client = new AmazonECSClient(Amazon.RegionEndpoint.USEast1);
    .
    .
    .
            RunTaskRequest req = new RunTaskRequest()
            {
                Cluster = clusterName,
                Count = 1,
                PlacementConstraints = placementConstraintList,
                LaunchType = LaunchType.EC2,
                TaskDefinition = "TASKDEF",
                Overrides = taskOverride,
                Group = $"id1:{tryJob.id1}__id2:{tryJob.id2}",
            };

            var task = client.RunTaskAsync(req);
            task.Wait(); //Wait so that we can know if our task actually succeeded or not
            if (task.IsCompletedSuccessfully)
            {

                var result = task.Result;
                if (result.Tasks.Count == 0)
                {

                    if (result.Failures.Count > 0)
                    {
                        //Increase Delay but cap delay at 10 seconds
                        if (_taskFailCountBackoff < 20)
                            _taskFailCountBackoff++;
                        var failure = result.Failures[0];
                        if (failure.Reason.ToLower().Contains("memberof placement"))
                        {
                            _lastFailedSourceId = tryJob.SourceId;
                            //Shuffle to the end 
                            _WaitingJobs.Remove(tryJob);
                            _WaitingJobs.Add(tryJob);
                        }
                        else if (failure.Reason.ToLower().Contains("resource:memory") || failure.Reason.ToLower().Contains("resource:memory"))
                        {
                            //fail silently
                        }
                        else
                        {

                            Program.Logger.Error($"Job Unable to be queued, retrying -- s: {tryJob.id1} f: {tryJob.id2}");
                            Program.Logger.Error($"Failure Reason: {result.Failures[0].Reason} -- {result.Failures[0].Detail} ");
                            System.Threading.Thread.Sleep(2000);
                            //Shuffle to the end 
                            _WaitingJobs.Remove(tryJob);
                            _WaitingJobs.Add(tryJob);
                        }

                    }
                    //We failed if we fail repeatedly don't spam the ecs agents
                    System.Threading.Thread.Sleep(500 * _taskFailCountBackoff);
                    return;
                }
                //we succeeded send quickly if possible
                _taskFailCountBackoff = 0;
                tryJob.TaskArn = result.Tasks[0].TaskArn;
                tryJob.containerInstanceARN = result.Tasks[0].ContainerInstanceArn;
                if (tryJob.containerInstanceARN != "")
                {
                    tryJob.ec2InstanceId = _containerInstances.Where(t => t.ContainerInstanceArn == tryJob.containerInstanceARN).FirstOrDefault().Ec2InstanceId;
                }
                _WaitingJobs.Remove(tryJob);
                _RunningJobs.Add(tryJob);
                Program.Logger.Info($"Added Job to Container s: {tryJob.id1} f: {tryJob.di2}");