1
votes

I am backfilling DynamoDB table using import feature available for table. My table has a GSI. I am setting the Write capacity units to 8K for both GSI and table.

But when i am running data pipeline to backfill the data from s3. My consumed capacity units reaches at max 4k for both GSI AND table. But when i tried the same with no GSI my consumer capacity reaches to ~8K.

Datapipeline configuration.

{
  "objects": [
    {
      "failureAndRerunMode": "CASCADE",
      "resourceRole": "DataPipelineDefaultResourceRole",
      "role": "DataPipelineDefaultRole",
      "pipelineLogUri": "s3://logger/logs/",
      "scheduleType": "ONDEMAND",
      "name": "Default",
      "id": "Default"
    },
    {
      "output": {
        "ref": "DDBDestinationTable"
      },
      "input": {
        "ref": "S3InputDataNode"
      },
      "maximumRetries": "2",
      "name": "TableLoadActivity",
      "step": "s3://dynamodb-emr-#{myDDBRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbImport,#{input.directoryPath},#{output.tableName},#{output.writeThroughputPercent}",
      "runsOn": {
        "ref": "EmrClusterForLoad"
      },
      "id": "TableLoadActivity",
      "type": "EmrActivity",
      "resizeClusterBeforeRunning": "false"
    },
    {
      "writeThroughputPercent": "#{myDDBWriteThroughputRatio}",
      "name": "DDBDestinationTable",
      "id": "DDBDestinationTable",
      "type": "DynamoDBDataNode",
      "tableName": "#{myDDBTableName}"
    },
    {
      "taskInstanceType": "m3.xlarge",
      "bootstrapAction": "s3://#{myDDBRegion}.elasticmapreduce/bootstrap-actions/configure-hadoop, --mapred-key-value,mapreduce.map.speculative=false",
      "taskInstanceCount": "5",
      "name": "EmrClusterForLoad",
      "coreInstanceCount": "1",
      "coreInstanceType": "m3.xlarge",
      "amiVersion": "3.8.0",
      "id": "EmrClusterForLoad",
      "masterInstanceType": "m3.xlarge",
      "region": "#{myDDBRegion}",
      "type": "EmrCluster"
    },
    {
      "directoryPath": "#{myInputS3Loc}",
      "name": "S3InputDataNode",
      "id": "S3InputDataNode",
      "type": "S3DataNode"
    }
  ],
  "parameters": [
    {
      "description": "Input S3 folder",
      "id": "myInputS3Loc",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "description": "Target DynamoDB table name",
      "id": "myDDBTableName",
      "type": "String"
    },
    {
      "default": "0.25",
      "watermark": "Enter value between 0.1-1.0",
      "description": "DynamoDB write throughput ratio",
      "id": "myDDBWriteThroughputRatio",
      "type": "Double"
    },
    {
      "default": "us-east-1",
      "watermark": "us-east-1",
      "description": "Region of the DynamoDB table",
      "id": "myDDBRegion",
      "type": "String"
    }
  ],
  "values": {
    "myDDBRegion": "us-west-2",
    "myDDBTableName": "KNEX_NODE",
    "myDDBWriteThroughputRatio": "1",
    "myInputS3Loc": "s3://DataToLoad/sampleData/"
  }
}

Any Suggestions to achieve maximum write throughput for write with a GSI?

1

1 Answers

0
votes

one of the main reasons can be your distribution of your hash keys in the GSI. probably your main table has only 1 partition (this why you see it writing in 8k), and you GSI has 2 partitions (8/2 = 4k).

what is your GSI index values? maybe you should consider to change something in your hash key format in order to get good distribution. (or when your table will grow your GSI index split again).