There are some good guides for working with MapReduce and DynamoDB. I followed this one the other day and got data exporting to S3 going reasonably painlessly. I think your best bet would be to create a hive script that performs the backup task, save it in an S3 bucket, then use the AWS API for your language to pragmatically spin up a new EMR job flow, complete the backup. You could set this as a cron job.
Example of a hive script exporting data from Dynamo to S3:
CREATE EXTERNAL TABLE my_table_dynamodb (
company_id string
,id string
,name string
,city string
,state string
,postal_code string)
STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
TBLPROPERTIES ("dynamodb.table.name"="my_table","dynamodb.column.mapping" = "id:id,name:name,city:city,state:state,postal_code:postal_code");
CREATE EXTERNAL TABLE my_table_s3 (
,id string
,name string
,city string
,state string
,postal_code string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 's3://yourBucket/backup_path/dynamo/my_table';
INSERT OVERWRITE TABLE my_table_s3
SELECT * from my_table_dynamodb;
Here is an example of a PHP script that will spin up a new EMR job flow:
$emr = new AmazonEMR();
$response = $emr->run_job_flow(
'My Test Job',
array(
"TerminationProtected" => "false",
"HadoopVersion" => "0.20.205",
"Ec2KeyName" => "my-key",
"KeepJobFlowAliveWhenNoSteps" => "false",
"InstanceGroups" => array(
array(
"Name" => "Master Instance Group",
"Market" => "ON_DEMAND",
"InstanceType" => "m1.small",
"InstanceCount" => 1,
"InstanceRole" => "MASTER",
),
array(
"Name" => "Core Instance Group",
"Market" => "ON_DEMAND",
"InstanceType" => "m1.small",
"InstanceCount" => 1,
"InstanceRole" => "CORE",
),
),
),
array(
"Name" => "My Test Job",
"AmiVersion" => "latest",
"Steps" => array(
array(
"HadoopJarStep" => array(
"Args" => array(
"s3://us-east-1.elasticmapreduce/libs/hive/hive-script",
"--base-path",
"s3://us-east-1.elasticmapreduce/libs/hive/",
"--install-hive",
"--hive-versions",
"0.7.1.3",
),
"Jar" => "s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar",
),
"Name" => "Setup Hive",
"ActionOnFailure" => "TERMINATE_JOB_FLOW",
),
array(
"HadoopJarStep" => array(
"Args" => array(
"s3://us-east-1.elasticmapreduce/libs/hive/hive-script",
"--base-path",
"s3://us-east-1.elasticmapreduce/libs/hive/",
"--hive-versions",
"0.7.1.3",
"--run-hive-script",
"--args",
"-f",
"s3n://myBucket/hive_scripts/hive_script.hql",
"-d",
"INPUT=Var_Value1",
"-d",
"LIB=Var_Value2",
"-d",
"OUTPUT=Var_Value3",
),
"Jar" => "s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar",
),
"Name" => "Run Hive Script",
"ActionOnFailure" => "CANCEL_AND_WAIT",
),
),
"LogUri" => "s3n://myBucket/logs",
)
);
}