I'm starting with a collection of documents that look like this:
{
state: 'CA',
year: 2014,
accepted: true
}
{
state: 'AL',
year: 2012,
accepted: false
}
{
state: 'CA',
year: 2013,
accepted: false
}
...
I want to end up with a new aggregated collection in this format:
{
_id: 'CA',
value: {
submittedApplications2012: 34,
submittedApplications2013: 23,
submittedApplications2014: 72,
acceptedApplications2012: 12,
acceptedApplications2013: 7,
acceptedApplications2014: 5
}
}
{
_id: 'AL',
value: {
submittedApplications2012: 73,
submittedApplications2013: 67,
submittedApplications2014: 98,
acceptedApplications2012: 45,
acceptedApplications2013: 34,
acceptedApplications2014: 31
}
}
I have written a mapreduce that groups the documents by state name and loops through each state, incrementing the appropriate properties:
var map = function() {
var key = this.state;
var value = {
year: this.year,
accepted: this.accepted
};
emit(key, value);
};
var reduce = function(key, values) {
var reducedObject = {
submittedApplications2012: 0,
submittedApplications2013: 0,
submittedApplications2014: 0,
acceptedApplications2012: 0,
acceptedApplications2013: 0,
acceptedApplications2014: 0
};
values.forEach(function(v) {
switch (v.year) {
case 2014:
reducedObject.submittedApplications2014++;
if (v.accepted) {
reducedObject.acceptedApplications2014++;
}
break;
case 2013:
reducedObject.submittedApplications2013++;
if (v.accepted) {
reducedObject.acceptedApplications2013++;
}
break;
case 2012:
reducedObject.submittedApplications2012++;
if (v.accepted) {
reducedObject.acceptedApplications2012++;
}
break;
default:
}
});
return reducedObject;
};
db.test_collection.mapReduce(
map,
reduce,
{out: {inline: 1}}
)
Unfortunately, the results are inaccurate. Alabama ends up with 9, 8 and 3 for submitted2012, submitted2013 and submitted2014. The other states also end up with low numbers. With 10,000 records, the numbers should be a lot higher.
I think this is happening because the reduce function is being called several times (see Reduce is called several times with the same key in mongodb map-reduce) and the reducedObject object is being overwritten on subsequent passes.
How can this be prevented so that it accurately counts the number of submitted and accepted applications?
Here is some code to create a test collection in the original format:
// Generate a test collection with 10K documents for demo'ing purposes
var i = 10000,
states = ['AL', 'CA', 'FL', 'TN', 'OH'],
years = [2012, 2013, 2014];
db.test_collection.drop();
while (i--) {
db.test_collection.insert({
state: states[Math.floor(Math.random() * states.length)],
year: NumberInt(years[Math.floor(Math.random() * years.length)]),
accepted: Math.random() >= 0.5
});
}