1
votes

I'm starting with a collection of documents that look like this:

{
  state: 'CA',
  year: 2014,
  accepted: true
}
{
  state: 'AL',
  year: 2012,
  accepted: false
}
{
  state: 'CA',
  year: 2013,
  accepted: false
}
...

I want to end up with a new aggregated collection in this format:

{
  _id: 'CA',
  value: {
    submittedApplications2012: 34,
    submittedApplications2013: 23,
    submittedApplications2014: 72,
    acceptedApplications2012: 12,
    acceptedApplications2013: 7,
    acceptedApplications2014: 5
  }
}
{
  _id: 'AL',
  value: {
    submittedApplications2012: 73,
    submittedApplications2013: 67,
    submittedApplications2014: 98,
    acceptedApplications2012: 45,
    acceptedApplications2013: 34,
    acceptedApplications2014: 31
  }
}

I have written a mapreduce that groups the documents by state name and loops through each state, incrementing the appropriate properties:

var map = function() {
  var key = this.state;
  var value = {
    year: this.year,
    accepted: this.accepted
  };
  emit(key, value);
};

var reduce = function(key, values) {
  var reducedObject = {
    submittedApplications2012: 0,
    submittedApplications2013: 0,
    submittedApplications2014: 0,
    acceptedApplications2012: 0,
    acceptedApplications2013: 0,
    acceptedApplications2014: 0
  };

  values.forEach(function(v) {
    switch (v.year) {
      case 2014:
        reducedObject.submittedApplications2014++;
        if (v.accepted) {
          reducedObject.acceptedApplications2014++;
        }
        break;
      case 2013:
        reducedObject.submittedApplications2013++;
        if (v.accepted) {
          reducedObject.acceptedApplications2013++;
        }
        break;
      case 2012:
        reducedObject.submittedApplications2012++;
        if (v.accepted) {
          reducedObject.acceptedApplications2012++;
        }
        break;
      default:
    }
  });
  return reducedObject;
};

db.test_collection.mapReduce(
  map,
  reduce,
  {out: {inline: 1}}
)

Unfortunately, the results are inaccurate. Alabama ends up with 9, 8 and 3 for submitted2012, submitted2013 and submitted2014. The other states also end up with low numbers. With 10,000 records, the numbers should be a lot higher.

I think this is happening because the reduce function is being called several times (see Reduce is called several times with the same key in mongodb map-reduce) and the reducedObject object is being overwritten on subsequent passes.

How can this be prevented so that it accurately counts the number of submitted and accepted applications?

Here is some code to create a test collection in the original format:

// Generate a test collection with 10K documents for demo'ing purposes
var i = 10000,
    states = ['AL', 'CA', 'FL', 'TN', 'OH'],
    years = [2012, 2013, 2014];
db.test_collection.drop();
while (i--) {
  db.test_collection.insert({
    state: states[Math.floor(Math.random() * states.length)],
    year: NumberInt(years[Math.floor(Math.random() * years.length)]),
    accepted: Math.random() >= 0.5
  });
}
2

2 Answers

1
votes

I really don't think that mapReduce is the right choice for this. Peronally I would be using the aggregation framework as it will process much more quickly here as the operations are all in native code without JavaScript translation of code or objects.

Doing so is just a simple $group operation, with some treatment by $cond to convert the true/false values to numeric:

db.test_collection.aggregate([
    { "$group": {
        "_id": {
            "state": "$state",
            "year": "$year"
        },
        "submitted": { "$sum": 1 },
        "accepted": {
            "$sum": {
                "$cond": [
                    "$accepted",
                    1,
                    0
                ]
            }
        }
    }},
    { "$group": {
        "_id": "$_id.state",
        "values": {
            "$push": {
                "year": "$_id.year",
                "submitted": "$submitted",
                "accepted": "$accepted"
            }
        }
    }}
])

Which produces output like this ( just one state for brevity ):

{
    "_id" : "CA",
    "values" : [
        {
                "year" : 2014,
                "submitted" : 691,
                "accepted" : 360
        },
        {
                "year" : 2013,
                "submitted" : 653,
                "accepted" : 332
        },
        {
                "year" : 2012,
                "submitted" : 681,
                "accepted" : 350
        }
    ]
}

Or if you really must specify all the keys in output, then use the following form. It would be a trival matter to generate the structure in code as a "data structure" is all an aggregation pipeline or indeed any native MongoDB query actually is:

db.test_collection.aggregate([
    { "$group": {
        "_id": "$state",
        "submitted2012": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2012 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2012": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2012 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        },
        "submitted2013": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2013 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2013": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2013 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        },
        "submitted2014": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2014 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2014": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2014 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        }
    }}
])

In fact it's actually as trivial as this:

var groupStage  = {
    "$group": {
        "_id": "$state"
    }
};

[2012,2013,2014].forEach(function(year) {
    groupStage["$group"]["submitted" + year] = {
        "$sum": {
            "$cond": [
                { "$eq": [ "$year", year ] },
                1,
                0
            ]
        }
    };
    groupStage["$group"]["accepted" + year] = {
        "$sum": {
            "$cond": [
                { "$and": [
                    { "$eq": [ "$year", year ] },
                    "$accepted"
                ]},
                1,
                0
            ]
        }
    };
});

db.test_collection.aggregate([groupStage])

And it's output:

{
    "_id" : "CA",
    "submitted2012" : 681,
    "accepted2012" : 350,
    "submitted2013" : 653,
    "accepted2013" : 332,
    "submitted2014" : 691,
    "accepted2014" : 360
}

Doing this with mapReduce is much slower, but the main considerations you need to make is to have the "mapper" emit the same output as the reducer itself is going to return. This is because the "reducer" does not actually process all grouped documents at once, but rather the output of a "reduce" can in turn come back as "input" alongside other emitted or "reduced" values for further reduction:

db.test_collection.mapReduce(
    function() {
        var obj = {};
        obj["submitted" + this.year] = 1,
        obj["accepted" + this.year] = (this.accepted) ? 1: 0;
        emit(this.state,obj);
    },
    function(key,values) {
        var obj = {};
        values.forEach(function(value) {
            Object.keys(value).forEach(function(key) {
                if ( !obj.hasOwnProperty(key) )
                    obj[key] = 0;
                obj[key] += value[key];
            });
        });
        return obj;
    },
    { "out": { "inline": 1 } }
)

With this sort of output:

{
    "_id" : "CA",
    "value" : {
            "submitted2014" : 691,
            "accepted2014" : 360,
            "submitted2013" : 653,
            "accepted2013" : 332,
            "submitted2012" : 681,
            "accepted2012" : 350
    }
}

For the record, output like the original aggregate example can be obtained like this:

db.test_collection.mapReduce(
    function() {
        var obj = {
            "year": this.year,
            "submitted": 1,
            "accepted": (this.accepted) ? 1 : 0
        };
        emit(this.state,{ "values": [obj] });
    },
    function(key,values) {
        var obj = { "values": [] };

        var accum = {};

        values.forEach(function(value) {
            value.values.forEach(function(data) {
                if ( !accum.hasOwnProperty(data.year) )
                    accum[data.year] = {
                        submitted: 0,
                        accepted: 0
                    };
                accum[data.year]["submitted"] += data.submitted;
                accum[data.year]["accepted"] += data.accepted;
            });
        });

        Object.keys(accum).forEach(function(key) {
            obj.values.push({
                "year": parseInt(key),
                "submitted": accum[key].submitted,
                "accepted": accum[key].accepted
            });
        });
        obj.values.sort(function(a,b){
            return a.year < b.year;
        });

        return obj;
    },
    { "out": { "inline": 1  } }
)

With output keys that follows the mapReduce rules:

{
    "_id" : "CA",
    "value" : {
        "values" : [
            {
                    "year" : 2014,
                    "submitted" : 691,
                    "accepted" : 360
            },
            {
                    "year" : 2013,
                    "submitted" : 653,
                    "accepted" : 332
            },
            {
                    "year" : 2012,
                    "submitted" : 681,
                    "accepted" : 350
            }
        ]
    }
}

So it's possible with mapReduce, but the aggregation framework is certainly the better option for this type of task.


Also your generation script could be a bit better for using Bulk operations:

var i = 10000,
    states = ['AL', 'CA', 'FL', 'TN', 'OH'],
    years = [2012, 2013, 2014],
    bulk = db.test_collection.initializeOrderedBulkOp();

db.test_collection.drop();
while (i--) {
  bulk.insert({
    state: states[Math.floor(Math.random() * states.length)],
    year: NumberInt(years[Math.floor(Math.random() * years.length)]),
    accepted: Math.random() >= 0.5
  });
  if ( i % 1000 == 0 ) {
    bulk.execute();
    bulk = db.test_collection.initializeOrderedBulkOp();
  }
}
0
votes

You can achieve with the help of Aggregation operator:

Lets create a below collection :

Collection :

db.flat.insert({state: "CA",year: 2014,accepted: true});
db.flat.insert({state: "AL",year: 2012,accepted: false});
db.flat.insert({state: "CA",year: 2013,accepted: false});
db.flat.insert({state: "AL",year: 2012,accepted: true});
db.flat.insert({state: "CA",year: 2011,accepted: false});
db.flat.insert({state: "AL",year: 2011,accepted: true});
db.flat.insert({state: "CA",year: 2013,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});
db.flat.insert({state: "AL",year: 2014,accepted: true});
db.flat.insert({state: "CA",year: 2014,accepted: false});

Below Query will fetch you the required output :

Query :

          db.flat.aggregate([
            { $group: { _id: { state:"$state",year:"$year" }, 
                       submitted:{ $sum : 1 }, 
                       accepted: { $sum : 
                              { 
                                $cond: { if: { $eq:[ "$accepted",true ] },
                                         then: 1 , 
                                         else: 0 
                                       } 
                              }
                                 } 
                      } 
            },
            { $project: { temp: { 
                               $concat: [ "submittedApplications",
                                 { $substr:[ "$_id.year", 0, 4 ] }," : ",
                                 { $substr:[ "$submitted",0,1 ] }, "  ,  " ,
                                { 
                               $concat:[ "acceptedApplications",
                                 { $substr: [ "$_id.year",0,4 ] }," : ", 
                                 { $substr: [ "$accepted",0,1 ] }
                                        ]
                                }
                                        ]
                                }
                        }
           },
           { $group: { _id:"$_id.state" , value : {$push:"$temp"} 
                     } 
           }
         ]).pretty();

OutPut :

{
        "_id" : "CA",
        "value" : [
                "submittedApplications2011 : 1  ,  acceptedApplications2011 : 0",
                "submittedApplications2013 : 2  ,  acceptedApplications2013 : 0",
                "submittedApplications2014 : 4  ,  acceptedApplications2014 : 1"
        ]
}
{
        "_id" : "AL",
        "value" : [
                "submittedApplications2011 : 1  ,  acceptedApplications2011 : 1",
                "submittedApplications2012 : 2  ,  acceptedApplications2012 : 1",
                "submittedApplications2014 : 3  ,  acceptedApplications2014 : 3"
        ]
}