$avg in mongodb aggregation - mongodb

Document looks like this:
{
"_id" : ObjectId("361de42f1938e89b179dda42"),
"user_id" : "u1",
"evaluator_id" : "e1",
"candidate_id" : ObjectId("54f65356294160421ead3ca1"),
"OVERALL_SCORE" : 150,
"SCORES" : [
{ "NAME" : "asd", "OBTAINED_SCORE" : 30}, { "NAME" : "acd", "OBTAINED_SCORE" : 36}
]
}
Aggregation function:
db.coll.aggregate([ {$unwind:"$SCORES"}, {$group : { _id : { user_id : "$user_id", evaluator_id : "$evaluator_id"}, AVG_SCORE : { $avg : "$SCORES.OBTAINED_SCORE" }}} ])
Suppose if there are two documents with same "user_id" (say u1) and different "evaluator_id" (say e1 and e2).
For example:
1) Average will work like this ((30 + 20) / 2 = 25). This is working for me.
2) But for { evaluator_id : "e1" } document, score is 30 for { "NAME" : "asd" } and { evaluator_id : "e2" } document, score is 0 for { "NAME" : "asd" }. In this case, I want the AVG_SCORE to be 30 only (not (30 + 0) / 2 = 15).
Is it possible through aggregation??
Could any one help me out.

It's possible by placing a $match between the $unwind and $group aggregation pipelines to first filter the arrays which match the specified condition to include in the average computation and that is, score array where the obtained score is not equal to 0 "SCORES.OBTAINED_SCORE" : { $ne : 0 }
db.coll.aggregate([
{
$unwind: "$SCORES"
},
{
$match : {
"SCORES.OBTAINED_SCORE" : { $ne : 0 }
}
},
{
$group : {
_id : {
user_id : "$user_id",
evaluator_id : "$evaluator_id"
},
AVG_SCORE : {
$avg : "$SCORES.OBTAINED_SCORE"
}
}
}
])
For example, the aggregation result for this document:
{
"_id" : ObjectId("5500aaeaa7ef65c7460fa3d9"),
"user_id" : "u1",
"evaluator_id" : "e1",
"candidate_id" : ObjectId("54f65356294160421ead3ca1"),
"OVERALL_SCORE" : 150,
"SCORES" : [
{
"NAME" : "asd",
"OBTAINED_SCORE" : 0
},
{
"NAME" : "acd",
"OBTAINED_SCORE" : 36
}
]
}
will yield:
{
"result" : [
{
"_id" : {
"user_id" : "u1",
"evaluator_id" : "e1"
},
"AVG_SCORE" : 36
}
],
"ok" : 1
}

Related

Find sum of fields inside array in MongoDB

I have a data as follows:
> db.PQRCorp.find().pretty()
{
"_id" : 0,
"name" : "Ancy",
"results" : [
{
"evaluation" : "term1",
"score" : 1.463179736705023
},
{
"evaluation" : "term2",
"score" : 11.78273309957772
},
{
"evaluation" : "term3",
"score" : 6.676176060654615
}
]
}
{
"_id" : 1,
"name" : "Mark",
"results" : [
{
"evaluation" : "term1",
"score" : 5.89772766299929
},
{
"evaluation" : "term2",
"score" : 12.7726680028769
},
{
"evaluation" : "term3",
"score" : 2.78092882672992
}
]
}
{
"_id" : 2,
"name" : "Jeff",
"results" : [
{
"evaluation" : "term1",
"score" : 36.78917882992872
},
{
"evaluation" : "term2",
"score" : 2.883687879200287
},
{
"evaluation" : "term3",
"score" : 9.882668212003763
}
]
}
What I want to achieve is ::Find employees who failed in aggregate (term1 + term2 + term3)
What I am doing and eventually getting is:
db.PQRCorp.aggregate([
{$unwind:"$results"},
{ $group: {_id: "$id",
'totalTermScore':{ $sum:"$results.score" }
}
}])
OUTPUT:{ "_id" : null, "totalTermScore" : 90.92894831067625 }
Simply I am getting a output of a flat sum of all scores. What I want is, to sum terms 1 , 2 and 3 separately for separate employees.
Please can someone help me. I am new to MongoDB (quite evident though).
You do not need to use $unwind and $group here... A simple $project query can $sum your entire score...
db.PQRCorp.aggregate([
{ "$project": {
"name": 1,
"totalTermScore": {
"$sum": "$results.score"
}
}}
])

mongodb $unwind empty array

With this data:
{
"_id" : ObjectId("576948b4999274493425c08a"),
"virustotal" : {
"scan_id" : "4a6c3dfc6677a87aee84f4b629303c40bb9e1dda283a67236e49979f96864078-1465973544",
"sha1" : "fd177b8c50b457dbec7cba56aeb10e9e38ebf72f",
"resource" : "4a6c3dfc6677a87aee84f4b629303c40bb9e1dda283a67236e49979f96864078",
"response_code" : 1,
"scan_date" : "2016-06-15 06:52:24",
"results" : [
{
"sig" : "Gen:Variant.Mikey.29601",
"vendor" : "MicroWorld-eScan"
},
{
"sig" : null,
"vendor" : "nProtect"
},
{
"sig" : null,
"vendor" : "CAT-QuickHeal"
},
{
"sig" : "HEUR/QVM07.1.0000.Malware.Gen",
"vendor" : "Qihoo-360"
}
]
}
},
{
"_id" : ObjectId("5768f214999274362f714e8b"),
"virustotal" : {
"scan_id" : "3d283314da4f99f1a0b59af7dc1024df42c3139fd6d4d4fb4015524002b38391-1466529838",
"sha1" : "fb865b8f0227e9097321182324c959106fcd8c27",
"resource" : "3d283314da4f99f1a0b59af7dc1024df42c3139fd6d4d4fb4015524002b38391",
"response_code" : 1,
"scan_date" : "2016-06-21 17:23:58",
"results" : [
{
"sig" : null,
"vendor" : "Bkav"
},
{
"sig" : null,
"vendor" : "ahnlab"
},
{
"sig" : null,
"vendor" : "MicroWorld-eScan"
},
{
"sig" : "Mal/DrodZp-A",
"vendor" : "Qihoo-360"
}
]
}
}
I'm trying to group by and count the vendor when sig is not null in order to obtain something like:
{
"_id" : "Qihoo-360",
"count" : 2
},
{
"_id" : "MicroWorld-eScan",
"count" : 1
},
{
"_id" : "Bkav",
"count" : 0
},
{
"_id" : "CAT-QuickHeal",
"count" : 0
}
At the moment with this code:
db.analysis.aggregate([
{ $unwind: "$virustotal.results" },
{
$group : {
_id : "$virustotal.results.vendor",
count : { $sum : 1 }
}
},
{ $sort : { count : -1 } }
])
I'm getting everything:
{
"_id" : "Qihoo-360",
"count" : 2
},
{
"_id" : "MicroWorld-eScan",
"count" : 2
},
{
"_id" : "Bkav",
"count" : 1
},
{
"_id" : "CAT-QuickHeal",
"count" : 1
}
How can I count 0 if the sig is null?
You need a conditional expression in your $sum operator that will check if the "$virustotal.results.sig" key is null by using the comparison operator $gt (as specified in the documentation's BSON comparsion order)
You can restructure your pipeline by adding this expression as follows:
db.analysis.aggregate([
{ "$unwind": "$virustotal.results" },
{
"$group" : {
"_id": "$virustotal.results.vendor",
"count" : {
"$sum": {
"$cond": [
{ "$gt": [ "$virustotal.results.sig", null ] },
1, 0
]
}
}
}
},
{ "$sort" : { "count" : -1 } }
])
Sample Output
/* 1 */
{
"_id" : "Qihoo-360",
"count" : 2
}
/* 2 */
{
"_id" : "MicroWorld-eScan",
"count" : 1
}
/* 3 */
{
"_id" : "Bkav",
"count" : 0
}
/* 4 */
{
"_id" : "CAT-QuickHeal",
"count" : 0
}
/* 5 */
{
"_id" : "nProtect",
"count" : 0
}
/* 6 */
{
"_id" : "ahnlab",
"count" : 0
}
I changed the null with None and the numbers increased but seems not correct yet.
Basically doing the query in mongoshell I get like
{
"_id" : "Kaspersky",
"count" : 176.0
}
from python:
Kaspersky 64
one of these 2 is wrong :)
So I'm trying to investigate what part of the query in python is not correctly written compared to the mongo shell one.
I did a simple query:
In mongoshell:
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$ne": "null"} } }})
results: 176
db.analysis.count( { "virustotal.results" : { $elemMatch : { "vendor": "Kaspersky", "sig": {$gt: null} } }})
results: 0
Then I tried in python:
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$ne": "null"} } }})
results: 568
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$ne": "None"} } }})
results: 568
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$gt": "None"} } }})
results: 64
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$gt": "null"} } }})
results: 6
hard to says what is the correct value! I suppose 176 but not able to reproduce in python...

Mongodb sort by sum of keys

I have a json document
{
{
"_id" : ObjectId("5715c4bbac530eb3018b456a"),
"content_id" : "5715c4bbac530eb3018b4569",
"views" : NumberLong(200),
"likes" : NumberLong(100),
"comments" : NumberLong(0)
},
{
"_id" : ObjectId("5715c4bbac530eb3018b4568"),
"content_id" : "5715c4bbac530eb3018b4567",
"views" : NumberLong(300),
"likes" : NumberLong(200),
"comments" : NumberLong(0)
},
{
"_id" : ObjectId("5715c502ac530ee5018b4956"),
"content_id" : "5715c502ac530ee5018b4955",
"views" : NumberLong(500),
"likes" : NumberLong(0),
"comments" : NumberLong(200)
}
}
How can we sort the document order by SUM("views", "likes", "comments")
something like in mysql
SELECT SUM(key1, key2, key3) AS key
FROM document
ORDER BY key
Thanks in advance.
First do a projection to obtain the sum of all the likes, views and comments, then sort based on that sum. I am considering group by content_id if is needed in the second snippet
db.test.aggregate([
{ $project : { "_id" : "$content_id", "total" : { $add : [ "$likes", "$views", "$comments"]}}},
{ $sort : { "total" : 1 }}
])
If you need a group operation if content_id can be duplicated
db.test.aggregate([
{ $project : { "_id" : "$content_id", "total" : { $add : [ "$likes", "$views", "$comments"]}}},
{ $group : { "_id" : "$_id" , totalPerId : { $sum : "$total" }}},
{ $sort : { "total" : 1 }}
])
Based on your test data, you will get:
{ "_id" : "5715c502ac530ee5018b4955", "totalPerId" : NumberLong(700) }
{ "_id" : "5715c4bbac530eb3018b4567", "totalPerId" : NumberLong(500) }
{ "_id" : "5715c4bbac530eb3018b4569", "totalPerId" : NumberLong(300) }

Mongodb distinct aggregation of 3 billion documents

I have a huge collection with 3 billion documents. Each document looks like the following:
"_id" : ObjectId("54c1a013715faf2cc0047c77"),
"service_type" : "JE",
"receiver_id" : NumberLong("865438083645"),
"time" : ISODate("2012-12-05T23:07:36Z"),
"duration" : 24,
"service_description" : "NQ",
"receiver_cell_id" : null,
"location_id" : "658_55525",
"caller_id" : NumberLong("475035504705")
I would like to get the list of distinct users (they should at least appear once as a caller 'caller_id'), their counts (how many times each user appeared in the collection as either caller or receiver) and the count of locations if they are callers (i.e., the count for each location_id per user).
I want to end up with the following:
"number_of_records" : 20,
"locations" : [{location_id: 658_55525, count:5}, {location_id: 840_5425, count:15}],
"user" : NumberLong("475035504705")
I tried the solution described here and here but they are not efficient enough (extremely slow). What would be an efficient way to achieve this?
Use aggregation for your result:
db.<collection>.aggregate([
{ $group : { _id : { user: "$caller_id", localtion: '$location_id'} , count : { $sum : 1} } },
{ $project : { _id : 0, _id : '$_id.user', location : '$_id.localtion', count : '$count' } },
{ $group : { _id : '$_id', 'locations' : { $push : { location_id : '$location', count : '$count' } }, number_of_records : {$sum : '$count'} } },
{ $project : { _id : 0, user : '$_id', locations : '$locations', number_of_records : '$number_of_records'} },
{ $out : 'outputCollection'},
])
The output will be:
{
"0" : {
"locations" : [
{
"location_id" : "840_5425",
"count" : 8
},
{
"location_id" : "658_55525",
"count" : 5
}
],
"number_of_records" : 13,
"user" : NumberLong(475035504705)
}
}
Update using allowDiskUse:
var pipe = [
{ $group : { _id : { user: "$caller_id", localtion: '$location_id'} , count : { $sum : 1} } },
{ $project : { _id : 0, _id : '$_id.user', location : '$_id.localtion', count : '$count' } },
{ $group : { _id : '$_id', 'locations' : { $push : { location_id : '$location', count : '$count' } }, number_of_records : {$sum : '$count'} } },
{ $project : { _id : 0, user : '$_id', locations : '$locations', number_of_records : '$number_of_records'} },
{ $out : 'outputCollection'},
];
db.runCommand(
{ aggregate: "collection",
pipeline: pipe,
allowDiskUse: true
}
)
A map-reduce solution would be more suitable here rather than an aggregation pipeline, simply because it avoids two unwinds. If you could bring out an aggregation solution with a single unwind, that would be it. But the below map-reduce solution is one way to do it, though you would need to measure its running time against large data and see if it works for you.
The map function:
var map = function(){
emit(this.caller_id,
{locs:[{"location_id":this.location_id,"count":1}]});
}
The reduce function:
var reduce = function(key,values){
var result = {locs:[]};
var locations = {};
values.forEach(function(value){
value.locs.forEach(function(loc){
if(!locations[loc.location_id]){
locations[loc.location_id] = loc.count;
}
else{
locations[loc.location_id]++;
}
})
})
Object.keys(locations).forEach(function(k){
result.locs.push({"location_id":k,"count":locations[k]});
})
return result;
}
The finalize function:
var finalize = function(key,value){
var total = 0;
value.locs.forEach(function(loc){
total += loc.count;
})
return {"total":total,"locs":value.locs};
}
Invoking map-reduce:
db.collection.mapReduce(map,reduce,{"out":"t1","finalize":finalize});
Aggregating the result once the map-reduce produces its output.
db.t1.aggregate([
{$project:{"_id":0,
"number_of_records":"$value.total",
"locations":"$value.locs","user":"$_id"}}
])
Sample o/p:
{
"number_of_records" : 3,
"locations" : [
{
"location_id" : "658_55525",
"count" : 1
},
{
"location_id" : "658_55525213",
"count" : 2
}
],
"user" : 2
}
{
"number_of_records" : 1,
"locations" : [
{
"location_id" : "658_55525",
"count" : 1
}
],
"user" : NumberLong("475035504705")
}
The map-reduce java script code should be self explanatory.

Mongodb Time Series operations and generation

I've got a Mongodb Collection with this kind of docs :
{
"_id" : ObjectId("53cb898bed4bd6c24ae07a9f"),
"account" : "C1"
"created_on" : ISODate("2014-10-01T01:23:00.000Z")
"value" : 253
}
and
{
"_id" : ObjectId("52cb898bed4bd6c24ae06a9e"),
"account" : "C2"
"created_on" : ISODate("2014-10-01T01:23:00.000Z")
"value" : 9381
}
There is a document every minutes for C1 and C2.
I would like to generate data for an other account "C0" which will be equal to : (C2 - C1)*0.25
So the aim is to generate data for every minutes in the collection.
According to you, is it possible to do that in mongo shell ?
Thank you very much :)
The logic to solve this problem, is as below:
a) group all the records by created_on date.
b) get the value of both the documents in each group.
c) calculate the difference the C2 and C1 documents for each group.
d) In case one of the documents is missing difference
would be the value of the existing document.
d) project a document with value as (difference*.25) in each group.
e) insert the projected document to the collection.
I would like to propose two solutions to this, the first one would be on your assumption,
There is a document every minutes for C1 and C2.
So for every created_on time, there would be only two documents, C1 and C2.
db.time.aggregate([ {
$match : {
"account" : {
$in : [ "C1", "C2" ]
}
}
}, {
$group : {
"_id" : "$created_on",
"first" : {
$first : "$value"
},
"second" : {
$last : "$value"
},
"count" : {
$sum : 1
}
}
}, {
$project : {
"_id" : 0,
"value" : {
$multiply : [ {
$cond : [ {
$lte : [ "$count", 1 ]
}, "$first", {
$subtract : [ "$first", "$second" ]
} ]
}, 0.25 ]
},
"created_on" : "$_id",
"account" : {
$literal : "C0"
}
}
} ]).forEach(function(doc) {
doc.value = Math.abs(doc.value);
db.time.insert(doc);
});
The second solution is based on real-time scenarios. For a particular created_on time, there can be 'n' number of C1 documents and 'm' number of C2 documents with different values, but we would need only one 'C0' document representing the differences, for that particular created_on time. You would need an extra $group pipeline operator as below:
db.time.aggregate([ {
$match : {
"account" : {
$in : [ "C1", "C2" ]
}
}
}, {
$group : {
"_id" : {
"created_on" : "$created_on",
"account" : "$account"
},
"created_on" : {
$first : "$created_on"
},
"values" : {
$sum : "$value"
}
}
}, {
$group : {
"_id" : "$created_on",
"first" : {
$first : "$values"
},
"second" : {
$last : "$values"
},
"count" : {
$sum : 1
}
}
}, {
$project : {
"_id" : 0,
"value" : {
$multiply : [ {
$cond : [ {
$lte : [ "$count", 1 ]
}, "$first", {
$subtract : [ "$first", "$second" ]
} ]
}, 0.25 ]
},
"created_on" : "$_id",
"account" : {
$literal : "C0"
}
}
} ]).forEach(function(doc) {
doc.value = Math.abs(doc.value);
db.time.insert(doc);
});