mongodb $unwind empty array - mongodb

With this data:
{
"_id" : ObjectId("576948b4999274493425c08a"),
"virustotal" : {
"scan_id" : "4a6c3dfc6677a87aee84f4b629303c40bb9e1dda283a67236e49979f96864078-1465973544",
"sha1" : "fd177b8c50b457dbec7cba56aeb10e9e38ebf72f",
"resource" : "4a6c3dfc6677a87aee84f4b629303c40bb9e1dda283a67236e49979f96864078",
"response_code" : 1,
"scan_date" : "2016-06-15 06:52:24",
"results" : [
{
"sig" : "Gen:Variant.Mikey.29601",
"vendor" : "MicroWorld-eScan"
},
{
"sig" : null,
"vendor" : "nProtect"
},
{
"sig" : null,
"vendor" : "CAT-QuickHeal"
},
{
"sig" : "HEUR/QVM07.1.0000.Malware.Gen",
"vendor" : "Qihoo-360"
}
]
}
},
{
"_id" : ObjectId("5768f214999274362f714e8b"),
"virustotal" : {
"scan_id" : "3d283314da4f99f1a0b59af7dc1024df42c3139fd6d4d4fb4015524002b38391-1466529838",
"sha1" : "fb865b8f0227e9097321182324c959106fcd8c27",
"resource" : "3d283314da4f99f1a0b59af7dc1024df42c3139fd6d4d4fb4015524002b38391",
"response_code" : 1,
"scan_date" : "2016-06-21 17:23:58",
"results" : [
{
"sig" : null,
"vendor" : "Bkav"
},
{
"sig" : null,
"vendor" : "ahnlab"
},
{
"sig" : null,
"vendor" : "MicroWorld-eScan"
},
{
"sig" : "Mal/DrodZp-A",
"vendor" : "Qihoo-360"
}
]
}
}
I'm trying to group by and count the vendor when sig is not null in order to obtain something like:
{
"_id" : "Qihoo-360",
"count" : 2
},
{
"_id" : "MicroWorld-eScan",
"count" : 1
},
{
"_id" : "Bkav",
"count" : 0
},
{
"_id" : "CAT-QuickHeal",
"count" : 0
}
At the moment with this code:
db.analysis.aggregate([
{ $unwind: "$virustotal.results" },
{
$group : {
_id : "$virustotal.results.vendor",
count : { $sum : 1 }
}
},
{ $sort : { count : -1 } }
])
I'm getting everything:
{
"_id" : "Qihoo-360",
"count" : 2
},
{
"_id" : "MicroWorld-eScan",
"count" : 2
},
{
"_id" : "Bkav",
"count" : 1
},
{
"_id" : "CAT-QuickHeal",
"count" : 1
}
How can I count 0 if the sig is null?

You need a conditional expression in your $sum operator that will check if the "$virustotal.results.sig" key is null by using the comparison operator $gt (as specified in the documentation's BSON comparsion order)
You can restructure your pipeline by adding this expression as follows:
db.analysis.aggregate([
{ "$unwind": "$virustotal.results" },
{
"$group" : {
"_id": "$virustotal.results.vendor",
"count" : {
"$sum": {
"$cond": [
{ "$gt": [ "$virustotal.results.sig", null ] },
1, 0
]
}
}
}
},
{ "$sort" : { "count" : -1 } }
])
Sample Output
/* 1 */
{
"_id" : "Qihoo-360",
"count" : 2
}
/* 2 */
{
"_id" : "MicroWorld-eScan",
"count" : 1
}
/* 3 */
{
"_id" : "Bkav",
"count" : 0
}
/* 4 */
{
"_id" : "CAT-QuickHeal",
"count" : 0
}
/* 5 */
{
"_id" : "nProtect",
"count" : 0
}
/* 6 */
{
"_id" : "ahnlab",
"count" : 0
}

I changed the null with None and the numbers increased but seems not correct yet.
Basically doing the query in mongoshell I get like
{
"_id" : "Kaspersky",
"count" : 176.0
}
from python:
Kaspersky 64
one of these 2 is wrong :)
So I'm trying to investigate what part of the query in python is not correctly written compared to the mongo shell one.
I did a simple query:
In mongoshell:
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$ne": "null"} } }})
results: 176
db.analysis.count( { "virustotal.results" : { $elemMatch : { "vendor": "Kaspersky", "sig": {$gt: null} } }})
results: 0
Then I tried in python:
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$ne": "null"} } }})
results: 568
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$ne": "None"} } }})
results: 568
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$gt": "None"} } }})
results: 64
rtmp = results_db.analysis.count( { "virustotal.results" : { "$elemMatch" : { "vendor": "Kaspersky", "sig": {"$gt": "null"} } }})
results: 6
hard to says what is the correct value! I suppose 176 but not able to reproduce in python...

Related

MongoDB count distinct items

I have following query on a list with this fields : key,time,p,email
use app_db;
db.getCollection("app_log").aggregate(
[
{
"$match" : {
"key" : "login"
}
},
{
"$group" : {
"_id" : {
"$substr" : [
"$time",
0.0,
10.0
]
},
"total" : {
"$sum" : "$p"
},
"count" : {
"$sum" : 1.0
}
}
}
]
);
and the output is something like this :
{
"_id" : "2019-08-25",
"total" : NumberInt(623),
"count" : 400.0
}
{
"_id" : "2019-08-24",
"total" : NumberInt(2195),
"count" : 1963.0
}
{
"_id" : "2019-08-23",
"total" : NumberInt(1294),
"count" : 1706.0
}
{
"_id" : "2019-08-22",
"total" : NumberInt(53),
"count" : 1302.0
}
But I need the count to be distinctive on email field, which is count number of distinct email addresses who logged in per day and their p value is greater 0
You need $addToSet to get an array of unique email values per day and then you can use $size to count the number of items in that array:
db.getCollection("app_log").aggregate(
[
{
"$match" : {
"key" : "login"
}
},
{
"$group" : {
"_id" : {
"$substr" : [
"$time",
0.0,
10.0
]
},
"total" : {
"$sum" : "$p"
},
"emails" : {
"$addToSet": "$email"
}
}
},
{
$project: {
_id: 1,
total: 1,
countDistinct: { $size: "$emails" }
}
}
]
);

How to count from two fields in mongoDB

{
"_id" : ObjectId("56bd8e9de517259412a743ab"),
"user_token" : "mzXhdbCu",
"sender_details" : {
"location" : "XYZ",
"zipcode" : "610208"
},
"shipping_address" : {
"location" : "ABC",
"zipcode" : "602578
}
}
I have been trying to count the number of instances of each unique zipcode from both
$sender_details.zipcode
and
$shipping_address.zipcode
I tried to use the following code
db.ac_consignments.aggregate({
$group: {
_id: {
"zipcode":"$sender_details.zipcode",
"szipcode":"$shipping_address.zipcode"
},
count: {"$sum":1}
}
})
The output I receive is this
{
"result" : [
{
"_id" : {
"zipcode" : "610208",
"szipcode" : "602578"
},
"count" : 7
},
{
"_id" : {
"zipcode" : "602578",
"szipcode" : "678705"
},
"count" : 51
}
],
"ok" : 1
}
But what I require is the count of each zipcode present in $sender_details.zipcode and $shipping_address.zipcode totally. So an output like this
{
"result" : [
{
"_id" : {
"zipcode" : "610208",
},
"count" : 7
},
{
"_id" : {
"zipcode" : "602578"
},
"count" : 51
}
{
"_id" : {
"zipcode" : "678705"
},
"count" : 51
}
],
"ok" : 1
}
The following pipeline should work for you
db.getCollection('ac_consignments').aggregate([
{
$project: {
zipcode: [ "$sender_details.zipcode", "$shipping_address.zipcode" ]
}
},
{
$unwind: "$zipcode"
},
{
$group: {
_id: "$zipcode",
count: { $sum: 1 }
}
}
])
which produces output like this
/* 1 */
{
"_id" : "610208",
"count" : 1.0
}
/* 2 */
{
"_id" : "610209",
"count" : 2.0
}
/* 3 */
{
"_id" : "602578",
"count" : 1.0
}
/* 4 */
{
"_id" : "602579",
"count" : 2.0
}
when using the following as sample data
/* 1 */
{
"_id" : ObjectId("56bd8e9de517259412a743ab"),
"user_token" : "mzXhdbCu",
"sender_details" : {
"location" : "XYZ",
"zipcode" : "610208"
},
"shipping_address" : {
"location" : "ABC",
"zipcode" : "602578"
}
}
/* 2 */
{
"_id" : ObjectId("56bd8e9de517259412a743ac"),
"user_token" : "mzXhdbCu",
"sender_details" : {
"location" : "XYZ",
"zipcode" : "610209"
},
"shipping_address" : {
"location" : "ABC",
"zipcode" : "602579"
}
}
/* 3 */
{
"_id" : ObjectId("56bd8e9de517259412a753ac"),
"user_token" : "mzXhdbCu",
"sender_details" : {
"location" : "XYZ",
"zipcode" : "610209"
},
"shipping_address" : {
"location" : "ABC",
"zipcode" : "602579"
}
}
See the following GIF
Update for older versions
db.getCollection('ac_consignments').aggregate([
{
$project: {
sender_zip: "$sender_details.zipcode",
shipping_zip: "$shipping_address.zipcode",
party: { $literal: ["sender_zip", "shipping_zip"] }
}
},
{
$unwind: "$party"
},
{
$group: {
_id: "$_id",
zipcode: {
$push: {
$cond: [
{ $eq: ["$party", "sender_zip"] },
"$sender_zip",
"$shipping_zip"
]
}
}
}
},
{
$unwind: "$zipcode"
},
{
$group: {
_id: "$zipcode",
count: { $sum: 1 }
}
}
])

mongodb aggregation $group and then $push a object

this is my data :
> db.bookmarks.find({"userId" : "56b9b74bf976ab70ff6b9999"}).pretty()
{
"_id" : ObjectId("56c2210fee4a33579f4202dd"),
"userId" : "56b9b74bf976ab70ff6b9999",
"items" : [
{
"itemId" : "28",
"timestamp" : "2016-02-12T18:07:28Z"
},
{
"itemId" : "29",
"timestamp" : "2016-02-12T18:07:29Z"
},
{
"itemId" : "30",
"timestamp" : "2016-02-12T18:07:30Z"
},
{
"itemId" : "31",
"timestamp" : "2016-02-12T18:07:31Z"
},
{
"itemId" : "32",
"timestamp" : "2016-02-12T18:07:32Z"
},
{
"itemId" : "33",
"timestamp" : "2016-02-12T18:07:33Z"
},
{
"itemId" : "34",
"timestamp" : "2016-02-12T18:07:34Z"
}
]
}
I want to have something like (actually i hope the _id can become userId too) :
{
"_id" : "56b9b74bf976ab70ff6b9999",
"items" : [
{ "itemId": "32", "timestamp": "2016-02-12T18:07:32Z" },
{ "itemId": "31", "timestamp": "2016-02-12T18:07:31Z" },
{ "itemId": "30", "timestamp": "2016-02-12T18:07:30Z" }
]
}
What I have now :
> db.bookmarks.aggregate(
... { $match: { "userId" : "56b9b74bf976ab70ff6b9999" } },
... { $unwind: '$items' },
... { $sort: { 'items.timestamp': -1} },
... { $skip: 2 },
... { $limit: 3},
... { $group: { '_id': '$userId' , items: { $push: '$items.itemId' } } }
... ).pretty()
{ "_id" : "56b9b74bf976ab70ff6b9999", "items" : [ "32", "31", "30" ] }
i tried to read the document in mongo and find out i can $push, but somehow i cannot find a way to push such object, which is not defined anywhere in the whole object. I want to have the timestamp also.. but i don't know how should i modified the $group (or others??) to do so. thanks for helping!
This code, which I tested in the MongoDB 3.2.1 shell, should give you the output format that you want:
> db.bookmarks.aggregate(
{ "$match" : { "userId" : "Ursula" } },
{ "$unwind" : "$items" },
{ "$sort" : { "items.timestamp" : -1 } },
{ "$skip" : 2 },
{ "$limit" : 3 },
{ "$group" : { "_id" : "$userId", items: { "$push" : { "myPlace" : "$items.itemId", "myStamp" : "$items.timestamp" } } } } ).pretty()
Running the above will produce this output:
{
"_id" : "Ursula",
"items" : [
{
"myPlace" : "52",
"myStamp" : ISODate("2016-02-13T18:07:32Z")
},
{
"myPlace" : "51",
"myStamp" : ISODate("2016-02-13T18:07:31Z")
},
{
"myPlace" : "50",
"myStamp" : ISODate("2016-02-13T18:07:30Z")
}
]
}
In MongoDB version 3.2.x, you can also use the $out operator in the very last stage of the aggregation pipeline, and have the output of the aggregation query written to a collection. Here is the code I used:
> db.bookmarks.aggregate(
{ "$match" : { "userId" : "Ursula" } },
{ "$unwind" : "$items" },
{ "$sort" : { "items.timestamp" : -1 } },
{ "$skip" : 2 },
{ "$limit" : 3 },
{ "$group" : { "_id" : "$userId", items: { "$push" : { "myPlace" : "$items.itemId", "myStamp" : "$items.timestamp" } } } },
{ "$out" : "ursula" } )
This gives me a collection named "ursula":
> show collections
ursula
and I can query that collection:
> db.ursula.find().pretty()
{
"_id" : "Ursula",
"items" : [
{
"myPlace" : "52",
"myStamp" : ISODate("2016-02-13T18:07:32Z")
},
{
"myPlace" : "51",
"myStamp" : ISODate("2016-02-13T18:07:31Z")
},
{
"myPlace" : "50",
"myStamp" : ISODate("2016-02-13T18:07:30Z")
}
]
}
>
Last of all, this is the input document I used in the aggregation query. You can compare this document to how I coded the aggregation query to see how I built the new items array.
> db.bookmarks.find( { "userId" : "Ursula" } ).pretty()
{
"_id" : ObjectId("56c240ed55f2f6004dc3b25c"),
"userId" : "Ursula",
"items" : [
{
"itemId" : "48",
"timestamp" : ISODate("2016-02-13T18:07:28Z")
},
{
"itemId" : "49",
"timestamp" : ISODate("2016-02-13T18:07:29Z")
},
{
"itemId" : "50",
"timestamp" : ISODate("2016-02-13T18:07:30Z")
},
{
"itemId" : "51",
"timestamp" : ISODate("2016-02-13T18:07:31Z")
},
{
"itemId" : "52",
"timestamp" : ISODate("2016-02-13T18:07:32Z")
},
{
"itemId" : "53",
"timestamp" : ISODate("2016-02-13T18:07:33Z")
},
{
"itemId" : "54",
"timestamp" : ISODate("2016-02-13T18:07:34Z")
}
]
}

$avg in mongodb aggregation

Document looks like this:
{
"_id" : ObjectId("361de42f1938e89b179dda42"),
"user_id" : "u1",
"evaluator_id" : "e1",
"candidate_id" : ObjectId("54f65356294160421ead3ca1"),
"OVERALL_SCORE" : 150,
"SCORES" : [
{ "NAME" : "asd", "OBTAINED_SCORE" : 30}, { "NAME" : "acd", "OBTAINED_SCORE" : 36}
]
}
Aggregation function:
db.coll.aggregate([ {$unwind:"$SCORES"}, {$group : { _id : { user_id : "$user_id", evaluator_id : "$evaluator_id"}, AVG_SCORE : { $avg : "$SCORES.OBTAINED_SCORE" }}} ])
Suppose if there are two documents with same "user_id" (say u1) and different "evaluator_id" (say e1 and e2).
For example:
1) Average will work like this ((30 + 20) / 2 = 25). This is working for me.
2) But for { evaluator_id : "e1" } document, score is 30 for { "NAME" : "asd" } and { evaluator_id : "e2" } document, score is 0 for { "NAME" : "asd" }. In this case, I want the AVG_SCORE to be 30 only (not (30 + 0) / 2 = 15).
Is it possible through aggregation??
Could any one help me out.
It's possible by placing a $match between the $unwind and $group aggregation pipelines to first filter the arrays which match the specified condition to include in the average computation and that is, score array where the obtained score is not equal to 0 "SCORES.OBTAINED_SCORE" : { $ne : 0 }
db.coll.aggregate([
{
$unwind: "$SCORES"
},
{
$match : {
"SCORES.OBTAINED_SCORE" : { $ne : 0 }
}
},
{
$group : {
_id : {
user_id : "$user_id",
evaluator_id : "$evaluator_id"
},
AVG_SCORE : {
$avg : "$SCORES.OBTAINED_SCORE"
}
}
}
])
For example, the aggregation result for this document:
{
"_id" : ObjectId("5500aaeaa7ef65c7460fa3d9"),
"user_id" : "u1",
"evaluator_id" : "e1",
"candidate_id" : ObjectId("54f65356294160421ead3ca1"),
"OVERALL_SCORE" : 150,
"SCORES" : [
{
"NAME" : "asd",
"OBTAINED_SCORE" : 0
},
{
"NAME" : "acd",
"OBTAINED_SCORE" : 36
}
]
}
will yield:
{
"result" : [
{
"_id" : {
"user_id" : "u1",
"evaluator_id" : "e1"
},
"AVG_SCORE" : 36
}
],
"ok" : 1
}

Use field value as key

I am doing this query
db.analytics.aggregate([
{
$match: {"event":"USER_SENTIMENT"}
},
{ $group: {
_id: {brand:"$data.brandId",sentiment:"$data.sentiment"},
count: {$sum : 1}
}
},
{ $group: {
_id: "$_id.brand",
sentiments: {$addToSet : {sentiment:"$_id.sentiment", count:"$count"}}
}
}
])
Which generates that :
{
"result" : [
{
"_id" : 57,
"sentiments" : [
{
"sentiment" : "Meh",
"count" : 4
}
]
},
{
"_id" : 376,
"sentiments" : [
{
"sentiment" : "Meh",
"count" : 1
},
{
"sentiment" : "Happy",
"count" : 1
},
{
"sentiment" : "Confused",
"count" : 1
}
]
}
],
"ok" : 1
}
But What I want is that :
[
{
"_id" : 57,
"Meh" : 4
},
{
"_id" : 376,
"Meh" : 1,
"Happy" : 1,
"Confused" : 1
}
]
Any idea on how to transform that? The blocking point for me is to transform a value into a key.