MongoDB nested object aggregation counting - mongodb

I have a highly nested mongoDB set of objects and I want to count the number of subdocuments that match a given condition Edit: (in each document). For example:
{"_id":{"chr":"20","pos":"14371","ref":"A","alt":"G"},
"studies":[
{
"study_id":"Study1",
"samples":[
{
"sample_id":"NA00001",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"NA00002",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
{"_id":{"chr":"20","pos":"14372","ref":"T","alt":"AA"},
"studies":[
{
"study_id":"Study3",
"samples":[
{
"sample_id":"SAMPLE1",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"SAMPLE2",
"formatdata":[
{"GT":"1|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
{"_id":{"chr":"20","pos":"14373","ref":"C","alt":"A"},
"studies":[
{
"study_id":"Study3",
"samples":[
{
"sample_id":"SAMPLE3",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
},
{
"sample_id":"SAMPLE7",
"formatdata":[
{"GT":"0|0","GQ":48,"DP":8,"HQ":[51,51]}
]
}
]
}
]
}
I want to know how many subdocuments contain GT:"1|0", which in this case would be 1 in the first document, and two in the second, and 0 in the 3rd. I've tried the unwind and aggregate functions but I'm obviously not doing something correct. When I try to count the sub documents by the "GT" field, mongo complains:
db.collection.aggregate([{$group: {"$studies.samples.formatdata.GT":1,_id:0}}])
since my group's names cannot contain ".", yet if I leave them out:
db.collection.aggregate([{$group: {"$GT":1,_id:0}}])
it complains because "$GT cannot be an operator name"
Any ideas?

You need to process $unwind when working with arrays, and you need to do this three times:
db.collection.aggregate([
// Un-wind the array's to access filtering
{ "$unwind": "$studies" },
{ "$unwind": "$studies.samples" },
{ "$unwind": "$studies.samples.formdata" },
// Group results to obtain the matched count per key
{ "$group": {
"_id": "$studies.samples.formdata.GT",
"count": { "$sum": 1 }
}}
])
Ideally you want to filter your input. Possibly do this with a $match both before and after $unwind is processed and using a $regex to match documents where the data at point begins with a "1".
db.collection.aggregate([
// Match first to exclude documents where this is not present in any array member
{ "$match": { "studies.samples.formdata.GT": /^1/ } },
// Un-wind the array's to access filtering
{ "$unwind": "$studies" },
{ "$unwind": "$studies.samples" },
{ "$unwind": "$studies.samples.formdata" },
// Match to filter
{ "$match": { "studies.samples.formdata.GT": /^1/ } },
// Group results to obtain the matched count per key
{ "$group": {
"_id": {
"_id": "$_id",
"key": "$studies.samples.formdata.GT"
},
"count": { "$sum": 1 }
}}
])
Note that in all cases the "dollar $" prefixed entries are the "variables" referring to properties of the document. These are "values" to use an input on the right side. The left side "keys" must be specified as a plain string key. No variable can be used to name a key.

https://mongoplayground.net/p/DpX6cFhR_mm
db.collection.aggregate([
{
"$unwind": "$tags"
},
{
"$match": {
"$or": [
{
"tags.name": "Canada"
},
{
"tags.name": "ABC"
}
]
}
},
{
"$group": {
"_id": null,
"count": {
"$sum": 1
}
}
}
])

Related

MongoDB document merge without a-priori knowledge of fields

I would like to merge several documents. Most of the fields have the same values but there might be one or two fields that have different values. These fields are unknown beforehand. Ideally I would like to merge all the documents keeping the fields that are the same as is but creating an array of values only for those fields that have some variation.
For my first approach I grouped by a common field to my documents and kept the first document, this however discards some information that varies in other fields.
group_documents = {
"$group": {
"_id": "$0020000E.Value",
"doc": {
"$first": "$$ROOT"
}
}
}
merge_documents = {
"$replaceRoot": {
"newRoot": "$doc"
}
}
write_collection = { "$out": { "db": "database", "coll": "records_nd" } }
objects = coll.aggregate(pipeline)
IF the fields that have different values where known I would have done something like this,
merge_sol1
or
merge_sol2
or
merge_sol3
The third solution is actually very close to my desired output and I could tweak it a bit. But these answers assume a-priori knowledge of the fields to be merged.
You can first convert $$ROOT to array of k-v tuples by $objectToArray. Then, $group all fields by $addToSet to put all distinct values into an array first. Then, check the size of the result array and conditionally pick the first item if the array size is 1 (i.e. the value is the same for every documents in the field); Otherwise, keep the result array. Finally, revert back to original document form by $arrayToObject.
db.collection.aggregate([
{
$project: {
_id: "$key",
arr: {
"$objectToArray": "$$ROOT"
}
}
},
{
"$unwind": "$arr"
},
{
$match: {
"arr.k": {
$nin: [
"key",
"_id"
]
}
}
},
{
$group: {
_id: {
id: "$_id",
k: "$arr.k"
},
v: {
"$addToSet": "$arr.v"
}
}
},
{
$project: {
_id: "$_id.id",
arr: [
{
k: "$_id.k",
v: {
"$cond": {
"if": {
$gt: [
{
$size: "$v"
},
1
]
},
"then": "$v",
"else": {
$first: "$v"
}
}
}
}
]
}
},
{
"$project": {
doc: {
"$arrayToObject": "$arr"
}
}
},
{
"$replaceRoot": {
"newRoot": {
"$mergeObjects": [
{
_id: "$_id"
},
"$doc"
]
}
}
}
])
Mongo Playground

MongoDb Aggregate transform common objects in arrays

I'm stuck in an issue:
I need to transform:
[ {a:1 , b:2 , c:3} , {a:5, b:6, c:7} ]
Into:
[{a:[1,5], b:[2,6] , c: [3,7]}]
Just look for common keys and group that.
I'm not sure if i should use $project + $reduce or $group. Someone have a tip?
To do this, we should change the object to array first to be abble to group by key. You can check it here.
{
"$project": {
"_id": 0 // First we have to eliminate the _id and all the other fields that we dont want to group
}
},
{
"$project": {
"arr": {
"$objectToArray": "$$ROOT"
}
}
},
Then we sould unwind this array and group the keys.
{
"$unwind": "$arr"
},
{
"$group": {
"_id": "$arr.k",
"field": {
"$push": "$arr.v"
}
}
}
Finally we remap the information with the desired output.
{
$replaceRoot: {
newRoot: {
$arrayToObject: [
[
{
k: "$_id",
v: "$field"
}
]
]
}
}
}

How to find null documents in mongodb?

I'm a complete beginner in mongodb . Actually I'm trying to find all the documents containing null or nothing for example documents like {
"_id" : "abc"
} for deleting them from collection.
But even after searching a lot of SO questions I couldn't get any solution .So, how can I do this ? and sorry if I'm ignoring anything.
I don't know how to do it in a single operation, but you can try something like this:
db["collectionName"].find({_id: {$exists: true}}).forEach(function(doc) {
if (Object.keys(doc).length === 1) {
// ..delete this document db["collectionName"].remove({_id: doc._id})
}
})
One possible solution is to get a list of the _id values of those null field documents and then remove them. This can be significantly efficient considering that you only execute two queries instead of looping through the whole collection (this can potentially affect your db performance especially with large collections).
Consider running the following aggregate pipeline to get those ids:
var ids = db.collection.aggregate([
{ "$project": {
"hashmaps": { "$objectToArray": "$$ROOT" }
} },
{ "$project": {
"keys": "$hashmaps.k"
} },
{ "$redact": {
"$cond": [
{
"$eq":[
{
"$ifNull": [
{ "$arrayElemAt": ["$keys", 1] },
0
]
},
0
]
},
"$$KEEP",
"$$PRUNE"
]
} },
{ "$group": {
"_id": null,
"ids": { "$push": "$_id" }
} }
]).toArray()[0]["ids"];
Removing the documents
db.collection.remove({ "_id": { "$in": ids } });
The other approach is similar to the above in that you would need two queries; the first which returns a list of all the top level fields in the collection and the last removes the documents from the collection which do not have those fields altogether.
Consider running the following queries:
/*
Run an aggregate pipeline operation to get a list
of all the top-level fields in the collection
*/
var fields = db.collection.aggregate([
{ "$project": {
"hashmaps": { "$objectToArray": "$$ROOT" }
} },
{ "$project": {
"keys": "$hashmaps.k"
} },
{ "$group": {
"_id": null,
"fields": { "$addToSet": "$keys" }
} },
{ "$project": {
"fields": {
"$setDifference": [
{
"$reduce": {
"input": "$fields",
"initialValue": [],
"in": { "$setUnion" : ["$$value", "$$this"] }
}
},
["_id"]
]
}
}
}
]).toArray()[0]["fields"];
The second query looks for the existence of all the fields except the _id one. For example, suppose your collection has documents with the keys _id, a, b and c, the query
db.collection.find({
"a" : { "$exists": false },
"b" : { "$exists": false },
"c" : { "$exists": false }
});
matches documents that do not contain the all the three fields a, b AND c:
So if you have a list of the top level fields in your collection then all you need is to construct the above query document. Use reduce method on the array for this:
// Construct the above query
var query = fields.reduce(function(acc, curr) {
acc[curr] = { "$exists": false };
return acc;
}, {});
Then use the query to remove the documents as
db.collection.remove(query);

Finding multiple documents with one query

I have a schema like so:
schema
{
owner: <id to other document type>
created: date
}
I have an array of owner's ids: [owner_id_1, owner_id_2, ... owner_id_x]
I want to get a list of documents, with these owners, but limited to just the latest of each. Doing the queries individually:
find_one({ owner: owner_id_1 }).sort({ created: -1 }).limit(1)
But I don't want to have to fire off x of these, I'd like a way to do it in one query if possible
The .aggregate() method allows you do do this, along with matching the documents via the $in operator:
collection.aggregate([
{ "$match": { "owner": { "$in": [owner_id_1, owner_id_2, ... owner_id_x] } },
{ "$group": {
"_id": "$owner",
"created": { "$max": "$created" }
}}
])
Gets the maximum ( $max ) "created" value for each "owner" you asked for with the $in, which takes an array of values to match the field in the condition.
If you wanted more data than just that one field, the use $sort before you $group:
collection.aggregate([
{ "$match": { "owner": { "$in": [owner_id_1, owner_id_2, ... owner_id_x] } },
{ "$sort": { "owner": 1, "created": -1 } },
{ "$group": {
"_id": "$owner",
"created": { "$first": "$created" },
"docId": { "$first": "$_id" },
"something": { "$first": "$something" }
}}
])
And the $first takes the first value ( descending was done in sort ) from each grouping boundary.

How to calculate difference between values of different documents using mongo aggregation?

Hi my mongo structure as below
{
"timemilliSec":1414590255,
"data":[
{
"x":23,
"y":34,
"name":"X"
},
{
"x":32,
"y":50,
"name":"Y"
}
]
},
{
"timemilliSec":1414590245,
"data":[
{
"x":20,
"y":13,
"name":"X"
},
{
"x":20,
"y":30,
"name":"Y"
}
]
}
Now I want to calculate difference of first document and second document and second to third in this way
so calculation as below
diffX = ((data.x-data.x)/(data.y-data.y)) in our case ((23-20)/(34-13))
diffY = ((data.x-data.x)/(data.y-data.y)) in our case ((32-20)/(50-30))
Tough question in principle, but I'm going to stay with the simplified case you present of two documents and base a solution around that. The concepts should abstract, but are more difficult for expanded cases. Possible with the aggregation framework in general:
db.collection.aggregate([
// Match the documents in a pair
{ "$match": {
"timeMilliSec": { "$in": [ 1414590255, 1414590245 ] }
}}
// Trivial, just keeping an order
{ "$sort": { "timeMilliSec": -1 } },
// Unwind the arrays
{ "$unwind": "$data" },
// Group first and last
{ "$group": {
"_id": "$data.name",
"firstX": { "$first": "$data.x" },
"lastX": { "$last": "$data.x" },
"firstY": { "$first": "$data.y" },
"lastY": { "$last": "$data.y" }
}},
// Difference on the keys
{ "$project": {
"diff": {
"$divide": [
{ "$subtract": [ "$firstX", "$lastX" ] },
{ "$subtract": [ "$firstY", "$lastY" ] }
]
}
}},
// Not sure you want to take it this far
{ "$group": {
"_id": null,
"diffX": {
"$min": {
"$cond": [
{ "$eq": [ "$_id", "X" ] },
"$diff",
false
]
}
},
"diffY": {
"$min": {
"$cond": [
{ "$eq": [ "$_id", "Y" ] },
"$diff",
false
]
}
}
}}
])
Possibly overblown, not sure of the intent, but the output of this based on the sample would be:
{
"_id" : null,
"diffX" : 0.14285714285714285,
"diffY" : 0.6
}
Which matches the calculations.
You can adapt to your case, but the general principle is as shown.
The last "pipeline" stage there is a little "extreme" as all that is done is combine the results into a single document. Otherwise, the "X" and "Y" results are already obtained in two documents in the pipeline. Mostly by the $group operation with $first and $last operations to find the respective elements on the grouping boundary.
The subsequent operations in $project as a pipeline stage performs the required math to determine the distinct results. See the aggregation operators for more details, particularly $divide and $subtract.
Whatever you do you follow this course. Get a "start" and "end" pair on your two keys. Then perform the calculations.