Get unique array elements in MongoDB by "master" element - mongodb

I have mongodb rows with array element which looks like this:
{"data" : [1, 111]}
{"data" : [222, 1]}
{"data" : [1, 333]}
{"data" : [2, 444]}
How to get unique array elements by "master" element. So for example "master" element is 1 I should get result: [111, 222, 333] and not 444, because that array does not contain 1. If master element would be 2, the result should be: [444]
I tried something like this aggregation. Is it correct? Are there any performance issues? What indecies should be on table to make it fast?
[
{$match: {"data": 1}},
{$project : {a : '$data'}},
{$unwind: '$a'},
{$group: {_id: 'a', items: {$addToSet: '$a'}}}
]

You can use Aggregation framework:
$match to filter all documents that have "master" key in the "data" array.
$group to concatenate "data" arrays of all documents in one property called "result" and $filter to filter our "master" element from "data" arrays. ("result" will be an array that will have all documents "data" arrays as elements).
$reduce with $concatArrays to concatenate all "data" arrays inside "result" property.
db.collection.aggregate([
{
"$match": {
data: 1
}
},
{
"$group": {
"_id": null,
result: {
$addToSet: {
"$filter": {
"input": "$data",
"cond": {
"$ne": [
"$$this",
1
]
}
}
}
}
}
},
{
"$project": {
result: {
$reduce: {
input: "$result",
initialValue: [],
in: {
$concatArrays: [
"$$value",
"$$this"
]
}
}
}
}
}
])
Be aware that the "master" element has to be dynamically populated in first stage for $match pipeline, as well as in the second stage when performing filtering with $filter operator.
Here is the working example: https://mongoplayground.net/p/EtYwOqAE-PE

I think this works also
Test code here
keeps only the arrays that contain the master key
unwind them
group by {"_id" 1} is like group by null, all make it true, just added to have the master key as _id (on the group $$REMOVE system variable is used to not add the master key)
Query (where you see 1 put your master key, or a variable)
db.collection.aggregate([
{
"$match": {
"data": 1
}
},
{
"$unwind": {
"path": "$data"
}
},
{
"$group": {
"_id": 1,
"members": {
"$addToSet": {
"$cond": [
{
"$ne": [
"$data",
1
]
},
"$data",
"$$REMOVE"
]
}
}
}
}
])

Related

"iterate" through all document fields in mongodb

I have a collection with documents in this form:
{
"fields_names": ["field1", "field2", "field3"]
"field1": 1,
"field2": [1, 2, 3]
"field3": "12345"
}
where field1, field2, field3 are "dynamic" for each document (I have for each document the fields names in the "fields_names" array)
I would like to test whether 2 documents are equals using the aggregation framework.
I used $lookup stage for getting another documents.
My issue is: how can I "iterate" through the whole fields for my collection?
db.collection.aggregate([
{
{$match: "my_id": "test_id"},
{$lookup:
from: "collection"
let: my_id: "$my_id", prev_id: "$_id"
pipeline: [
{$match: "my_id": "$$my_id", "_id": {$ne: "$$prev_id"}}
]
as: "lookup_test"
}
}])
and in the pipeline of the lookup, I would like to iterate the "fields_names" array for getting the names of the fields, and then access their value and compare between the "orig document" (not the $lookup) and the other documents ($lookup documents).
OR: just to iterate all fields (not include the "fields_names" array)
I would like to fill the "lookup_test" array with all documents which as the same fields values..
You will have to compare the two "partial" parts of the document meaning you'll have to ( for each document ) do this in the $lookup, needless to say this is going to be a -very- expensive pipeline. With that said here's how I would do it:
db.collection.aggregate([
{
$match: {
"my_id": "test_id"
}
},
{
"$lookup": {
"from": "collection",
"let": {
id: "$_id",
partialRoot: {
$filter: {
input: {
"$objectToArray": "$$ROOT"
},
as: "fieldObj",
cond: {
"$setIsSubset": [
[
"$$fieldObj.k"
],
"$fields_names"
]
}
}
}
},
pipeline: [
{
$match: {
$expr: {
$and: [
{
$ne: [
"$$id",
"$_id"
]
},
{
$eq: [
{
$size: "$$partialRoot"
},
{
$size: {
"$setIntersection": [
"$$partialRoot",
{
$filter: {
input: {
"$objectToArray": "$$ROOT"
},
as: "fieldObj",
cond: {
"$setIsSubset": [
[
"$$fieldObj.k"
],
"$fields_names"
]
}
}
}
]
}
}
]
}
]
}
}
},
],
"as": "x"
}
}
])
Mongo Playground
If you could dynamically build the query through code you could make this much more efficient by using the same match query in the $lookup stage like so:
const query = { my_id: "test_id" };
db.collection.aggregate([
{
$match: query
},
{
$lookup: {
...
pipeline: [
{ $match: query },
... rest of pipeline ...
]
}
}
])
This way you're only matching documents who at least match the initial query, this should drastically improve query performance ( obviously dependant on field x value entropy )
One more caveat to note is that if x document match you will get the same result x times, meaning you probably want to add $limit: 1 stage to your pipeline.

MongoDb Aggregate nested documents with $add

I need to get sum value from nested documents.
DB document:
{
"_id": 123,
"products": [
{
"productId": 1,
"charges": [
{
"type": "che",
"amount": 100
}
]
}
]
}
i wanted to get sum value.
sumValue = products.charges.amount+20; where "products.productId" is 1 and "products.charges.type" is "che"
i tried below query but no hope:
db.getCollection('test').aggregate(
[
{"$match":{$and:[{"products.productId": 14117426}, {"products.charges.type":"che"}]},
{ $project: { "_id":0, total: { $add: [ "$products.charges.price", 20 ] } }}
]
)
please help me to solve this.
You have to take a look at $unwind operator which deconstructs an array to output a document for each element of array. Also take a look at add and project operators.
I assume your db query should look like this:
db.test.aggregate([
{$unwind: '$products'}, // Unwind products array
{$match: {'products.productId' : 3}}, // Matching product id
{$unwind: '$products.charges'}, // Unwind charges
{$match: {'products.charges.type' : 'che'}}, // Matching charge type of che
{$project: {'with20': {$add: ["$products.charges.amount", 20]}}}, // project total field which is value + 20
{$group: {_id : null, amount: { $sum: '$with20' }}} // total sum
])
You can run $reduce twice to convert your arrays into scalar value. The outer condition could be applied as $filter, the inner one can be run as $cond:
db.collection.aggregate([
{
"$project": {
_id: 0,
total: {
$reduce: {
input: { $filter: { input: "$products", cond: [ "$$this.productId", 1 ] } },
initialValue: 20,
in: {
$add: [
"$$value",
{
$reduce: {
input: "$$this.charges",
initialValue: 0,
in: {
$cond: [ { $eq: [ "$$this.type", "che" ] }, "$$this.amount", 0 ]
}
}
}
]
}
}
}
}
}
])
Mongo Playground

How to limit the finds to unique values in an aggregate in MongoDB

Example dataset:
{
"source": "http://adress.com/",
"date": ISODate("2016-08-31T08:41:00.000Z"),
"author": "Some Guy",
"thread": NumberInt(115265),
"commentID": NumberInt(2693454),
"title": ["A", "title", "for", "a", "comment"],
"comment": ["This", "is", "a", "comment", "with", "a", "duplicate"]
}
The dataset I'm using is basically a comment from a user, with a unique commentID. The comment itself is held as an array of words. I've managed to unwind the array, match the buzzword and get back all finds.
My problem now is getting rid of duplicates, where buzzwords show up several times in a comment. I suppose I have to use a group, but can't find a way to do it.
The current pipeline is:
[
{"$unwind": "$comment"},
{"$match": {"comment": buzzword } }
]
Which does work just fine. But if I'm searching for the buzzword "a", in the above example it will find the comment twice, as the word "a" shows up twice.
What I need is a JSON for the pipeline to drop all duplicates past the first.
You could run a single pipeline without $unwind that takes advantage of the array operators $arrayElemAt and $filter. The former will give you the first element in a given array and this array will be a result of filtering elements using the latter, $filter.
Follow this example to get the desired result:
db.collection.aggregate([
{ "$match": { "comment": buzzword } },
{
"$project": {
"source": 1,
"date": 1,
"author": 1,
"thread": 1,
"commentID": 1,
"title": 1,
"comment": 1,
"distinct_matched_comment": {
"$arrayElemAt": [
{
"$filter": {
"input": "$comment",
"as": "word",
"cond": {
"$eq": ["$$word", buzzword]
}
}
}, 0
]
}
}
}
])
Explanations
In the above pipeline, the trick is to first filter the comment array by selecting just the elements which satisfy a given criteria. For example, to demonstrate this concept, run this pipeline:
db.collection.aggregate([
{
"$project": {
"filtered_comment": {
"$filter": {
"input": ["This", "is", "a", "comment", "with", "a", "duplicate"], /* hardcoded input array for demo */
"as": "word", /* The variable name for the element in the input array.
The as expression accesses each element in the input array by this variable.*/
"cond": { /* this condition determines whether to include the element in the resulting array. */
"$eq": ["$$word", "a"] /* condition where the variable equals the buzzword "a" */
}
}
}
}
}
])
Output
{
"_id" : ObjectId("57dbd747be80cdcab63703dc"),
"filtered_comment" : [
"a",
"a"
]
}
As the $filter's input parameter accepts an expression that resolves to an array, you can use an array field instead.
Taking the result above further, we can show how the $arrayElemAt operator works:
db.collection.aggregate([
{
"$project": {
"distinct_matched_comment": {
"$arrayElemAt": [
["a", "a"], /* array produced by the above $filter expression */
0 /* the index position of the element we want to return, here being the first */
]
}
}
}
])
Output
{
"_id" : ObjectId("57dbd747be80cdcab63703dc"),
"distinct_matched_comment": "a"
}
Since the expression in the $arrayElemAt operator
{ "$arrayElemAt": [ <array>, <idx> ] }
can be any valid expression as long as it resolves to an array, you can combine the $filter expression from the beginning of this example as the array expression since it returns an array thus your final pipeline will look like:
db.collection.aggregate([
{
"$project": {
"distinct_matched_comment": {
"$arrayElemAt": [
{ /* expression that produces an array with elements that match a condition */
"$filter": {
"input": "$comment",
"as": "word",
"cond": {
"$eq": ["$$word", buzzword]
}
}
},
0 /* the index position of the element we want to return, here being the first */
]
}
}
}
])
One possible solution could be with $group like so
...
{ $unwind: "$comment"},
{ $match: {"comment": buzzword } },
{
$group: {
_id : "$_id",
source: { $first: "$source" },
date: { $first: "$date" },
author: { $first: "$author" },
thread: { $first: "$thread" },
commentID: { $first: "$commentID" },
title: { $first: "$title" }
}
}
...
Another way would be to use $project prior unwinding the array in order to get rid of the duplicate words like so
...
{
$project: {
source: 1,
date: 1,
author: 1,
thread: 1,
commentID: 1,
title: 1,
comment: { $setUnion: ["$comment"] }
}
},
{$unwind: "$comment"},
{$match: {"comment": buzzword } }
...
Update due to comment:
To retain the comment array you could project the array to another field and unwind that instead like so
...
{
$project: {
source: 1,
date: 1,
author: 1,
thread: 1,
commentID: 1,
title: 1,
comment: 1,
commentWord: { $setUnion: ["$comment"] }
}
},
{$unwind: "$commentWord"},
{$match: {"commentWord": buzzword } }
...
Hope that helps

mongodb $aggregate empty array and multiple documents

mongodb has below document:
> db.test.find({name:{$in:["abc","abc2"]}})
{ "_id" : 1, "name" : "abc", "scores" : [ ] }
{ "_id" : 2, "name" : "abc2", "scores" : [ 10, 20 ] }
I want get scores array length for each document, how should I do?
Tried below command:
db.test.aggregate({$match:{name:"abc2"}}, {$unwind: "$scores"}, {$group: {_id:null, count:{$sum:1}}} )
Result:
{ "_id" : null, "count" : 2 }
But below command:
db.test.aggregate({$match:{name:"abc"}}, {$unwind: "$scores"}, {$group: {_id:null, count:{$sum:1}}} )
Return Nothing. Question:
How should I get each lenght of scores in 2 or more document in one
command?
Why the result of second command return nothing? and how
should I check if the array is empty?
So this is actually a common problem. The result of the $unwind phase in an aggregation pipeline where the array is "empty" is to "remove" to document from the pipeline results.
In order to return a count of "0" for such an an "empty" array then you need to do something like the following.
In MongoDB 2.6 or greater, just use $size:
db.test.aggregate([
{ "$match": { "name": "abc" } },
{ "$group": {
"_id": null,
"count": { "$sum": { "$size": "$scores" } }
}}
])
In earlier versions you need to do this:
db.test.aggregate([
{ "$match": { "name": "abc" } },
{ "$project": {
"name": 1,
"scores": {
"$cond": [
{ "$eq": [ "$scores", [] ] },
{ "$const": [false] },
"$scores"
]
}
}},
{ "$unwind": "$scores" },
{ "$group": {
"_id": null,
"count": { "$sum": {
"$cond": [
"$scores",
1,
0
]
}}
}}
])
The modern operation is simple since $size will just "measure" the array. In the latter case you need to "replace" the array with a single false value when it is empty to avoid $unwind "destroying" this for an "empty" statement.
So replacing with false allows the $cond "trinary" to choose whether to add 1 or 0 to the $sum of the overall statement.
That is how you get the length of "empty arrays".
To get the length of scores in 2 or more documents you just need to change the _id value in the $group pipeline which contains the distinct group by key, so in this case you need to group by the document _id.
Your second aggregation returns nothing because the $match query pipeline passed a document which had an empty scores array. To check if the array is empty, your match query should be
{'scores.0': {$exists: true}} or {scores: {$not: {$size: 0}}}
Overall, your aggregation should look like this:
db.test.aggregate([
{ "$match": {"scores.0": { "$exists": true } } },
{ "$unwind": "$scores" },
{
"$group": {
"_id": "$_id",
"count": { "$sum": 1 }
}
}
])

Mongodb aggregation, finding within an array of values

I have a schemea that creates documents using the following structure:
{
"_id" : "2014-07-16:52TEST",
"date" : ISODate("2014-07-16T23:52:59.811Z"),
"name" : "TEST"
"values" : [
[
1405471921000,
0.737121
],
[
1405471922000,
0.737142
],
[
1405471923000,
0.737142
],
[
1405471924000,
0.737142
]
]
}
In the values, the first index is a timestamp. What I'm trying to do is query a specific timestamp to find the closest value ($gte).
I've tried the following aggregate query:
[
{ "$match": {
"values": {
"$elemMatch": { "0": {"$gte": 1405471923000} }
},
"name" : 'TEST'
}},
{ "$project" : {
"name" : 1,
"values" : 1
}},
{ "$unwind": "$values" },
{ "$match": { "values.0": { "$gte": 1405471923000 } } },
{ "$limit" : 1 },
{ "$sort": { "values.0": -1 } },
{ "$group": {
"_id": "$name",
"values": { "$push": "$values" },
}}
]
This seems to work, but it doesn't pull the closest value. It seems to pull anything greater or equal to and the sort doesn't seem to get applied, so it will pull a timestamp that is far in the future.
Any suggestions would be great!
Thank you
There are a couple of things wrong with the approach here even though it is a fair effort. You are right that you need to $sort here, but the problem is that you cannot "sort" on an inner element with an array. In order to get a value that can be sorted you must $unwind the array first as it otherwise will not sort on an array position.
You also certainly do not want $limit in the pipeline. You might be testing this against a single document, but "limit" will actually act on the entire set of documents in the pipeline. So if more than one document was matching your condition then they would be thrown away.
The key thing you want to do here is use $first in your $group stage, which is applied once you have sorted to get the "closest" element that you want.
db.collection.aggregate([
// Documents that have an array element matching the condition
{ "$match": {
"values": { "$elemMatch": { "0": {"$gte": 1405471923000 } } }
}},
// Unwind the top level array
{ "$unwind": "$values" },
// Filter just the elements that match the condition
{ "$match": { "values.0": { "$gte": 1405471923000 } } },
// Take a copy of the inner array
{ "$project": {
"date": 1,
"name": 1,
"values": 1,
"valCopy": "$values"
}},
// Unwind the inner array copy
{ "$unwind": "$valCopy" },
// Filter the inner elements
{ "$match": { "valCopy": { "$gte": 1405471923000 } }},
// Sort on the now "timestamp" values ascending for nearest
{ "$sort": { "valCopy": 1 } },
// Take the "first" values
{ "$group": {
"_id": "$_id",
"date": { "$first": "$date" },
"name": { "$first": "$name" },
"values": { "$first": "$values" },
}},
// Optionally push back to array to match the original structure
{ "$group": {
"_id": "$_id",
"date": { "$first": "$date" },
"name": { "$first": "$name" },
"values": { "$push": "$values" },
}}
])
And this produces your document with just the "nearest" timestamp value matching the original document form:
{
"_id" : "2014-07-16:52TEST",
"date" : ISODate("2014-07-16T23:52:59.811Z"),
"name" : "TEST",
"values" : [
[
1405471923000,
0.737142
]
]
}