MongoDB: How to get add filter main collection by second collection using $lookup [duplicate] - mongodb

How can I add a filter after an $lookup or is there any other method to do this?
My data collection test is:
{ "_id" : ObjectId("570557d4094a4514fc1291d6"), "id" : 100, "value" : "0", "contain" : [ ] }
{ "_id" : ObjectId("570557d4094a4514fc1291d7"), "id" : 110, "value" : "1", "contain" : [ 100 ] }
{ "_id" : ObjectId("570557d4094a4514fc1291d8"), "id" : 120, "value" : "1", "contain" : [ 100 ] }
{ "_id" : ObjectId("570557d4094a4514fc1291d9"), "id" : 121, "value" : "2", "contain" : [ 100, 120 ] }
I select id 100 and aggregate the childs:
db.test.aggregate([ {
$match : {
id: 100
}
}, {
$lookup : {
from : "test",
localField : "id",
foreignField : "contain",
as : "childs"
}
}]);
I get back:
{
"_id":ObjectId("570557d4094a4514fc1291d6"),
"id":100,
"value":"0",
"contain":[ ],
"childs":[ {
"_id":ObjectId("570557d4094a4514fc1291d7"),
"id":110,
"value":"1",
"contain":[ 100 ]
},
{
"_id":ObjectId("570557d4094a4514fc1291d8"),
"id":120,
"value":"1",
"contain":[ 100 ]
},
{
"_id":ObjectId("570557d4094a4514fc1291d9"),
"id":121,
"value":"2",
"contain":[ 100, 120 ]
}
]
}
But I want only childs that match with "value: 1"
At the end I expect this result:
{
"_id":ObjectId("570557d4094a4514fc1291d6"),
"id":100,
"value":"0",
"contain":[ ],
"childs":[ {
"_id":ObjectId("570557d4094a4514fc1291d7"),
"id":110,
"value":"1",
"contain":[ 100 ]
},
{
"_id":ObjectId("570557d4094a4514fc1291d8"),
"id":120,
"value":"1",
"contain":[ 100 ]
}
]
}

The question here is actually about something different and does not need $lookup at all. But for anyone arriving here purely from the title of "filtering after $lookup" then these are the techniques for you:
MongoDB 3.6 - Sub-pipeline
db.test.aggregate([
{ "$match": { "id": 100 } },
{ "$lookup": {
"from": "test",
"let": { "id": "$id" },
"pipeline": [
{ "$match": {
"value": "1",
"$expr": { "$in": [ "$$id", "$contain" ] }
}}
],
"as": "childs"
}}
])
Earlier - $lookup + $unwind + $match coalescence
db.test.aggregate([
{ "$match": { "id": 100 } },
{ "$lookup": {
"from": "test",
"localField": "id",
"foreignField": "contain",
"as": "childs"
}},
{ "$unwind": "$childs" },
{ "$match": { "childs.value": "1" } },
{ "$group": {
"_id": "$_id",
"id": { "$first": "$id" },
"value": { "$first": "$value" },
"contain": { "$first": "$contain" },
"childs": { "$push": "$childs" }
}}
])
If you question why would you $unwind as opposed to using $filter on the array, then read Aggregate $lookup Total size of documents in matching pipeline exceeds maximum document size for all the detail on why this is generally necessary and far more optimal.
For releases of MongoDB 3.6 and onwards, then the more expressive "sub-pipeline" is generally what you want to "filter" the results of the foreign collection before anything gets returned into the array at all.
Back to the answer though which actually describes why the question asked needs "no join" at all....
Original
Using $lookup like this is not the most "efficient" way to do what you want here. But more on this later.
As a basic concept, just use $filter on the resulting array:
db.test.aggregate([
{ "$match": { "id": 100 } },
{ "$lookup": {
"from": "test",
"localField": "id",
"foreignField": "contain",
"as": "childs"
}},
{ "$project": {
"id": 1,
"value": 1,
"contain": 1,
"childs": {
"$filter": {
"input": "$childs",
"as": "child",
"cond": { "$eq": [ "$$child.value", "1" ] }
}
}
}}
]);
Or use $redact instead:
db.test.aggregate([
{ "$match": { "id": 100 } },
{ "$lookup": {
"from": "test",
"localField": "id",
"foreignField": "contain",
"as": "childs"
}},
{ "$redact": {
"$cond": {
"if": {
"$or": [
{ "$eq": [ "$value", "0" ] },
{ "$eq": [ "$value", "1" ] }
]
},
"then": "$$DESCEND",
"else": "$$PRUNE"
}
}}
]);
Both get the same result:
{
"_id":ObjectId("570557d4094a4514fc1291d6"),
"id":100,
"value":"0",
"contain":[ ],
"childs":[ {
"_id":ObjectId("570557d4094a4514fc1291d7"),
"id":110,
"value":"1",
"contain":[ 100 ]
},
{
"_id":ObjectId("570557d4094a4514fc1291d8"),
"id":120,
"value":"1",
"contain":[ 100 ]
}
]
}
Bottom line is that $lookup itself cannot "yet" query to only select certain data. So all "filtering" needs to happen after the $lookup
But really for this type of "self join" you are better off not using $lookup at all and avoiding the overhead of an additional read and "hash-merge" entirely. Just fetch the related items and $group instead:
db.test.aggregate([
{ "$match": {
"$or": [
{ "id": 100 },
{ "contain.0": 100, "value": "1" }
]
}},
{ "$group": {
"_id": {
"$cond": {
"if": { "$eq": [ "$value", "0" ] },
"then": "$id",
"else": { "$arrayElemAt": [ "$contain", 0 ] }
}
},
"value": { "$first": { "$literal": "0"} },
"childs": {
"$push": {
"$cond": {
"if": { "$ne": [ "$value", "0" ] },
"then": "$$ROOT",
"else": null
}
}
}
}},
{ "$project": {
"value": 1,
"childs": {
"$filter": {
"input": "$childs",
"as": "child",
"cond": { "$ne": [ "$$child", null ] }
}
}
}}
])
Which only comes out a little different because I deliberately removed the extraneous fields. Add them in yourself if you really want to:
{
"_id" : 100,
"value" : "0",
"childs" : [
{
"_id" : ObjectId("570557d4094a4514fc1291d7"),
"id" : 110,
"value" : "1",
"contain" : [ 100 ]
},
{
"_id" : ObjectId("570557d4094a4514fc1291d8"),
"id" : 120,
"value" : "1",
"contain" : [ 100 ]
}
]
}
So the only real issue here is "filtering" any null result from the array, created when the current document was the parent in processing items to $push.
What you also seem to be missing here is that the result you are looking for does not need aggregation or "sub-queries" at all. The structure that you have concluded or possibly found elsewhere is "designed" so that you can get a "node" and all of it's "children" in a single query request.
That means just the "query" is all that is really needed, and the data collection ( which is all that is happening since no content is really being "reduced" ) is just a function of iterating the cursor result:
var result = {};
db.test.find({
"$or": [
{ "id": 100 },
{ "contain.0": 100, "value": "1" }
]
}).sort({ "contain.0": 1 }).forEach(function(doc) {
if ( doc.id == 100 ) {
result = doc;
result.childs = []
} else {
result.childs.push(doc)
}
})
printjson(result);
This does exactly the same thing:
{
"_id" : ObjectId("570557d4094a4514fc1291d6"),
"id" : 100,
"value" : "0",
"contain" : [ ],
"childs" : [
{
"_id" : ObjectId("570557d4094a4514fc1291d7"),
"id" : 110,
"value" : "1",
"contain" : [
100
]
},
{
"_id" : ObjectId("570557d4094a4514fc1291d8"),
"id" : 120,
"value" : "1",
"contain" : [
100
]
}
]
}
And serves as proof that all you really need to do here is issue the "single" query to select both the parent and children. The returned data is just the same, and all you are doing on either server or client is "massaging" into another collected format.
This is one of those cases where you can get "caught up" in thinking of how you did things in a "relational" database, and not realize that since the way the data is stored has "changed", you no longer need to use the same approach.
That is exactly what the point of the documentation example "Model Tree Structures with Child References" in it's structure, where it makes it easy to select parents and children within one query.

Related

Mongo lookup through array of ids

how to correct lookup collection product which has array of ids of prices and need execute query exactly from prices collection query. Need to show price record with lookup product record
so, I have prices record
{
"_id" : "813f02ff-882e-44f7-b2bc-2f067427daf6",
"unit_amount" : 333,
"currency" : "USD",
"interval" : "year",
"active" : true
}
and product
"_id" : "3c46f277-8953-4f96-baf1-bd871ee3301f",
"name" : "test",
"prices" : [
"813f02ff-882e-44f7-b2bc-2f067427daf6",
"f5c76122-6132-4e4b-a26b-41bbd6325acc",
"3e4be68e-fbed-47f7-b871-92de72cb00df"
]
and my query, I thought it should be like that
db.getCollection('price').aggregate([
{
"$lookup": {
"from": "product",
"let": { "prid": "$_id" },
"pipeline": [
{ "$match": { "$expr": { "$in": ["$$prid", '$prices'] } } }
],
"as": "product_tbl"
}
}
])
faced with
{
"ok" : 0,
"errmsg" : "PlanExecutor error during aggregation :: caused by :: $in requires an array as a second argument, found: missing",
"code" : 40081,
"codeName" : "Location40081"
}
but it's not works. How it shoul be look ?
Seems like some of the documents in your product collection are missing prices
key. You can try this:
db.prices.aggregate([
{
"$lookup": {
"from": "product",
"let": {
"prid": "$_id"
},
"pipeline": [
{
"$addFields": {
"prices": {
"$ifNull": [
"$prices",
[]
]
}
}
},
{
"$match": {
"$expr": {
"$in": [
"$$prid",
"$prices"
]
}
}
}
],
"as": "product_tbl"
}
}
])
Here, we recompute the prices and set it to empty array, if it's missing, before the $match. Playground link.

How to Join Arrays in the Same Document?

I would like to combine the data in one collection using the IDs of the two arrays.
An example is shown below.
{
"_id": ObjectId ("5976fd2eb0adec0a32fa9831"),
"People": [
{
"_id": 1, <--- ID
"Name": "jane"
},
{
"_id": 2, <--- ID
"Name": "Mark"
}
],
"Contents": [
{
"userID": 2, <--- People ID
"Text": "111"
},
{
"userID": 1, <--- People ID
"Text": "Hi"
}
]
}
I want to make the above document as below.
{
"_id": ObjectId ("5976fd2eb0adec0a32fa9831"),
"People": [
{
"_id": 1,
"Name" : "Jane"
},
{
"_id": 2,
"Name": "Mark"
}
],
"Contents": [
{
"userID": 2,
"Name": "Mark", <-- Adding
"Text": "111",
},
{
"userID": 1,
"Name": "Jane", <-- Adding
"Text": "Hi",
}
]
}
I have tried various things like $lookup or $unwind of .aggregate() but I cannot get the result.
You want $map and $indexOfArray ideally:
db.collection.aggregate([
{ "$addFields": {
"Contents": {
"$map": {
"input": "$Contents",
"as": "c",
"in": {
"userID": "$$c.userID",
"Name": {
"$arrayElemAt": [
"$People.Name",
{ "$indexOfArray": [ "$People._id", "$$c.userID" ] }
]
},
"Text": "$$c.Text"
}
}
}
}}
])
Which basically grabs the value from the other array via $arrayElemAt for the matching "index" returned by $indexOfArray.
If your MongoDB needs to fall back a version without that operator, then you could use $filter instead:
db.collection.aggregate([
{ "$addFields": {
"Contents": {
"$map": {
"input": "$Contents",
"as": "c",
"in": {
"userID": "$$c.userID",
"Name": {
"$arrayElemAt": [
{ "$map": {
"input": {
"$filter": {
"input": "$People",
"as": "p",
"cond": { "$eq": [ "$$p._id", "$$c.userID" ] }
}
},
"as": "p",
"in": "$$p.Name"
}},
0
]
},
"Text": "$$c.Text"
}
}
}
}}
])
Where basically you $filter the results down of the other array in comparison and simply return the first matching element by the 0 index with $arrayElemAt.
In either case, there is no need to "self-join" using $lookup, and that's just really unnecessary overhead best avoided.
From the document in the question you get the following:
/* 1 */
{
"_id" : ObjectId("5976fd2eb0adec0a32fa9831"),
"People" : [
{
"_id" : 1.0,
"Name" : "jane"
},
{
"_id" : 2.0,
"Name" : "Mark"
}
],
"Contents" : [
{
"userID" : 2.0,
"Name" : "Mark",
"Text" : "111"
},
{
"userID" : 1.0,
"Name" : "jane",
"Text" : "Hi"
}
]
}
Generally speaking though, there is no such reason for any aggregation operators at all, as this sort of operation is generally best left to post-processing in the cursor. In fact since you are actually "adding" data to the document to return, it's better to do modification after the document is sent over the network.
As a common idiom of the above shown as JavaScript for the shell:
db.collection.find().map( d =>
Object.assign(
d,
{
"Contents": d.Contents.map( c =>
Object.assign(c,
{ "Name": d.People.map(p => p.Name)[d.People.map(p => p._id).indexOf(c.userID)] }
)
)
}
)
)
Produces the exact same result, and is generally a bit easier on the eyes to read and interpret

Retrieve specific element of a nested document

Just cannot figure this out. This is the document format from a MongoDB of jobs, which is derived from an XML file the layout of which I have no control over:
{
"reference" : [ "93417" ],
"Title" : [ "RN - Pediatric Director of Nursing" ],
"Description" : [ "...a paragraph or two..." ],
"Classifications" : [
{
"Classification" : [
{
"_" : "Nurse / Midwife",
"name" : [ "Category" ]
},
{
"_" : "FL - Jacksonville",
"name" : [ "Location" ],
},
{
"_" : "Permanent / Full Time",
"name" : [ "Work Type" ],
},
{
"_" : "Some Health Care Org",
"name" : [ "Company Name" ],
}
]
}
],
"Apply" : [
{
"EmailTo" : [ "jess#recruiting.co" ]
}
]
}
The intention is to pull a list of jobs from the DB, to include 'Location', which is buried down there as the second document at 'Classifications.Classification._'.
I've tried various 'aggregate' permutations of $project, $unwind, $match, $filter, $group… but I don't seem to be getting anywhere. Experimenting with just retrieving the company name, I was expecting this to work:
db.collection(JOBS_COLLECTION).aggregate([
{ "$project" : { "meta": "$Classifications.Classification" } },
{ "$project" : { "meta": 1, _id: 0 } },
{ "$unwind" : "$meta" },
{ "$match": { "meta.name" : "Company Name" } },
{ "$project" : { "Company" : "$meta._" } },
])
But that pulled everything for every record, thus:
[{
"Company":[
"Nurse / Midwife",
"TX - San Antonio",
"Permanent / Full Time",
"Some Health Care Org"
]
}, { etc etc }]
What am I missing, or misusing?
Ideally with MongoDB 3.4 available you would simply $project, and use the array operators of $map, $filter and $reduce. The latter to "compact" the arrays and the former to to extract the relevant element and detail. Also $arrayElemAt takes just the "element" from the array(s):
db.collection(JOBS_COLLECTION).aggregate([
{ "$match": { "Classifications.Classification.name": "Location" } },
{ "$project": {
"_id": 0,
"output": {
"$arrayElemAt": [
{ "$map": {
"input": {
"$filter": {
"input": {
"$reduce": {
"input": "$Classifications.Classification",
"initialValue": [],
"in": {
"$concatArrays": [ "$$value", "$$this" ]
}
}
},
"as": "c",
"cond": { "$eq": [ "$$c.name", ["Location"] ] }
}
},
"as": "c",
"in": "$$c._"
}},
0
]
}
}}
])
Or even skip the $reduce which is merely applying the $concatArrays to "merge" and simply grab the "first" array index ( since there is only one ) using $arrayElemAt:
db.collection(JOBS_COLLECTION).aggregate([
{ "$match": { "Classifications.Classification.name": "Location" } },
{ "$project": {
"_id": 0,
"output": {
"$arrayElemAt": [
{ "$map": {
"input": {
"$filter": {
"input": { "$arrayElemAt": [ "$Classifications.Classification", 0 ] },
"as": "c",
"cond": { "$eq": [ "$$c.name", ["Location"] ] }
}
},
"as": "c",
"in": "$$c._"
}},
0
]
}
}}
])
That makes the operation compatible with MongoDB 3.2, which you "should" be running at least.
Which in turn allows you to consider alternate syntax for MongoDB 3.4 using $indexOfArray based on the initial input variable of the "first" array index using $let to somewhat shorten the syntax:
db.collection(JOBS_COLLECTION).aggregate([
{ "$match": { "Classifications.Classification.name": "Location" } },
{ "$project": {
"_id": 0,
"output": {
"$let": {
"vars": {
"meta": {
"$arrayElemAt": [
"$Classifications.Classification",
0
]
}
},
"in": {
"$arrayElemAt": [
"$$meta._",
{ "$indexOfArray": [
"$$meta.name", [ "Location" ]
]}
]
}
}
}
}}
])
If indeed you consider that to be "shorter", that is.
In the other sense though, much like above there is an "array inside and array", so in order to process it, you $unwind twice, which is effectively what the $concatArrays inside $reduce is countering in the ideal case:
db.collection(JOBS_COLLECTION).aggregate([
{ "$match": { "Classifications.Classification.name": "Location" } },
{ "$unwind": "$Classifications" },
{ "$unwind": "$Classifications.Classification" },
{ "$match": { "Classifications.Classification.name": "Location" } },
{ "$project": { "_id": 0, "output": "$Classifications.Classification._" } }
])
All statements actually produce:
{
"output" : "FL - Jacksonville"
}
Which is the matching value of "_" in the inner array element for the "Location" as selected by your original intent.
Keeping in mind of course that all statements really should be preceded with the relevant [$match]9 statement as shown:
{ "$match": { "Classifications.Classification.name": "Location" } },
Since without that you would be possibly processing documents unnecessarily, which did not actually contain an array element matching that condition. Of course this may not be the case due to the nature of the documents, but it's generally good practice to make sure the "initial" selection always matches the conditions of details you later intend to "extract".
All of that said, even if this is the result of a direct import from XML, the structure should be changed since it does not efficiently present itself for queries. MongoDB documents do not work how XPATH does in terms of issuing queries. Therefore anything "XML Like" is not going to be a good structure, and if the "import" process cannot be changed to a more accommodating format, then there should at least be a "post process" to manipulate this into a separate storage in a more usable form.

Querying mongoDB for some chart data - my pipeline seems convoluted

This is a long question. If you bother answering, I will be extra grateful.
I have some time series data that I am trying to query to create various charts. The data format isn't the most simple, but I think my aggregation pipeline is getting a bit out of hand. I am planning to use charts.js to visualise the data on the client.
I will post a sample of my data below as well as my pipeline, with the desired output.
My question is in two parts - answering either one could solve the problem.
Does charts.js accept data formats other than an array of numbers per row? This would mean my pipeline could try to do less.
My pipeline doesn't quite get to the result I need. Can you recommend any alterations to get the correct result from my pipeline? Is there is a simpler way to get my desired output format?
Sample data
Here is a real data sample - a brand with one facebook account and one twitter account. There is some data for some dates in June. Lots of null day and month fields have been omitted.
Brand
[{
"_id": "5943f427e7c11ac3ad3652b0",
"name": "Brand1",
"facebookAccounts": [
"5943f427e7c11ac3ad3652ac",
],
"twitterAccounts": [
"5943f427e7c11ac3ad3652aa",
],
}]
FacebookAccounts
[
{
"_id" : "5943f427e7c11ac3ad3652ac"
"name": "Brand 1 Name",
"years": [
{
"date": "2017-01-01T00:00:00.000Z",
"months": [
{
"date": "2017-06-01T00:00:00.000Z",
"days": [
{
"date": "2017-06-16T00:00:00.000Z",
"likes": 904025,
},
{
"date": "2017-06-17T00:00:00.000Z",
"likes": null,
},
{
"date": "2017-06-18T00:00:00.000Z",
"likes": 904345,
},
],
},
],
}
]
}
]
Twitter accounts
[
{
"_id": "5943f427e7c11ac3ad3652aa",
"name": "Brand 1 Name",
"vendorId": "twitterhandle",
"years": [
{
"date": "2017-01-01T00:00:00.000Z",
"months": [
{
"date": "2017-06-01T00:00:00.000Z",
"days": [
{
"date": "2017-06-16T00:00:00.000Z",
"followers": 69390,
},
{
"date": "2017-06-17T00:00:00.000Z",
"followers": 69397,
{
"date": "2017-06-18T00:00:00.000Z",
"followers": 69428,
},
{
"date": "2017-06-19T00:00:00.000Z",
"followers": 69457,
},
]
},
],
}
]
}
]
The query
For this example, I want, for each brand, a daily sum of facebook likes and twitter followers between June 16th and June 18th. So here, the required format is:
{
brand: Brand1,
date: ["2017-06-16T00:00:00.000Z", "2017-06-17T00:00:00.000Z", "2017-06-18T00:00:00.000Z"],
stat: [973415, 69397, 973773]
}
The pipeline
The pipeline seems more convoluted due to the population, but I accept that complexity and it is necessary. Here are the steps:
db.getCollection('brands').aggregate([
{ $match: { _id: { $in: [ObjectId("5943f427e7c11ac3ad3652b0") ] } } },
// Unwind all relevant account types. Make one row per account
{ $project: {
accounts: { $setUnion: [ '$facebookAccounts', '$twitterAccounts' ] } ,
name: '$name'
}
},
{ $unwind: '$accounts' },
// populate the accounts.
// These transform the arrays of facebookAccount ObjectIds into the objects described above.
{ $lookup: { from: 'facebookaccounts', localField: 'accounts', foreignField: '_id', as: 'facebookAccounts' } },
{ $lookup: { from: 'twitteraccounts', localField: 'accounts', foreignField: '_id', as: 'twitterAccounts' } },
// unwind the populated accounts. Back to one record per account.
{ $unwind: { path: '$facebookAccounts', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$twitterAccounts', preserveNullAndEmptyArrays: true } },
// unwind to the granularity we want. Here it is one record per day per account per brand.
{ $unwind: { path: '$facebookAccounts.years', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$facebookAccounts.years.months', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$facebookAccounts.years.months.days', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$facebookAccounts.years.months.days', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$twitterAccounts.years', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$twitterAccounts.years.months', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$twitterAccounts.years.months.days', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$twitterAccounts.years.months.days', preserveNullAndEmptyArrays: true } },
// Filter each one between dates
{ $match: { $or: [
{ $and: [
{ 'facebookAccounts.years.months.days.date': { $gte: new Date('2017-06-16') } } ,
{ 'facebookAccounts.years.months.days.date': { $lte: new Date('2017-06-18') } }
]},
{ $and: [
{ 'twitterAccounts.years.months.days.date': { $gte: new Date('2017-06-16') } } ,
{ 'twitterAccounts.years.months.days.date': { $lte: new Date('2017-06-18') } }
]}
] }},
// Build stats and date arrays for each account
{ $group: {
_id: '$accounts',
brandId: { $first: '$_id' },
brandName: { $first: '$name' },
stat: {
$push: {
$sum: {
$add: [
{ $ifNull: ['$facebookAccounts.years.months.days.likes', 0] },
{ $ifNull: ['$twitterAccounts.years.months.days.followers', 0] }
]
}
}
},
date: { $push: { $ifNull: ['$facebookAccounts.years.months.days.date', '$twitterAccounts.years.months.days.date'] } } ,
}}
])
This gives me the output format
[{
_id: accountId, // facebook
brandName: 'Brand1'
date: ["2017-06-16T00:00:00.000Z", "2017-06-17T00:00:00.000Z", "2017-06-18T00:00:00.000Z"],
stat: [904025, null, 904345]
},
{
_id: accountId // twitter
brandName: 'Brand1',
date: ["2017-06-16T00:00:00.000Z", "2017-06-17T00:00:00.000Z", "2017-06-18T00:00:00.000Z"],
stat: [69457, 69390, 69397]
}]
So I now need to perform column-wise addition on my stat properties.And then I am stuck - I feel like there should be a more pipeline friendly way to sum these rather than column-wise addition.
Note I accept the extra work that the population required and am happy with that. Most of the repetition is done programmatically.
Thank you if you've gotten this far.
I can trim a lot of fat out of this and keep it compatible with MongoDB 3.2 ( which you must be using at least due to preserveNullAndEmptyArrays ) available operators with a few simple actions. Mostly by simply joining the arrays immediately after $lookup, which is the best place to do it:
Short Optimize
db.brands.aggregate([
{ "$lookup": {
"from": "facebookaccounts",
"localField": "facebookAccounts",
"foreignField": "_id",
"as": "facebookAccounts"
}},
{ "$lookup": {
"from": "twitteraccounts",
"localField": "twitterAccounts",
"foreignField": "_id",
"as": "twitterAccounts"
}},
{ "$project": {
"name": 1,
"all": {
"$concatArrays": [ "$facebookAccounts", "$twitterAccounts" ]
}
}},
{ "$match": {
"all.years.months.days.date": {
"$gte": new Date("2017-06-16"), "$lte": new Date("2017-06-18")
}
}},
{ "$unwind": "$all" },
{ "$unwind": "$all.years" },
{ "$unwind": "$all.years.months" },
{ "$unwind": "$all.years.months.days" },
{ "$match": {
"all.years.months.days.date": {
"$gte": new Date("2017-06-16"), "$lte": new Date("2017-06-18")
}
}},
{ "$group": {
"_id": {
"brand": "$name",
"date": "$all.years.months.days.date"
},
"total": {
"$sum": {
"$sum": [
{ "$ifNull": [ "$all.years.months.days.likes", 0 ] },
{ "$ifNull": [ "$all.years.months.days.followers", 0 ] }
]
}
}
}},
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": "$_id.brand",
"date": { "$push": "$_id.date" },
"stat": { "$push": "$total" }
}}
])
This gives the result:
{
"_id" : "Brand1",
"date" : [
ISODate("2017-06-16T00:00:00Z"),
ISODate("2017-06-17T00:00:00Z"),
ISODate("2017-06-18T00:00:00Z")
],
"stat" : [
973415,
69397,
973773
]
}
With MongoDB 3.4 we could probably speed it up a "little" more by filtering the arrays and breaking them down before we eventually $unwind to make this work across documents, or maybe even not worry about going across documents at all if the "name" from "brands" is unique. The pipeline operations to compact down the arrays "in place" though are quite cumbersome to code, if a "little" better on performance.
You seem to be doing this "per brand" or for a small sample, so it's likely of little consequence.
As for the chartjs data format, I don't seem to be able to get my hands on what I believe is a different data format to the array format here, but again this should have little bearing.
The main point I see addressed is we can easily move away from your previous output that separated the "facebook" and "twitter" data, and simply aggregate by date moving all the data together "before" the arrays are constructed.
That last point then obviates the need for further "convoluted" operations to attempt to "merge" those two documents and the arrays produced.
Alternate Optimize
As an alternate approach where this does in fact not aggregate across documents, then you can essentially do the "filter" on the array in place and then simply sum and reshape the received result in client code.
db.brands.aggregate([
{ "$lookup": {
"from": "facebookaccounts",
"localField": "facebookAccounts",
"foreignField": "_id",
"as": "facebookAccounts"
}},
{ "$lookup": {
"from": "twitteraccounts",
"localField": "twitterAccounts",
"foreignField": "_id",
"as": "twitterAccounts"
}},
{ "$project": {
"name": 1,
"all": {
"$map": {
"input": { "$concatArrays": [ "$facebookAccounts", "$twitterAccounts" ] },
"as": "all",
"in": {
"years": {
"$map": {
"input": "$$all.years",
"as": "year",
"in": {
"months": {
"$map": {
"input": "$$year.months",
"as": "month",
"in": {
"days": {
"$filter": {
"input": "$$month.days",
"as": "day",
"cond": {
"$and": [
{ "$gte": [ "$$day.date", new Date("2017-06-16") ] },
{ "$lte": [ "$$day.date", new Date("2017-06-18") ] }
]
}
}
}
}
}
}
}
}
}
}
}
}
}}
]).map(doc => {
doc.all = [].concat.apply([],[].concat.apply([],[].concat.apply([],doc.all.map(d => d.years)).map(d => d.months)).map(d => d.days));
doc.all = doc.all.reduce((a,b) => {
if ( a.findIndex( d => d.date.valueOf() == b.date.valueOf() ) != -1 ) {
a[a.findIndex( d => d.date.valueOf() == b.date.valueOf() )].stat += (b.hasOwnProperty('likes')) ? (b.likes || 0) : (b.followers || 0);
} else {
a = a.concat([{ date: b.date, stat: (b.hasOwnProperty('likes')) ? (b.likes || 0) : (b.followers || 0) }]);
}
return a;
},[]);
doc.date = doc.all.map(d => d.date);
doc.stat = doc.all.map(d => d.stat);
delete doc.all;
return doc;
})
This really leaves all the things that "need" to happen on the server, on the server. And it's then a fairly trivial task to "flatten" the array and process to "sum up" and reshape it. This would mean less load on the server, and the data returned is not really that much greater per document.
Gives the same result of course:
[
{
"_id" : ObjectId("5943f427e7c11ac3ad3652b0"),
"name" : "Brand1",
"date" : [
ISODate("2017-06-16T00:00:00Z"),
ISODate("2017-06-17T00:00:00Z"),
ISODate("2017-06-18T00:00:00Z")
],
"stat" : [
973415,
69397,
973773
]
}
]
Committing to the Diet
The biggest problem you really have is with the multiple collections and the heavily nested documents. Neither of these is doing you any favors here and will with larger results cause real performance problems.
The nesting in particular is completely unnecessary as well as not being very maintainable since there are limitations to "update" where you have nested arrays. See the positional $ operator documentation, as well as many posts about this.
Instead you really want a single collection with all those "days" entries in it. You can always work with that source easily for query as well as aggregation purposes and it should look something like this:
{
"_id" : ObjectId("5948cd5cd6eb0b7d6ac38097"),
"date" : ISODate("2017-06-16T00:00:00Z"),
"likes" : 904025,
"__t" : "Facebook",
"account" : ObjectId("5943f427e7c11ac3ad3652ac")
}
{
"_id" : ObjectId("5948cd5cd6eb0b7d6ac38098"),
"date" : ISODate("2017-06-17T00:00:00Z"),
"likes" : null,
"__t" : "Facebook",
"account" : ObjectId("5943f427e7c11ac3ad3652ac")
}
{
"_id" : ObjectId("5948cd5cd6eb0b7d6ac38099"),
"date" : ISODate("2017-06-18T00:00:00Z"),
"likes" : 904345,
"__t" : "Facebook",
"account" : ObjectId("5943f427e7c11ac3ad3652ac")
}
{
"_id" : ObjectId("5948cd5cd6eb0b7d6ac3809a"),
"date" : ISODate("2017-06-16T00:00:00Z"),
"followers" : 69390,
"__t" : "Twitter",
"account" : ObjectId("5943f427e7c11ac3ad3652aa")
}
{
"_id" : ObjectId("5948cd5cd6eb0b7d6ac3809b"),
"date" : ISODate("2017-06-17T00:00:00Z"),
"followers" : 69397,
"__t" : "Twitter",
"account" : ObjectId("5943f427e7c11ac3ad3652aa")
}
{
"_id" : ObjectId("5948cd5cd6eb0b7d6ac3809c"),
"date" : ISODate("2017-06-18T00:00:00Z"),
"followers" : 69428,
"__t" : "Twitter",
"account" : ObjectId("5943f427e7c11ac3ad3652aa")
}
{
"_id" : ObjectId("5948cd5cd6eb0b7d6ac3809d"),
"date" : ISODate("2017-06-19T00:00:00Z"),
"followers" : 69457,
"__t" : "Twitter",
"account" : ObjectId("5943f427e7c11ac3ad3652aa")
}
Combining those referenced in the brands collection as well:
{
"_id" : ObjectId("5943f427e7c11ac3ad3652b0"),
"name" : "Brand1",
"accounts" : [
ObjectId("5943f427e7c11ac3ad3652ac"),
ObjectId("5943f427e7c11ac3ad3652aa")
]
}
Then you simply aggregate like this:
db.brands.aggregate([
{ "$lookup": {
"from": "social",
"localField": "accounts",
"foreignField": "account",
"as": "accounts"
}},
{ "$unwind": "$accounts" },
{ "$match": {
"accounts.date": {
"$gte": new Date("2017-06-16"), "$lte": new Date("2017-06-18")
}
}},
{ "$group": {
"_id": {
"brand": "$name",
"date": "$accounts.date"
},
"stat": {
"$sum": {
"$sum": [
{ "$ifNull": [ "$accounts.likes", 0 ] },
{ "$ifNull": [ "$accounts.followers", 0 ] }
]
}
}
}},
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": "$_id.brand",
"date": { "$push": "$_id.date" },
"stat": { "$push": "$stat" }
}}
])
This is actually the most efficient thing you can do, and it's mostly because of what actually happens on the server. We need to look at the "explain" output to see what happens to the pipeline here:
{
"$lookup" : {
"from" : "social",
"as" : "accounts",
"localField" : "accounts",
"foreignField" : "account",
"unwinding" : {
"preserveNullAndEmptyArrays" : false
},
"matching" : {
"$and" : [
{
"date" : {
"$gte" : ISODate("2017-06-16T00:00:00Z")
}
},
{
"date" : {
"$lte" : ISODate("2017-06-18T00:00:00Z")
}
}
]
}
}
}
This is what happens when you send $lookup -> $unwind -> $match to the server as the latter two stages are "hoisted" into the $lookup itself. This reduces the results in the actual "query" run on the collection to be joined.
Without that sequence, then $lookup potentially pulls in "a lot of data" with no constraint, and would break the 16MB BSON limit under most normal loads.
So not only is the process a lot more simple in the altered form, it actually "scales" where the present structure will not. This is something that you seriously should consider.

MongoDB aggregate/grouping by key-value pairs

My data looks something like this:
{
"_id" : "9aa072e4-b706-47e6-9607-1a39e904a05a",
"customerId" : "2164289-4",
"channelStatuses" : {
"FOO" : {
"status" : "done"
},
"BAR" : {
"status" : "error"
}
},
"channel" : "BAR",
}
My aggregate/group looks like this:
{
"_id" : {
"customerId" : "$customerId",
"channel" : "$channel",
"status" : "$channelStatuses[$channel].status"
},
"count" : {
"$sum" : 1
}
}
So basically with the example data the group should give me a group grouped by:
{"customerId": "2164289-4", "channel": "BAR", "status": "error"}
But I cannot use []-indexing in a aggregate/group. What should I do instead?
You cannot get the result you want with the current structure using .aggregate(). You "could" change the structure to use an array rather than named keys, and the operation is actually quite simple.
So with a document like:
{
"_id" : "9aa072e4-b706-47e6-9607-1a39e904a05a",
"customerId" : "2164289-4",
"channelStatuses" : [
{
"channel": "FOO",
"status" : "done"
},
{
"channel": "BAR",
"status" : "error"
}
],
"channel" : "BAR",
}
You can then do in modern releases with $filter, $map and $arrayElemAt:
{ "$group": {
"_id": {
"customerId" : "$customerId",
"channel" : "$channel",
"status": {
"$arrayElemAt": [
{ "$map": {
"input": { "$filter": {
"input": "$chanelStatuses",
"as": "el",
"cond": { "$eq": [ "$$el.channel", "$channel" ] }
}},
"as": "el",
"in": "$$el.status"
}},
0
]
}
},
"count": { "$sum": 1 }
}}
Older versions of MongoDB are going to going to require $unwind to access the matched array element.
In MongoDB 2.6 then you can still "pre-filter" the array before unwind:
[
{ "$project": {
"customerId": 1,
"channel": 1,
"status": {
"$setDifference": [
{ "$map": {
"input": "$channelStatuses",
"as": "el",
"in": {
"$cond": [
{ "$eq": [ "$$el.channel", "$channel" ] },
"$$el.status",
false
]
}
}},
[false]
]
}
}},
{ "$unwind": "$status" },
{ "$group": {
"_id": {
"customerId": "$customerId",
"channel": "$channel",
"status": "$status"
},
"count": { "$sum": 1 }
}}
]
And anything prior to that you "filter" after $unwind instead:
[
{ "$unwind": "$channelStatuses" },
{ "$project": {
"customerId": 1,
"channel": 1,
"status": "$channelStatuses.status",
"same": { "$eq": [ "$channelStatuses.status", "$channel" ] }
}},
{ "$match": { "same": true } },
{ "$group": {
"_id": "$_id",
"customerId": { "$first": "$customerId" },
"channel": { "$first": "$channel" },
"status": { "$first": "$status" }
}},
{ "$group": {
"_id": {
"customerId": "$customerId",
"channel": "$channel",
"status": "$status"
},
"count": { "$sum": 1 }
}}
]
In a lesser version than MongoDB 2.6 you also need to $project the result of the equality test between the two fields and then $match on the result in a seperate stage. You might also note the "two" $group stages, since the first one removes any possible duplicates of the "channel" values after the filter via the $first accumulators. The following $group is exactly the same as in the previous listing.
But if you cannot change the structure and need "flexible" matching of keys where you cannot supply every name, then you must use mapReduce:
db.collection.mapReduce(
function() {
emit({
"customerId": this.customerId,
"channel": this.channel,
"status": this.channelStatuses[this.channel].status
},1);
},
function(key,values) {
return Array.sum(values);
},
{ "out": { "inline": 1 } }
)
Where of course you can use that sort of notation