Sort working opposite - mongodb

I have a mongo collection:
/* 0 */
{
"_id" : ObjectId("51f1fcc08188d3117c6da351"),
"cust_id" : "abc123",
"ord_date" : ISODate("2012-10-03T18:30:00Z"),
"status" : "A",
"price" : 25,
"items" : [{
"sku" : "ggg",
"qty" : 7,
"price" : 2.5
}, {
"sku" : "ppp",
"qty" : 5,
"price" : 2.5
}]
}
/* 1 */
{
"_id" : ObjectId("51fa1c318188d305fcbf9f9b"),
"cust_id" : "abc123",
"ord_date" : ISODate("2012-10-03T18:30:00Z"),
"status" : "A",
"price" : 27,
"items" : [{
"sku" : "ggg",
"qty" : 7,
"price" : 2.5
}, {
"sku" : "ppp",
"qty" : 5,
"price" : 2.5
}]
}
When I am giving the aggregate query for sorting in ascending order:
db.orders.aggregate([{
"$unwind": "$items"
}, {
"$sort": {
"price": -1
}
}, {
"$match": {}
}, {
"$group": {
"price": {
"$first": "$price"
},
"items": {
"$push": {
"sku": "$items.sku"
}
},
"_id": "$_id"
}
}, {
"$project": {
"_id": 0,
"price": 1,
"items": 1
}
}])
I get result:
{
"result": [{
"price": 25,
"items": [{
"sku": "ggg"
}, {
"sku": "ppp"
}]
}, {
"price": 27,
"items": [{
"sku": "ggg"
}, {
"sku": "ppp"
}]
}]
}
i.e it is sorting in ascending order and vice versa.

Move the $sort after $group, since the previous sort will be lost after grouping.
db.orders.aggregate([{
"$unwind": "$items"
}, {
"$match": {}
}, {
"$group": {
"price": {
"$first": "$price"
},
"items": {
"$push": {
"sku": "$items.sku"
}
},
"_id": "$_id"
}
}, {
"$sort": {
"price": -1
}
}, {
"$project": {
"_id": 0,
"price": 1,
"items": 1
}
}])
For $natural operator, this is the quoted from the doc.
The $natural operator uses the following syntax to return documents in
the order they exist on disk
Long story short, that means the order you see is not necessarily consistent with the order it store in DB.

Related

Retrieve highest score for each game using aggregate in MongoDB

I am working on a database of various games and i want to design a query that returns top scorer from each game with specific player details.
The document structure is as follows:
db.gaming_system.insertMany(
[
{
"_id": "01",
"name": "GTA 5",
"high_scores": [
{
"hs_id": 1,
"name": "Harry",
"score": 6969
},
{
"hs_id": 2,
"name": "Simon",
"score": 8574
},
{
"hs_id": 3,
"name": "Ethan",
"score": 4261
}
]
},
{
"_id": "02",
"name": "Among Us",
"high_scores": [
{
"hs_id": 1,
"name": "Harry",
"score": 926
},
{
"hs_id": 2,
"name": "Simon",
"score": 741
},
{
"hs_id": 3,
"name": "Ethan",
"score": 841
}
]
}
]
)
I have created a query using aggregate which returns the name of game and the highest score for that game as follows
db.gaming_system.aggregate(
{ "$project": { "maximumscore": { "$max": "$high_scores.score" }, name:1 } },
{ "$group": { "_id": "$_id", Name: { $first: "$name" }, "Highest_Score": { "$max": "$maximumscore" } } },
{ "$sort" : { "_id":1 } }
)
The output from my query is as follows:
{ "_id" : "01", "Name" : "GTA 5", "Highest_Score" : 8574 }
{ "_id" : "02", "Name" : "Among Us", "Highest_Score" : 926 }
I want to generate output which also provides the name of player and "hs_id" of that player who has the highest score for each game as follows:
{ "_id" : "01", "Name" : "GTA 5", "Top_Scorer" : "Simon", "hs_id": 2, "Highest_Score" : 8574 }
{ "_id" : "02", "Name" : "Among Us", "Top_Scorer" : "Harry", "hs_id": 1, "Highest_Score" : 926 }
What should be added to my query using aggregate pipeline?
[
{
$unwind: "$high_scores" //unwind the high_scores, so you can then sort
},
{
$sort: {
"high_scores.score": -1 //sort the high_scores, irrelevant of game, because we are going to group in next stage
}
},
{
//now group them by _id, take the name and top scorer from $first (which is the first in that group as sorted by score in descending order
$group: {
_id: "$_id",
name: {
$first: "$name"
},
Top_Scorer: {
$first: "$high_scores"
}
}
}
]

Forming an array with aggregation in MongoDB

I have a document in MongoDB 3.4 with the following structure:
{
"_id" : ObjectId("5e3419e468d01013eadb83dc"),
"id_station" : "62",
"fiware_service" : null,
"fiware_servicepath" : null,
"id_fiware_name" : "CE_del_medio",
"attrName" : "15",
"attrType" : "float",
"attrValue" : 0.33,
"id_sensor_station_absolute" : "15_62",
"recvTimeTs" : 1580387045,
"recvTime" : "2020-01-30T12:24:05.00Z",
"id_fiware" : "15",
"sensor_type" : [
{
"name" : "id",
"type" : "String",
"value" : "15"
},
{
"name" : "img",
"type" : "String",
"value" : "assets/img/contrast.png"
},
{
"name" : "manufacturer",
"type" : "String",
"value" : "Hortisis"
},
{
"name" : "medida",
"type" : "String",
"value" : "mS/cm"
},
{
"name" : "name_comun",
"type" : "String",
"value" : "CE del medio"
},
{
"name" : "place",
"type" : "String",
"value" : "interior"
},
{
"name" : "timestamp",
"type" : "DateTime",
"value" : "2020-01-30T12:24:05.00Z"
},
{
"name" : "type",
"type" : "String",
"value" : "fertigation"
}
]
}
I need to convert the sensor_type field to an array with only one object, as follows:
{
"_id":"15_62",
"medidas":[
{
"_id":"5e3419e468d01013eadb83dc",
"marca":"Hortisis",
"modelo":"Estacion",
"fabricante":"Hortisis",
"id_station":"15",
"sensor_type":[
{
"name":"15",
"type":"fertigation",
"place":"interior",
"img":"assets/img/contrast.png",
"name_comun":"Temp. Suelo",
"medida":"mS/cm"
}
],
"attrName":"15",
"attrValue":0.33,
"recvTimeTs":1580387045,
"recvTime":"2020-01-30T12:24:05.00Z",
"id_sensor_station_absolute":"15_62"
}
]
}
As you can really see it is formatting the sensor_type field = name : value.
I'm working with NODEJS and mongoose.
This is my query: (first I search, sort, only show the first value and then with the project I give format, the problem is that I don't know how to tell the project to put that format if I put "sensor_type": "$latest.attributes.name") it only shows the names and I don't know how to put it in the mentioned format.
Datagreenhouse.aggregate([
{ "$match": { "id_sensor_station_absolute": { "$in": array3 } } }, // "id_station": { "$in": id_station },
{ "$sort": { "recvTime": -1 } },
{
"$group": {
"_id": "$id_sensor_station_absolute",
"latest": { "$first": "$$ROOT" },
}
},
{
"$project": {
"_id": 1,
"id_station": "$latest.id_station",
//"id_sensor_station_absolute": "$id_sensor_station_absolute",
"attrName": "$latest.attrName",
"attrValue": "$latest.attrValue",
"recvTimeTs": "$latest.recvTimeTs",
"recvTime": "$latest.recvTime",
"id_sensor_station_absolute": "$latest.id_sensor_station_absolute",
"sensor_type": "$latest.attributes",
"name": { $arrayElemAt: ["$latest.attributes", 0] },
"type": { $arrayElemAt: ["$latest.attributes", 1] },
"place": { $arrayElemAt: ["$latest.attributes", 2] },
"img": { $arrayElemAt: ["$latest.attributes", 1] },
"name_comun": { $arrayElemAt: ["$latest.attributes", 4] },
"medida": { $arrayElemAt: ["$latest.attributes", 3] },
"interfaz": { $arrayElemAt: ["$latest.attributes", 6] },
}
}
], (err, DatagreenhouseRecuperado) => {
if (err) return res.status(500).send({ message: 'Error al realizar la peticion' + err })
if (!DatagreenhouseRecuperado) return res.status(404).send({ message: 'Error el usuario no existe' })
res.status(200).send({ DatagreenhouseRecuperado })
})
Thank you for your help. Best regards.
Since version 3.4.4, MongoDB introduced a magnific operator: $arrayToObject
This operator allows us transmute array key:value pair into object.
Syntax
RAW DATA $map $arrayToObject
sensor_type : [ sensor_type : [ sensor_type : {
{ \ { \
"name" : "manufacturer", ----> k: "manufacturer", --->
"type" : "String", / v: "Hortisis" / "manufacturer" : "Hortisis"
"value" : "Hortisis"
} }
] ] }
db.datagreenhouses.aggregate([
{
"$match": {} // setup your match criteria
},
{
"$sort": {
"recvTime": -1
}
},
{
$group: {
_id: "$id_sensor_station_absolute",
medidas: {
$push: {
_id: "$_id",
"marca": "Hortisis", // don't know where you get this value
"modelo": "Estacion", // don't know where you get this value
"id_station": "$id_station",
"attrName": "$attrName",
"attrValue": "$attrValue",
"recvTimeTs": "$recvTimeTs",
"recvTime": "$recvTime",
"id_sensor_station_absolute": "$id_sensor_station_absolute",
"sensor_type": {
$arrayToObject: {
$map: {
input: "$sensor_type",
in: {
k: "$$this.name",
v: "$$this.value"
}
}
}
}
}
}
}
}
])
MongoPlayground
[
{
"_id": "15_62",
"medidas": [
{
"_id": ObjectId("5e3419e468d01013eadb83dc"),
"attrName": "15",
"attrValue": 0.33,
"id_sensor_station_absolute": "15_62",
"id_station": "62",
"marca": "Hortisis",
"modelo": "Estacion",
"recvTime": "2020-01-30T12:24:05.00Z",
"recvTimeTs": 1.580387045e+09,
"sensor_type": {
"id": "15",
"img": "assets/img/contrast.png",
"manufacturer": "Hortisis",
"medida": "mS/cm",
"name_comun": "CE del medio",
"place": "interior",
"timestamp": "2020-01-30T12:24:05.00Z",
"type": "fertigation"
}
}
]
}
]
All you need to do is transform data to the desired result with an easy to handle object ($unwind medidas field, transform and then $group again)
Note: If your MongoDB is earlier 3.4.4 version, follow update procedure:
Install MongoDB 3.4.4 or newer
Make mongodump with new version MongoBD
Stop old MongoBD
Remove /data directory (make backup)
Start new MongoDB and run mongorestore

mongo aggregation framework group by quarter/half year/year

I have a database with this schema structure :
{
"name" : "Carl",
"city" : "paris",
"time" : "1-2018",
"notes" : [
"A",
"A",
"B",
"C",
"D"
]
}
And this query using the aggregation framework :
db.getCollection('collection').aggregate(
[{
"$match": {
"$and": [{
"$or": [ {
"time": "1-2018"
}, {
"time": "2-2018"
} ]
}, {
"name": "Carl"
}, {
"city": "paris"
}]
}
}, {
"$unwind": "$notes"
}, {
"$group": {
"_id": {
"notes": "$notes",
"time": "$time"
},
"count": {
"$sum": 1
}
}
}
, {
"$group": {
"_id": "$_id.time",
"count": {
"$sum": 1
}
}
}, {
"$project": {
"_id": 0,
"time": "$_id",
"count": 1
}
}])
It working correcly and i'm getting these results these results :
{
"count" : 4.0,
"time" : "2-2018"
}
{
"count" : 4.0,
"time" : "1-2018"
}
My issue is that i'd like to keep the same match stage and i'd like to group by quarter.
Here the result i'd like to have :
{
"count" : 8.0,
"time" : "1-2018" // here quarter 1
}
Thanks

Is it recommended to use unwind in working with large amount of data with nested documents on MongoDB [duplicate]

I am new in mongodb and trying to work with nested documents.I have a query as below
db.EndpointData.aggregate([
{ "$group" : { "_id" : "$EndpointId", "RequestCount" : { "$sum" : 1 }, "FirstActivity" : { "$min" : "$DateTime" }, "LastActivity" : { "$max" : "$DateTime" }, "Tags" : { "$push" : "$Tags" } } },
{ "$unwind" : "$Tags" },
{ "$unwind" : "$Tags" },
{ "$group" : { "_id" : "$_id", "RequestCount" : { "$first" : "$RequestCount" }, "Tags" : { "$push" : "$Tags" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } },
{ "$unwind" : "$Tags" },
{ "$unwind" : "$Tags.Sensors" },
{ "$group" : { "_id" : { "EndpointId" : "$_id", "Uid" : "$Tags.Uid", "Type" : "$Tags.Sensors.Type" }, "RequestCount" : { "$first" : "$RequestCount" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } },
{ "$group" : { "_id" : { "EndpointId" : "$_id.EndpointId", "Uid" : "$_id.Uid" }, "count" : { "$sum" : 1 }, "RequestCount" : { "$first" : "$RequestCount" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } },
{ "$group" : { "_id" : "$_id.EndpointId", "TagCount" : { "$sum" : 1 }, "SensorCount" : { "$sum" : "$count" }, "RequestCount" : { "$first" : "$RequestCount" }, "FirstActivity" : { "$first" : "$FirstActivity" }, "LastActivity" : { "$first" : "$LastActivity" } } }])
and my data structure is as below
{
"_id": "6aef51dfaf42ea1b70d0c4db",
"EndpointId": "98799bcc-e86f-4c8a-b340-8b5ed53caf83",
"DateTime": "2018-05-06T19:05:02.666Z",
"Url": "test",
"Tags": [
{
"Uid": "C1:3D:CA:D4:45:11",
"Type": 1,
"DateTime": "2018-05-06T19:05:02.666Z",
"Sensors": [
{
"Type": 1,
"Value": { "$numberDecimal": "-95" }
},
{
"Type": 2,
"Value": { "$numberDecimal": "-59" }
},
{
"Type": 3,
"Value": { "$numberDecimal": "11.029802536740132" }
}
]
},
{
"Uid": "C1:3D:CA:D4:45:11",
"Type": 1,
"DateTime": "2018-05-06T19:05:02.666Z",
"Sensors": [
{
"Type": 1,
"Value": { "$numberDecimal": "-92" }
},
{
"Type": 2,
"Value": { "$numberDecimal": "-59" }
}
]
}
]
}
This query works fine and correct. I count Tags, Sensors and repeat times of each EdpointID. But the problem is when I work with large size of data (about 10,000,000 documents) I get memory problem. It seems having 4 levels of unwind make problem in this query. How can I reduce unwinds in this query?
As long as your data has unique sensor and tag readings per document, which to date what you have presented appears to, then you simply don't need $unwind at all.
In fact, all you really need is a single $group:
db.endpoints.aggregate([
// In reality you would $match to limit the selection of documents
{ "$match": {
"DateTime": { "$gte": new Date("2018-05-01"), "$lt": new Date("2018-06-01") }
}},
{ "$group": {
"_id": "$EndpointId",
"FirstActivity" : { "$min" : "$DateTime" },
"LastActivity" : { "$max" : "$DateTime" },
"RequestCount": { "$sum": 1 },
"TagCount": {
"$sum": {
"$size": { "$setUnion": ["$Tags.Uid",[]] }
}
},
"SensorCount": {
"$sum": {
"$sum": {
"$map": {
"input": { "$setUnion": ["$Tags.Uid",[]] },
"as": "tag",
"in": {
"$size": {
"$reduce": {
"input": {
"$filter": {
"input": {
"$map": {
"input": "$Tags",
"in": {
"Uid": "$$this.Uid",
"Type": "$$this.Sensors.Type"
}
}
},
"cond": { "$eq": [ "$$this.Uid", "$$tag" ] }
}
},
"initialValue": [],
"in": { "$setUnion": [ "$$value", "$$this.Type" ] }
}
}
}
}
}
}
}
}}
])
Or if you actually do need to accumulate those "unique" values of "Sensors" and "Tags" from across different documents, then you still need initial $unwind statements to get the right grouping, but nowhere near as much as you presently have:
db.endpoints.aggregate([
// In reality you would $match to limit the selection of documents
{ "$match": {
"DateTime": { "$gte": new Date("2018-05-01"), "$lt": new Date("2018-06-01") }
}},
{ "$unwind": "$Tags" },
{ "$unwind": "$Tags.Sensors" },
{ "$group": {
"_id": {
"EndpointId": "$EndpointId",
"Uid": "$Tags.Uid",
"Type": "$Tags.Sensors.Type"
},
"FirstActivity": { "$min": "$DateTime" },
"LastActivity": { "$max": "$DateTime" },
"RequestCount": { "$addToSet": "$_id" }
}},
{ "$group": {
"_id": {
"EndpointId": "$_id.EndpointId",
"Uid": "$_id.Uid",
},
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"count": { "$sum": 1 },
"RequestCount": { "$addToSet": "$RequestCount" }
}},
{ "$group": {
"_id": "$_id.EndpointId",
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"TagCount": { "$sum": 1 },
"SensorCount": { "$sum": "$count" },
"RequestCount": { "$addToSet": "$RequestCount" }
}},
{ "$addFields": {
"RequestCount": {
"$size": {
"$reduce": {
"input": {
"$reduce": {
"input": "$RequestCount",
"initialValue": [],
"in": { "$setUnion": [ "$$value", "$$this" ] }
}
},
"initialValue": [],
"in": { "$setUnion": [ "$$value", "$$this" ] }
}
}
}
}}
],{ "allowDiskUse": true })
And from MongoDB 4.0 you can use $toString on the ObjectId within _id and simply merge the unique keys for those in order to keep the RequestCount using $mergeObjects. This is cleaner and a bit more scalable than pushing nested array content and flattening it
db.endpoints.aggregate([
// In reality you would $match to limit the selection of documents
{ "$match": {
"DateTime": { "$gte": new Date("2018-05-01"), "$lt": new Date("2018-06-01") }
}},
{ "$unwind": "$Tags" },
{ "$unwind": "$Tags.Sensors" },
{ "$group": {
"_id": {
"EndpointId": "$EndpointId",
"Uid": "$Tags.Uid",
"Type": "$Tags.Sensors.Type"
},
"FirstActivity": { "$min": "$DateTime" },
"LastActivity": { "$max": "$DateTime" },
"RequestCount": {
"$mergeObjects": {
"$arrayToObject": [[{ "k": { "$toString": "$_id" }, "v": 1 }]]
}
}
}},
{ "$group": {
"_id": {
"EndpointId": "$_id.EndpointId",
"Uid": "$_id.Uid",
},
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"count": { "$sum": 1 },
"RequestCount": { "$mergeObjects": "$RequestCount" }
}},
{ "$group": {
"_id": "$_id.EndpointId",
"FirstActivity": { "$min": "$FirstActivity" },
"LastActivity": { "$max": "$LastActivity" },
"TagCount": { "$sum": 1 },
"SensorCount": { "$sum": "$count" },
"RequestCount": { "$mergeObjects": "$RequestCount" }
}},
{ "$addFields": {
"RequestCount": {
"$size": {
"$objectToArray": "$RequestCount"
}
}
}}
],{ "allowDiskUse": true })
Either form returns the same data, though the order of keys in the result may vary:
{
"_id" : "89799bcc-e86f-4c8a-b340-8b5ed53caf83",
"FirstActivity" : ISODate("2018-05-06T19:05:02.666Z"),
"LastActivity" : ISODate("2018-05-06T19:05:02.666Z"),
"RequestCount" : 2,
"TagCount" : 4,
"SensorCount" : 16
}
The result is obtained from these sample documents which you originally gave as a sample source in the original question on the topic:
{
"_id" : ObjectId("5aef51dfaf42ea1b70d0c4db"),
"EndpointId" : "89799bcc-e86f-4c8a-b340-8b5ed53caf83",
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Url" : "test",
"Tags" : [
{
"Uid" : "C1:3D:CA:D4:45:11",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-95")
},
{
"Type" : 2,
"Value" : NumberDecimal("-59")
},
{
"Type" : 3,
"Value" : NumberDecimal("11.029802536740132")
},
{
"Type" : 4,
"Value" : NumberDecimal("27.25")
},
{
"Type" : 6,
"Value" : NumberDecimal("2924")
}
]
},
{
"Uid" : "C1:3D:CA:D4:45:11",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-95")
},
{
"Type" : 2,
"Value" : NumberDecimal("-59")
},
{
"Type" : 3,
"Value" : NumberDecimal("11.413037961112279")
},
{
"Type" : 4,
"Value" : NumberDecimal("27.25")
},
{
"Type" : 6,
"Value" : NumberDecimal("2924")
}
]
},
{
"Uid" : "E5:FA:2A:35:AF:DD",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:02.666Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-97")
},
{
"Type" : 2,
"Value" : NumberDecimal("-58")
},
{
"Type" : 3,
"Value" : NumberDecimal("10.171658037099185")
}
]
}
]
}
/* 2 */
{
"_id" : ObjectId("5aef51e0af42ea1b70d0c4dc"),
"EndpointId" : "89799bcc-e86f-4c8a-b340-8b5ed53caf83",
"Url" : "test",
"Tags" : [
{
"Uid" : "E2:02:00:18:DA:40",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:04.574Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-98")
},
{
"Type" : 2,
"Value" : NumberDecimal("-65")
},
{
"Type" : 3,
"Value" : NumberDecimal("7.845424441900629")
},
{
"Type" : 4,
"Value" : NumberDecimal("0.0")
},
{
"Type" : 6,
"Value" : NumberDecimal("3012")
}
]
},
{
"Uid" : "12:3B:6A:1A:B7:F9",
"Type" : 1,
"DateTime" : ISODate("2018-05-06T19:05:04.574Z"),
"Sensors" : [
{
"Type" : 1,
"Value" : NumberDecimal("-95")
},
{
"Type" : 2,
"Value" : NumberDecimal("-59")
},
{
"Type" : 3,
"Value" : NumberDecimal("12.939770381907275")
}
]
}
]
}
Bottom line is that you can either use the first given form here which will accumulate "within each document" and then "accumulate per endpoint" within a single stage and is the most optimal, or you actually require to identify things like the "Uid" on the tags or the "Type" on the sensor where those values occur more than once over any combination of documents grouping by the endpoint.
Your sample data supplied to date only shows that these values are "unique within each document", therefore the first given form would be most optimal if this is the case for all remaining data.
In the event that it is not, then "unwinding" the two nested arrays in order to "aggregate the detail across documents" is the only way to approach this. You can limit the date range or other criteria as most "queries" typically have some bounds and do not actually work on the "whole" collection data, but the main fact remains that arrays would be "unwound" creating essentially a document copy for every array member.
The point on optimization means that you only need to do this "twice" as there are only two arrays. Doing successive $group to $unwind to $group is always a sure sign you a doing something really wrong. Once you "take something apart" you should only ever need to "put it back together" once. In a series of graded steps as demonstrated here is the once approach which optimizes.
Outside of the scope of your question still remains:
Add other realistic constraints to the query to reduce the documents processed, maybe even do so in "batches" and combine results
Add the allowDiskUse option to the pipeline to let temporary storage be used. ( actually demonstrated on the commands )
Consider that "nested arrays" are probably not the best storage method for the analysis you want to do. It's always more efficient when you know you need to $unwind to simply write the data in that "unwound" form directly into a collection.
If you're dealing with data on the order of 10,000,000 documents, you're going to run into aggregation pipeline size limits easily. Specifically, according to the MongoDB documentation, there is a pipeline RAM use limit of 100MB. If each document has at least 10 bytes of data, then that's enough to hit that limit, and your documents would absolutely exceed that amount.
There are a few options available to you to resolve this problem:
1) You can use the allowDiskUse option as noted in the documentation.
2) You can project your documents further between unwind stages to limit document size (very unlikely to be enough on its own).
3) You can periodically generate summary documents on subsets of your data, and then perform your aggregations on those summary documents. If, for example, you run summary documents on subsets of size 1,000, you can reduce the number of documents in your pipelines from 10,000,000 to just 10,000.
4) You can look into sharding your collection and running these aggregate operations on a cluster to reduce the load on any single server.
Options 1 and 2 are both very short-term solutions. They're easy to implement, but won't help much in the long run. Options 3 and 4, however, are far more involved and trickier to implement, but will provide the greatest amount of scalability and are more likely to continue meeting your needs long-term.
Do be warned, however, that if you plan to approach option 4, you need to be very prepared. A sharded collection cannot be unsharded, and messing up can cause potentially irreparable data loss. Having a dedicated DBA with experience with MongoDB clusters is recommended.

Find Total based on group by of two mongo field

i have collection data like this -
{
"user_id" : "1",
"branch_id" : "1",
"total" : 100,
},
{
"user_id" : "1",
"branch_id" : "1",
"total" : 200
},
{
"user_id" : "1",
"branch_id" : "3",
"total" : 1400
},
{
"user_id" : "2",
"branch_id" : "1",
"total" : 100
},
{
"user_id" : "2",
"branch_id" : "1",
"total" : 100
},
I am looking to get output in the below format -
[
{
"user_id":"1",
"branch_id":"1",
"grand_total":"300"
},
{
"user_id":"1",
"branch_id":"3",
"grand_total":"1400"
},
{
"user_id":"2",
"branch_id":"1",
"grand_total":"200"
}
]
I have tried a mongo aggregate query, but the query gives output as undefined.
Basically I need to get per user wise per branch wise the total points he has earned.
Here is what I have tried but not working -
Collection.aggregate(
{
"$group": {
"_id": "$user_id",
"nameCount": { "$sum": 1 },
"branch_id": {
"$sum": {
"$cond": [ {"$branch_id":{"$ne":null}} ]
}
}
}
},
{
"$project": {
"_id": 0,
"name": "$_id",
"nameCount": 1,
"branch_id":1
}
}
);
Please help.
Your aggregation pipeline has to look like this:
{
"$group": {
"_id": {
user_id: "$user_id",
branch_id: "$branch_id"
},
"grand_total": {
"$sum": "$total"
},
}
}, {
"$project": {
"_id": 0,
"user_id": "$_id.user_id",
"branch_id": "$_id.branch_id",
"total": "$grand_total"
}
}
Inside your _id field in your "$group" pipeline you add the fields that you want to group your documents by. If you only want to group by one field you can write it as follows:
{"$group": {
"_id": "$user_id"
}
}
If you have multiple fields you want to group by (like it seems in your case) then you write it as follows:
{"$group": {
"_id": {
user_id: "$user_id",
branch_id: "$branch_id"
}
}
}
Every aggregation pipeline changes your document. So, in your $group if you call the sum of all totals like that "grand_total"
"grand_total": {
"$sum": "$total"
}
then in your $project pipeline that field total doesn't exist anymore. But instead we created a new field (grand_total) that is the sum.