Does MongoDB support aggregate queries on the result of the aggregate query? - mongodb

I have an aggregate query that returns the count of records a property has.
db.collection.aggregate([
{
$group : {
_id : "$propertyId",
count: { $sum: 1 }
}
},
{
$sort : { count: 1 }
}
],
{
allowDiskUse:true
});
This gives me a result that looks like this.
{ "_id" : 1234, "count" : 1 }
{ "_id" : 1235, "count" : 1 }
{ "_id" : 1236, "count" : 2 }
{ "_id" : 1237, "count" : 3 }
{ "_id" : 1238, "count" : 3 }
Now I want to count the counts. So the above result would turn into this.
{ "_id" : 1, "count" : 2 }
{ "_id" : 2, "count" : 1 }
{ "_id" : 3, "count" : 2 }
Is this possible to do with a query, or do I need to write some code to get this done?

I updated the query to have another "step" that counts the counts. This is how it looks.
db.collection.aggregate([
{
$group : {
_id : "$propertyId",
count: { $sum: 1 }
}
},
{
$group : {
_id : "$count",
countOfCounts: { $sum: 1 }
}
},
{
$sort : { countOfCounts: 1 }
}
],
{
allowDiskUse:true
});

Related

Mongodb: Deduplicate collection

I'm working with mongo and node. I have a collection with a large number of records an unknown number of which are duplicates. I'm trying to remove dups following Remove duplicate records from mongodb 4.0 and https://docs.mongodb.com/manual/aggregation/ .
I am using the mongodb compass tool. I am able to run the code in the mongodb Shell at the bottom of this tool.
So far I have:
db.hayes.aggregate([
... {"$group" : {_id:"$PropertyId", count:{$sum:1}}}
... ]
... );
{ "_id" : "R135418", "count" : 10 }
{ "_id" : "R47410", "count" : 17 }
{ "_id" : "R130794", "count" : 10 }
{ "_id" : "R92923", "count" : 18 }
{ "_id" : "R107811", "count" : 11 }
{ "_id" : "R91389", "count" : 15 }
{ "_id" : "R22047", "count" : 12 }
{ "_id" : "R103664", "count" : 10 }
{ "_id" : "R121349", "count" : 12 }
{ "_id" : "R143168", "count" : 8 }
{ "_id" : "R85918", "count" : 13 }
{ "_id" : "R41641", "count" : 13 }
{ "_id" : "R160910", "count" : 11 }
{ "_id" : "R48919", "count" : 11 }
{ "_id" : "M119387", "count" : 10 }
{ "_id" : "R161734", "count" : 12 }
{ "_id" : "R41259", "count" : 13 }
{ "_id" : "R156538", "count" : 7 }
{ "_id" : "R60868", "count" : 10 }
How do I now select 1 of each of the groups to avoid duplicates . (I can see that loading it into a new collection will likely involve: {$out: "theCollectionWithoutDuplicates"}
)
edit:
The output from db.hayes.aggregate([
{
$group: {
_id: "$PropertyId",
count: {
$sum: 1
},
ids: {
$addToSet: "$_id"
}
}
},
{
$match: {
count: {
$gt: 1
}
}
}
]) gives output looking like:
{ "_id" : "M118975", "count" : 8, "ids" : [ ObjectId("60147f84e9fdd41da73272d6"), ObjectId("601427ac432deb152a70b8fd"), ObjectId("6014639be210571a70d1118f"), ObjectId("60145e9ae210571a70d0062f"), ObjectId("60145545b6f7a917817e9519"), ObjectId("6014619be210571a70d0a091"), ObjectId("60145dc3d5a2811a459b4e07"), ObjectId("60146641e210571a70d1a3cd") ] }
{ "_id" : "R88986", "count" : 10, "ids" : [ ObjectId("60131752de3d3a09bc1eb04b"), ObjectId("6013348385dcda0eb5b8d40c"), ObjectId("60145297b6f7a917817e1928"), ObjectId("601458eeb08c4919df85f63d"), ObjectId("601462f4e210571a70d0e961"), ObjectId("60142ad9c0db1716068a612e"), ObjectId("601425263df18a145b2fd0a8"), ObjectId("60145be5d5a2811a459aea7e"), ObjectId("6014634ce210571a70d0fe5c"), ObjectId("60131a1ab7335806a1816b95") ] }
{ "_id" : "P119977", "count" : 11, "ids" : [ ObjectId("601468b9597abd1bfd0798a4"), ObjectId("60144b7dbfa28016887b0e8f"), ObjectId("60147094c4bca31cfdb12d1d"), ObjectId("60144de7bfa28016887b698b"), ObjectId("60135aa63674d90dffec3759"), ObjectId("60135f552441920e97e858a3"), ObjectId("601428b3432deb152a70f32e"), ObjectId("60141b222ac11f13055725a5"), ObjectId("60145326b6f7a917817e38b6"), ObjectId("6014882c5322582035e83f63"), ObjectId("6014741ae9fdd41da7313a44") ] }
However when I run the foreach loop it runs for minutes and crashes
Originally the database mydb had 0.173 GB but now 0.368 GB
any idea what is wrong?
edit 2:
I rebooted, then reran your entire script. this time it completed in a 3-4 minutes. No errors.
> show dbs
admin 0.000GB
config 0.000GB
local 0.000GB
myNewDatabase 0.000GB
mydb 0.396GB
> db.hayes.aggregate([ {"$group" : {_id:"$PropertyId", count:{$sum:1}}},{$count:"total"} ]);
{ "total" : 103296 }
> db.hayes.aggregate([ {"$group" : {_id:"$PropertyId", count:{$sum:1}}} ]);
{ "_id" : "R96274", "count" : 1 }
{ "_id" : "R106186", "count" : 1 }
{ "_id" : "R169417", "count" : 1 }
{ "_id" : "R140542", "count" : 1 }
So it looks like it worked this time, but why is 'mydb' getting larger?
Here is how to keep single document from every duplicated list and remove the rest:
db.test.aggregate([
{
$group: {
_id: "$PropertyId",
count: {
$sum: 1
},
ids: {
$addToSet: "$_id"
}
}
},
{
$match: {
count: {
$gt: 1
}
}
}
]).forEach(function(d){
d.ids.shift();
printjson(d.ids);
db.test.remove({
_id: {
$in: d.ids
}
})
})
Explained:
You group by PropertyId and preserve every document _id in ids array
You filter only groups that have more then 1 document ( the duplicates )
You loop over all groups and remove 1st _id from ids array (the group for deletion) and remove the duplicates.
You can execute multiple times , if there is no duplicates no deletion will be executed ...

How can I split a MongoDB collection into 3 and assign a new field?

I have a json collection with 300 records like this:
{
salesNumber: 23839,
batch: null
},
{
salesNumber 389230,
batch: null
}
...etc.
I need to divide this collection into 3 different batches. So, when sorted by salesNumber, the first 100 would be in batch 1, the next 100 would be batch 2, and the last 100 would be batch 3. How do I do this?
I wrote a script to select the first 100, but when I tried to turn it into an array to use in an update, the result was 0 records.
var firstBatchCompleteRecords = db.properties.find(
{
"auction": ObjectId("50")
}
).sort("saleNumber").limit(100);
// This returned 174 records as excepted with all the fields
var firstBatch = firstBatchCompleteRecords.distinct( "saleNumber", {});
// This returned 0 records
I was going to take the results of that last query and use it in an update statement:
db.properties.update(
{
"saleNumber":
{
"$in": firstBatch
}
}
,
{
$set:
{
batch: "1"
}
}
,
{
multi: true
}
);
...then I would have created an array using distinct of the next 100 and update those, but I never got that far.
there is a chance to get results using aggregation framework - and store them in new collection - then you can use this answer to iterate and update fields in source collection
Have a fun!
db.sn.aggregate([{
$sort : {
salesNumber : 1
}
}, {
$group : {
_id : null,
arrayOfData : {
$push : "$$ROOT"
},
}
}, {
$project : {
_id : 0,
firstHundred : {
$slice : ["$arrayOfData", 0, 100]
},
secondHundred : {
$slice : ["$arrayOfData", 99, 100]
},
thirdHundred : {
$slice : ["$arrayOfData", 199, 100]
},
}
}, {
$project : {
"firstHundred.batch" : {
$literal : 1
},
"firstHundred.salesNumber" : 1,
"firstHundred._id" : 1,
"secondHundred.batch" : {
$literal : 2
},
"secondHundred.salesNumber" : 1,
"secondHundred._id" : 1,
"thirdHundred.batch" : {
$literal : 3
},
"thirdHundred.salesNumber" : 1,
"thirdHundred._id" : 1,
}
}, {
$project : {
allValues : {
$setUnion : ["$firstHundred", "$secondHundred", "$thirdHundred"]
}
}
}, {
$unwind : "$allValues"
}, {
$project : {
_id : "$allValues._id",
salesNumber : "$allValues.salesNumber",
batch : "$allValues.batch",
}
}, {
$out : "collectionName"
}
])
db.collectionName.find()
and output generated for 6 document divided by 2:
{
"_id" : ObjectId("5733ade7eeeccba2bd546121"),
"salesNumber" : 389230,
"batch" : 2
}, {
"_id" : ObjectId("5733ade7eeeccba2bd546120"),
"salesNumber" : 23839,
"batch" : 1
}, {
"_id" : ObjectId("5733ade7eeeccba2bd546122"),
"salesNumber" : 43839,
"batch" : 1
}, {
"_id" : ObjectId("5733ade7eeeccba2bd546124"),
"salesNumber" : 63839,
"batch" : 2
}, {
"_id" : ObjectId("5733ade7eeeccba2bd546123"),
"salesNumber" : 589230,
"batch" : 3
}, {
"_id" : ObjectId("5733ade7eeeccba2bd546125"),
"salesNumber" : 789230,
"batch" : 3
}
Any comments welcome!

How to Group mongodb - mapReduce output?

i have a query regarding the mapReduce framework in mongodb, so i have a result of key value pair from mapReduce function , now i want to run the query on this output of mapReduce.
So i am using mapReduce to find out the stats of user like this
db.order.mapReduce(function() { emit (this.customer,{count:1,orderDate:this.orderDate.interval_start}) },
function(key,values){
var sum =0 ; var lastOrderDate;
values.forEach(function(value) {
if(value['orderDate']){
lastOrderDate=value['orderDate'];
}
sum+=value['count'];
});
return {count:sum,lastOrderDate:lastOrderDate};
},
{ query:{status:"DELIVERED"},out:"order_total"}).find()
which give me output like this
{ "_id" : ObjectId("5443765ae4b05294c8944d5b"), "value" : { "count" : 1, "orderDate" : ISODate("2014-10-18T18:30:00Z") } }
{ "_id" : ObjectId("54561911e4b07a0a501276af"), "value" : { "count" : 2, "lastOrderDate" : ISODate("2015-03-14T18:30:00Z") } }
{ "_id" : ObjectId("54561b9ce4b07a0a501276b1"), "value" : { "count" : 1, "orderDate" : ISODate("2014-11-01T18:30:00Z") } }
{ "_id" : ObjectId("5458712ee4b07a0a501276c2"), "value" : { "count" : 2, "lastOrderDate" : ISODate("2014-11-03T18:30:00Z") } }
{ "_id" : ObjectId("545f64e7e4b07a0a501276db"), "value" : { "count" : 15, "lastOrderDate" : ISODate("2015-06-04T18:30:00Z") } }
{ "_id" : ObjectId("54690771e4b0070527c657ed"), "value" : { "count" : 6, "lastOrderDate" : ISODate("2015-06-03T18:30:00Z") } }
{ "_id" : ObjectId("54696c64e4b07f3c07010b4a"), "value" : { "count" : 1, "orderDate" : ISODate("2014-11-18T18:30:00Z") } }
{ "_id" : ObjectId("546980d1e4b07f3c07010b4d"), "value" : { "count" : 4, "lastOrderDate" : ISODate("2015-03-24T18:30:00Z") } }
{ "_id" : ObjectId("54699ac4e4b07f3c07010b51"), "value" : { "count" : 30, "lastOrderDate" : ISODate("2015-05-23T18:30:00Z") } }
{ "_id" : ObjectId("54699d0be4b07f3c07010b55"), "value" : { "count" : 1, "orderDate" : ISODate("2014-11-16T18:30:00Z") } }
{ "_id" : ObjectId("5469a1dce4b07f3c07010b59"), "value" : { "count" : 2, "lastOrderDate" : ISODate("2015-04-29T18:30:00Z") } }
{ "_id" : ObjectId("5469a96ce4b07f3c07010b5e"), "value" : { "count" : 1, "orderDate" : ISODate("2014-11-16T18:30:00Z") } }
{ "_id" : ObjectId("5469c1ece4b07f3c07010b64"), "value" : { "count" : 9, "lastOrderDate" : ISODate("2015-04-15T18:30:00Z") } }
{ "_id" : ObjectId("5469f422e4b0ce7d5ee021ad"), "value" : { "count" : 5, "lastOrderDate" : ISODate("2015-06-01T18:30:00Z") } }
......
Now i want to run query and group the users on the basis of count in different categories like for user with count less than 5 in one group , 5-10 in another, etc
and want output something like this
{userLessThan5: 9 }
{user5to10: 2 }
{user10to15: 1 }
{user15to20: 0 }
....
Try this,
db.order.mapReduce(function() { emit (this.customer,{count:1,orderDate:this.orderDate.interval_start}) },
function(key,values){
var category; // add this new field
var sum =0 ; var lastOrderDate;
values.forEach(function(value) {
if(value['orderDate']){
lastOrderDate=value['orderDate'];
}
sum+=value['count'];
});
// at this point you are already aware in which category your records lies , just add a new field to mark it
if(sum < 5){ category: userLessThan5};
if(sum >= 5 && sum <=10){ category: user5to10};
if(sum <= 10 && sum >= 15){ category: user10to15};
if(sum <= 15 && sum >=20){ category: user15to20};
....
return {count:sum,lastOrderDate:lastOrderDate,category:category};
},
{ query:{status:"DELIVERED"},out:"order_total"}).find()
db.order_total.aggregate([{ $group: { "_id": "$value.category", "users": { $sum: 1 } } }]);
you will get you desired result
{userLessThan5: 9 }
{user5to10: 2 }
{user10to15: 1 }
{user15to20: 0 }
....
I wrote a query using your data in aggregation as per my knowledge, there may be better way to solve this problem.
var a=db.test.aggregate([{$match:{"value.count":{$lt:5}}},
{ $group: { _id:"$value.count",total:{"$sum":1}}},
{$group:{_id:"less than 5",total:{$sum:"$total"}}}])
var b=db.test.aggregate([{$match:{"value.count":{$lt:10,$gt:5}}},
{ $group: { _id:"$value.count",total:{"$sum":1}}},
{$group:{_id:"between 5 and 10",total:{$sum:"$total"}}}])
var c=db.test.aggregate([{$match:{"value.count":{$lt:15,$gt:10}}},
{ $group: { _id:"$value.count",total:{"$sum":1}}},
{$group:{_id:"between 10 and 15",total:{$sum:"$total"}}}])
insert a, b, c into another collection
You could try to group the output data after mapreduce to every 5 interval count through aggregate like below
db.data.aggregate([
{ "$group": {
"_id": {
"$subtract": [
{ "$subtract": [ "$value.count", 0 ] },
{ "$mod": [
{ "$subtract": [ "$value.count", 0 ] },
5
]}
]
},
"count": { "$sum": 1 }
}}
])
Also maybe here is one related question here.

How to get the list of docs that exist more than 1 time in mongoDB with the same field?

I would like to create a query that will get all the docs with title that exist more than 1 time in mongoDB.
lets say that this is my doc:
{
"itemID" : "AAN88998JJCA",
"itemTitle" : "AAAA
}
{
"itemID" : "AAN8BB98JJCA",
"itemTitle" : "AAAA"
}
{
"itemID" : "A5N84998JJ3A",
"itemTitle" : "AACC"
}
{
"itemID" : "A2N81998JJC1",
"itemTitle" : "AACC"
}
{
"itemID" : "A2N81998JJC1",
"itemTitle" : "BBBB"
}
I would like to set a query that will produce a list of
{
"itemID" : "AAN88998JJCA",
"itemTitle" : "AAAA
}
{
"itemID" : "AAN8BB98JJCA",
"itemTitle" : "AAAA"
}
{
"itemID" : "A5N84998JJ3A",
"itemTitle" : "AACC"
}
{
"itemID" : "A2N81998JJC1",
"itemTitle" : "AACC"
}
Which mean a record with the same title that exist in the DB more than 1 time.
I can do it in Java code but seem like it would be more reasonable to do it on the DB level.
Using aggregation framework, you can group itemTitle by the number of occurences:
db.collection.aggregate([
{
$group: {
_id: "$itemTitle",
total: { $sum: 1 }
}
}
,{
$match: {
total: { $gt: 1 }
}
}
]);

mongodb aggregation find min value and other fields in nested array

Is it possible to find in a nested array the max date and show its price then show the parent field like the actual price.
The result I want it to show like this :
{
"_id" : ObjectId("5547e45c97d8b2c816c994c8"),
"actualPrice":19500,
"lastModifDate" :ISODate("2015-05-04T22:53:50.583Z"),
"price":"16000"
}
The data :
db.adds.findOne()
{
"_id" : ObjectId("5547e45c97d8b2c816c994c8"),
"addTitle" : "Clio pack luxe",
"actualPrice" : 19500,
"fistModificationDate" : ISODate("2015-05-03T22:00:00Z"),
"addID" : "1746540",
"history" : [
{
"price" : 18000,
"modifDate" : ISODate("2015-05-04T22:01:47.272Z"),
"_id" : ObjectId("5547ec4bfeb20b0414e8e51b")
},
{
"price" : 16000,
"modifDate" : ISODate("2015-05-04T22:53:50.583Z"),
"_id" : ObjectId("5547f87e83a1dae00bc033fa")
},
{
"price" : 19000,
"modifDate" : ISODate("2015-04-04T22:53:50.583Z"),
"_id" : ObjectId("5547f87e83a1dae00bc033fe")
}
],
"__v" : 1
}
my query
db.adds.aggregate(
[
{ $match:{addID:"1746540"}},
{ $unwind:"$history"},
{ $group:{
_id:0,
lastModifDate:{$max:"$historique.modifDate"}
}
}
])
I dont know how to include other fields I used $project but I get errors
thanks for helping
You could try the following aggregation pipeline which does not need to make use of the $group operator stage as the $project operator takes care of the fields projection:
db.adds.aggregate([
{
"$match": {"addID": "1746540"}
},
{
"$unwind": "$history"
},
{
"$project": {
"actualPrice": 1,
"lastModifDate": "$history.modifDate",
"price": "$history.price"
}
},
{
"$sort": { "lastModifDate": -1 }
},
{
"$limit": 1
}
])
Output
/* 1 */
{
"result" : [
{
"_id" : ObjectId("5547e45c97d8b2c816c994c8"),
"actualPrice" : 19500,
"lastModifDate" : ISODate("2015-05-04T22:53:50.583Z"),
"price" : 16000
}
],
"ok" : 1
}