Optimize $group performance in mongodb - mongodb

I have huge documents about 2 millions like this, each account has many logs
{
"_id" : 1,
"type" : "login",
"date" : "2057-03-28T02:59:41.176Z",
"link" : DBRef("accounts", ObjectId("5bd9683d4df859ad279b5649"))
},
{
"_id" : 2,
"type" : "login",
"date" : "2057-03-28T02:53:41.176Z",
"link" : DBRef("accounts", ObjectId("5bd9683d4df859ad279b5649"))
},
{
"_id" : 3,
"type" : "login",
"date" : "2057-03-28T02:49:41.176Z",
"link" : DBRef("accounts", ObjectId("5bd9683d4df859ad279b5643"))
}
I'm trying to get latest date of log per account by this way, but it took couple seconds to get result.
db.logs.aggregate(
{
$match: {
"link.$ref": "accounts",
"link.$id": { $in: [ObjectId("5bd9683d4df859ad279b5649"), ObjectId("5bd9683d4df859ad279b5643")]} //array accounts id
}
},
{
$group: {
_id: "$link",
date: {$last: "$date"},
type: {$last: "$type"}
}
}
)
I'm using mongodb 3.4

Related

mongodb get entries where id exists multiple times based on count condition

I have a collection 'bookings' with the following example structure:
{
"_id" : ObjectId("1"),
"user" : ObjectId("1"),
"event" : ObjectId("1"),
},
{
"_id" : ObjectId("2"),
"user" : ObjectId("1"),
"event" : ObjectId("1"),
},
{
"_id" : ObjectId("3"),
"user" : ObjectId("2"),
"event" : ObjectId("1"),
},
{
"_id" : ObjectId("4"),
"user" : ObjectId("3"),
"event" : ObjectId("1"),
},
{
"_id" : ObjectId("5"),
"user" : ObjectId("4"),
"event" : ObjectId("2"),
},
{
"_id" : ObjectId("6"),
"user" : ObjectId("1"),
"event" : ObjectId("2"),
}
I cant figure out a query that shows all "event" id's in which the same "user" id appears multiple times. something like this:
{
"event": 1,
"user": 1,
"count": 2
}
Does not have to be this exact output, in other words I just want a way to have a query to get all events for which the same "user" id has more than one entry in this "bookings" collection.
Any suggestions? Thanks!
You just need to do grouping and filtering.
In SQL it would be just as simple as
SELECT count(*) as cc, user, event FROMM t1 GROUP BY user, event HAVING cc > 1
In MongoDB, you can use the aggregation framework to do equivalent stuff.
It does the same in 3 different steps in the pipeline: group, filter, project.
db.mycollection.aggregate( [
{ $group: { _id: { user: "$user", event: "$event", }, count: { $sum: 1 } } },
{ $match: { count: { $gt: 1 } } },
{ $project: { _id: 0,
userId: "$_id. user",
event: "$_id.event",
count: 1
}
}
] )
This documentation can help you to understand deeper: https://www.mongodb.com/docs/manual/reference/sql-aggregation-comparison/

Whats the alternative to $replaceRoot on mongoDB? $replaceRoot is incompatible with documentDB

The problem: I'm trying to make a query on MongoDB, but I'm using the DocumentDb from amazon, where some operations are no supported. I wanted to find an alternative to get the same result, if possible. Basically I want to change the root of the result, instead of being the first entity, I need it to be some merging of some values in different levels of the document.
So, I have the following structure in my collection:
{
"_id" : ObjectId("5e598bf4d98f7c70f9aa3b58"),
"status" : "active",
"invoices" : [
{
"_id" : ObjectId("5e598bf13b24713f50600375"),
"value" : 1157.52,
"receivables" : [
{
"situation" : {
"status" : "active",
"reason" : []
},
"rec_code" : "001",
"_id" : ObjectId("5e598bf13b24713f50600374"),
"expiration_date" : ISODate("2020-03-25T00:00:00.000Z"),
"value" : 1157.52
}
],
"invoice_code" : 9773,
"buyer" : {
"legal_name" : "test name",
"buyer_code" : "223132165498797"
}
},
],
"seller" : {
"code" : "321654897986",
"name" : "test name 2"
}
}
What I want to achieve is to list all "receivables" like this, where the _id is the _id of the receivable:
[{
"_id" : ObjectId("5e598bf13b24713f50600374"),
"situation" : {
"status" : "active",
"reason" : []
},
"rec_code" : "001",
"expiration_date" : ISODate("2020-03-25T00:00:00.000Z"),
"value" : 1157.52,
"status" : "active",
"seller" : {
"cnpj" : "321654897986",
"name" : "test name 2"
},
"invoice_code" : 9773.0,
"buyer" : {
"legal_name" : "test name",
"cnpj" : "223132165498797"
}
}]
This I can do with $replaceRoot in with the query below on MongoDB, but using documentDB I can't use $replaceRoot or $mergeObjects. Do you know how can I get the same result with other operators?:
db.testCollection.aggregate([
{ $unwind: "$invoices" },
{ $replaceRoot: {
newRoot: {
$mergeObjects: ["$$ROOT","$invoices"]}
}
},
{$project: {"_id": 0, "value": 0, "created_at": 0, "situation": 0}},
{ $unwind: "$receivables" },
{ $replaceRoot: {
newRoot: {
$mergeObjects: ["$receivables", "$$ROOT"]
}
}
},
{$project:{"created_at": 0, "receivables": 0, "invoices": 0}}
])
After going through mongodb operations, I could get a similar result fro what I wanted with the following query without $replaceRoot. It turns out it was a better query, I think:
db.testCollection.aggregate([
{$unwind: "$invoices"},
{$project : {
created_at: 1,
seller: "$seller",
buyer: "$invoices.buyer",
nnf: "$invoices.nnf",
receivable: '$invoices.receivables'
}
},
{$unwind: "$receivable"},
{$project : {
_id: '$receivable._id',
seller: 1,
buyer: 1,
invoice_code: 1,
receivable: 1,
created_at: 1,
}
},
{$sort: {"created_at": -1}},
])
This query resulted in the following structure list:
[{
"created_at" : ISODate("2020-03-06T09:47:26.161Z"),
"seller" : {
"name" : "Test name",
"cnpj" : "21231232131232"
},
"buyer" : {
"cnpj" : "21322132164654",
"legal_name" : "Test name 2"
},
"invoice_code" : 66119,
"receivable" : {
"rec_code" : "001",
"_id" : ObjectId("5e601bb5efff82b92935bad4"),
"expiration_date" : ISODate("2020-03-17T00:00:00.000Z"),
"value" : 6540.7,
"situation" : {
"status" : "active",
"reason" : []
}
},
"_id" : ObjectId("5e601bb5efff82b92935bad4")
}]
Support for $replaceRoot was added to Amazon DocumentDB in January 2021.

Mongodb aggregate by day and delete duplicate value

I'm trying to clean a huge database.
Sample DB :
{
"_id" : ObjectId("59fc5249d5ab401d99f3de7f"),
"addedAt" : ISODate("2017-11-03T11:26:01.744Z"),
"__v" : 0,
"check" : 17602,
"lastCheck" : ISODate("2018-04-05T11:47:00.609Z"),
"tracking" : [
{
"timeCheck" : ISODate("2017-11-06T13:17:20.861Z"),
"_id" : ObjectId("5a0060e00f3c330012bafe39"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:22:31.254Z"),
"_id" : ObjectId("5a0062170f3c330012bafe77"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:27:40.551Z"),
"_id" : ObjectId("5a00634c0f3c330012bafebe"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-06T13:32:41.084Z"),
"_id" : ObjectId("5a0064790f3c330012baff03"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff32"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-07T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff34"),
"rank" : 2379,
}]
}
I have a lot of duplicate value but I need to clean only by day.
To obtain this for example :
{
"_id" : ObjectId("59fc5249d5ab401d99f3de7f"),
"addedAt" : ISODate("2017-11-03T11:26:01.744Z"),
"__v" : 0,
"check" : 17602,
"lastCheck" : ISODate("2018-04-05T11:47:00.609Z"),
"tracking" : [
{
"timeCheck" : ISODate("2017-11-06T13:17:20.861Z"),
"_id" : ObjectId("5a0060e00f3c330012bafe39"),
"rank" : 2395,
},
{
"timeCheck" : ISODate("2017-11-06T13:27:40.551Z"),
"_id" : ObjectId("5a00634c0f3c330012bafebe"),
"rank" : 2379,
},
{
"timeCheck" : ISODate("2017-11-07T13:37:51.012Z"),
"_id" : ObjectId("5a0065af0f3c330012baff34"),
"rank" : 2379,
}]
}
How can I aggregate by day and after delete last value duplicate?
I need to keep the values per day even if they are identical with another day.
The aggregation framework cannot update data at this stage. However, you can use the following aggregation pipeline in order to get the desired output and then use e.g. a bulk replace to update all your documents:
db.collection.aggregate({
$unwind: "$tracking" // flatten the "tracking" array into separate documents
}, {
$sort: {
"tracking.timeCheck": 1 // sort by timeCheck to allow us to use the $first operator in the next stage reliably
}
}, {
$group: {
_id: { // group by
"_id": "$_id", // "_id" and
"rank": "$tracking.rank", // "rank" and
"date": { // the "date" part of the "timeCheck" field
$dateFromParts : {
year: { $year: "$tracking.timeCheck" },
month: { $month: "$tracking.timeCheck" },
day: { $dayOfWeek: "$tracking.timeCheck" }
}
}
},
"doc": { $first: "$$ROOT" } // only keep the first document per group
}
}, {
$sort: {
"doc.tracking.timeCheck": 1 // restore ascending sort order - may or may not be needed...
}
}, {
$group: {
_id: "$_id._id", // merge everything again per "_id"
"addedAt": { $first: "$doc.addedAt" },
"__v": { $first: "$doc.__v" },
"check": { $first: "$doc.check" },
"lastCheck": { $first: "$doc.lastCheck" },
"tracking": { $push: "$doc.tracking" } // in order to join the tracking values into an array again
}
})

How to do this query in mongo: get newest messages for a list of users

I have a collection of messages with fields user_id, created_time, and content. Given a list of user_id, I would like to get back a list of messages, where for each user_id it contains a message newest with respect to that user. I thought about using a distinct command together with sort in mongo but that doesn't seem to be supported. Is there a way to do this in mongo using a single query?
MongoDB has the Aggregation framework which you can use for tasks that require some manipulation of your data in your collection
Consider the following dataset
> db.messages.find().pretty()
{
"_id" : ObjectId("52ecb77486d35a12f3552aa1"),
"user_id" : "fred",
"create_date" : ISODate("1392-09-21T00:00:00Z")
}
{
"_id" : ObjectId("52ecb79286d35a12f3552aa2"),
"user_id" : "fred",
"create_date" : ISODate("1392-06-01T00:00:00Z")
}
{
"_id" : ObjectId("52ecb7a386d35a12f3552aa3"),
"user_id" : "marty",
"create_date" : ISODate("1393-04-06T00:00:00Z")
}
{
"_id" : ObjectId("52ecb7af86d35a12f3552aa4"),
"user_id" : "marty",
"create_date" : ISODate("1386-02-12T00:00:00Z")
}
So in passing this to aggregate we want to group on user_id and get the most recent or maximum create_date
> db.messages.aggregate([
{ $group: { _id: { user_id: "$user_id" }, create_date: { $max: "$create_date" }} }
])
{
"result" : [
{
"_id" : {
"user_id" : "marty"
},
"create_date" : ISODate("1393-04-06T00:00:00Z")
},
{
"_id" : {
"user_id" : "fred"
},
"create_date" : ISODate("1392-09-21T00:00:00Z")
}
],
"ok" : 1
}
That's not bad but you can clean it up with $project
> db.messages.aggregate([
{ $group: { _id: { user_id: "$user_id" }, create_date: { $max: "$create_date" }} },
{ $project: { _id: 0, user_id: "$_id.user_id", create_date: 1} }
])
{
"result" : [
{
"create_date" : ISODate("1393-04-06T00:00:00Z"),
"user_id" : "marty"
},
{
"create_date" : ISODate("1392-09-21T00:00:00Z"),
"user_id" : "fred"
}
],
"ok" : 1
}
So that actually looks like a clean record to use. In latest drivers the returned value from aggregate should be a cursor you can iterate over. So the results are just the same to work with as using find.
Additional documentation on operators to use can be found here.

How to ensure grouping via two separate criteria

Expanded from How to average the summed up values in mongodb?
Using MongoDB 2.4.8,
I have the following records
{
"category" : "TOYS",
"price" : 12,
"status" : "online",
"_id" : "35043"
}
{
"category" : "TOYS",
"price" : 13,
"status" : "offline",
"_id" : "35044"
}
{
"category" : "TOYS",
"price" : 22,
"status" : "online",
"_id" : "35045"
}
{
"category" : "BOOKS",
"price" : 13,
"status" : "offline",
"_id" : "35046"
}
{
"category" : "BOOKS",
"price" : 17,
"status" : "online",
"_id" : "35047"
}
{
"category" : "TOYS",
"price" : 19,
"status" : "unavailable",
"_id" : "35048"
}
{
"category" : "BOOKS",
"price" : 10,
"status" : "unavailable",
"_id" : "35049"
}
{
"category" : "BOOKS",
"price" : 17,
"status" : "unavailable",
"_id" : "35050"
}
I want to find the average price of all categories whose status is online OR offline and total price within a category is more than 50.
Toys offline and Toys online are considered two separate categories.
I adapted the answer given.
db.items.aggregate([
{$match:
{
$or: [
{status:"online"},
{status:"offline"}
]
}
},
{$group :
{
_id: "$category",
total_price: {$sum:"$price"},
}
},
{$match:
{
total_price:{$gt:50}
}
},
{$group :
{
_id: "1",
avg_price: {$avg:"$total_price"},
}
},
]);
But I believe this query I adapted grouped categories of the same name together which is not what I am looking for.
If online and offline are the only values for status, you can remove the initial $match step. If it is needed, it would be more appropriate to use the $in operator as these values could be found in the same index (if one existed).
I think the only step you are missing is that you can $group by multiple fields (i.e. category and status):
db.items.aggregate(
// If 'online' and 'offline' are the only possible status values, this may be unnecessary
{ $match: {
'status' : { $in: [ 'online', 'offline' ] }
}},
// Group by category & status
{ $group: {
_id: { category: "$category", status: "$status" },
total_price: { $sum: "$price" },
}},
// Only find groups where total_price is > 50
{ $match: {
total_price: { $gt:50 }
}},
// Find the average price for the group
{ $group : {
_id: null,
avg_price: {$avg:"$total_price"},
}}
)