I've made a query using the aggregation framework in the latest version of MongoDB. The problem is that I can't find the way to get the result because it takes too long and the process crashes.
Collections
Constructora with 50.000 documents (including embedded documents and so on)
{
_id: 1,
edificio: [
{
_id:58,
pais: "Argentina",
provincia: "Buenos Aires",
ciudad: "Tandil",
direccion: "9 de Julio 845",
departamento: [
{
_id:45648651867,
nombre_depto: "a"
},
...
]
}
...
]
},
{
_id:2,
edificio: [...],
...
}
...
variable with 400.000 documents, including embedded documents.
{
_id:1
medicion_departamento: [
{
_id:1,
valmax:40
id_departamento:6,
...
},
...
]
},
{
_id: 2,
medicion_departamento: [...]
},
...
Medicion with 8.000.000 documents.
{
_id:1,
id_departamento: 6,
id_variable: 1,
valor: 6269
},
{
_id:2,
...
},
...
Query
I want to get the the adress of the departments (pais, provincia, ciudad, departamento.nombre) that exceeds the valmax value in "variable" fives time in the field valor in "medicion". My query is:
db.constructora.aggregate([
{$unwind:"$edificio"},
{$unwind:"$edificio.departamento"},
{$lookup:
{
from: "variable",
localField: "edificio.departamento._id",
foreignField: "medicion_departamento.id_departamento",
as: "var"
}
},
{$unwind: "$var"},
{$unwind: "$var.medicion_departamento"},
{$match: {"var.nombre":"electricidad"}},
{$lookup:
{
from: "medicion",
localField: "var.medicion_departamento.id_departamento",
foreignField: "id_departamento",
as: "med"
}
},
{$unwind:"$med"},
{$project:{"id_dto":"med.id_departamento", "consumo":"$med.valor","valorMult":{$multiply:["$var.valmax",5]}, "edificio.pais":1, "edificio.provincia":1, "edificio.direccion":1, "edificio.departamento.nombre_depto":1}},
{$match:{"consumo":{$gt:"valorMult"}}},
{$group:{_id:{"a": "edificio.pais", "b":"edificio.provincia", "c":"edificio.direccion", "d":"edificio.departamento.nombre_depto"}}}
]);
When I remove the last match and group in the pipeline, the query returns the data in 0.08s, but when I run this with match and group it runs until the process crashes. What can I do to fix (or optimize) it?
Thanks!
Related
Hope someone can help with the slow Mongo query - it runs fine against smaller collections but once we test it against the larger production collections, it fails with the message "Not enough disk space" even though we had limited the result set to 100.
I feel like there is an issue with the query structure and/or appropriate indexes
Both collections are ~5 million records.
We need help to make this query fast.
// divide these by 1000 because the ts field isn't javascript milliseconds
const startDate = (ISODate("2022-07-01T00:00:00.000Z").getTime()/1000)
const endDate = (ISODate("2022-08-10T00:00:00.000Z").getTime()/1000)
const clientId = xxxx
const ordersCollection = "orders"
const itemsCollection = "items"
db[ordersCollection].aggregate(
[
{
$lookup: {
from: itemsCollection,
localField: "data.id",
foreignField: "data.orders_id",
as: "item"
}
},
{
$unwind: "$item"
},
{
$match: {"data.client_id": clientId}
},
{
$match: {"item.ts": {$gt: startDate, $lt: endDate}}
},
{
$project: {
order_id: "$data.id",
parent_id: "$data.parent_id",
owner_id: "$data.owner_id",
client_id: "$data.client_id",
ts: "$item.ts",
status: {
$cond: {
if: {$eq: ["$item.data.status",10] },then: 3,
else: {
$cond: { if: { $eq: ["$item.data.status",4] },
then: 2,
else: "$item.data.status"
}
}
}
}
}
},
{$group: { _id: {"order_id": "$order_id", "status": "$status"},
order_id: {$first:"$order_id"},
parent_id: {$first:"$parent_id"},
owner_id: {$first:"$owner_id"},
client_id: {$first:"$client_id"},
ts: {$first:"$ts"},
status:{$first:"$status"}
}},
{$sort: {"ts": 1}}
]
).limit(100).allowDiskUse(true)
Try pulling $match on the main collection up.
This way you limit the number of documents you need to $lookup on (otherwise we'll try to lookup 5 million documents in other collection of 5 million documents).
Be sure to have an index on data.client_id.
db[ordersCollection].aggregate(
[
{
$match: {"data.client_id": clientId}
},
{
$lookup: {
from: itemsCollection,
localField: "data.id",
foreignField: "data.orders_id",
as: "item"
}
},
{
$unwind: "$item"
},
{
$match: {"item.ts": {$gt: startDate, $lt: endDate}}
},
...
As a side note limiting the result set to 100 is not helping, as the heaviest part - aggregation with lookups and grouping can not be limited.
DB Schema:
3 Collections - `Collection1`, `Collection2`, `Collection3`
Column Structure:
Collection1 - [ _id, rid, aid, timestamp, start_timestamp, end_timestamp ]
Collection2 - [ _id, rid, aid, task ]
Collection3 - [ _id, rid, event_type, timestamp ]
MONGO DB VERSION: 4.2.14
The problem statement is, we need to join all the three collections using rid. Collection1 is the parent source from where we need the analysis from.
Collection2 contains the task for each record for rid in collection1. We need a "count" for each rid in this collection. Collection3 contains the event log for each record for rid. This is quite huge so we just need to filter only two events EventA & EventB for each rid found in pipeline-1.
I could come up with this but it's not working. I am not getting the min date from Collection3 for each rid matched in previous pipeline.
Note: Event logs min date for each event should be associated with rid matched in Collection1 filter.
Query:
db.getCollection("Collection1").aggregate([
{
$match: {
start_timestamp: {
$gte: new Date(ISODate().getTime() - 1000 * 60 * 15),
},
},
},
{
$lookup: {
from: "Collection2",
localField: "rid",
foreignField: "rid",
as: "tasks",
},
},
{
$lookup: {
from: "Collection3",
pipeline: [
{
$match: {
event: {
$in: ["EventA", "EventB"]
}
}
},
{
$group: {
_id: "$event",
timestamp: {
"$min": {
"updated": "$timestamp"
}
}
}
}
],
as: "eventlogs",
},
}
]);
Expected Output:
[
rid: "123",
aid: "456",
timestamp: ISODate("2022-06-03T09:46:39.609Z"),
start_timestamp: ISODate("2022-06-03T09:46:39.657Z"),
tasks: [
count: 5
],
logs: [
{
"_id": "EventA",
"timestamp": {
"updated": ISODate("2022-04-27T06:10:44.323Z")
}
},
{
"_id": "EventB",
"timestamp": {
"updated": ISODate("2022-05-05T06:36:10.271Z")
}
}
]
]
I need to write a highly optimized query which would do the above in less time (Assuming proper indexes are in place for each collection on columns). That query should not do COLLSCAN for the entire data in Collection3, since it's going to be quite huge.
I have the following collections:
phones:
{"_id": {
"$oid": "61d376c0b9887d4e736e6acb"
},
"brand": "Nokia",
"name": "Nokia 3210",
"picture": "https://fdn2.gsmarena.com/vv/bigpic/no3210b.gif",
"phoneId": "1" }
reviews:
{"_id": {
"$oid": "61d333d0ac2d25f88d0bc8fa"
},
"phoneId": "1",
"rating": "3",
"dateOfReview": {
"$date": "2008-11-18T00:00:00.000Z"
},
"title": "Ok phone to tide me over",
"userId": "47599" }
I'm running the following aggregation both on MongoCompass and MongoShell and gives me the expected result:
db.phones.aggregate([{$lookup: {
from: 'reviews',
localField: 'phoneId',
foreignField: 'phoneId',
as: 'reviews'
}}])
{ _id: ObjectId("61d376c0b9887d4e736e6acb"),
brand: 'Nokia',
name: 'Nokia 3210',
picture: 'https://fdn2.gsmarena.com/vv/bigpic/no3210b.gif',
phoneId: '1',
reviews:
[ { _id: ObjectId("61d333d0ac2d25f88d0bc8fa"),
phoneId: '1',
rating: '3',
dateOfReview: 2008-11-18T00:00:00.000Z,
title: 'Ok phone to tide me over',
userId: '47599' } ] }
But when I check the collection there is no field reviews, how can I do to add it to the collection permanently? Since I have a lot of reviews for each phone I would also like to add to the reviews' array in phones only the 20 most recent ones that match the phoneId, is it possible?
You can use the $merge and $out pipeline stages to write data back to your database.
Note that these stages have to be the last stage in your pipeline.
E.g.
db.phones.aggregate([
{
$lookup: {
from: 'reviews',
localField: 'phoneId',
foreignField: 'phoneId',
as: 'reviews'
}
},
{
$merge: {
into: 'phones', // collection-name
on: 'phoneId', // the identifier, used to identify the document to merge into
}
}
])
Since for each phone # reviews is >> 20, you might wish to consider going after the reviews first, then doing a $lookup into phones. For a single known phone lookup the following will work; it will not work for >1 phone because the $limit cannot reference data fields (i.e. phoneId)
db.Xreviews.aggregate([
{$match: {"phoneId":"2"}}
,{$sort: {"dateOfReview":-1}} // no getting around the desc sort..
,{$limit: 20} // but we now limit to ONLY 20 reviews.
// Put them "back together" as an array called "reviews"
,{$group: {_id:"$phoneId", reviews: {$push: "$$CURRENT"}}}
// ... and pull in the phone data:
,{$lookup: {from: "Xphones", localField: "_id", foreignField: "phoneId", as: "X" }}
]);
The following will work across 1 or more phone or all of them BUT the consideration is the reviews array could be very large before being passed to the $slice operator to cut it back to 20:
db.Xreviews.aggregate([
// $match for anything or nothing here; then:
{$sort: {"dateOfReview":-1}}
// The order of the _id is not deterministic BUT the docs will be
// pushed onto the reviews array correctly in desc order:
,{$group: {_id:"$phoneId", reviews: {$push: "$$CURRENT"}}}
// Now simply overwrite the reviews array a shortened version:
,{$addFields: {reviews: {$slice: ["$reviews",20] }}}
,{$lookup: {from: "Xphones", localField: "_id", foreignField: "phoneId", as: "X" }}
]);
These two solutions end up with the phone details being stored in field 'X' which is an array of 1 item. Since we know the phoneID is 1:1, if we wish to get fancy, we can add this after the $lookup:
// Pull item[0] out and merge with reviews AND make that the new
// document:
,{$replaceRoot: { newRoot: {$mergeObjects: [ "$$CURRENT", {$arrayElemAt:["$X",0]} ]} }}
,{$unset: "X"} // drop X completely
I think I solved my problem combining the two solutions proposed by Buzz and MarcRo:
db.reviews.aggregate([
{
$sort: {
"dateOfReview":-1
}
},
{
$group: {
_id: '$phoneId',
reviews: {
$push: '$$CURRENT'
}
}
}, {
$addFields: {
reviews: {
$slice: [
'$reviews',
20
]
}
}
}, {
$merge: {
into: 'phones',
on: '_id'
}
}])
I have a collection matches like this. I'm using players object {key: ObjectId, key: ObjectID} instead of classic array [ObjectId, ObjectID] for reference players collection
{
"_id": ObjectId("5eb93f8efd259cd7fbf49d55"),
"date": "01/01/2020",
"players": {
"home": ObjectId("5eb93f8efd259cd7fbf49d59"),
"away": ObjectId("5eb93f8efd259cd7fbf49d60")
}
},
{...}
And players collection:
{
"_id": ObjectId("5eb93f8efd259cd7fbf49d59"),
"name": "Roger Federer"
"country": "Suiza"
},
{
"_id": ObjectId("5eb93f8efd259cd7fbf49d60"),
"name": "Rafa Nadal"
"country": "España"
},
{...}
What's the better way to do mongoDB lookup? something like this is correct?
const rows = await db.collection('matches').aggregate([
{
$lookup: {
from: "players",
localField: "players.home",
foreignField: "_id",
as: "players.home"
}
},
{
$lookup: {
from: "players",
localField: "players.away",
foreignField: "_id",
as: "players.away"
},
{ $unwind: "$players.home" },
{ $unwind: "$players.away" },
}]).toArray()
I want output like this:
{
_id: 5eb93f8efd259cd7fbf49d55,
date: "12/05/20",
players: {
home: {
_id: 5eb93f8efd259cd7fbf49d59,
name: "Roger Federer",
country: "Suiza"
},
away: {
_id: 5eb93f8efd259cd7fbf49d60,
name: "Rafa Nadal",
country: "España"
}
}
}
{...}
You can try below aggregation query :
db.matches.aggregate([
{
$lookup: {
from: "players",
localField: "players.home",
foreignField: "_id",
as: "home"
}
},
{
$lookup: {
from: "players",
localField: "players.away",
foreignField: "_id",
as: "away"
}
},
/** Check output of lookup is not empty array `[]` & get first doc & write it to respective field, else write the same value as original */
{
$project: {
date: 1,
"players.home": { $cond: [ { $eq: [ "$home", [] ] }, "$players.home", { $arrayElemAt: [ "$home", 0 ] } ] },
"players.away": { $cond: [ { $eq: [ "$away", [] ] }, "$players.away", { $arrayElemAt: [ "$away", 0 ] } ] }
}
}
])
Test : mongoplayground
Changes or Issues with current Query :
1) As you're using two $unwind stages one after the other, If anyone of the field either home or away doesn't have a matching document in players collection then in the result you don't even get actual match document also, But why ? It's because if you do $unwind on [] (which is returned by lookup stage) then unwind will remove that parent document from result, To overcome this you need to use preservenullandemptyarrays option in unwind stage.
2) Ok, there is another way to do this without actually using $unwind. So do not use as: "players.home" or as: "players.away" cause you're actually writing back to original field, Just in case if you don't find a matching document an empty array [] will be written to actual fields either to "home" or "away" wherever there is not match (In this case you would loose actual ObjectId() value existing in that particular field in matches doc). So write output of lookup to a new field.
Or even more efficient way, instead of two $lookup stages (Cause each lookup has to go through docs of players collection again & again), you can try one lookup with multiple-join-conditions-with-lookup :
db.matches.aggregate([
{
$lookup: {
from: "players",
let: { home: "$players.home", away: "$players.away" },
pipeline: [
{
$match: { $expr: { $or: [ { $eq: [ "$_id", "$$home" ] }, { $eq: [ "$_id", "$$away" ] } ] } }
}
],
as: "data"
}
}
])
Test : mongoplayground
Note : Here all the matching docs from players which match with irrespective of away or home field will be pushed to data array. So to keep DB operation simple you can get that array from DB along with actual matches document & Offload some work to code which is to map respective objects from data array to players.home & players.away fields.
I have Book collection which have Name, Description, Publisher ObjectID field, Array of Authors ID, Categories ID field etc. I need to search book with name, description, publisher name, author and category name with $regex.
To do that in aggregation pipe first I populate authors, publisher, categories with $lookup and then use $match with $or operator over the field.
My query works but it perform very slow ( approximately 11s ) where Book collection contain only 70 thousand documents.
What steps should I need in collection model, Indexing or Query good performance?
Book Model:
{
"_id" : ObjectId("5a2934934410bf8b0e547989"),
"publisher" : ObjectId("5a27e7b68021772210b125d4"),
"is_enabled" : true,
"published_at" : ISODate("2017-12-07T12:31:15.166Z"),
"author" : [
ObjectId("5a27c5754b0efc477f37a131"),
ObjectId("5a27c5754b0efc47737a1512"),
ObjectId("5a27c5754b0efc477f37a145"),
],
"category" : [
ObjectId("5a27e22ffb6110b11c326cd7"),
ObjectId("5a27e22ffb6110b11c326ced"),
ObjectId("5a27e22ffb6110b11c326d2d"),
ObjectId("5a27e22ffb6110b11c326e45")
]
"published_year" : "2017"
}
Query I executed:
Book.aggregate(
[
{
$match: {
"is_enabled": { $eq: true },
}
},
{
$lookup:
{
from: "authors",
localField: "author",
foreignField: "_id",
as: "author"
}
},
{
$lookup:
{
from: "categories",
localField: "category",
foreignField: "_id",
as: "category"
}
},
{
$lookup:
{
from: "publishers",
localField: "publisher",
foreignField: "_id",
as: "publisher"
}
},
{
$match: {
$or: [
{ "author.name": new RegExp(params.expression, 'i') },
{ "category.name": new RegExp(params.expression, 'i') },
{ "publisher.name": new RegExp(params.expression, 'i') },
{ "description": new RegExp(params.expression, 'i') },
{ "name": new RegExp(params.expression, 'i') },
{ "published_year": params.terms }
]
}
},
{
$project: {
previous_price: "$previous_price",
price: "$price",
name: "$name",
seo_url: "$seo_url",
click_url: "book",
author: "$author",
authorObj: {
name: { $arrayElemAt: ["$author.name", 0] },
}
}
},
{ $sort: { name: 1 } }
]
)
.skip(8 * (params.pagenum - 1))
.limit(8)
.exec((err, product) => {
if (err)
reject(err);
else
resolve(product);
})
You can create index for fields is_enabled, author, category and publisher like bellow.
db.coll.createIndex( { is_enabled: 1 } )
db.coll.createIndex( { author: 1 } )
db.coll.createIndex( { category: 1 } )
db.coll.createIndex( { publisher: 1 } )
that will increase the performance for first match stage and for lookup.
you can also create index for name, description and published_year but I am not sure how will affect of this index for last match stage because you used $or condition. As far I know still unable to optimize indexed queries that uses $or, $in (<=3.2). You can try that. It will be helpful if you use $and condition query. If you use $and query then you can also create multi key index for name, description and published_year. like
db.coll.createIndex( { name: 1, description: 1 published_year:1 } )
and then you should follow the same order in match condition
{$match: { name: 'xx', description:'yy', published_year: 2017}}