Optimizing mongo query for better response - mongodb

I am trying to optimize mongodb query for better response
db.myReports.find({
"CheckInDate": {
"$gte" : ISODate("2015-01-12T00:00:00Z"),
"$lte" : ISODate("2015-03-31T00:00:00Z")
},
"SubscriberPropertyId": NumberLong(47984),
"ChannelId": {
"$in": [701, 8275]
},
"PropertyId": {
"$in": [47984, 3159, 5148, 61436, 66251, 70108]
},
"LengthOfStay": 1
}, {
"CheckInDate": 1,
"SubscriberPropertyId": 1,
"ChannelId": 1,
"PropertyId": 1
});
Currently it is taking around 3 minutes just to find data from 3 million records.
One Document from collection
{
"_id" : ObjectId("54dba46c320caf5a08473074"),
"OptimisationId" : NumberLong(1),
"ScheduleLogId" : NumberLong(3),
"ReportId" : NumberLong(4113235),
"SubscriberPropertyId" : NumberLong(10038),
"PropertyId" : NumberLong(18166),
"ChannelId" : 701,
"CheckInDate" : ISODate("2014-09-30T18:30:00Z"),
"LengthOfStay" : 1,
"OccupancyIndex" : 1.0,
"CreatedDate" : ISODate("2014-09-11T06:31:08Z"),
"ModifiedDate" : ISODate("2014-09-11T06:31:08Z"),
}
INDEX created is:
db.myReports.getIndexes();
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "db.myReports"
},
{
"v" : 1,
"key" : {
"CheckInDate" : 1,
"SubscriberPropertyId" : 1,
"ReportId" : 1,
"ChannelId" : 1,
"PropertyId" : 1
},
"name" :
"CheckInDate_1_SubscriberPropertyId_1_ReportId_1_Channe
lId_1_PropertyId_1",
"ns" : "db.myReports"
},
{
"v" : 1,
"key" : {
"CheckInDate" : 1
},
"name" : "CheckInDate_1",
"ns" : "db.myReports"
}
]
I have created index on possible entities

Put equality queries first, then range queries:
db.myReports.find({
"SubscriberPropertyId": NumberLong(47984),
"ChannelId": {
"$in": [701, 8275]
},
"PropertyId": {
"$in": [47984, 3159, 5148, 61436, 66251, 70108]
},
"CheckInDate": {
"$gte" : ISODate("2015-01-12T00:00:00Z"),
"$lte" : ISODate("2015-03-31T00:00:00Z")
},
"LengthOfStay": 1 // low selectivity, move to the end
}, {
"CheckInDate": 1,
"SubscriberPropertyId": 1,
"ChannelId": 1,
"PropertyId": 1
});
Make sure the index fits, i.e make the index SubscriberPropertyId, ChannelId, PropertyId, CheckInDate. LengthOfStay probably has too low selectivity to make sense in an index, depends on your data.
That should reduce the nscanned significantly, but getting 300k results will take its time (actually reading them, I mean)

Related

what't the meaning of mongo $minKey?

this page :
https://docs.mongodb.com/manual/reference/operator/query/type/
{ "date": new Date(1393804800000), "grade": MaxKey(), "score": 2 },
when i show Maxkey() in mongo shell:
MaxKey().help
The MaxKey BSON Class.
For more information on usage: https://mongodb.github.io/node-mongodb-native/3.6/api/MaxKey.html
how can I understand it ?
should I compare it with "$lt" or "$gt" like this ?
db.test.find({"grades.grade": {"$gt":"a"}})
MinKey and MaxKey are MongoDB internal types. Their purpose is to represent the theoretical extremes.
MinKey is less than any value, and MaxKey is greater than any value, regardless of type.
See Comparison/Sort Order
I think minKey() or maxKey() is just a special value which can only be queried by { $type : "maxKey" }
If data is below,
{
"_id" : 2,
"grades" : [
{
"date" : ISODate("2014-03-03T00:00:00.000Z"),
"grade" : { "$maxKey" : 1 },
"score" : 2
}, {
"date" : ISODate("2013-01-24T00:00:00.000Z"),
"grade" : { "$maxKey" : 1 },
"score" : 3
}
]
}
Use,
db.test.find({"grades.grade": {"$gt":"A"}})
Will return nothing.
But if use,
db.test.find({"grades.grade" : { $type : "maxKey" }})
Will return,
{
"_id" : 2,
"grades" : [
{
"date" : ISODate("2014-03-03T00:00:00.000Z"),
"grade" : { "$maxKey" : 1 },
"score" : 2
}, {
"date" : ISODate("2013-01-24T00:00:00.000Z"),
"grade" : { "$maxKey" : 1 },
"score" : 3
}
]
}

Optimise MongoDB aggregate query

I have a collection with millions of documents, each document represent an event: {_id, product, timestamp}
In my query, I need to group by product and take the top 10 for example.
"aggregate" : "product_events",
"pipeline" : [
{
"$match" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
}
},
{
"$group" : {
"_id" : "$product",
"count" : {
"$sum" : 1
}
}
},
{
"$sort" : {
"count" : -1
}
},
{
"$limit" : 10
}
]
My query is very slow now (10 seconds), I am wondering if there is a way to store data differently to optimise this query?
db.product_events.explain("executionStats").aggregate([ {"$match" :
{"timeEvent" : {"$gt" : ISODate("2017-07-17T00:00:00Z")}}},{"$group" :
{"_id" : "$product","count" : {"$sum" : 1}}}, {"$project": {"_id": 1,
"count": 1}} , {"$sort" : {"count" : -1}},{"$limit" : 500}],
{"allowDiskUse": true})
{
"stages" : [
{
"$cursor" : {
"query" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"fields" : {
"product" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "mydb.product_events",
"indexFilterSet" : false,
"parsedQuery" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"direction" : "forward"
},
"rejectedPlans" : [ ]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 2127315,
"executionTimeMillis" : 940,
"totalKeysExamined" : 0,
"totalDocsExamined" : 2127315,
"executionStages" : {
"stage" : "COLLSCAN",
"filter" : {
"timeEvent" : {
"$gt" : ISODate("2017-07-17T00:00:00Z")
}
},
"nReturned" : 2127315,
"executionTimeMillisEstimate" : 810,
"works" : 2127317,
"advanced" : 2127315,
"needTime" : 1,
"needYield" : 0,
"saveState" : 16620,
"restoreState" : 16620,
"isEOF" : 1,
"invalidates" : 0,
"direction" : "forward",
"docsExamined" : 2127315
}
}
}
},
{
"$group" : {
"_id" : "$product",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"count" : true
}
},
{
"$sort" : {
"sortKey" : {
"count" : -1
},
"limit" : NumberLong(500)
}
}
],
"ok" : 1
}
Below my indexes
db.product_events.getIndexes()
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "mydb.product_events"
},
{
"v" : 2,
"key" : {
"product" : 1,
"timeEvent" : -1
},
"name" : "product_1_timeEvent_-1",
"ns" : "mydb.product_events"
}
]
Creating indexes on fields of a collection aids into optimising process of data retrieval from database collections.
Indexes are generally created on fields into which data are filtered according to specific criteria.
Data contained into indexed fields are sorted in specific order and while fetching data once match is found ,scanning of other document stops which makes process of fetching data faster.
According to description as mentioned into above question to optimise performance of aggregate query please try creating an index on timeEvent field as timeEvent field is used as a filter expression into $match stage of aggregation pipeline.
The documentation on compound indexes states the following.
db.products.createIndex( { "item": 1, "stock": 1 } )
The order of the fields listed in a compound index is important. The
index will contain references to documents sorted first by the values
of the item field and, within each value of the item field, sorted by
values of the stock field.
In addition to supporting queries that match on all the index fields,
compound indexes can support queries that match on the prefix of the
index fields. That is, the index supports queries on the item field as
well as both item and stock fields.
Your product_1_timeEvent_-1 index looks like this:
{
"product" : 1,
"timeEvent" : -1
}
which is why it cannot be used to support a query that only filters on timeEvent.
Options you have to get that sorted:
Flip the order of the fields in your index
Remove the product field from your index
Create an additional index with only the timeEvent field in it.
(Include some additional filter on the product field so the existing index gets used)
And keep in mind that any creation/deletion/modification of an index may impact other queries, too. So make sure you test your changes properly.

MongoDB performs slow query on sum() based on monthly groups

This is What I tried so far on aggregated query:
db.getCollection('storage').aggregate([
{
"$match": {
"user_id": 2
}
},
{
"$project": {
"formattedDate": {
"$dateToString": { "format": "%Y-%m", "date": "$created_on" }
},
"size": "$size"
}
},
{ "$group": {
"_id" : "$formattedDate",
"size" : { "$sum": "$size" }
} }
])
This is the result:
/* 1 */
{
"_id" : "2018-02",
"size" : NumberLong(10860595386)
}
/* 2 */
{
"_id" : "2017-12",
"size" : NumberLong(524288)
}
/* 3 */
{
"_id" : "2018-01",
"size" : NumberLong(21587971)
}
And this is the document structure:
{
"_id" : ObjectId("5a59efedd006b9036159e708"),
"user_id" : NumberLong(2),
"is_transferred" : false,
"is_active" : false,
"process_id" : NumberLong(0),
"ratio" : 0.000125759169459343,
"type_id" : 201,
"size" : NumberLong(1687911),
"is_processed" : false,
"created_on" : ISODate("2018-01-13T11:39:25.000Z"),
"processed_on" : ISODate("1970-01-01T00:00:00.000Z")
}
And last, the explain result:
/* 1 */
{
"stages" : [
{
"$cursor" : {
"query" : {
"user_id" : 2.0
},
"fields" : {
"created_on" : 1,
"size" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "data.storage",
"indexFilterSet" : false,
"parsedQuery" : {
"user_id" : {
"$eq" : 2.0
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"user_id" : 1
},
"indexName" : "user_id",
"isMultiKey" : false,
"multiKeyPaths" : {
"user_id" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"user_id" : [
"[2.0, 2.0]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$project" : {
"_id" : true,
"formattedDate" : {
"$dateToString" : {
"format" : "%Y-%m",
"date" : "$created_on"
}
},
"size" : "$size"
}
},
{
"$group" : {
"_id" : "$formattedDate",
"size" : {
"$sum" : "$size"
}
}
}
],
"ok" : 1.0
}
The problem:
I can navigate and get all results in almost instantly like in 0,002sec. However, when I specify user_id and sum them by grouping on each month, My result came in between 0,300s to 0,560s. I do similar tasks in one request and it becaomes more than a second to finish.
What I tried so far:
I've added an index for user_id
I've added an index for created_on
I used more $match conditions. However, This makes even worse.
This collection have almost 200,000 documents in it currently and approximately 150,000 of them are belongs to user_id = 2
How can I minimize the response time for this query?
Note: MongoDB 3.4.10 used.
Pratha,
try to add sort on "created_on" and "size" fields as the first stage in aggregation pipeline.
db.getCollection('storage').aggregate([
{
"$sort": {
"created_on": 1, "size": 1
}
}, ....
Before that, add compound key index:
db.getCollection('storage').createIndex({created_on:1,size:1})
If you sort data before the $group stage, it will improve the efficiency of accumulation of the totals.
Note about sort aggregation stage:
The $sort stage has a limit of 100 megabytes of RAM. By default, if the stage exceeds this limit, $sort will produce an error. To allow for the handling of large datasets, set the allowDiskUse option to true to enable $sort operations to write to temporary files.
P.S
get rid of match stage by userID to test performance, or add userID to compound key also.

Query to group distinct values and show sum of array values in mongodb

I wanted to group by cart.name and find the sum of cart.qty in mongodb. Below is sample document
{
"_id" : ObjectId("581323379ae5e607645cb485"),
"cust" : {
"name" : "Customer 1",
"dob" : "09/04/1989",
"mob" : 999999999,
"loc" : "Karimangalam",
"aadhar" : {
}
},
"cart" : [
{
"name" : "Casual Shirt",
"qty" : 1,
"mrp" : 585,
"discperc" : 10,
"fit" : null,
"size" : "L"
},
{
"name" : "Casual Shirt",
"qty" : 1,
"mrp" : 500,
"discperc" : 0,
"fit" : null,
"size" : "L"
},
{
"name" : "Cotton Pant",
"qty" : 1,
"mrp" : 850,
"discperc" : 0,
"fit" : null,
"size" : "34"
},
{
"name" : "Cotton Pant",
"qty" : 1,
"mrp" : 1051,
"discperc" : 10,
"fit" : null,
"size" : "34"
}
],
"summary" : {
"bill" : 2822.4,
"qty" : 4,
"mrp" : 2986,
"received" : "2800",
"balance" : -22.40000000000009
},
"createdAt" : ISODate("2016-10-28T10:06:47.367Z"),
"updatedAt" : ISODate("2016-10-28T10:06:47.367Z")
}
There are many document like this. I want the output as below distinct product name (cart.name) and its total qty
{Casual Shirt , 30},
{Cotton Pant , 10},
{T-Shirt , 15},
{Lower , 12}
Here is my query trying to group by cart.name and sum qty
db.order.aggregate( [
{ $unwind: "$cart" },
{ $group: {
_id: "$cart.name",
totalQTY: { $sum:"$cart.qty"},
count: { $sum: 1 }
}
}
] )
but it displays wrong totalQty values for each product name. I checked manually.
Please give me the correct query.
> db.collection.aggregate([
... { $unwind: "$cart" },
... { $group: { "_id": "$cart.name", totalQTY: { $sum: "$cart.qty" }, count: { $sum: 1 } } }
... ])
I get the following result:
{ "_id" : "Cotton Pant", "totalQTY" : 2, "count" : 2 }
{ "_id" : "Casual Shirt", "totalQTY" : 11, "count" : 2 }
I'm not sure what you're looking for, it looks like your aggregation pipeline is correct. (Note I changed the Casual Shirt Quantity to be 10 and 1 respectively)

mongodb $near query is slow

One mongodb collection
{
"_id" : ObjectId("574bbae4d009b5364abaebe5"),
"cityid" : 406,
"location" : {
"type" : "Point",
"coordinates" : [
118.602355,
24.89083
]
},
"shopid" : "a"
}
with about 50, 000 rows;
and indexes:
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "pingan-test.shop_actinfo_collection_0530"
},
{
"v" : 1,
"key" : {
"location" : "2dsphere"
},
"name" : "location_2dsphere",
"ns" : "pingan-test.shop_actinfo_collection_0530",
"2dsphereIndexVersion" : 3
},
{
"v" : 1,
"key" : {
"shopid" : 1,
"cityid" : 1
},
"name" : "shopid_1_cityid_1",
"ns" : "pingan-test.shop_actinfo_collection_0530"
}
]
I query this collection like:
body = {'cityid': 2, 'location': {'$near': {'$geometry': {'type': 'Point', 'coordinates': [122.0, 31.0]}}}, 'shopid': {'$in': ['a','b']}}
results = collection.find(body, {'shopid': 1, '_id':0},).batch_size(20).limit(20)
shops = list(results)
The question is that it run about 400ms. But it just take 30ms if we don't care about location.
why and how to fix? please.
You have an index on shopid and cityid, but you search for cityid. Since the index is ordered by shopid first it cannot be used to search by cityid. If you change the index to cityid: 1, shopid: 1, then you will see a performance improvement because your query will be able to search using the index.
after all, i got it.
I just create a index to cityid: 1, shopid: 1, "location" : "2dsphere"
, and then, world peace。
and thanks #tiramisu again.