mongodb - group by on ~ 1.3 million records - mongodb

{"messageId": "123124", "writtenAt":"2017-04-26T15:16:36.200Z", "updatedAt":"2999-12-31T23:59:59.999Z"}
{"messageId": "123124", "writtenAt":"2017-04-26T15:21:30.230Z", "updatedAt":"2999-12-31T23:59:59.999Z"}
The structure of the collection is above. Aside from the mongo id, it has an id called 'messageId', in a collection we can have multiple entries with same 'messageId' but has different 'writtenAt' field value. Have a compound index: messageId (desc), writtenAt (desc).
Now, wanting to do a group by on messageId so I would only get the the latest one (max writtenAt value). I have the following query but it's taking very long I haven't even gotten a result yet (more than 10 mins then I stop, collection has over 1.3 million records):
db.messages.aggregate(
[{ "$match": { "updatedAt": { "$gte": { "$date": "2021-02-26T06:59:51.738Z" } } } },
{ "$sort": { "messageId": -1, "writtenAt": -1 } },
{ "$group": { "_id": "$messageId", "doc": { "$first": "$$ROOT" } } },
{ "$replaceRoot" : { "newRoot" : "$doc"}}
], {allowDiskUse: true});
If I add an explain with executionStats, I can see it's picking up the index:
[
{
"$cursor" : {
"query" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"sort" : {
"messageId" : -1.0,
"writtenAt" : -1.0
},
"queryPlanner" : {
"plannerVersion" : 1.0,
"namespace" : "db.messages",
"indexFilterSet" : false,
"parsedQuery" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"queryHash" : "3141BBC5",
"planCacheKey" : "6858F892",
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"updatedAt" : {
"$gte" : ISODate("2021-02-26T06:59:51.738+0000")
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"messageId" : -1.0,
"writtenAt" : -1.0
},
"indexName" : "idx_messageId",
"isMultiKey" : false,
"multiKeyPaths" : {
"messageId" : [
],
"writtenAt" : [
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2.0,
"direction" : "forward",
"indexBounds" : {
"messageId" : [
"[MaxKey, MinKey]"
],
"writtenAt" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : [
]
}
}
},
{
"$group" : {
"_id" : "$messageId",
"doc" : {
"$first" : "$$ROOT"
}
}
},
{
"$replaceRoot" : {
"newRoot" : "$doc"
}
}
]
Any idea how can I can improve? After retrieving the latest messages by messageId then planning to do some pagination slicing after.
***Removed

Related

MongoDB first query taking time in Java Spring Boot application

I have following peace of java spring mongoDB code:
startTime = System.currentTimeMillis();
AggregationResults<MyClass> list = mongoTemplate.aggregate(Aggregation.newAggregation(operations),
"Post", MyClass.class);
System.out.println("Time taken for query execution -> "
+ (System.currentTimeMillis() - startTime));
when i am testing this code using jmeter, first execution shows:
Time taken for query execution -> 3275 ('list' has 16 records)
On 2nd and henceforth request its liks
Time taken for query execution -> 355 ('list' has 16 records)
Time difference is huge. How can I improve it in first call ?
When I do Aggregation.newAggregation(operations).toString() I am getting following query output. Running the folliwng aggregation query on shell command always take around .350sec.
{
"aggregate": "__collection__",
"pipeline": [
{
"$match": {
"$and": [
{
"postType": "AUTOMATIC"
}
]
}
},
{
"$project": {
"orders.id": 1,
"postedTotals": 1
}
},
{
"$unwind": "$orders"
},
{
"$group": {
"_id": "$orders.userId",
"ae": {
"$addToSet": "$orders.userId"
}
}
},
{
"$sort": {
"ae": 1
}
}
]
}
.explain().aggregate( shows following:
/* 1 */
{
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"postType" : "AUTOMATIC"
}
]
},
"fields" : {
"headerPostedTotals" : 1,
"orders.UserId" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "post",
"indexFilterSet" : false,
"parsedQuery" : {
"postType" : {
"$eq" : "AUTOMATIC"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"postType" : 1,
"orders.UserId" : 1,
"orders.flightStartDateForQuery" : 1,
"orders.flightEndDateForQuery" : 1,
"postRunDate" : -1
},
"indexName" : "default_filter_index",
"isMultiKey" : true,
"multiKeyPaths" : {
"postType" : [],
"orders.UserId" : [
"orders"
],
"orders.flightStartDateForQuery" : [
"orders"
],
"orders.flightEndDateForQuery" : [
"orders"
],
"postRunDate" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"postType" : [
"[\"AUTOMATIC\", \"AUTOMATIC\"]"
],
"orders.UserId" : [
"[MinKey, MaxKey]"
],
"orders.flightStartDateForQuery" : [
"[MinKey, MaxKey]"
],
"orders.flightEndDateForQuery" : [
"[MinKey, MaxKey]"
],
"postRunDate" : [
"[MaxKey, MinKey]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$project" : {
"_id" : true,
"headerPostedTotals" : true,
"orders" : {
"UserId" : true
}
}
},
{
"$unwind" : {
"path" : "$orders"
}
},
{
"$group" : {
"_id" : "$orders.UserId",
"aes" : {
"$addToSet" : "$orders.UserId"
}
}
},
{
"$sort" : {
"sortKey" : {
"aes" : 1
}
}
}
],
"ok" : 1.0
}

MongoDB (text search with the relevant field) aggregation problem

I have the MongoDB aggregation query
db.data.aggregate([{ "$match" : { "$text" : { "$search" : "STORAGE TYPE" } } },
{ "$group" :
{ "_id" :{"doc_type": "$doc_type" ,"title" : "$title", "player_name" : "$player_name", "player_type" : "INSTITUTION", "country_code" :"$country_code" },
"number_records" : { "$sum" : 1}
}
},
{"$match" : {"doc_type": "PATENT"} },
{"$sort":{"number_records" : -1}},
{"$limit" : 10}],
{"allowDiskuse" : true}
)
When I tried to execute the above code, it keeps on buffering for a long time, I am not getting any output. Can anyone help me?
When I used command explain(), it shows the following code:
{
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"$text" : {
"$search" : "STORAGE TYPE"
}
},
{
"doc_type" : "PATENT"
}
]
},
"fields" : {
"country_code" : 1,
"doc_type" : 1,
"player_name" : 1,
"title" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "datadocuments.data",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"doc_type" : {
"$eq" : "PATENT"
}
},
{
"$text" : {
"$search" : "STORAGE TYPE",
"$language" : "english",
"$caseSensitive" : false,
"$diacriticSensitive" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"doc_type" : {
"$eq" : "PATENT"
}
},
"inputStage" : {
"stage" : "TEXT",
"indexPrefix" : {
},
"indexName" : "title",
"parsedTextQuery" : {
"terms" : [
"storag",
"type"
],
"negatedTerms" : [ ],
"phrases" : [ ],
"negatedPhrases" : [ ]
},
"textIndexVersion" : 3,
"inputStage" : {
"stage" : "TEXT_MATCH",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "OR",
"inputStages" : [
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : 1
},
"indexName" : "title",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
}
},
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : 1
},
"indexName" : "title",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
}
}
]
}
}
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"doc_type" : "$doc_type",
"title" : "$title",
"player_name" : "$player_name",
"player_type" : {
"$const" : "INSTITUTION"
},
"country_code" : "$country_code"
},
"number_records" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$sort" : {
"sortKey" : {
"number_records" : -1
},
"limit" : NumberLong("10")
}
}
],
"ok" : 1
}
I couldn't figure out the mistake; is there any problem in aggregation, if not, how to increase the performance?
Your error comes from your second match stage : at this point, doc_type doesn't exist, but _id.doc_type instead. But you bettermerge this stage with the first one, to improve performance by reducing number of documents passed to the $group stage.
Your improved query will be :
db.data.aggregate([
{"$match" : { "$text" : { "$search" : "STORAGE TYPE" `},"doc_type": "PATENT" } },`
{ "$group" :
{ "_id" :{"doc_type": "$doc_type" ,"title" : "$title", "player_name" : "$player_name", "player_type" : "INSTITUTION", "country_code" :"$country_code" },
"number_records" : { "$sum" : 1}
}
},
{"$sort":{"number_records" : -1}},
{"$limit" : 10}],
{"allowDiskuse" : true}
)

Mongo aggregation performance

I am new to mongo and below query performs really slow with record set over 2 Million records
Query
db.testCollection.aggregate({
$match: {
active: {
$ne: false
}
}
}, {
$group: {
_id: {
productName: "$productName",
model: "$model",
version: "$version",
uid: "$uid"
},
total: {
$sum: 1
}
}
}, {
$project: {
total: 1,
model: "$_id.model",
version: "$_id.version",
uid: "$_id.uid",
productName: "$_id.productName"
}
}, {
$sort: {
model: 1
}
})
explain()
{
"stages" : [
{
"$cursor" : {
"query" : {
"active" : {
"$ne" : false
}
},
"fields" : {
"version" : 1,
"productName" : 1,
"model" : 1,
"uid" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "fms2.device",
"indexFilterSet" : false,
"parsedQuery" : {
"$nor" : [
{
"active" : {
"$eq" : false
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"active" : 1
},
"indexName" : "active",
"isMultiKey" : false,
"multiKeyPaths" : {
"active" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"active" : [
"[MinKey, false)",
"(false, MaxKey]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : {
"productName" : "$productName",
"model" : "$model",
"version" : "$version",
"uid" : "$uid"
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : true,
"total" : true,
"model" : "$_id.model",
"version" : "$_id.version",
"uid" : "$_id.uid",
"productName" : "$_id.productName"
}
},
{
"$sort" : {
"sortKey" : {
"model" : 1
}
}
}
],
"ok" : 1
}
Is there a way to optimize this query more ? I had a look into https://docs.mongodb.com/manual/core/aggregation-pipeline-optimization/ as well but most of the stated suggestions are not applicable for this query.
Not sure if it matters, result of this aggregation ends up with only 20-30 records.

Tune Up Mongo Query

I am new to Mongo and was trying to get distinct count of users. The field Id and Status are not individually Indexed columns but there exists a composite index on both the field. My current query is something like this where the match conditions changes depending on the requirements.
DBQuery.shellBatchSize = 1000000;
db.getCollection('username').aggregate([
{$match:
{ Status: "A"
} },
{"$group" : {_id:"$Id", count:{$sum:1}}}
]);
Is there anyway we can optimize this query more or add parallel runs on collection so that we can achieve results faster ?
Regards
You can tune your aggregation pipelines by passing in an option of explain=true in the aggregate method.
db.getCollection('username').aggregate([
{$match: { Status: "A" } },
{"$group" : {_id:"$Id", count:{$sum:1}}}],
{ explain: true });
This will then output the following to work with
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "EOF"
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
So to speed up our query we need a index to help the match part of the pipeline, so let's create a index on Status
> db.usernames.createIndex({Status:1})
{
"createdCollectionAutomatically" : true,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
If we now run the explain again we'll get the following results
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"Status" : 1
},
"indexName" : "Status_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"Status" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"Status" : [
"[\"A\", \"A\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
We can now see straight away this is using a index.
https://docs.mongodb.com/manual/reference/explain-results/

Slow Aggregation MongoDB

My Aggregation is pretty slow. I've already made it a little faster (from 3000 ms to 200ms) by using the match statement before the unwind statement. Is there any other way to improve my aggregation? In the end there'll be just one result (the last one based on timestamp). The unwind part is the longest operation if i'm right yet i really do need this.
db.CpuInfo.aggregate([
{"$match":
{
"timestamp": {"$gte":1464764400},
'hostname': 'baklap4'
}
},
{ "$unwind": "$cpuList" },
{ "$group":
{ "_id":
{ "interval":
{ "$subtract": [
"$timestamp",
{ "$mod": [ "$timestamp", 60 * 5 ] }
]}
},
"avgCPULoad": { "$avg": "$cpuList.load" },
"timestamp": { "$max": "$timestamp" }
}
},
{ "$project": { "_id": 0, "avgCPULoad": 1, "timestamp": 1 } },
{$sort: {'timestamp': -1}},
{$limit: 1}
])
The items in my collection are all simular to this:
{
"_id": ObjectId("574d6175da461e77030041b7"),
"hostname": "VPS",
"timestamp": NumberLong(1460040691),
"cpuCores": NumberLong(2),
"cpuList": [
{
"name": "cpu1",
"load": 3.4
},
{
"name": "cpu2",
"load": 0.7
}
]
}
I've added the explain option to my aggregation and this is the result:
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"timestamp" : {
"$gte" : 1464732000
},
"hostname" : "baklap4"
},
"fields" : {
"cpuList" : 1,
"timestamp" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "prototyping.CpuInfo",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"hostname" : {
"$eq" : "baklap4"
}
},
{
"timestamp" : {
"$gte" : 1464732000
}
}
]
},
"winningPlan" : {
"stage" : "FETCH",
"filter" : {
"hostname" : {
"$eq" : "baklap4"
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"timestamp" : NumberLong(1)
},
"indexName" : "timestamp_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"timestamp" : [
"[1464732000.0, inf.0]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$unwind" : {
"path" : "$cpuList"
}
},
{
"$group" : {
"_id" : {
"interval" : {
"$subtract" : [
"$timestamp",
{
"$mod" : [
"$timestamp",
{
"$const" : 300
}
]
}
]
}
},
"avgCPULoad" : {
"$avg" : "$cpuList.load"
},
"timestamp" : {
"$max" : "$timestamp"
}
}
},
{
"$project" : {
"_id" : false,
"timestamp" : true,
"avgCPULoad" : true
}
},
{
"$sort" : {
"sortKey" : {
"timestamp" : -1
},
"limit" : NumberLong(1)
}
}
],
"ok" : 1
}
When i Look up in my table i see that Timestamp and Id are indexed:
db.CpuInfo.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "prototyping.CpuInfo"
},
{
"v" : 1,
"key" : {
"timestamp" : NumberLong(1)
},
"name" : "timestamp_1",
"ns" : "prototyping.CpuInfo",
"sparse" : false
}
]