MongoDB Aggregate Slow Performance When Using Sort - mongodb

I have collection (tvshow episodes) with more than 1,200,000 document,
here is my schema :
var episodeSchema = new Schema({
imdbId: { type : String },
showId: {type : String},
episodeId: { type : String },
episodeIdNumber:{ type : Number },
episodeTitle:{ type : String },
showTitle:{type : String},
seasonNumber:{type : Number},
episodeNumber:{type : Number},
airDate : {type : String},
summary:{type : String}
});
I created Index for episodeTitle episodeIdNumber seasonNumber episodeNumber episodeId and showId
Now i used mongodb aggregate group to get every tvshow episodes
here is the aggregate query i used :
episode.aggregate( [
{ $match : { showId : "scorpion" } },
{$sort:{"episodeNumber":-1}},
{ $group: {
_id: "$seasonNumber", count: { $sum: 1 } ,
episodes : { $push: { episodeId : "$episodeId" , episodeTitle: "$episodeTitle" , episodeNumber: "$episodeNumber" , seasonNumber: "$seasonNumber" , airDate: "$airDate" } }
} }
,
{ $sort : { _id : -1 } }
] )
Now when i am run this query its take more than 2605.907 ms , after some digging i found out why its slow , it was because of using {$sort:{"episodeNumber":-1}} , without using {$sort:{"episodeNumber":-1}} its take around 19.178 ms to run.
As i mentioned above i create an index for episodeNumber field and based on MongoDB Aggregation Pipeline Optimization
i used sort after match so basically everything was ok , and i didn't anything wrong.
So after this i thought something wrong with my indexes , so i removed episodeNumber index and reindexd , but i had same time nothing changed.
At end one time i tried run aggregate group query without episodeNumber indexed and surprisingly it was faster ! its take around 20.118 ms .
I wants know why this happened , isn't indexes to get faster query ?
Update
query explain output :
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"showId" : "scorpion"
},
"sort" : {
"episodeNumber" : -1
},
"fields" : {
"airDate" : 1,
"episodeId" : 1,
"episodeNumber" : 1,
"episodeTitle" : 1,
"seasonNumber" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.episodes",
"indexFilterSet" : false,
"parsedQuery" : {
"showId" : {
"$eq" : "scorpion"
}
},
"winningPlan" : {
"stage" : "EOF"
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$seasonNumber",
"count" : {
"$sum" : {
"$const" : 1
}
},
"episodes" : {
"$push" : {
"episodeId" : "$episodeId",
"episodeTitle" : "$episodeTitle",
"episodeNumber" : "$episodeNumber",
"seasonNumber" : "$seasonNumber",
"airDate" : "$airDate"
}
}
}
},
{
"$sort" : {
"sortKey" : {
"_id" : -1
}
}
}
],
"ok" : 1
}

Related

MongoDB aggregate count is too much slow

I have around 60 thousand document in users collection, and have the following query:
db.getCollection('users').aggregate([
{"$match":{"userType":"employer"}},
{"$lookup":{"from":"companies","localField":"_id","foreignField":"owner.id","as":"company"}},
{"$unwind":"$company"},
{"$lookup":{"from":"companytypes","localField":"company.type.id","foreignField":"_id","as":"companyType"}},
{"$unwind":"$companyType"},
{ $group: { _id: null, count: { $sum: 1 } } }
])
It takes around 12 seconds to count, even I call count function before list function, but my list function with limit: 10 response faster than count.
And following is explain result:
{
"stages" : [
{
"$cursor" : {
"query" : {
"userType" : "employer"
},
"fields" : {
"company" : 1,
"_id" : 1
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "jobs.users",
"indexFilterSet" : false,
"parsedQuery" : {
"userType" : {
"$eq" : "employer"
}
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"userType" : {
"$eq" : "employer"
}
},
"direction" : "forward"
},
"rejectedPlans" : []
}
}
},
{
"$lookup" : {
"from" : "companies",
"as" : "company",
"localField" : "_id",
"foreignField" : "owner.id",
"unwinding" : {
"preserveNullAndEmptyArrays" : false
}
}
},
{
"$match" : {
"$nor" : [
{
"company" : {
"$eq" : []
}
}
]
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"total" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : false,
"total" : true
}
}
],
"ok" : 1.0
}
$lookup operations are slow since they mimic the left join behavior, from the DOCS:
$lookup performs an equality match on the localField to the
foreignField from the documents of the from collection
Hence if there are no indexes in the fields used for joining the collections Mongodb is force to do a collection scan.
Adding an index for the foreignField attributes should prevent a collection scan and increase the performance even of a magnitude

Why are my mongodb queries so slow (on Swisscom cloud)?

I am using a (small, 256 MB) MongoDB 3.2.9 service instance through Swisscom CloudFoundry. As long as our entire DB fits into the available RAM, we see somewhat acceptable query performance.
However, we are experiencing very long query times on aggregation operations when our DB does not fit into RAM. We have created indexes for the accessed fields, but as far as I can tell it doesn't help.
Example document entry:
_id: 5a31...
description: Object
location: "XYZ"
name: "ABC"
status: "A"
m_nr: null
k_nr: null
city: "QWE"
high_value: 17
right_value: 71
more_data: Object
number: 101
interval: 1
next_date: "2016-01-16T00:00:00Z"
last_date: null
status: null
classification: Object
priority_value: "?"
redundancy_value: "?"
active_value: "0"
Example Query:
db.getCollection('a').aggregate(
[{ $sort:
{"description.location": 1}
},
{ $group:
{_id: "$description.location"}
}],
{ explain: true }
)
This query takes 25sec on a DB that only has 20k entries and produces 1k output fields.
The explain info for this query:
db.getCollection('a').aggregate([{ $group: {_id: "$description.location"} }], { explain: true }):
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {},
"fields" : {
"description.location" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "Z.a",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : []
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"$and" : []
},
"direction" : "forward"
},
"rejectedPlans" : []
}
}
},
{
"$group" : {
"_id" : "$description.location"
}
}
],
"ok" : 1.0
}
[UPDATE] Output of db.a.getIndexes():
/* 1 */
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "db.a"
},
{
"v" : 1,
"key" : {
"description.location" : 1.0
},
"name" : "description.location_1",
"ns" : "db.a"
}
]
Looks like it's doing a collection scan, have you tried adding an index on description.location?
db.a.createIndex({"description.location" : 1});

Mongodb merge chunk with $maxkey value on sharded cluster

I have shardkey {thread_id:1,_id:1} on collection "post",
and i want to merge 2 following chunk :
{
"_id" : "forum.post-thread_id_\"547dc7c2de2cf22b688b4572\"_id_ObjectId('549c519660e24b65118b456c')",
"lastmod" : Timestamp(3012, 3),
"lastmodEpoch" : ObjectId("50829c0e172de38a3398f72c"),
"ns" : "forum.post",
"min" : {
"thread_id" : "547dc7c2de2cf22b688b4572",
"_id" : ObjectId("549c519660e24b65118b456c")
},
"max" : {
"thread_id" : ObjectId("50901d4e1dd7198161000063"),
"_id" : ObjectId("50901d4e1dd7198161000068")
},
"shard" : "shard3"
}
{
"_id" : "forum.post-thread_id_ObjectId('50901d4e1dd7198161000063')_id_ObjectId('50901d4e1dd7198161000068')",
"lastmod" : Timestamp(604, 0),
"lastmodEpoch" : ObjectId("50829c0e172de38a3398f72c"),
"ns" : "forum.post",
"min" : {
"thread_id" : ObjectId("50901d4e1dd7198161000063"),
"_id" : ObjectId("50901d4e1dd7198161000068")
},
"max" : {
"thread_id" : {
"$maxKey" : 1
},
"_id" : {
"$maxKey" : 1
}
},
"shard" : "shard3"
}
It need to merge because thread_id supposed to be string , with current condition 1st chunk hold all new data (string -> ObjectId()), and 2nd chunk only hold documents with thread_id "ObjectId()"
I have tried this command :
reference
db.runCommand({
mergeChunks : 'forum.post',
bounds : [{
thread_id : "547dc7c2de2cf22b688b4572",
_id : ObjectId("549c519660e24b65118b456c")
}, {
thread_id : {
$type : 127
},
_id : {
$type : 127
}
}
]
})
And i got this error :
{
"ok" : 0,
"errmsg" : "shard key bounds [{
thread_id: " 547dc7c2de2cf22b688b4572 ",
_id: ObjectId(" 549c519660e24b65118b456c ")
},{
thread_id: { $type: 127 }, _id: { $type: 127 } })
are not valid for shard key pattern { thread_id: 1.0, _id: 1.0 }"
}
Does anyone know how to fix this ?
Mongodb Version 2.4.9
It appear that "mergechunk" command was on later version 2.6.x ++
I solved my problem using this command :
db.runCommand({
mergeChunks : 'forum.post',
bounds : [{
thread_id : "547dc7c2de2cf22b688b4572",
_id : ObjectId("549c519660e24b65118b456c")
}, {
thread_id : MaxKey,
_id : MaxKey
}
]
})
it appear that for "write" we should use :
db.foo.insert({ id : MaxKey });
For "Query" we should use :
db.foo.find({ id : { $type : 127 } });

Mongodb : How should I get original Json structure after filter the records based on requirement?

I am new to mongodb.
I have a Json document in collection like :
{
"_id" : ObjectId("55abf32f358e3aca807f0e6a"),
"usercbid" : 1995492.0000000000000000,
"defaultnotifytype" : {
"status" : true,
"alert" : true,
"action" : true
},
"calendar" : {
"alert" : 2468.0000000000000000,
"action" : 13579.0000000000000000,
"status" : 123456.0000000000000000
},
"assignment" : [
{
"orgid" : {
"service" : "AVPN",
"adminemail" : "pl9129#att.com",
"notifytype" : {
"status" : true,
"alert" : true
},
"keytype" : "MCN",
"KeyValue" : "SK1383"
}
},
{
"orgid" : {
"KeyValue" : "DD3342",
"service" : "<all>",
"keytype" : "MCN"
}
},
{
"orgid" : {
"notifytype" : {
"optout" : true
},
"keytype" : "MCN",
"keyvalue" : "<all>",
"service" : "MVPN"
}
},
{
"order" : {
"date" : "2015-03-15",
"adminemail" : "abc.com",
"notifytype" : {
"alert" : true
},
"id" : 123456.0000000000000000
}
},
{
"order" : {
"id" : 135246.0000000000000000,
"date" : "2015-03-17",
"adminemail" : "abc.com"
}
}
]
}
I would like to filter above json document with following condition:
var result = db.subscription.aggregate(
[ { $unwind: "$assignment" }
, {$match : {$or:
[
{
"assignment.order.id" : 123456
},
{
"assignment.orgid.keytype" : { $in: ["MCN"]}
,"assignment.orgid.KeyValue" : { $in: ["<all>","SK1383"]}
,"assignment.orgid.service" : { $in: ["<all>","AVPN"]}
}
]
}
}
,{$group: {_id: "$_id", assignment: {$push: "$assignment"}}}
// ,{$project : { usercbid : $usercbid, defaultnotifytype : 1, calendar : 1, assignment: 1} }
]
)
printjson(result);
Result of above query is :
{
"result" : [
{
"_id" : ObjectId("55abf32f358e3aca807f0e6a"),
"assignment" : [
{
"orgid" : {
"service" : "AVPN",
"adminemail" : "pl9129#att.com",
"notifytype" : {
"status" : true,
"alert" : true
},
"keytype" : "MCN",
"KeyValue" : "SK1383"
}
},
{
"order" : {
"date" : "2015-03-15",
"adminemail" : "pl9129#att.com",
"notifytype" : {
"alert" : true
},
"id" : 123456
}
}
]
}
],
"ok" : 1
}
But my final result lost the following original content:
"usercbid" : 1995492.0000000000000000,
"defaultnotifytype" : {
"status" : true,
"alert" : true,
"action" : true
},
"calendar" : {
"alert" : 2468.0000000000000000,
"action" : 13579.0000000000000000,
"status" : 123456.0000000000000000
},
How should I append above original content with filtered records?
Thanks,
$Fisrt is the operator which helps you getting the required output.
When you do a $Group, the result of the $Group pipeline operator contains only those fields which are specified inside the $Group pipeline operator.
So, from your query we can notice that you are grouping based on "_Id" and you are selecting only "assignment" key field, so the OUTPUT of this group pipeline operator will contain only those 2 fileds ( "_ID" and "assignment" ).
To make sure that the other left out feilds ( usercbid, defaultnotifytype , calendar ) to be part of the $Group pipeline output, we need to mention that explicitly in the Group pipeline using $First as below :
{ $group: { _id: "$_id", assignment: {$push: "$assignment"},
usercbid : { $first : "usercbid"} ,
defaultnotifytype : { $first : "defaultnotifytype" } ,
calendar : { $first : "calendar"}
}
}
$First Returns the value that results from applying an expression to the first document in a group of documents that share the same group by key.
Please check the below query, it will help you in fetching the required output :
var result = db.subscription.aggregate(
[ { $unwind: "$assignment" }
, { $match : {$or:
[
{
"assignment.order.id" : 123456
},
{
"assignment.orgid.keytype" : { $in: ["MCN"]}
,"assignment.orgid.KeyValue" : { $in: ["<all>","SK1383"]}
,"assignment.orgid.service" : { $in: ["<all>","AVPN"]}
}
]
}
}
,{ $group: { _id: "$_id", assignment: {$push: "$assignment"},
usercbid : { $first : "usercbid"} ,
defaultnotifytype : { $first : "defaultnotifytype" } ,
calendar : { $first : "calendar"}
}
}
]
).pretty();

Mongodb distinct aggregation of 3 billion documents

I have a huge collection with 3 billion documents. Each document looks like the following:
"_id" : ObjectId("54c1a013715faf2cc0047c77"),
"service_type" : "JE",
"receiver_id" : NumberLong("865438083645"),
"time" : ISODate("2012-12-05T23:07:36Z"),
"duration" : 24,
"service_description" : "NQ",
"receiver_cell_id" : null,
"location_id" : "658_55525",
"caller_id" : NumberLong("475035504705")
I would like to get the list of distinct users (they should at least appear once as a caller 'caller_id'), their counts (how many times each user appeared in the collection as either caller or receiver) and the count of locations if they are callers (i.e., the count for each location_id per user).
I want to end up with the following:
"number_of_records" : 20,
"locations" : [{location_id: 658_55525, count:5}, {location_id: 840_5425, count:15}],
"user" : NumberLong("475035504705")
I tried the solution described here and here but they are not efficient enough (extremely slow). What would be an efficient way to achieve this?
Use aggregation for your result:
db.<collection>.aggregate([
{ $group : { _id : { user: "$caller_id", localtion: '$location_id'} , count : { $sum : 1} } },
{ $project : { _id : 0, _id : '$_id.user', location : '$_id.localtion', count : '$count' } },
{ $group : { _id : '$_id', 'locations' : { $push : { location_id : '$location', count : '$count' } }, number_of_records : {$sum : '$count'} } },
{ $project : { _id : 0, user : '$_id', locations : '$locations', number_of_records : '$number_of_records'} },
{ $out : 'outputCollection'},
])
The output will be:
{
"0" : {
"locations" : [
{
"location_id" : "840_5425",
"count" : 8
},
{
"location_id" : "658_55525",
"count" : 5
}
],
"number_of_records" : 13,
"user" : NumberLong(475035504705)
}
}
Update using allowDiskUse:
var pipe = [
{ $group : { _id : { user: "$caller_id", localtion: '$location_id'} , count : { $sum : 1} } },
{ $project : { _id : 0, _id : '$_id.user', location : '$_id.localtion', count : '$count' } },
{ $group : { _id : '$_id', 'locations' : { $push : { location_id : '$location', count : '$count' } }, number_of_records : {$sum : '$count'} } },
{ $project : { _id : 0, user : '$_id', locations : '$locations', number_of_records : '$number_of_records'} },
{ $out : 'outputCollection'},
];
db.runCommand(
{ aggregate: "collection",
pipeline: pipe,
allowDiskUse: true
}
)
A map-reduce solution would be more suitable here rather than an aggregation pipeline, simply because it avoids two unwinds. If you could bring out an aggregation solution with a single unwind, that would be it. But the below map-reduce solution is one way to do it, though you would need to measure its running time against large data and see if it works for you.
The map function:
var map = function(){
emit(this.caller_id,
{locs:[{"location_id":this.location_id,"count":1}]});
}
The reduce function:
var reduce = function(key,values){
var result = {locs:[]};
var locations = {};
values.forEach(function(value){
value.locs.forEach(function(loc){
if(!locations[loc.location_id]){
locations[loc.location_id] = loc.count;
}
else{
locations[loc.location_id]++;
}
})
})
Object.keys(locations).forEach(function(k){
result.locs.push({"location_id":k,"count":locations[k]});
})
return result;
}
The finalize function:
var finalize = function(key,value){
var total = 0;
value.locs.forEach(function(loc){
total += loc.count;
})
return {"total":total,"locs":value.locs};
}
Invoking map-reduce:
db.collection.mapReduce(map,reduce,{"out":"t1","finalize":finalize});
Aggregating the result once the map-reduce produces its output.
db.t1.aggregate([
{$project:{"_id":0,
"number_of_records":"$value.total",
"locations":"$value.locs","user":"$_id"}}
])
Sample o/p:
{
"number_of_records" : 3,
"locations" : [
{
"location_id" : "658_55525",
"count" : 1
},
{
"location_id" : "658_55525213",
"count" : 2
}
],
"user" : 2
}
{
"number_of_records" : 1,
"locations" : [
{
"location_id" : "658_55525",
"count" : 1
}
],
"user" : NumberLong("475035504705")
}
The map-reduce java script code should be self explanatory.