preserving fields collapsed by $group - mongodb

I want to summarize a set of documents by counting on a field named code. How can I summarize my data and preserve details from the original documents?
The pipeline input contains the documents below.
{
"_id" : ObjectId("5ff38e0eb09dec2cbce14760"),
"code" : "U",
"date" : ISODate("2021-04-09T00:00:00.000+0000"),
"full_day" : false,
"remote" : false,
"student_id" : 9441
}
{
"_id" : ObjectId("5ff38e0eb09dec2cbce14807"),
"code" : "E",
"date" : ISODate("2020-11-02T00:00:00.000+0000"),
"full_day" : false,
"remote" : false,
"student_id" : 9441
}
{
"_id" : ObjectId("5ff39854b09dec2cbce1494c"),
"code" : "E",
"date" : ISODate("2020-11-03T08:00:00.000+0000"),
"full_day" : true,
"remote" : false,
"student_id" : 9441
}
The desired output groups by code, promotes student_id to the root level, and nests the other details in a details array:
{
"code" : "U",
"student_id": 9441,
"count" : 1.0,
"details" : [
{
"date" : ISODate("2021-04-09T00:00:00.000+0000"),
"full_day" : false,
"remote" : false,
}
]
}
{
"code" : "E",
"student_id": 9441,
"count" : 2.0,
"details" : [
{
"date" : ISODate("2020-11-02T00:00:00.000+0000"),
"full_day" : false,
"remote" : false,
},
{
"date" : ISODate("2020-11-03T08:00:00.000+0000"),
"full_day" : true,
"remote" : false,
}
]
}
Combining $group and $push I've only been able to produce:
{
"_id" : "U",
"count" : 1.0,
"details" : [
{
"date" : ISODate("2021-04-09T00:00:00.000+0000"),
"full_day" : false,
"remote" : false,
"student_id" : 9441
}
]
}
{
"_id" : "E",
"count" : 2.0,
"details" : [
{
"date" : ISODate("2020-11-02T00:00:00.000+0000"),
"full_day" : false,
"remote" : false,
"student_id" : 9441
},
{
"date" : ISODate("2020-11-03T08:00:00.000+0000"),
"full_day" : true,
"remote" : false,
"student_id" : 9441.0
}
]
}
The results above were achieved with this pipeline:
[
{
"$match" : {
"student_id" : 9441.0
}
},
{
"$group" : {
"_id" : "$code",
"count" : {
"$sum" : 1.0
},
"details" : {
"$push" : {
"date" : "$date",
"full_day" : "$full_day",
"remote" : "$remote",
"student_id" : "$student_id"
}
}
}
},
{
"$addFields" : {
"student_id" : "$student_id"
}
}
]

If you expect all of the input documents to have the same value for a field, and want that field to be included in the $group output, use the $first accumulation operator:
{
"$group" : {
"_id" : "$code",
"student_id" : {$first: "$student_id"},
"count" : {
"$sum" : 1.0
},
"details" : {
"$push" : {
"date" : "$date",
"full_day" : "$full_day",
"remote" : "$remote"
}
}
}
}
If you need to rename _id back to code, use a $project stage after the group.

Related

Tune Up Mongo Query

I am new to Mongo and was trying to get distinct count of users. The field Id and Status are not individually Indexed columns but there exists a composite index on both the field. My current query is something like this where the match conditions changes depending on the requirements.
DBQuery.shellBatchSize = 1000000;
db.getCollection('username').aggregate([
{$match:
{ Status: "A"
} },
{"$group" : {_id:"$Id", count:{$sum:1}}}
]);
Is there anyway we can optimize this query more or add parallel runs on collection so that we can achieve results faster ?
Regards
You can tune your aggregation pipelines by passing in an option of explain=true in the aggregate method.
db.getCollection('username').aggregate([
{$match: { Status: "A" } },
{"$group" : {_id:"$Id", count:{$sum:1}}}],
{ explain: true });
This will then output the following to work with
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "EOF"
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
So to speed up our query we need a index to help the match part of the pipeline, so let's create a index on Status
> db.usernames.createIndex({Status:1})
{
"createdCollectionAutomatically" : true,
"numIndexesBefore" : 1,
"numIndexesAfter" : 2,
"ok" : 1
}
If we now run the explain again we'll get the following results
{
"stages" : [
{
"$cursor" : {
"query" : {
"Status" : "A"
},
"fields" : {
"Id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "test.usernames",
"indexFilterSet" : false,
"parsedQuery" : {
"Status" : {
"$eq" : "A"
}
},
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"Status" : 1
},
"indexName" : "Status_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"Status" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"Status" : [
"[\"A\", \"A\"]"
]
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$group" : {
"_id" : "$Id",
"count" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
We can now see straight away this is using a index.
https://docs.mongodb.com/manual/reference/explain-results/

How to delete duplicates from 2nd level of array in Mongo DB

I have duplicates in 2nd level of array.
In 2nd level array elements all the columns contains same data except _id. In Some cases _id also is same.
Please suggest how to delete duplicates from 2nd level of array by keeping one element.
From the below example we can treat AssessmentName as unique filed in Bhra array.
Mongo Version : 3.2.8
Example:
{
"_id" : ObjectId("592415c434810eeb63afe029"),
"Encounter" : [
{
"_id" : ObjectId("5846c6361489b54e402d76f6"),
"Bhra" : [
{
"_id" : "15445853419048538e0ba2cd",
"AssessmentId" : 1,
"AssessmentName" : "Major Depressive Episode",
"AssessmentOrder" : 1,
"IsContinue" : true,
"IsNotAssessed" : false,
"TotalScore" : 9,
"DepressionSeverity" : "Mild depression",
"AssessmentResult" : "Negative",
"Notes" : ""
},
{
"_id" : "15445853419048538e0ba2aa",
"AssessmentId" : 1,
"AssessmentName" : "Major Depressive Episode",
"AssessmentOrder" : 1,
"IsContinue" : true,
"IsNotAssessed" : false,
"TotalScore" : 9,
"DepressionSeverity" : "Mild depression",
"AssessmentResult" : "Negative",
"Notes" : ""
},
{
"_id" : "15445853419048538e0ba2aa",
"AssessmentId" : 1,
"AssessmentName" : "Major Depressive Episode",
"AssessmentOrder" : 1,
"IsContinue" : true,
"IsNotAssessed" : false,
"TotalScore" : 9,
"DepressionSeverity" : "Mild depression",
"AssessmentResult" : "Negative",
"Notes" : ""
}
]
}
]
}
Output should be as follows.
{
"_id" : ObjectId("592415c434810eeb63afe029"),
"Encounter" : [
{
"_id" : ObjectId("5846c6361489b54e402d76f6"),
"Bhra" : [
{
"_id" : "15445853419048538e0ba2cd",// any _id is fine
"AssessmentId" : 1,
"AssessmentName" : "Major Depressive Episode",
"AssessmentOrder" : 1,
"IsContinue" : true,
"IsNotAssessed" : false,
"TotalScore" : 9,
"DepressionSeverity" : "Mild depression",
"AssessmentResult" : "Negative",
"Notes" : ""
}
]
}
]
}
Thanks,
Rao
You can use aggregation. First, you need to $unwind data 2 times in order to reach to level 2 array and then use $group to only include distinct item in array. Finally, $project into required json format.
db.your_collection.aggregate([
{"$unwind":"$Encounter"},
{"$unwind":"$Encounter.Bhra"},
{"$group":{
_id: {
_id:"$_id",
"Encounter_id":"$Encounter._id",
"Encounter_Bhra_AssessmentId":"$Encounter.Bhra.AssessmentId",
"Encounter_Bhra_AssessmentName":"$Encounter.Bhra.AssessmentName",
"Encounter_Bhra_AssessmentOrder":"$Encounter.Bhra.AssessmentOrder",
"Encounter_Bhra_IsContinue":"$Encounter.Bhra.IsContinue",
"Encounter_Bhra_IsNotAssessed":"$Encounter.Bhra.IsNotAssessed",
"Encounter_Bhra_TotalScore":"$Encounter.Bhra.TotalScore",
"Encounter_Bhra_DepressionSeverity":"$Encounter.Bhra.DepressionSeverity",
"Encounter_Bhra_AssessmentResult":"$Encounter.Bhra.AssessmentResult",
"Encounter_Bhra_Notes":"$Encounter.Bhra.Notes"
},
"Encounter_Bhra_id":{"$min":"$Encounter.Bhra._id"}
}
},
{"$project":{
_id:"$_id._id",
Encounter : {
_id:"$_id.Encounter_id",
Bhra : {
_id : "$Encounter_Bhra_id",
AssessmentId : "$_id.Encounter_Bhra_AssessmentId",
AssessmentName : "$_id.Encounter_Bhra_AssessmentName",
AssessmentOrder : "$_id.Encounter_Bhra_AssessmentOrder",
IsContinue : "$_id.Encounter_Bhra_IsContinue",
IsNotAssessed : "$_id.Encounter_Bhra_IsNotAssessed",
TotalScore : "$_id.Encounter_Bhra_TotalScore",
DepressionSeverity : "$_id.Encounter_Bhra_DepressionSeverity",
AssessmentResult : "$_id.Encounter_Bhra_AssessmentResult",
Notes : "$_id.Encounter_Bhra_Notes"
}
}
}
},
{$out: "your_new_coll_name"}
])
Output:
{
"_id" : ObjectId("592415c434810eeb63afe029"),
"Encounter" : {
"_id" : ObjectId("5846c6361489b54e402d76f6"),
"Bhra" : {
"_id" : "15445853419048538e0ba2aa",
"AssessmentId" : 1,
"AssessmentName" : "Major Depressive Episode",
"AssessmentOrder" : 1,
"IsContinue" : true,
"IsNotAssessed" : false,
"TotalScore" : 9,
"DepressionSeverity" : "Mild depression",
"AssessmentResult" : "Negative",
"Notes" : ""
}
}
}

MongoDB between date range query - nested property

How to write query to return object between dates? Query should search nested property. I use $gte and $lte but it doesn't seem to work as I expected. I want to return 'task' object which has got history.startTime between two dates.
db.tasks.find({'history.startTime' : { '$gte': ISODate("2017-02-04T00:00:00.000Z"), '$lt': ISODate("2017-02-05T23:00:00.000Z")} }).pretty()
{
"_id" : ObjectId("588f53c5d00baa2558fd56ae"),
"desc" : "test3",
"category" : "Category1",
"project" : "Project1",
"_creator" : "582afb3800c1bc1f203edf39",
"history" : [
{
"startTime" : ISODate("2017-02-06T11:49:42.570Z"),
"stopTime" : ISODate("2017-02-06T11:49:45.725Z"),
"_id" : ObjectId("589862d9449b4629f8dbaba7"),
"dt" : 3.155
},
{
"startTime" : ISODate("2017-02-06T08:53:53.086Z"),
"stopTime" : ISODate("2017-02-06T11:47:58.098Z"),
"_id" : ObjectId("5898626e449b4629f8dbaba6"),
"dt" : 10445.012
},
{
"startTime" : ISODate("2017-01-30T15:30:46.287Z"),
"stopTime" : ISODate("2017-01-30T15:32:52.979Z"),
"_id" : ObjectId("588f5c2cd00baa2558fd56b0"),
"dt" : 126.692
},
{
"startTime" : ISODate("2017-01-30T13:55:09.738Z"),
"stopTime" : ISODate("2017-01-30T14:55:13.974Z"),
"_id" : ObjectId("588f53d1d00baa2558fd56af"),
"dt" : 3604.236
}
],
"isCompleted" : false,
"isPerforming" : false,
"duration" : 14179.095000000001,
"updated" : ISODate("2017-02-06T11:49:45.725Z"),
"creationDate" : ISODate("2017-01-30T14:55:01.045Z"),
"__v" : 4
}
It is an array. Your query won't work. You have to use $elemMatch.
db.tasks.find({
'history': {
$elemMatch: {
startTime: {
$gte: ISODate("2017-02-04T00:00:00.000Z"),
$lte: ISODate("2017-02-05T23:00:00.000Z")
}
}
}
});

Mongodb pull data from subarray

Hi I have below mongodb collection
{
"_id" : ObjectId("53ce993639203f573671d3f5"),
"user_id" : NumberLong(51),
"buses" : [
{
"slot_id" : NumberLong(50),
"status" : NumberLong(3),
"bus_id" : NumberLong(8)
},
{
"slot_id" : NumberLong(67),
"status" : NumberLong(3),
"bus_id" : NumberLong(12)
}
]
}
i want to pull sub array where bus_id=8.
Final result i want to be like this
{
"_id" : ObjectId("53ce993639203f573671d3f5"),
"user_id" : NumberLong(51),
"buses" : [
{
"slot_id" : NumberLong(67),
"status" : NumberLong(3),
"bus_id" : NumberLong(12)
}
]
}
When i tried with below query
db.collectionname.update({},{$pull: {buses: {bus_id:8}}},{multi: true})
I got below error in console,
Cannot apply $pull/$pullAll modifier to non-array
Can any one please suggest me how to achieve this,and also need php mongodb query also.
Thanks in Advance
Worked fine for me for your sample document:
> db.bus.findOne()
{
"_id" : ObjectId("53ce993639203f573671d3f5"),
"user_id" : NumberLong(51),
"buses" : [
{
"slot_id" : NumberLong(50),
"status" : NumberLong(3),
"bus_id" : NumberLong(8)
},
{
"slot_id" : NumberLong(67),
"status" : NumberLong(3),
"bus_id" : NumberLong(12)
}
]
}
> db.bus.update({}, { "$pull" : { "buses" : { "bus_id" : 8 } } }, { "multi" : true })
WriteResult({ "nMatched" : 1, "nUpserted" : 0, "nModified" : 1 })
> db.bus.findOne()
{
"_id" : ObjectId("53ce993639203f573671d3f5"),
"user_id" : NumberLong(51),
"buses" : [
{
"slot_id" : NumberLong(67),
"status" : NumberLong(3),
"bus_id" : NumberLong(12)
}
]
}
The cause of the problem is that some buses element is not an array. What does the query
> db.bus.find({ "buses.0" : { "$exists" : 0}, "buses" : { "$ne" : [] } })
return? This query finds documents where there is no 0th element of the array and the array is not empty, so it should return documents where buses is not an array.

Aggregation framework performance on a 10M collection

I have a collection of 10M documents, that is a pre-aggregation of daily events.
A simple $group took more than 8s, is this performance normal ?
Some date from the profiler :
{
"op" : "command",
"ns" : "analytics.$cmd",
"command" : {
"aggregate" : "aggregation",
"pipeline" : [
{
"$group" : {
"_id" : "",
"hits" : {
"$sum" : "$hits"
}
}
}
]
},
"ntoreturn" : 1,
"keyUpdates" : 0,
"numYield" : 15,
"lockStats" : {
"timeLockedMicros" : {
"r" : NumberLong(17169805),
"w" : NumberLong(0)
},
"timeAcquiringMicros" : {
"r" : NumberLong(8582619),
"w" : NumberLong(294)
}
},
"responseLength" : 78,
"millis" : 8594,
"ts" : ISODate("2013-12-04T15:57:38.217Z"),
"client" : "127.0.0.1",
"allUsers" : [ ],
"user" : ""
}
Here is one single document
{
"_id" : ObjectId("529e21ee67e807418500daeb"),
"date" : ISODate("2012-09-19T00:00:00Z"),
"hits" : 1,
"infos" : {
"sourceValue" : NumberLong(1),
"eventType" : "createUser",
"sourceType" : "user",
"instance" : "xxx",
"targetType" : "user",
"targetValue" : NumberLong(15)
}
}