How can i count total documents and also grouped counts simultanously in mongodb aggregation? - mongodb

I have a dataset in mongodb collection named visitorsSession like
{ip : 192.2.1.1,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.3.1.8,country : 'UK', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.5.1.4,country : 'UK', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.8.1.7,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.1.1.3,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'}
I am using this mongodb aggregation
[{$match: {
nsp : "/hrm.sbtjapan.com",
creationDate : {
$gte: "2019-12-15T00:00:00.359Z",
$lte: "2019-12-20T23:00:00.359Z"
},
type : "Visitors"
}}, {$group: {
_id : "$country",
totalSessions : {
$sum: 1
}
}}, {$project: {
_id : 0,
country : "$_id",
totalSessions : 1
}}, {$sort: {
country: -1
}}]
using above aggregation i am getting results like this
[{country : 'US',totalSessions : 3},{country : 'UK',totalSessions : 2}]
But i also total visitors also along with result like totalVisitors : 5
How can i do this in mongodb aggregation ?

You can use $facet aggregation stage to calculate total visitors as well as visitors by country in a single pass:
db.visitorsSession.aggregate( [
{
$match: {
nsp : "/hrm.sbtjapan.com",
creationDate : {
$gte: "2019-12-15T00:00:00.359Z",
$lte: "2019-12-20T23:00:00.359Z"
},
type : "Visitors"
}
},
{
$facet: {
totalVisitors: [
{
$count: "count"
}
],
countrySessions: [
{
$group: {
_id : "$country",
sessions : { $sum: 1 }
}
},
{
$project: {
country: "$_id",
_id: 0,
sessions: 1
}
}
],
}
},
{
$addFields: {
totalVisitors: { $arrayElemAt: [ "$totalVisitors.count" , 0 ] },
}
}
] )
The output:
{
"totalVisitors" : 5,
"countrySessions" : [
{
"sessions" : 2,
"country" : "UK"
},
{
"sessions" : 3,
"country" : "US"
}
]
}

You could be better off with two queries to do this.
To save the two db round trips following aggregation can be used which IMO is kinda verbose (and might be little expensive if documents are very large) to just count the documents.
Idea: Is to have a $group at the top to count documents and preserve the original documents using $push and $$ROOT. And then before other matches/filter ops $unwind the created array of original docs.
db.collection.aggregate([
{
$group: {
_id: null,
docsCount: {
$sum: 1
},
originals: {
$push: "$$ROOT"
}
}
},
{
$unwind: "$originals"
},
{ $match: "..." }, //and other stages on `originals` which contains the source documents
{
$group: {
_id: "$originals.country",
totalSessions: {
$sum: 1
},
totalVisitors: {
$first: "$docsCount"
}
}
}
]);
Sample O/P: Playground Link
[
{
"_id": "UK",
"totalSessions": 2,
"totalVisitors": 5
},
{
"_id": "US",
"totalSessions": 3,
"totalVisitors": 5
}
]

Related

MongoDB sum of fields inside objects inside an array that is inside of an object greater than x

//8. isbn numbers of books that sold at least X copies (you decide the value for X).
Book example
{
isbn: "0001",
title: "Book1",
pages: NumberInt("150"),
price: NumberDecimal("321.2"),
copies: NumberInt("3"),
language: "english",
author: ["Author1"],
category: ["Space Opera"],
genre: ["Genre-1", "Genre-2"],
character: ["Character-1", "Character-2"],
},
Order example
{
orderNo: "3",
customerNo: "0003",
date: {
day: NumberInt("25"),
month: NumberInt("02"),
year: NumberInt("2021"),
},
orderLine: [
{
isbn: "0006",
price: NumberDecimal("341.0"),
amount: NumberInt("2"),
},
{
isbn: "0007",
price: NumberDecimal("170.5"),
amount: NumberInt("1"),
},
],
},
My try
I believe I have a mistake inside the pipeline at the group stage. For now I need at least to have isbn along with the copies sold in one object.
db.books.aggregate([ // editing this
{ $match : {} },
{
$lookup :
{
from : "orders",
pipeline : [
{
$group :
{
_id: null,
amount_total : { $sum : "$orderLine.amount" }
}
},
{ $project : { _id : 0, amount_total : 1} }
],
as : "amount"
}
},
{ $project : { _id : 0, isbn : 1, amount : 1} }
])
No idea why all are 0's, I was expecting at least some different numbers.
{
"isbn": "0001",
"amount": [
{
"amount_total": 0
}
]
},
{
"isbn": "0002",
"amount": [
{
"amount_total": 0
}
]
},
{
"isbn": "0003",
"amount": [
{
"amount_total": 0
}
]
},// and so on
Apparently, this does what I wanted.
db.books.aggregate([
{
$lookup: {
from: "orders",
let: { isbn: "$isbn" }, // Pass this variable to pipeline for Joining condition.
pipeline: [
{ $unwind: "$orderLine" },
{
$match: {
// Join condition.
$expr: { $eq: ["$orderLine.isbn", "$$isbn"] }
}
},
{
$project: { _id: 0 , orderNo : 1, "orderLine.amount": 1}
}
],
as: "amount"
}
}, { $project : { _id : 0, isbn : 1, amount_total : { $sum : "$amount.orderLine.amount" } } }
])
In your query $lookup is performing a join operation without any condition instead try this query:
db.books.aggregate([
{
$lookup: {
from: "orders",
let: { isbn: "$isbn" },
pipeline: [
{ $unwind: "$orderLine" },
{
$match: {
$expr: { $eq: ["$orderLine.isbn", "$$isbn"] }
}
}
],
as: "amount"
}
},
{
$project: {
_id: 0,
isbn: 1,
amount_total: { $sum: "$amount.orderLine.amount" }
}
}
]);
Test data:
books collection:
/* 1 createdAt:3/12/2021, 10:41:13 AM*/
{
"_id" : ObjectId("604af7f14b5860176c2254b7"),
"isbn" : "0001",
"title" : "Book1"
},
/* 2 createdAt:3/12/2021, 10:41:13 AM*/
{
"_id" : ObjectId("604af7f14b5860176c2254b8"),
"isbn" : "0002",
"title" : "Book2"
}
orders collection:
/* 1 createdAt:3/12/2021, 11:10:51 AM*/
{
"_id" : ObjectId("604afee34b5860176c2254ce"),
"orderNo" : "1",
"customerNo" : "0001",
"orderLine" : [
{
"isbn" : "0001",
"price" : 341,
"amount" : 2
},
{
"isbn" : "0002",
"price" : 170.5,
"amount" : 1
},
{
"isbn" : "0003",
"price" : 190.5,
"amount" : 3
}
]
},
/* 2 createdAt:3/12/2021, 11:10:51 AM*/
{
"_id" : ObjectId("604afee34b5860176c2254cf"),
"orderNo" : "3",
"customerNo" : "0003",
"orderLine" : [
{
"isbn" : "0001",
"price" : 341,
"amount" : 2
},
{
"isbn" : "0002",
"price" : 170.5,
"amount" : 1
},
{
"isbn" : "0003",
"price" : 190.5,
"amount" : 3
}
]
}
Output:
/* 1 */
{
"isbn" : "0001",
"amount_total" : 4
},
/* 2 */
{
"isbn" : "0002",
"amount_total" : 2
}
The $sum inside $group stage will sum root and grouped fields but here orderLine field is an array, you need to sum that array of numbers before applying $sum, it means nested $sum operation,
{
$group: {
_id: null,
amount_total: {
$sum: {
$sum: "$orderLine.amount"
}
}
}
}
Playground
Try the final solution,
$match isbn array in orderLine.isbn using $in condition
$filter to iterate look of orderLine array, and match isbn, it will return filtered documents
$let declare a orders variable to hold above filtered documents of orderLine, sum the amount from filtered array using $sum
$project to show required fields, and get total sum of amount_total array
db.books.aggregate([
{
$lookup: {
from: "orders",
let: { isbn: "$isbn" },
pipeline: [
{ $match: { $expr: { $in: ["$$isbn", "$orderLine.isbn"] } } },
{
$project: {
_id: 0,
amount_total: {
$let: {
vars: {
orders: {
$filter: {
input: "$orderLine",
cond: { $eq: ["$$this.isbn", "$$isbn"] }
}
}
},
in: { $sum: "$$orders.amount" }
}
}
}
}
],
as: "amount"
}
},
{
$project: {
_id: 0,
isbn: 1,
amount_total: { $sum: "$amount.amount_total" }
}
}
])
Playground

mongodb aggregate multiple arrays

I am using MongoDB version v3.4. I have a documents collection and sample datas are like this:
{
"mlVoters" : [
{"email" : "a#b.com", "isApproved" : false}
],
"egVoters" : [
{"email" : "a#b.com", "isApproved" : false},
{"email" : "c#d.com", "isApproved" : true}
]
},{
"mlVoters" : [
{"email" : "a#b.com", "isApproved" : false},
{"email" : "e#f.com", "isApproved" : true}
],
"egVoters" : [
{"email" : "e#f.com", "isApproved" : true}
]
}
Now if i want the count of distinct email addresses for mlVoters:
db.documents.aggregate([
{$project: { mlVoters: 1 } },
{$unwind: "$mlVoters" },
{$group: { _id: "$mlVoters.email", mlCount: { $sum: 1 } }},
{$project: { _id: 0, email: "$_id", mlCount: 1 } },
{$sort: { mlCount: -1 } }
])
Result of the query is:
{"mlCount" : 2.0,"email" : "a#b.com"}
{"mlCount" : 1.0,"email" : "e#f.com"}
And if i want the count of distinct email addresses for egVoters i do the same for egVoters field. And the result of that query would be:
{"egCount" : 1.0,"email" : "a#b.com"}
{"egCount" : 1.0,"email" : "c#d.com"}
{"egCount" : 1.0,"email" : "e#f.com"}
So, I want to combine these two aggregation and get the result as following (sorted by totalCount):
{"email" : "a#b.com", "mlCount" : 2, "egCount" : 1, "totalCount":3}
{"email" : "e#f.com", "mlCount" : 1, "egCount" : 1, "totalCount":2}
{"email" : "c#d.com", "mlCount" : 0, "egCount" : 1, "totalCount":1}
How can I do this? How should the query be like? Thanks.
First you add a field voteType in each vote. This field indicates its type. Having this field, you don't need to keep the votes in two separate arrays mlVoters and egVoters; you can instead concatenate those arrays into a single array per document, and unwind afterwards.
At this point you have one document per vote, with a field that indicates which type it is. Now you simply need to group by email and, in the group stage, perform two conditional sums to count how many votes of each type there are for every email.
Finally you add a field totalCount as the sum of the other two counts.
db.documents.aggregate([
{
$addFields: {
mlVoters: {
$ifNull: [ "$mlVoters", []]
},
egVoters: {
$ifNull: [ "$egVoters", []]
}
}
},
{
$addFields: {
"mlVoters.voteType": "ml",
"egVoters.voteType": "eg"
}
},
{
$project: {
voters: { $concatArrays: ["$mlVoters", "$egVoters"] }
}
},
{
$unwind: "$voters"
},
{
$project: {
email: "$voters.email",
voteType: "$voters.voteType"
}
},
{
$group: {
_id: "$email",
mlCount: {
$sum: {
$cond: {
"if": { $eq: ["$voteType", "ml"] },
"then": 1,
"else": 0
}
}
},
egCount: {
$sum: {
$cond: {
"if": { $eq: ["$voteType", "eg"] },
"then": 1,
"else": 0
}
}
}
}
},
{
$addFields: {
totalCount: {
$sum: ["$mlCount", "$egCount"]
}
}
}
])

using mongo aggregation how to replace the fields names [duplicate]

I have large collection of documents which represent some kind of events. Collection contains events for different userId.
{
"_id" : ObjectId("57fd7d00e4b011cafdb90d22"),
"userId" : "123123123",
"userType" : "mobile",
"event_type" : "clicked_ok",
"country" : "US",
"timestamp" : ISODate("2016-10-12T00:00:00.308Z")
}
{
"_id" : ObjectId("57fd7d00e4b011cafdb90d22"),
"userId" : "123123123",
"userType" : "mobile",
"event_type" : "clicked_cancel",
"country" : "US",
"timestamp" : ISODate("2016-10-12T00:00:00.308Z")
}
At midnight I need to run aggregation for all documents for the previous day. Documents need to aggregated in the way so I could get number of different events for particular userId.
{
"userId" : "123123123",
"userType" : "mobile",
"country" : "US",
"clicked_ok" : 23,
"send_message" : 14,
"clicked_cancel" : 100,
"date" : "2016-11-24",
}
During aggregation I need to perform two things:
calculate number of events for particular userId
add "date" text fields with date
Any help is greatly appreciated! :)
you can do this with aggregation like this :
db.user.aggregate([
{
$match:{
$and:[
{
timestamp:{
$gte: ISODate("2016-10-12T00:00:00.000Z")
}
},
{
timestamp:{
$lt: ISODate("2016-10-13T00:00:00.000Z")
}
}
]
}
},
{
$group:{
_id:"$userId",
timestamp:{
$first:"$timestamp"
},
send_message:{
$sum:{
$cond:[
{
$eq:[
"$event_type",
"send_message"
]
},
1,
0
]
}
},
clicked_cancel:{
$sum:{
$cond:[
{
$eq:[
"$event_type",
"clicked_cancel"
]
},
1,
0
]
}
},
clicked_ok:{
$sum:{
$cond:[
{
$eq:[
"$event_type",
"clicked_ok"
]
},
1,
0
]
}
}
}
},
{
$project:{
date:{
$dateToString:{
format:"%Y-%m-%d",
date:"$timestamp"
}
},
userId:1,
clicked_cancel:1,
send_message:1,
clicked_ok:1
}
}
])
explanation:
keep only document for a specific day in $match stage
group doc by userId and count occurrences for each event in $group stage
finally format the timestamp field into yyyy_MM-dd format in $project stage
for the data you provided, this will output
{
"_id":"123123123",
"send_message":0,
"clicked_cancel":1,
"clicked_ok":1,
"date":"2016-10-12"
}
Check the following query
db.sandbox.aggregate([{
$group: {
_id: {
userId: "$userId",
date: {
$dateToString: { format: "%Y-%m-%d", date: "$timestamp" }}
},
send_message: {
$sum: {
$cond: { if: { $eq: ["$event_type", "send_message"] }, then: 1, else: 0 } }
},
clicked_cancel: {
$sum: {
$cond: { if: { $eq: ["$event_type", "clicked_cancel"] }, then: 1, else: 0 }
}
},
clicked_ok: {
$sum: {
$cond: { if: { $eq: ["$event_type", "clicked_ok"] }, then: 1, else: 0 }
}
}
}
}])

Group by inner array element after $unwind

I have the following time series data stored in mongodb
{
"_id" : ObjectId("59a46062e1aeb958a712490e"),
"channelName" : "ABC",
"rtData" : [
{
"ts" : ISODate("2017-08-28T18:26:42.837Z"),
"data" : [ 676.297664, 676.297664 ]
},
{
"ts" : ISODate("2017-08-28T18:27:42.837Z"),
"data" : [ 724.297664, 676.297664 ]
},
{
"ts" : ISODate("2017-08-28T18:29:42.837Z"),
"data" : [ 878.297, 676.297 ]
}
]
}
I want to group the data based on the ts field on hour and get the first element of rtData for that hour.
Here is what I have tried
db.channels.aggregate( [ {$match: {"channelName": "ABC"} }, { $unwind : "$rtData" }, { $group : {_id: { $hour: "$rtData.ts" }, ucast: { $sum: $rtData.data[0]} }
But the above code gives me the following output
{ "_id" : 28, "ucast" : 0 }
What I actually want is
{ "_id" : 28, "ucast" : 676.297664 }
You don't notate getting a first element of an array in an aggregation pipeline like that. You want $arrayElemAt which returns the array value by index:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $group : {
_id: { $hour: "$rtData.ts" },
ucast: { $sum: { $arrayElemAt: [ "$rtData.data", 0 ] } }
}}
])
If your MongoDB does not support $arrayElemAt ( prior to 3.2 ), then you can instead use $first in an additional $group on just the document key, done before you "accumulate" for the desired grouping key:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $group: {
_id: { _id: "$_id", ts: "$rtData.ts" },
data: { $first: "$rtData.data" }
}},
{ $group : {
_id: { $hour: "$_id.ts" },
ucast: { $sum: "$data" }
}}
])
In modern versions you can "double barrel" the $sum to both add up array elements as well as act as an accumulator if you wanted to "sum" all elements of the array:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $group : {
_id: { $hour: "$rtData.ts" },
ucast: { $sum: { $sum: "$rtData.data" } }
}}
])
And with older versions ( prior to 3.2 ) you would "double" $unwind for each array path instead:
db.channels.aggregate( [
{ $match: {"channelName": "ABC"} },
{ $unwind : "$rtData" },
{ $unwind : "$rtData.data" },
{ $group : {
_id: { $hour: "$rtData.ts" },
ucast: { $sum: "$rtData.data" }
}}
])
you need to use $first operator for that instead of $sum
db.channels.aggregate( [ {$match: {"channelName": "ABC"} }, { $unwind : "$rtData" }, { $group : {_id: { $hour: "$rtData.ts" }, ucast: { $first: $rtData.data} }
which will give you output like { "_id" : 28, "ucast" : [ 676.297664, 676.297664 ] }
if you want output like { "_id" : 28, "ucast" : 676.297664 } in next $project or $addFields stage use $arrayElemAt

Count Distinct Within Date Range

I have a MongoDB database with a collection of site-events. The documents look like:
{
"_id" : ObjectId("5785bb02eac0636f1dc07023"),
"referrer" : "https://example.com",
"_t" : ISODate("2016-07-12T18:10:17Z"),
"_p" : "ucd7+hvjpacuhtgbq1caps4rqepvwzuoxm=",
"_n" : "visited site",
"km screen resolution" : "1680x1050"
},
{
"_id" : ObjectId("5785bb02eac0636f1dc07047"),
"url" : "https://www.example.com/",
"referrer" : "Direct",
"_t" : ISODate("2016-07-12T18:10:49Z"),
"_p" : "txt6t1siuingcgo483aabmses2et5uqk0=",
"_n" : "visited site",
"km screen resolution" : "1366x768"
},
{
"_id" : ObjectId("5785bb02eac0636f1dc07053"),
"url" : "https://www.example.com/",
"referrer" : "Direct",
"_t" : ISODate("2016-07-12T18:10:56Z"),
"_p" : "gcama1az5jxa74wa6o9r4v/3k+zulciqiu=",
"_n" : "visited site",
"km screen resolution" : "1366x768"
}
I want to get a count of the unique persons within a date range. In SQL it would be
SELECT COUNT(DISTINCT(`_p`)) FROM collection WHERE `_t` > '<SOME DATE>' AND `_t` <= '<SOME OTHER DATE>'
So far, I've grouped the dates along using the aggregation pipeline:
db.siteEvents.aggregate(
[
{
$match : {"_n": "visited site"}
},
{
$group : {
_id: {
year : { $year : "$_t" },
month : { $month : "$_t" },
day : { $dayOfMonth : "$_t" },
_p : "$_p"
},
count: { $sum: 1 }
}
},
{
$group : {
_id : {
year : { $year : "$_id.year" },
month : { $month : "$_id.month" },
day : { $dayOfMonth : "$_id.day" }
},
count: { $sum: 1 }
}
}
]
);
But this gives errors - I believe because of the second grouping _id trying to grab an intermediate field. I'm currently just using the Mongo shell, but if I had to choose an alternative driver it would be PyMongo. I'd like to get this to work in the shell (so I can understand the process).
With an aggregation pipeline it could look like so
db.getCollection('siteEvents').aggregate([
{
$match: {
_t: {
$gt: ISODate("2016-07-11T08:10:17.000Z"),
$lt: ISODate("2016-07-12T14:10:17.000Z")
}
}
},
{
$group: {
_id: "$_p"
}
},
{
$group: {
_id: null,
distinctCount: { $sum: 1 }
}
}
])
If you know the resulting distinct values won't be large then you could use a simply query like so
db.getCollection('siteEvents').distinct(
'_p',
{
_t: {
$gt: ISODate("2016-07-11T08:10:17.000Z"),
$lt: ISODate("2016-07-12T14:10:17.000Z")
}
}).length
You can use the $addToSet operator in the $group stage to return an array of distinct "_p" value then $project the resulted document to return the size of the array which is nothing other than the distinct count.
db.siteEvents.aggregate(
[
{"$match": {"_n": "visited site", "_t": {"$gt": <SOME DATE>, "$lt": <SOME OTHER DATE>}}},
{"$group": {
"_id": None,
"_p_values": {"$addToSet": "$_p"}
}},
{"$project": {"_id": 0, "count": {"$size": "$_p_values"}}}
]
)
For small size collection you can simply use distinct but you need to pass in the query argument.
len(db.siteEvents.distinct("_p", {"_n": "visited site", "_t": {"$gt": <SOME DATE>, "$lt": <SOME OTHER DATE>}}))