Count Distinct Within Date Range - mongodb

I have a MongoDB database with a collection of site-events. The documents look like:
{
"_id" : ObjectId("5785bb02eac0636f1dc07023"),
"referrer" : "https://example.com",
"_t" : ISODate("2016-07-12T18:10:17Z"),
"_p" : "ucd7+hvjpacuhtgbq1caps4rqepvwzuoxm=",
"_n" : "visited site",
"km screen resolution" : "1680x1050"
},
{
"_id" : ObjectId("5785bb02eac0636f1dc07047"),
"url" : "https://www.example.com/",
"referrer" : "Direct",
"_t" : ISODate("2016-07-12T18:10:49Z"),
"_p" : "txt6t1siuingcgo483aabmses2et5uqk0=",
"_n" : "visited site",
"km screen resolution" : "1366x768"
},
{
"_id" : ObjectId("5785bb02eac0636f1dc07053"),
"url" : "https://www.example.com/",
"referrer" : "Direct",
"_t" : ISODate("2016-07-12T18:10:56Z"),
"_p" : "gcama1az5jxa74wa6o9r4v/3k+zulciqiu=",
"_n" : "visited site",
"km screen resolution" : "1366x768"
}
I want to get a count of the unique persons within a date range. In SQL it would be
SELECT COUNT(DISTINCT(`_p`)) FROM collection WHERE `_t` > '<SOME DATE>' AND `_t` <= '<SOME OTHER DATE>'
So far, I've grouped the dates along using the aggregation pipeline:
db.siteEvents.aggregate(
[
{
$match : {"_n": "visited site"}
},
{
$group : {
_id: {
year : { $year : "$_t" },
month : { $month : "$_t" },
day : { $dayOfMonth : "$_t" },
_p : "$_p"
},
count: { $sum: 1 }
}
},
{
$group : {
_id : {
year : { $year : "$_id.year" },
month : { $month : "$_id.month" },
day : { $dayOfMonth : "$_id.day" }
},
count: { $sum: 1 }
}
}
]
);
But this gives errors - I believe because of the second grouping _id trying to grab an intermediate field. I'm currently just using the Mongo shell, but if I had to choose an alternative driver it would be PyMongo. I'd like to get this to work in the shell (so I can understand the process).

With an aggregation pipeline it could look like so
db.getCollection('siteEvents').aggregate([
{
$match: {
_t: {
$gt: ISODate("2016-07-11T08:10:17.000Z"),
$lt: ISODate("2016-07-12T14:10:17.000Z")
}
}
},
{
$group: {
_id: "$_p"
}
},
{
$group: {
_id: null,
distinctCount: { $sum: 1 }
}
}
])
If you know the resulting distinct values won't be large then you could use a simply query like so
db.getCollection('siteEvents').distinct(
'_p',
{
_t: {
$gt: ISODate("2016-07-11T08:10:17.000Z"),
$lt: ISODate("2016-07-12T14:10:17.000Z")
}
}).length

You can use the $addToSet operator in the $group stage to return an array of distinct "_p" value then $project the resulted document to return the size of the array which is nothing other than the distinct count.
db.siteEvents.aggregate(
[
{"$match": {"_n": "visited site", "_t": {"$gt": <SOME DATE>, "$lt": <SOME OTHER DATE>}}},
{"$group": {
"_id": None,
"_p_values": {"$addToSet": "$_p"}
}},
{"$project": {"_id": 0, "count": {"$size": "$_p_values"}}}
]
)
For small size collection you can simply use distinct but you need to pass in the query argument.
len(db.siteEvents.distinct("_p", {"_n": "visited site", "_t": {"$gt": <SOME DATE>, "$lt": <SOME OTHER DATE>}}))

Related

Memory overflow error when using aggregate and $group queries

I have a collection named "allvoice" which has the following structure:
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf38"),
"subscriber_id" : "e3365edb9c781a561107242a81c1a92b4269ef9a",
"callednumber" : "559198e6f8814773551a457e53a614d603f9deab",
"originaltimestamp" : "20200113033803",
"duration" : "13",
"maincost" : "255.6",
"type" : "Internal",
"type_network" : "local_network",
"month" : "202001"
}
with the field "originaltimestamp" being the transaction period of the data of the day, to make it easier to query for large data during the day, I use the query "aggregate" and "group" by date to get the subscriber transactions by day (dailly transaction) and then I write out to a collection named "dailyvoice". From now on to query the transaction data by date, I will query directly in the collection "dailyvoice". Here is my query.
db.getCollection('allvoice').aggregate(
[
{
"$project": {
"date": { "$toDate": "$originaltimestamp" },
"subscriber_id":1,
"callednumber":1,
"originaltimestamp":1,
"duration": 1,
"maincost": 1,
"type": 1,
"type_network": 1,
"month":1
}},
{ "$group": {
"_id": { "$dateToString": { "format": "%Y-%m-%d", "date": "$date" } },
"data": { $push: "$$ROOT" } ,
"count": { "$sum": 1 }
}},
{
'$out': 'dailyvoice'
}
], { allowDiskUse: true }
)
And the output of the "dailyvoice" collection is as follows:
{
"_id" : "2020-01-13",
"data" : [
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf38"),
"subscriber_id" : "e3365edb9c781a561107242a81c1a92b4269ef9a",
"callednumber" : "559198e6f8814773551a457e53a614d603f9deab",
"originaltimestamp" : "20200113033803",
"duration" : "13",
"maincost" : "255.6",
"type" : "trong nuoc",
"type_network" : "local_network",
"month" : "202001",
"date" : ISODate("2020-01-13T03:38:03.000Z")
},
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf39"),
"subscriber_id" : "6cf5d711bfa12160eefe62b8bc9c914370eebd70",
"callednumber" : "0241052d42e5491b0529733716fb6fb04804248f",
"originaltimestamp" : "20200113041608",
"duration" : "28",
"maincost" : "644.0",
"type" : "trong nuoc",
"type_network" : "global_network",
"month" : "202001",
"date" : ISODate("2020-01-13T04:16:08.000Z")
},
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf3a"),
"subscriber_id" : "3e554a5a920c469da9faf7375c5265c5cf6fb696",
"callednumber" : "307219a71c028931a4b74f8f5f014ffa16005ee9",
"originaltimestamp" : "20200113051416",
"duration" : "202",
"maincost" : "2961.4",
"type" : "trong nuoc",
"type_network" : "local_network",
"month" : "202001",
"date" : ISODate("2020-01-13T05:14:16.000Z")
}
],
"count" : 3.0
}
The problem I have here is that if the collection "allvoice" has a small data set, the query statement works fine, but when the collection "allvoice" has a large data set, about 114513872 records (documents) the statement the query encountered an overflow ("PlanExcutor error during aggregation").
Is there a better solution than increasing the server configuration?
Please take a look to find a way to optimize the query for me! Thanks you so much
After I optimize the query to query month by month, the result is that I still get the error: "PlanExcutor error during aggreation::cased by::BSONObj size.."
db.getCollection('allvoice').aggregate(
[
{ $match: { month: "202001" } },
{
"$group": {
"_id": {
"$dateToString": {
"format": "%Y-%m-%d", "date": { "$toDate": "$originaltimestamp" }
}
},
"data": {
$push: {
"subscriber_id": "$subscriber_id",
"type": "$type",
// "originaltimestamp":"$originaltimestamp"
"date": { "$toDate": "$originaltimestamp" },
"month": "$month"
}
},
"count": { "$sum": 1 }
}
},
{
'$out': 'dailyvoice_202001'
}
], { allowDiskUse: true }
)
Some ideas:
You don't need the first $project stage in the query. And, you can include the { "$toDate": "$originaltimestamp" } within the $group stage's _id, as below:
"_id": {
"$dateToString": {
"format": "%Y-%m-%d", "date": { "$toDate": "$originaltimestamp" }
}
}
About the $push: "$$ROOT" - instead of the $$ROOT, capture only the fields you need most (or important). This is to reduce the memory usage. For example:
"data": {
$push: {
"subscriber_id": "$subscriber_id",
"type": "$type",
// other required fields...
}
}
Finally, you can think about restricting the query for a set of dates at time. This will require running the query more than once for different range of dates - but I think it may fare better overall. For example, a month at a time matching the month field. And, this month can be indexed for performance. This will require to include a $match stage at the beginning (the first stage) of the query, for example:
{ $match: { month: "202001" } }
And, this will query data for the month of January 2020.

How can I ensure my aggregation filters out subdocuments that are past expiry date in mongo?

I want to count the number of resetPassword (subdocument in Users schema) codes that are currently active. For a code to be active it's expiry date must be greater than the current date.
Here is my users schema. If someone requests to reset there password, we'll push a new { code: X, expiresAt, createdAt } Object to the array.
id: { type: String, unique: true },
resetPassword: [
{
code: String,
expiresAt: Date,
createdAt: Date,
},
],
I'm having an issue trying to $sum the total number of active reset codes. Here is the query I'm running that returns an empty array...note that if I were to remove the resetPassword.expiresAt: { $gt: nowDateInMilliseconds() } match section, it will return all the codes. I've tried moving this match statement out of the intial $match stage then doing an unwind & a match on the expiresAt but this didn't work either.
[
{
$match: {
"id": userId,
'resetPassword.expiresAt': {
$gt: nowDateInMillisec(),
},
},
},
{
$group: {
_id: '$id',
totalValidResetCodes: {
$sum: {
$size: '$resetPassword',
},
},
},
},
]
This returns an empty array, even though I've got the expiry dates set to a date in the future.
I also tried the following with the same result (notice how I added $unwind and another $match to the pipeline)
[
{
$match: {
"id": userId,
},
},
{
$unwind: '$resetPassword',
},
{
$match: {
'resetPassword.expiresAt': {
$gt: nowDateInMillisec(),
},
}
},
{
$group: {
_id: '$id',
totalValidResetCodes: {
$sum: {
$size: '$resetPassword',
},
},
},
},
]
nowDateInMillisec() - This simply returns the current date in milliseconds from epoch.
What am I doing wrong?
You can try $reduce in $project, instead of your all process, you need to return ISOdate from this nowDateInMillisec(),
db.collection.aggregate([
{ $match: { id: 1 } },
{
$project: {
totalValidResetCodes: {
$reduce: {
input: "$resetPassword",
initialValue: 0,
in: {
$add: [
"$$value",
{
$cond: [
{ $gt: ["$$this.expiresAt", nowDateInMillisec()] },
// If you really want to pass timestamp then try below line
// { $gt: ["$$this.expiresAt", { $toDate: nowDateInMillisec() }] },
1,
0
]
}
]
}
}
}
}
}
])
Playground
Below is my research, date format when stored in mongodb format should work for milliseconds as well. The below test is for up to a minute.
> db.users13.find().pretty();
{
"_id" : ObjectId("5f4e03768379a4e3f957641d"),
"id" : "johnc",
"resetPassword" : [
{
"code" : "abc",
"expiresAt" : ISODate("2020-09-02T09:11:18.394Z"),
"createdAt" : ISODate("2020-09-01T08:11:18.394Z")
},
{
"code" : "mno",
"expiresAt" : ISODate("2020-08-25T09:26:18.394Z"),
"createdAt" : ISODate("2020-08-25T08:11:18.394Z")
}
]
}
{
"_id" : ObjectId("5f4e06938379a4e3f957641f"),
"id" : "katey",
"resetPassword" : [
{
"code" : "j2c",
"expiresAt" : ISODate("2020-09-02T08:48:18.394Z"),
"createdAt" : ISODate("2020-09-01T08:11:18.394Z")
},
{
"code" : "rml",
"expiresAt" : ISODate("2020-09-01T08:26:18.394Z"),
"createdAt" : ISODate("2020-09-01T08:11:18.394Z")
}
]
}
> db.users13.aggregate([
{$unwind:"$resetPassword"},
{$match:{"resetPassword.expiresAt":{$gt:ISODate()}}}
]).pretty();
{
"_id" : ObjectId("5f4e03768379a4e3f957641d"),
"id" : "johnc",
"resetPassword" : {
"code" : "abc",
"expiresAt" : ISODate("2020-09-02T09:11:18.394Z"),
"createdAt" : ISODate("2020-09-01T08:11:18.394Z")
}
}
{
"_id" : ObjectId("5f4e06938379a4e3f957641f"),
"id" : "katey",
"resetPassword" : {
"code" : "j2c",
"expiresAt" : ISODate("2020-09-02T08:48:18.394Z"),
"createdAt" : ISODate("2020-09-01T08:11:18.394Z")
}
}
> ISODate()
ISODate("2020-09-01T08:31:37.059Z")
>
#Joe answered my question in the comments. He hinted that using a milliseconds since epoch time in the match filter wouldn't work since I'm using a Date type in my Mongoose schema.
So instead of doing this: $gt: nowDateInMillisec(), I simply used a Date type like so: $gt: new Date(),

How can i count total documents and also grouped counts simultanously in mongodb aggregation?

I have a dataset in mongodb collection named visitorsSession like
{ip : 192.2.1.1,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.3.1.8,country : 'UK', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.5.1.4,country : 'UK', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.8.1.7,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'},
{ip : 192.1.1.3,country : 'US', type : 'Visitors',date : '2019-12-15T00:00:00.359Z'}
I am using this mongodb aggregation
[{$match: {
nsp : "/hrm.sbtjapan.com",
creationDate : {
$gte: "2019-12-15T00:00:00.359Z",
$lte: "2019-12-20T23:00:00.359Z"
},
type : "Visitors"
}}, {$group: {
_id : "$country",
totalSessions : {
$sum: 1
}
}}, {$project: {
_id : 0,
country : "$_id",
totalSessions : 1
}}, {$sort: {
country: -1
}}]
using above aggregation i am getting results like this
[{country : 'US',totalSessions : 3},{country : 'UK',totalSessions : 2}]
But i also total visitors also along with result like totalVisitors : 5
How can i do this in mongodb aggregation ?
You can use $facet aggregation stage to calculate total visitors as well as visitors by country in a single pass:
db.visitorsSession.aggregate( [
{
$match: {
nsp : "/hrm.sbtjapan.com",
creationDate : {
$gte: "2019-12-15T00:00:00.359Z",
$lte: "2019-12-20T23:00:00.359Z"
},
type : "Visitors"
}
},
{
$facet: {
totalVisitors: [
{
$count: "count"
}
],
countrySessions: [
{
$group: {
_id : "$country",
sessions : { $sum: 1 }
}
},
{
$project: {
country: "$_id",
_id: 0,
sessions: 1
}
}
],
}
},
{
$addFields: {
totalVisitors: { $arrayElemAt: [ "$totalVisitors.count" , 0 ] },
}
}
] )
The output:
{
"totalVisitors" : 5,
"countrySessions" : [
{
"sessions" : 2,
"country" : "UK"
},
{
"sessions" : 3,
"country" : "US"
}
]
}
You could be better off with two queries to do this.
To save the two db round trips following aggregation can be used which IMO is kinda verbose (and might be little expensive if documents are very large) to just count the documents.
Idea: Is to have a $group at the top to count documents and preserve the original documents using $push and $$ROOT. And then before other matches/filter ops $unwind the created array of original docs.
db.collection.aggregate([
{
$group: {
_id: null,
docsCount: {
$sum: 1
},
originals: {
$push: "$$ROOT"
}
}
},
{
$unwind: "$originals"
},
{ $match: "..." }, //and other stages on `originals` which contains the source documents
{
$group: {
_id: "$originals.country",
totalSessions: {
$sum: 1
},
totalVisitors: {
$first: "$docsCount"
}
}
}
]);
Sample O/P: Playground Link
[
{
"_id": "UK",
"totalSessions": 2,
"totalVisitors": 5
},
{
"_id": "US",
"totalSessions": 3,
"totalVisitors": 5
}
]

using mongo aggregation how to replace the fields names [duplicate]

I have large collection of documents which represent some kind of events. Collection contains events for different userId.
{
"_id" : ObjectId("57fd7d00e4b011cafdb90d22"),
"userId" : "123123123",
"userType" : "mobile",
"event_type" : "clicked_ok",
"country" : "US",
"timestamp" : ISODate("2016-10-12T00:00:00.308Z")
}
{
"_id" : ObjectId("57fd7d00e4b011cafdb90d22"),
"userId" : "123123123",
"userType" : "mobile",
"event_type" : "clicked_cancel",
"country" : "US",
"timestamp" : ISODate("2016-10-12T00:00:00.308Z")
}
At midnight I need to run aggregation for all documents for the previous day. Documents need to aggregated in the way so I could get number of different events for particular userId.
{
"userId" : "123123123",
"userType" : "mobile",
"country" : "US",
"clicked_ok" : 23,
"send_message" : 14,
"clicked_cancel" : 100,
"date" : "2016-11-24",
}
During aggregation I need to perform two things:
calculate number of events for particular userId
add "date" text fields with date
Any help is greatly appreciated! :)
you can do this with aggregation like this :
db.user.aggregate([
{
$match:{
$and:[
{
timestamp:{
$gte: ISODate("2016-10-12T00:00:00.000Z")
}
},
{
timestamp:{
$lt: ISODate("2016-10-13T00:00:00.000Z")
}
}
]
}
},
{
$group:{
_id:"$userId",
timestamp:{
$first:"$timestamp"
},
send_message:{
$sum:{
$cond:[
{
$eq:[
"$event_type",
"send_message"
]
},
1,
0
]
}
},
clicked_cancel:{
$sum:{
$cond:[
{
$eq:[
"$event_type",
"clicked_cancel"
]
},
1,
0
]
}
},
clicked_ok:{
$sum:{
$cond:[
{
$eq:[
"$event_type",
"clicked_ok"
]
},
1,
0
]
}
}
}
},
{
$project:{
date:{
$dateToString:{
format:"%Y-%m-%d",
date:"$timestamp"
}
},
userId:1,
clicked_cancel:1,
send_message:1,
clicked_ok:1
}
}
])
explanation:
keep only document for a specific day in $match stage
group doc by userId and count occurrences for each event in $group stage
finally format the timestamp field into yyyy_MM-dd format in $project stage
for the data you provided, this will output
{
"_id":"123123123",
"send_message":0,
"clicked_cancel":1,
"clicked_ok":1,
"date":"2016-10-12"
}
Check the following query
db.sandbox.aggregate([{
$group: {
_id: {
userId: "$userId",
date: {
$dateToString: { format: "%Y-%m-%d", date: "$timestamp" }}
},
send_message: {
$sum: {
$cond: { if: { $eq: ["$event_type", "send_message"] }, then: 1, else: 0 } }
},
clicked_cancel: {
$sum: {
$cond: { if: { $eq: ["$event_type", "clicked_cancel"] }, then: 1, else: 0 }
}
},
clicked_ok: {
$sum: {
$cond: { if: { $eq: ["$event_type", "clicked_ok"] }, then: 1, else: 0 }
}
}
}
}])

Mongodb multiple key aggregation by date (from timestamp)

I have a mongodb collection with millions of records regarding transactions. I would like to create a query aggregated by date and resolution.
My document look like:
{
"_id": "Dan",
"finish_date": "2017-01-02 15:23:45.234Z",
"resolution": "canceled"
}
{
"_id": "John",
"finish_date": "2017-01-02 18:54:19.090Z",
"resolution": "completed"
}
{
"_id": "Pete",
"finish_date": "2017-01-02 19:11:27.418Z",
"order_resolution": "completed"
}
I would like the query output to look something like:
{
"2017-01-02" : {
"canceled": 1,
"completed": 2,
}
}
{
"2017-01-03" : {
"completed": 5,
}
}
Is this even possible? Currently, my output looks like:
{
"_id" : {
"curDate" : "2017-01-02",
"reason" : "canceled"
},
"count" : 1.0
}
"_id" : {
"curDate" : "2017-01-02",
"reason" : "completed"
},
"count" : 2.0
}
{
"_id" : {
"curDate" : "2017-01-03",
"reason" : "completed"
},
"count" : 5.0
}
The query looks like:
db.collection.aggregate(
[
{
"$match": {
"finish_date": {
"$gt": new Date("2017-01-02"),
"$lt": new Date("2017-01-08")
}
}
},
{
"$group" : {
_id : {
curDate: {
$substr: ["$finish_date", 0, 10]
},
reason: "$resolution"
},
count: { "$sum": 1 }
}
},
{
"$sort": { "_id.curDate": 1, "_id.reason": 1 }
}
]
)
You can use the new operator $arrayToObject available in 3.4.4 version with below aggregation query.
$group by curDate and push the reason and count values into reasoncount array.
$zip the reason and count array values together followed by $arrayToObject to create reason and count structure.
Same logic to create a curDate structure while keeping the previous reason and count structure.
db.collection.aggregate(
[
{$match:{finish_date:{$gt: new Date("2017-01-02"),$lt: new Date("2017-01-08")}}},
{$group:{_id:{curDate:{$substr:["$finish_date",0,10]},reason:"$resolution"},count:{$sum: 1}}},
{$sort:{"_id.curDate":1,"_id.reason":1}},
{$group:{_id:"$_id.curDate", reasoncount:{$push:{reason:"$_id.reason",count:"$count"}}}},
{$addFields:{reasoncount: {$arrayToObject:{$zip:{inputs:["$reasoncount.reason", "$reasoncount.count"]}}}}},
{$group:{_id:null, result:{$push:{curDate:"$_id",reasoncount:"$reasoncount"}}}},
{$addFields:{result: {$arrayToObject:{$zip:{inputs:["$result.curDate", "$result.reasoncount"]}}}}}
]
)