Memory overflow error when using aggregate and $group queries - mongodb

I have a collection named "allvoice" which has the following structure:
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf38"),
"subscriber_id" : "e3365edb9c781a561107242a81c1a92b4269ef9a",
"callednumber" : "559198e6f8814773551a457e53a614d603f9deab",
"originaltimestamp" : "20200113033803",
"duration" : "13",
"maincost" : "255.6",
"type" : "Internal",
"type_network" : "local_network",
"month" : "202001"
}
with the field "originaltimestamp" being the transaction period of the data of the day, to make it easier to query for large data during the day, I use the query "aggregate" and "group" by date to get the subscriber transactions by day (dailly transaction) and then I write out to a collection named "dailyvoice". From now on to query the transaction data by date, I will query directly in the collection "dailyvoice". Here is my query.
db.getCollection('allvoice').aggregate(
[
{
"$project": {
"date": { "$toDate": "$originaltimestamp" },
"subscriber_id":1,
"callednumber":1,
"originaltimestamp":1,
"duration": 1,
"maincost": 1,
"type": 1,
"type_network": 1,
"month":1
}},
{ "$group": {
"_id": { "$dateToString": { "format": "%Y-%m-%d", "date": "$date" } },
"data": { $push: "$$ROOT" } ,
"count": { "$sum": 1 }
}},
{
'$out': 'dailyvoice'
}
], { allowDiskUse: true }
)
And the output of the "dailyvoice" collection is as follows:
{
"_id" : "2020-01-13",
"data" : [
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf38"),
"subscriber_id" : "e3365edb9c781a561107242a81c1a92b4269ef9a",
"callednumber" : "559198e6f8814773551a457e53a614d603f9deab",
"originaltimestamp" : "20200113033803",
"duration" : "13",
"maincost" : "255.6",
"type" : "trong nuoc",
"type_network" : "local_network",
"month" : "202001",
"date" : ISODate("2020-01-13T03:38:03.000Z")
},
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf39"),
"subscriber_id" : "6cf5d711bfa12160eefe62b8bc9c914370eebd70",
"callednumber" : "0241052d42e5491b0529733716fb6fb04804248f",
"originaltimestamp" : "20200113041608",
"duration" : "28",
"maincost" : "644.0",
"type" : "trong nuoc",
"type_network" : "global_network",
"month" : "202001",
"date" : ISODate("2020-01-13T04:16:08.000Z")
},
{
"_id" : ObjectId("612599bb1cff80e6fc5cbf3a"),
"subscriber_id" : "3e554a5a920c469da9faf7375c5265c5cf6fb696",
"callednumber" : "307219a71c028931a4b74f8f5f014ffa16005ee9",
"originaltimestamp" : "20200113051416",
"duration" : "202",
"maincost" : "2961.4",
"type" : "trong nuoc",
"type_network" : "local_network",
"month" : "202001",
"date" : ISODate("2020-01-13T05:14:16.000Z")
}
],
"count" : 3.0
}
The problem I have here is that if the collection "allvoice" has a small data set, the query statement works fine, but when the collection "allvoice" has a large data set, about 114513872 records (documents) the statement the query encountered an overflow ("PlanExcutor error during aggregation").
Is there a better solution than increasing the server configuration?
Please take a look to find a way to optimize the query for me! Thanks you so much
After I optimize the query to query month by month, the result is that I still get the error: "PlanExcutor error during aggreation::cased by::BSONObj size.."
db.getCollection('allvoice').aggregate(
[
{ $match: { month: "202001" } },
{
"$group": {
"_id": {
"$dateToString": {
"format": "%Y-%m-%d", "date": { "$toDate": "$originaltimestamp" }
}
},
"data": {
$push: {
"subscriber_id": "$subscriber_id",
"type": "$type",
// "originaltimestamp":"$originaltimestamp"
"date": { "$toDate": "$originaltimestamp" },
"month": "$month"
}
},
"count": { "$sum": 1 }
}
},
{
'$out': 'dailyvoice_202001'
}
], { allowDiskUse: true }
)

Some ideas:
You don't need the first $project stage in the query. And, you can include the { "$toDate": "$originaltimestamp" } within the $group stage's _id, as below:
"_id": {
"$dateToString": {
"format": "%Y-%m-%d", "date": { "$toDate": "$originaltimestamp" }
}
}
About the $push: "$$ROOT" - instead of the $$ROOT, capture only the fields you need most (or important). This is to reduce the memory usage. For example:
"data": {
$push: {
"subscriber_id": "$subscriber_id",
"type": "$type",
// other required fields...
}
}
Finally, you can think about restricting the query for a set of dates at time. This will require running the query more than once for different range of dates - but I think it may fare better overall. For example, a month at a time matching the month field. And, this month can be indexed for performance. This will require to include a $match stage at the beginning (the first stage) of the query, for example:
{ $match: { month: "202001" } }
And, this will query data for the month of January 2020.

Related

How to join collections on nosqlbooster so that I can run $avg query on it?

I have two collections in my database with field names in the documents that are the same. I need to join these collections and then sum the values of the common field names and finally find the average as my output.
This is an example of a document in the first collection
{
"_id" : ObjectId("63074885ff3acbe0d63d7686"),
"year" : "2020",
"energy_products" : "Other Energy Products",
"sub_products" : "Other Energy Products",
"value_ktoe" : "70.4"
},
This is an example of a document in the second collection
{
"_id" : ObjectId("63074882ff3acbe0d63c391a"),
"year" : "2020",
"energy_products" : "Petroleum Products",
"sub_products" : "Other Petroleum Products",
"value_ktoe" : "10633.7"
},
So I need to join the collections and sum up all the values in the energy_products and the sub_products part and then find the average.
The output needs to look something like this
/* 1 */
{
"_id" : {
"energy_products" : "Petroleum Products"
},
"avg" : 18312.05625
},
/* 2 */
{
"_id" : {
"sub_products" : "Jet Fuel Kerosene"
},
"avg" : 4253.884375
},
Perform a $unionWith to "merge" the 2 collections. Perform a simple $group to get the $avg you need.
db.coll1.aggregate([
{
"$group": {
"_id": {
"energy_products": "$energy_products"
},
"avg": {
"$avg": {
"$toDouble": "$value_ktoe"
}
}
}
},
{
"$unionWith": {
"coll": "coll2",
"pipeline": [
{
"$group": {
"_id": {
"sub_products": "$sub_products"
},
"avg": {
"$avg": {
"$toDouble": "$value_ktoe"
}
}
}
}
]
}
}
])
Mongo Playground

$match for filtering backtracking of dates

I am trying to create a $match that will use one date parameter and it can do the backtracking of dates. What I am trying to achieve is to get all rooms that are occupied based on the filter that will be used. Please see the scenario below.
Raw Data
{
"_id" : ObjectId("60d9681765077a71ae158625"),
"room": "301",
"startaccomodation":ISODate("2021-08-05T06:11:36.007Z"),
"endaccomodation":ISODate("2021-08-10T06:11:36.007Z")
},
{
"_id" : ObjectId("60dbf391e2759909d52d1917"),
"room": "302",
"startaccomodation":ISODate("2021-08-07T06:11:36.007Z"),
"endaccomodation":ISODate("2021-08-09T06:11:36.007Z")
},
{
"_id" : ObjectId("60dbf61d54b7c46bfa1b7954"),
"room": "303",
"startaccomodation":ISODate("2021-08-02T06:11:36.007Z"),
"endaccomodation":ISODate("2021-08-05T06:11:36.007Z")
},
{
"_id" : ObjectId("60dbf6ef6a9e0e09f9a4caa6"),
"room": "304",
"startaccomodation":ISODate("2021-08-06T06:11:36.007Z"),
"endaccomodation":ISODate("2021-08-08T06:11:36.007Z")
},
{
"_id" : ObjectId("60dbf6ef5805b16bf96286cc"),
"room": "305",
"startaccomodation":ISODate("2021-08-01T06:11:36.007Z"),
"endaccomodation":ISODate("2021-08-05T06:11:36.007Z")
},
{
"_id" : ObjectId("60dd0d7b1410931155f0bdd0"),
"room": "306",
"startaccomodation":ISODate("2021-08-02T06:11:36.007Z"),
"endaccomodation":null
},
{
"_id" : ObjectId("60dd0e04c02ff023ab091cd3"),
"room": "307",
"startaccomodation":ISODate("2021-08-06T06:11:36.007Z"),
"endaccomodation":null
}
If I want to only get all rooms that are occupied on August 3, 2021. The output will be:
{
"_id" : ObjectId("60dbf61d54b7c46bfa1b7954"),
"room": "303",
"startaccomodation":ISODate("2021-08-02T06:11:36.007Z"),
"endaccomodation":ISODate("2021-08-05T06:11:36.007Z")
},
{
"_id" : ObjectId("60dbf6ef5805b16bf96286cc"),
"room": "305",
"startaccomodation":ISODate("2021-08-01T06:11:36.007Z"),
"endaccomodation":ISODate("2021-08-05T06:11:36.007Z")
},
{
"_id" : ObjectId("60dd0d7b1410931155f0bdd0"),
"room": "306",
"startaccomodation":ISODate("2021-08-02T06:11:36.007Z"),
"enddate": null
},
If I will put the filter TODAY, this will give the data of all rooms occupied as of now. The output will be:
{
"_id" : ObjectId("60dd0d7b1410931155f0bdd0"),
"room": "306",
"startaccomodation":ISODate("2021-08-02T06:11:36.007Z"),
"endaccomodation":null
},
{
"_id" : ObjectId("60dd0e04c02ff023ab091cd3"),
"room": "307",
"startaccomodation":ISODate("2021-08-06T06:11:36.007Z"),
"endaccomodation":null
}
My match is incomplete and no data output yet.
{
$match: {
$and: [
{
"startdate": {
"$gte": ISODate("2021-08-03T00:00:00.000Z"),
}
},
{
"enddate": {
"$lte": ISODate("2021-08-03T00:00:00.000Z"),
}
},
]
}
}
Thank you in advance.
I think you have reversed the date range logic.
It should be:
startaccomodation <= x <= endaccomodation
While there is endaccomodation with null, the second condition should be started with $or that fulfill any of these conditions:
endaccomodation is null
endaccomodation must be $gte the input date.
db.collection.aggregate([
{
$match: {
$and: [
{
"startaccomodation": {
"$lte": ISODate("2021-08-03T00:00:00.000Z")
}
},
{
$or: [
{
"endaccomodation": {
$eq: null
}
},
{
"endaccomodation": {
"$gte": ISODate("2021-08-03T00:00:00.000Z")
}
}
]
}
]
}
}
])
Sample Mongo Playground

Mongo - Aggregate Group Lookup Values

I'm trying to sum up values that are listed within a lookup. I have 2 collections:
Subscriptions (Pseudocode)
{
_id,
purchaseDate: Date,
wholesalePrice: Number,
purchasePrice: Number,
retailPrice: Number,
userID: String,
}
Payments
{
_id,
statementMonth: String, // e.g. 2020-08 (same as grouped id in Subscriptions)
paymentDate: Date,
userID: String,
amountPaid
}
Users will need to transfer profit margin value at end of month. Therefore, I want to create an output for Statements. These will have all Subscriptions and Payments grouped into monthly records, with a summary of all the data within them. I have managed to create everything up to the first group, however once I do my lookup to get Payment details, everything seems to fail.
This is my current pipeline
{
"$match": {
"userID": [provided UserID]
}
},
{
"$group": {
"_id": {
"$dateToString": {
"format": "%Y-%m",
"date": "$purchaseDate"
}
},
"totalWholesalePrice": {
"$sum": "$wholesalePrice"
},
"totalPurchasePrice": {
"$sum": "$purchasePrice"
},
"count": {
"$sum": 1.0
}
}
},
{
"$addFields": {
"totalAmountDue": {
"$subtract": [
"$totalPurchasePrice",
"$totalWholesalePrice"
]
}
}
},
{
"$lookup": {
"from": "transactions",
"localField": "_id",
"foreignField": "statementMonth",
"as": "transactions"
}
},
{
"$unwind": {
"path": "$transactions",
"preserveNullAndEmptyArrays": true
}
},
{
"$sort": {
"_id": -1.0
}
}
This returns 2 records if 2 transactions:
{
"_id" : "2020-08",
"totalWholesalePrice" : NumberInt(89),
"totalPurchasePrice" : 135.55,
"count" : 8.0,
"totalAmountDue" : 46.55,
"transactions" : {
"_id" : ObjectId("5f3faf2216d7a517bc51bfae"),
"date" : ISODate("2020-04-20T11:23:40.284+0000"),
"statementMonth" : "2020-08",
"merchant" : "M1268360",
"amountPaid" : "40"
}
}
{
"_id" : "2020-08",
"totalWholesalePrice" : NumberInt(89),
"totalPurchasePrice" : 135.55,
"count" : 8.0,
"totalAmountDue" : 46.55,
"transactions" : {
"_id" : ObjectId("5f3fc13f16d7a517bc51c047"),
"date" : ISODate("2020-04-20T11:23:40.284+0000"),
"statementMonth" : "2020-08",
"merchant" : "M1268360",
"amountPaid" : "2"
}
}
I would like final JSON to be:
{
"_id" : "2020-08",
"totalWholesalePrice" : NumberInt(89),
"totalPurchasePrice" : 135.55,
"count" : 8.0,
"totalAmountDue" : 46.55,
"transactions" : [{
"_id" : ObjectId("5f3faf2216d7a517bc51bfae"),
"date" : ISODate("2020-04-20T11:23:40.284+0000"),
"statementMonth" : "2020-08",
"merchant" : "M1268360",
"amountPaid" : "40"
}
{
"_id" : ObjectId("5f3fc13f16d7a517bc51c047"),
"date" : ISODate("2020-04-20T11:23:40.284+0000"),
"statementMonth" : "2020-08",
"merchant" : "M1268360",
"amountPaid" : "2"
}],
"totalPaid" : 42,
"totalBalance" : 4.55,
}
You need to add one more pipeline
db.collection.aggregate([
{
$group: {
"_id": "$id",
"transactions": { //Grouping transactions together
"$addToSet": "$transactions"
},
"totalWholesalePrice": {//As all values are unique getting the first
"$first": "$totalWholesalePrice"
}
//Similarly add the needed fields - use $first, $addToSet
}
}
])
playground
To get the total:
db.collection.aggregate([
{
$group: {
"_id": "$id",
"transactions": {
"$addToSet": "$transactions"
},
"totalWholesalePrice": {
"$first": "$totalWholesalePrice"
},
"totalAmountPaid": {
$sum: {
$toInt: "$transactions.amountPaid" //you stored as string, convert to appropriate type
}
}
}
}
])
playground

Group by day with Multiple Date Fields

I have documents stored into MongoDB like this :
{
"_id" : "XBpNKbdGSgGfnC2MJ",
"po" : 72134185,
"machine" : 40940,
"location" : "02A01",
"inDate" : ISODate("2017-07-19T06:10:13.059Z"),
"requestDate" : ISODate("2017-07-19T06:17:04.901Z"),
"outDate" : ISODate("2017-07-19T06:30:34Z")
}
And I want give the sum, by day, of inDate and outDate.
I can retrieve of both side the sum of documents by inDate day and, on other side, the sum of documents by outDate, but I would like the sum of each.
Currently, I use this pipeline :
$group: {
_id: {
yearA: { $year: '$inDate' },
monthA: { $month: '$inDate' },
dayA: { $dayOfMonth: '$inDate' },
},
count: { $sum: 1 },
},
and I give :
{ "_id" : { "year" : 2017, "month" : 7, "day" : 24 }, "count" : 1 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 21 }, "count" : 11 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 19 }, "count" : 20 }
But I would like, if it's possible :
{ "_id" : { "year" : 2017, "month" : 7, "day" : 24 }, "countIn" : 1, "countOut" : 4 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 21 }, "countIn" : 11, "countOut" : 23 }
{ "_id" : { "year" : 2017, "month" : 7, "day" : 19 }, "countIn" : 20, "countOut" : 18 }
Any idea ?
Many thanks :-)
You can also split the documents at the source, by essentially combining each value into an array of entries by "type" for "in" and "out". You can do this simply using $map and $cond to select the fields, then $unwind the array and then determine which field to "count" again by inspecting with $cond:
collection.aggregate([
{ "$project": {
"dates": {
"$filter": {
"input": {
"$map": {
"input": [ "in", "out" ],
"as": "type",
"in": {
"type": "$$type",
"date": {
"$cond": {
"if": { "$eq": [ "$$type", "in" ] },
"then": "$inDate",
"else": "$outDate"
}
}
}
}
},
"as": "dates",
"cond": { "$ne": [ "$$dates.date", null ] }
}
}
}},
{ "$unwind": "$dates" },
{ "$group": {
"_id": {
"year": { "$year": "$dates.date" },
"month": { "$month": "$dates.date" },
"day": { "$dayOfMonth": "$dates.date" }
},
"countIn": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$dates.type", "in" ] },
"then": 1,
"else": 0
}
}
},
"countOut": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$dates.type", "out" ] },
"then": 1,
"else": 0
}
}
}
}}
])
That's a safe way to do this that does not risk breaking the BSON limit, no matter what size of data you send at it.
Personally I would rather run as separate processes and "combine" the aggregated results separately, but that would depend on the environment you are running in, which is not mentioned in the question.
For an example of "parallel" execution, you can structure in Meteor somewhere along these lines:
import { Meteor } from 'meteor/meteor';
import { Source } from '../imports/source';
import { Target } from '../imports/target';
Meteor.startup(async () => {
// code to run on server at startup
await Source.remove({});
await Target.remove({});
console.log('Removed');
Source.insert({
"_id" : "XBpNKbdGSgGfnC2MJ",
"po" : 72134185,
"machine" : 40940,
"location" : "02A01",
"inDate" : new Date("2017-07-19T06:10:13.059Z"),
"requestDate" : new Date("2017-07-19T06:17:04.901Z"),
"outDate" : new Date("2017-07-19T06:30:34Z")
});
console.log('Inserted');
await Promise.all(
["In","Out"].map( f => new Promise((resolve,reject) => {
let cursor = Source.rawCollection().aggregate([
{ "$match": { [`${f.toLowerCase()}Date`]: { "$exists": true } } },
{ "$group": {
"_id": {
"year": { "$year": `$${f.toLowerCase()}Date` },
"month": { "$month": `$${f.toLowerCase()}Date` },
"day": { "$dayOfYear": `$${f.toLowerCase()}Date` }
},
[`count${f}`]: { "$sum": 1 }
}}
]);
cursor.on('data', async (data) => {
cursor.pause();
data.date = data._id;
delete data._id;
await Target.upsert(
{ date: data.date },
{ "$set": data }
);
cursor.resume();
});
cursor.on('end', () => resolve('done'));
cursor.on('error', (err) => reject(err));
}))
);
console.log('Mapped');
let targets = await Target.find().fetch();
console.log(targets);
});
Which is essentially going to output to the target collection as was mentioned in comments like:
{
"_id" : "XdPGMkY24AcvTnKq7",
"date" : {
"year" : 2017,
"month" : 7,
"day" : 200
},
"countIn" : 1,
"countOut" : 1
}
Riiiight. I came up with the following query. Admittedly, I have seen simpler and nicer ones in my life but it certainly gets the job done:
db.getCollection('test').aggregate
(
{
$facet: // split aggregation into two pipelines
{
"in": [
{ "$match": { "inDate": { "$ne": null } } }, // get rid of null values
{ $group: { "_id": { "y": { "$year": "$inDate" }, "m": { "$month": "$inDate" }, "d": { "$dayOfMonth": "$inDate" } }, "cIn": { $sum : 1 } } }, // compute sum per inDate
],
"out": [
{ "$match": { "outDate": { "$ne": null } } }, // get rid of null values
{ $group: { "_id": { "y": { "$year": "$outDate" }, "m": { "$month": "$outDate" }, "d": { "$dayOfMonth": "$outDate" } }, "cOut": { $sum : 1 } } }, // compute sum per outDate
]
}
},
{ $project: { "result": { $setUnion: [ "$in", "$out" ] } } }, // merge results into new array
{ $unwind: "$result" }, // unwind array into individual documents
{ $replaceRoot: { newRoot: "$result" } }, // get rid of the additional field level
{ $group: { _id: { year: "$_id.y", "month": "$_id.m", "day": "$_id.d" }, "countIn": { $sum: "$cIn" }, "countOut": { $sum: "$cOut" } } } // group into final result
)
As always with MongoDB aggregations you can get an idea of what's going on by simply reducing the projection stages step by step starting from the end of the query.
EDIT:
As you can see in the comments below there was a bit of a discussion around document size limits and the general applicability of this solution.
So let's look at those aspects in greater detail and let's also compare the performance of the $facet based solution to the one based on $map (suggested by #NeilLunn to avoid potential document size issues).
I created 2 million test records that have random dates assigned to both the "inDate" and the "outDate" field:
{
"_id" : ObjectId("597857e0fa37b3f66959571a"),
"inDate" : ISODate("2016-07-29T22:00:00.000Z"),
"outDate" : ISODate("1988-07-14T22:00:00.000Z")
}
The data range covered was from 01.01.1970 all the way to 01.01.2050, that's a total of 29220 distinct days. Given the random distribution of the 2 million test records across this time range both queries can be expected to return the full 29220 possible results (which both did).
Then I ran both queries five times after restarting my single MongoDB instance freshly and the results in milliseconds I got looked like this:
$facet: 5663, 5400, 5380, 5460, 5520
$map: 9648, 9134, 9058, 9085, 9132
I also measured the size of the single document returned by the facet stage which was 3.19MB so reasonably far away from the MongoDB document size limit (16MB at the time of writing) which, however, only applies to the result document anyway and wouldn't be a problem during pipeline processing.
Bottom line: If you want performance, use the solution suggested here. Be careful about the document size limit, though, in particular if your use case is not the exact one described in the question above (e.g. when you need to collect even more/bigger data). Also, I am not sure if in a sharded scenario both solutions still expose the same performance characteristics...

Count Distinct Within Date Range

I have a MongoDB database with a collection of site-events. The documents look like:
{
"_id" : ObjectId("5785bb02eac0636f1dc07023"),
"referrer" : "https://example.com",
"_t" : ISODate("2016-07-12T18:10:17Z"),
"_p" : "ucd7+hvjpacuhtgbq1caps4rqepvwzuoxm=",
"_n" : "visited site",
"km screen resolution" : "1680x1050"
},
{
"_id" : ObjectId("5785bb02eac0636f1dc07047"),
"url" : "https://www.example.com/",
"referrer" : "Direct",
"_t" : ISODate("2016-07-12T18:10:49Z"),
"_p" : "txt6t1siuingcgo483aabmses2et5uqk0=",
"_n" : "visited site",
"km screen resolution" : "1366x768"
},
{
"_id" : ObjectId("5785bb02eac0636f1dc07053"),
"url" : "https://www.example.com/",
"referrer" : "Direct",
"_t" : ISODate("2016-07-12T18:10:56Z"),
"_p" : "gcama1az5jxa74wa6o9r4v/3k+zulciqiu=",
"_n" : "visited site",
"km screen resolution" : "1366x768"
}
I want to get a count of the unique persons within a date range. In SQL it would be
SELECT COUNT(DISTINCT(`_p`)) FROM collection WHERE `_t` > '<SOME DATE>' AND `_t` <= '<SOME OTHER DATE>'
So far, I've grouped the dates along using the aggregation pipeline:
db.siteEvents.aggregate(
[
{
$match : {"_n": "visited site"}
},
{
$group : {
_id: {
year : { $year : "$_t" },
month : { $month : "$_t" },
day : { $dayOfMonth : "$_t" },
_p : "$_p"
},
count: { $sum: 1 }
}
},
{
$group : {
_id : {
year : { $year : "$_id.year" },
month : { $month : "$_id.month" },
day : { $dayOfMonth : "$_id.day" }
},
count: { $sum: 1 }
}
}
]
);
But this gives errors - I believe because of the second grouping _id trying to grab an intermediate field. I'm currently just using the Mongo shell, but if I had to choose an alternative driver it would be PyMongo. I'd like to get this to work in the shell (so I can understand the process).
With an aggregation pipeline it could look like so
db.getCollection('siteEvents').aggregate([
{
$match: {
_t: {
$gt: ISODate("2016-07-11T08:10:17.000Z"),
$lt: ISODate("2016-07-12T14:10:17.000Z")
}
}
},
{
$group: {
_id: "$_p"
}
},
{
$group: {
_id: null,
distinctCount: { $sum: 1 }
}
}
])
If you know the resulting distinct values won't be large then you could use a simply query like so
db.getCollection('siteEvents').distinct(
'_p',
{
_t: {
$gt: ISODate("2016-07-11T08:10:17.000Z"),
$lt: ISODate("2016-07-12T14:10:17.000Z")
}
}).length
You can use the $addToSet operator in the $group stage to return an array of distinct "_p" value then $project the resulted document to return the size of the array which is nothing other than the distinct count.
db.siteEvents.aggregate(
[
{"$match": {"_n": "visited site", "_t": {"$gt": <SOME DATE>, "$lt": <SOME OTHER DATE>}}},
{"$group": {
"_id": None,
"_p_values": {"$addToSet": "$_p"}
}},
{"$project": {"_id": 0, "count": {"$size": "$_p_values"}}}
]
)
For small size collection you can simply use distinct but you need to pass in the query argument.
len(db.siteEvents.distinct("_p", {"_n": "visited site", "_t": {"$gt": <SOME DATE>, "$lt": <SOME OTHER DATE>}}))