Hey i'm having troubles with getting my aggregation right.
I'm having this dataset and within the collection there are a few million other documents alike:
{
"_id": ObjectId("5757c73344ce54ae1d8b456c"),
"hostname": "Baklap4",
"timestamp": NumberLong(1465370500),
"networkList": [
{
"name": "46.243.152.13",
"openConnections": NumberLong(3)
},
{
"name": "46.243.152.50",
"openConnections": NumberLong(4)
}
],
"webserver": "nginx",
"deviceList": [
{
"deviceName": "eth0",
"receive": NumberLong(183263),
"transmit": NumberLong(781595)
},
{
"deviceName": "wlan0",
"receive": NumberLong(0),
"transmit": NumberLong(0)
}
]
}
What I want:
I'd like to get a resultset where i'm doing an average (of every numeric value) for every document within a 300 second timespan.
[
[
'$match' => [
'timestamp' => ['$gte' => $todayMidnight],
'hostname' => $serverName
]
],
[
'$unwind' => '$networkList'
],
[
'$unwind' => '$deviceList'
],
[
'$group' => [
'_id' => [
'interval' => [
'$subtract' => [
'$timestamp',
[
'$mod' => ['$timestamp', 300]
]
]
],
'network' => '$networkList.name',
'device' => '$deviceList.name',
],
'openConnections' => [
'$sum' => '$networkList.openConnections'
],
'cpuLoad' => [
'$avg' => '$cpuLoad'
],
'bytesPerSecond' => [
'$avg' => '$bytesPerSecond'
],
'requestsPerSecond' => [
'$avg' => '$requestsPerSecond'
],
'webserver' => [
'$last' => '$webserver'
],
'timestamp' => [
'$max' => '$timestamp'
]
]
],
[
'$project' => [
'_id' => 0,
'timestamp' => 1,
'cpuLoad' => 1,
'bytesPerSecond' => 1,
'requestsPerSecond' => 1,
'webserver' => 1,
'openConnections' => 1,
'networkList' => '$networkList',
'deviceList' => '$_id.device',
]
],
[
'$sort' => [
'timestamp' => -1
]
]
];
Yet this doesn't give me a list with all devices and per device an average of received and trasmited bytes.
How would one get those?
per given example I was able to get result using this mongo shel query:
var projectTime = {
$project : {
_id : 1,
hostname : 1,
timestamp : 1,
networkList : 1,
webserver : 1,
deviceList : 1,
isoDate : {
$add : [new Date(0), {
$multiply : ["$timestamp", 1000]
}
]
}
}
}
var group = {
$group : {
"_id" : {
time : {
"$add" : [{
"$subtract" : [{
"$subtract" : ["$isoDate", new Date(0)]
}, {
"$mod" : [{
"$subtract" : ["$isoDate", new Date(0)]
},
1000 * 60 * 5 // 1000 milsseconds * 60 seconds * 5 minutes
]
}
]
},
new Date(0)
]
},
"hostname" : "$hostname",
"deviceList_deviceName" : "$deviceList.deviceName",
"networkList_name" : "$networkList.name",
},
xreceive : {
$sum : "$deviceList.receive"
},
xtransmit : {
$sum : "$deviceList.transmit"
},
xopenConnections : {
$avg : "$networkList.openConnections"
},
}
}
var unwindNetworkList = {
$unwind : "$networkList"
}
var unwindSeviceList = {
$unwind : "$deviceList"
}
var match = {
$match : {
"_id.time" : ISODate("2016-06-09T08:05:00.000Z")
}
}
var finalProject = {
$project : {
_id : 0,
timestamp : "$_id.time",
hostname : "$_id.hostname",
deviceList_deviceName : "$_id.deviceList_deviceName",
networkList_name : "$_id.networkList_name",
xreceive : 1,
xtransmit : 1,
xopenConnections : 1
}
}
db.baklap.aggregate([projectTime, unwindNetworkList,
unwindSeviceList,
group,
match,
finalProject
])
db.baklap.findOne()
then output:
{
"xreceive" : NumberLong(0),
"xtransmit" : NumberLong(0),
"xopenConnections" : 4.0,
"timestamp" : ISODate("2016-06-09T08:05:00.000Z"),
"hostname" : "Baklap4",
"deviceList_deviceName" : "wlan0",
"networkList_name" : "46.243.152.50"
}
{
"xreceive" : NumberLong(183263),
"xtransmit" : NumberLong(781595),
"xopenConnections" : 4.0,
"timestamp" : ISODate("2016-06-09T08:05:00.000Z"),
"hostname" : "Baklap4",
"deviceList_deviceName" : "eth0",
"networkList_name" : "46.243.152.50"
}
{
"xreceive" : NumberLong(183263),
"xtransmit" : NumberLong(781595),
"xopenConnections" : 3.0,
"timestamp" : ISODate("2016-06-09T08:05:00.000Z"),
"hostname" : "Baklap4",
"deviceList_deviceName" : "eth0",
"networkList_name" : "46.243.152.13"
}
{
"xreceive" : NumberLong(0),
"xtransmit" : NumberLong(0),
"xopenConnections" : 3.0,
"timestamp" : ISODate("2016-06-09T08:05:00.000Z"),
"hostname" : "Baklap4",
"deviceList_deviceName" : "wlan0",
"networkList_name" : "46.243.152.13"
}
The main point is be aware than every time $unwind is processed, our data gets a bit of pollution. This could give a side effect when summing data (average will be same as (2+2+3+3)/4 is same as (2+3)/2))
To check that - you could add x:{$push:"$$ROOT"} in group stage and check values after pipeline executed - as you will have all source documents for given data peroid
Related
I need a query that takes multiple 'companyID's' and return the count for each company.
Currently this query only does this for one companyID and it does not return the id but just 'null' like show below.
I understand that I can use the 'in' operator for multiple companyID's but not sure how I would go about having the query return the count for multiple companyID's
db.getCollection('reg').aggregate([ {
'$match' : {
'$and' : [
{
'companyID' : 11
},
{
'created' : {
'$gte' : 1556726597
}
},
{
'created' : {
$lt : 1580572997
}
}
]
}
},
{
'$project' : {
'testID' : 1,
}
},
{
'$group' : {
'_id' : '$testID',
'registrationsCount' : {'$sum' : 1},
},
},
{
$group: {
_id: null,
count: { $sum: 1 }
}
}
])
The result below
{
"_id" : null,
"count" : 10.0
}
Schema below
{
"_id" : NumberLong(1),
"appUserID" : NumberLong(4294967295),
"companyID" : NumberLong(5),
"created" : NumberLong(1372625588),
"testID" : NumberLong(11),
"isCheckIn" : true,
"lastModified" : NumberLong(1372625588),
"source" : "upload",
"timeArrived" : NumberLong(1343062512),
}
I think you need to do 2 things
first
'$match' : {
'$or': [
'$and' : [
{
'companyID' : 11
},
{
'created' : {
'$gte' : 1556726597
}
},
{
'created' : {
$lt : 1580572997
}
}
]
// ... add the other ids you need
]
}
And second
{
'$group' : {
'_id' : '$testID',
'registrationsCount' : {'$sum' : 1},
},
},
{
$group: {
_id: '$testID', // or '$_id' not sure
count: { $sum: 1 }
}
}
Hope this helps!
I want to return two types of group results in one query, but it doen't work.
If you have one idea please share with me.
I have this collection:
[
{
_id: "ABC00001",
results: [
{
_id: "C0001",
status: {
_id: "stj001",
name: "status1"
},
test:{
profession: [
{
"level" : "Pregrado",
"institution" : {
"_id" : "inst006",
"name" : "University 3"
}
},
{
"level" : "Pregrado",
"institution" : {
"_id" : "inst002",
"name" : "University 2"
}
}
]
}
},
{
_id: "C0002",
status: {
_id: "stj002",
name: "status1"
},
test:{
profession: [
{
"level" : "Pregrado",
"institution" : {
"_id" : "inst006",
"name" : "University 3"
}
}
]
}
},
]
},
{
_id: "ABC00002",
results: [
{
_id: "C0001",
status: {
_id: "stj002",
name: "status1"
},
test:{
profession: [
{
"level" : "Pregrado",
"institution" : {
"_id" : "inst002",
"name" : "University 2"
}
},
{
"level" : "Pregrado",
"institution" : {
"_id" : "inst006",
"name" : "University 3"
}
}
]
}
},
{
_id: "C0002",
status: {
_id: "stj003",
name: "status1"
},
test:{
profession: [
{
"level" : "Pregrado",
"institution" : {
"_id" : "inst006",
"name" : "University 3"
}
}
]
}
},
]
},
]
I wanna return only disctincts institutions and status in one group query like this:
institution: [
{"_id" : "inst006","name" : "University 3"},
{"_id" : "inst002", "name" : "University 2"},
]
status: [
{_id: "stj002", name: "status1"},
{_id: "stj003", name: "status1"}
]
I tried with this but doesnt work:
db.collection.aggregate(
[
{'$unwind' : '$results'},
{'$group' : { '_id' : { 'status' : {'_id'=>'$results.status._id', 'name' : '$results.status.name'}, 'count' : { '$sum' : 1 } } } },
{'$group' : { '_id' : { 'institution' : {'_id' :'$results.test.profession.institution._id', 'name':'$results.test.profession.institution.name'},
'count' : { '$sum' 1 } } }
]
)
If I work with two distincts querys with their own group it works but I need only one query returns all values, maybe I'll add more groups
If I understand your requirements correctly then this might work.
db.CollectionName.aggregate([
{"$group" : {_id : {statusid:"$results.status._id", statusname: "$results.status.name", instid:"$results.test.profession.institution._id", instname: "$results.test.profession.institution.name"}},
},
{ "$project": {
"results.status._id": 1,
"results.status.name": 1,
"results.test.profession.institution._id": 1,
"results.test.profession.institution.name": 1
}
},
{ "$sort": { "_id.statusid": 1 }},
])
Note: The JSON data needs to be formatted to make this work.
I found the solution:
db.collection.aggregate([
{'$match' : {'_id' : "CA0001"] ],
{'$unwind' : '$results'],
{'$unwind' : '$results.test'],
{'$unwind' : '$results.test.profession'],
{'$unwind' : '$results.test.skills'],
{'$group' : {
'_id' : {
"status" : {'_id':'$results.status.id', 'name':'$results.status.name'},
"institution" : {'_id':'$results.test.profession.institution._id', 'name':'$results.test.profession.institution.name'},
'profession' => {'_id':'$results.test.profession.education._id', 'description':'$results.test.profession.education.description'},
'availability' : {'_id':'$results.availability._id', 'name':'$results.availability.name'},
'skills' : {'_id':'$results.test.skills._id', 'description':'$results.test.skills.description'}
},
}
},
{'$project' : { 'status' : 1, 'institution': 1, 'profession': 1, 'skills': 1, 'availability': 1} },
{
'$group' : {
'_id' : null,
'status' : {
'$addToSet' : '$_id.status'
},
'institution' : {
'$addToSet' : '$_id.institution'
},
'profession' : {
'$addToSet' : '$_id.profession'
},
'availability' : {
'$addToSet' : '$_id.availability'
},
'skills' : {
'$addToSet' : '$_id.skills'
}
}
}
]);
it returns:
{
"_id": null,
"status": [
{
"_id": 1,
"name": "evaluado"
}
],
"institution": [
{
"_id": "inst078",
"name": "Universidad Privada del Norte"
},
{
"_id": "inst079",
"name": "Universidad San Ignacio de Loyola"
}
],
"profession": [
{
"_id": "fa059",
"description": "Estadística"
},
{
"_id": "fa063",
"description": "Ingeniería Informática"
},
"availability": [
{
"_id": "wo001",
"name": "Inmediata"
}
],
"skills": [
{
"_id": "sk366",
"description": "Pentaho"
}
]
}
All results are distincts.
I reduced time from 550ms to 43ms in programming language, comparing doing with database query and code programming using collections.
Below is my query I want the result of shp_tx_survey_with_index and
for each loop collection shp_counties_with_index name1 and name2 together of both this collection. If running this query separate then getting the result but this query gives me nothing. I want result like Range_Township, Survey, Section, abstract, centroid, name_1, name_2.
db.shp_tx_survey_with_index.aggregate(
[
{ $match: { "centroid": { "$ne": null } } },
{ $limit: 5 },
{
$project: {
Range_Township: "$l1surnam",
Survey: "$l4surnam",
Section: "$l1surnam",
abstract: "$abstract_",
centroid: "$centroid"
}
}
]
).forEach((obj) => {
var item = db.shp_counties_with_index.findOne({
geom_geojson: {
$nearSphere: {
$geometry: obj.centroid
}
}
}, { 'name_1': 1, 'name_2': 1 });
});
shp_counties_with_index sample collection
{
"_id" : ObjectId("5846bf55834d5b761f00000a"),
"engtype_2" : "County",
"geom_geojson" : {
"type" : "MultiPolygon",
"coordinates" : [
[
[
[
-73.6516685561232,
34.2445059658098
],
[
-73.6516685623318,
34.2445059757618
],
[
-73.6516685538257,
34.244505973301
],
[
-73.6516685561232,
34.2445059658098
]
]
] ]
},
"name_0" : "United States",
"name_1" : "Michigan",
"name_2" : "Chippewa",
"shape_area" : "0.481851809544",
"shape_leng" : "9.37720288177",
"type_2" : "County",
"validfr_2" : "Unknown",
"validto_2" : "Unknown",
"centroid" : {
"coordinates" : [
-73.65166855807875,
34.244505970785795
],
"type" : "Point"
}
}
shp_tx_survey_with_index sample collection
{
"_id" : ObjectId("5846bf76834d5b761f013fa7"),
"abstract_" : "321.000000000",
"abstract_i" : "322.000000000",
"anum" : "443962",
"area" : "0.0000666764235294",
"geom" : "01060000000100000001030000000100000008000000EC90DE47A07659C0F062332AEA813E403471FBB0A17759C06082096CE6813E4034A2C2ABA17759C0700AAF2731823E40B49BADAAA17759C09092F09440823E401C588E90A17759C000B4279A6A823E400019834C677559C02026721261823E403073564B677559C080C77880E6813E40EC90DE47A07659C0F062332AEA813E40",
"geom_geojson" : {
"type" : "MultiPolygon",
"coordinates" : [
[
[
[
-73.6517272344497,
34.2444627902475
],
[
-73.6517271719931,
34.2444627964974
],
[
-73.6517271718375,
34.2444627914072
],
[
-73.6517272344497,
34.2444627902475
]
]
]
]
},
"geom_text" : "MULTIPOLYGON(((-73.6517272344497 34.2444627902475,-73.6517271719931 34.2444627964974,-73.6517271718375 34.2444627914072,-73.6517272344497 34.2444627902475)))",
"gid" : "271508",
"l1surnam" : "TEMPLETON, J S",
"l2block" : null,
"l3surnum" : "4",
"l4surnam" : null,
"perimeter" : "0.0735082380545",
"probflag" : "0",
"shape_area" : "0.0000666764230571",
"shape_leng" : "0.0735082374282",
"centroid" : {
"coordinates" : [
-73.6517272031436,
34.24446279337245
],
"type" : "Point"
}
}
Thanks in advance.
When you want to combine information from 2 collections in a aggregation pipeline you can use the $lookup operator.
This operator is available from MongoDB 3.2 and up.
I need to perform a sum with the following collection's schema:
{
"_id" : "20160530/108107/31",
"metadata" : {
"date" : "2016-05-30",
"offer" : "108107",
"adv" : 31,
"update" : ISODate("2016-05-30T15:27:20.240Z")
},
"daily_unique" : 4,
"daily_gross" : 4,
"hourly" : {
"17" : {
"unique" : 4,
"gross" : 4
}
},
"publisher" : {
"738" : {
"daily_unique" : 3,
"daily_gross" : 3,
"hourly" : {
"17" : {
"unique" : 3,
"gross" : 3
}
}
},
"43" : {
"daily_unique" : 1,
"daily_gross" : 1,
"hourly" : {
"17" : {
"unique" : 1,
"gross" : 1
}
}
}
}
},
{
"_id" : "20160530/78220/59",
"metadata" : {
"date" : "2016-05-30",
"offer" : "78220",
"adv" : 59,
"update" : ISODate("2016-05-30T15:24:49.900Z")
},
"daily_unique" : 2,
"daily_gross" : 2,
"hourly" : {
"17" : {
"unique" : 2,
"gross" : 2)
}
},
"publisher" : {
"43" : {
"daily_unique" : 2,
"daily_gross" : 2,
"hourly" : {
"17" : {
"unique" : 2,
"gross" : 2
}
}
}
}
}
First document have data from publisher 738 and 43, but second have data only from 43.
So, when I want to sum all data from publisher 738, I need to sum all daily_gross, or daily_unique only if its present in the publisher, as in the first document.
I am trying some different approaches, with $exists and $cond, but not getting results
aggregate(
['$match' => ['metadata.date' => date('Y-m-d')]],
['$group' => [
'_id' => '$metadata.offer',
'daily_u' => ['$sum' => '$daily_unique']
],
])
which gives me
[
0 => [
'_id' => '108107'
'daily_u' => 4
]
1 => [
'_id' => '78220'
'daily_u' => 2
]
]
When I try to dive deep in publisher I cannot get the results I want:
aggregate(
['$match' => ['metadata.date' => date('Y-m-d')]],
['$group' => [
'_id' => '$metadata.offer',
'daily_u' => [
'$sum' => [
'$cond' => [
'if' => [
'publisher.738' => ['$exists' => true],
'then' => 1,
'else' => 0
]
]
]
],
]]
)
But cannot get daily by publisher.
It even gets complicated when I try to get hourly data.
Can anybody point me in the right direction?
Thanks in advance.
Please help me with indexes in mongoDB.
There is a collection in which 800,000 documents.
There is a request that is very long runs. About 5 seconds!
{
"$or":[
{
"performer":"534ba408f9cd0ecb51711673",
"$or":[
{
"performersRole":"534ba30bf9cd0ec151a69522"
},
{
"performersRole":{
"$exists":false
}
}
]
},
{
"performersRole":"534ba30bf9cd0ec151a69522",
"notShowInToDo":{
"$ne":true
}
}
],
"taskTime":{
"$gte":1409774400,
"$lt":1409860799
},
"$and":[
{
"$or":[
{
"department":{
"$in":[
"5356134ef9cd0e4805672a15",
"53561368f9cd0e4b05645f3f",
"53a0357ff9cd0e670537c4b7",
"53a03594f9cd0e6705389449"
]
}
},
{
"department":{
"$exists":false
}
}
]
},
{
"$or":[
{
"salon":"534f7b3bf9cd0e311e77896f"
},
{
"salon":{
"$exists":false
}
}
]
}
],
"isDone":{
"$ne":true
}
}
Which indexes to add to optimize? Thanks for any advice!
Almost all documents about this format:
{
"_id": "541da66cf535a4a8569dd0ed",
"title": "test task",
"taskTime": NumberLong(1411229292),
"client": "53f876b2f535a4187f9e1264",
"salon": "534f7c3cf9cd0e91206dd948",
"track": "541da66cf535a4a8569dd0ec",
"department": "53a0357ff9cd0e670537c4b7",
"type": "invitePBP",
"performersRole": [
"534ba30bf9cd0ec151a69522"
],
"notShowInToDo": true,
"#createTime": NumberLong(1411229292),
"#updateTime": NumberLong(1411229292)
}
Before the creation of index, consider following points:
1. Cut down the number of query hierarchy as possible as you can;
2. Avoid to use $add and $or if possible;
3. Avoid to use $exists if possible as it will access the collection even though having index on the field;
4. Design the index according to the sequence executed as you want to.
Suppose I have understood your requirements correctly, then I reconstruct the query as below:
var query = {
"taskTime" : {
"$gte" : 1409774400,
"$lt" : 1409860799
},
"isDone" : {
"$ne" : true
},
"$and" : [
{
"salon" : {
"$in" : [ null, "534f7b3bf9cd0e311e77896f" ]
}
}, {
"department" : {
"$in" : [ null,
"5356134ef9cd0e4805672a15",
"53561368f9cd0e4b05645f3f",
"53a0357ff9cd0e670537c4b7",
"53a03594f9cd0e6705389449" ]
}
}],
"$or" : [ {
"performer" : "534ba408f9cd0ecb51711673",
"performersRole" : {
"$in" : [ null, "534ba30bf9cd0ec151a69522" ]
}
}, {
"performersRole" : "534ba30bf9cd0ec151a69522",
"notShowInToDo" : {
"$ne" : true
}
} ]
};
Be careful of null:
Be attentioned that {"salon" : {"$in" : [ null, "534f7b3bf9cd0e311e77896f" ]} can work completely on index {salon:1} in v2.4 but will still access the collection in v2.6, I don't know the exact reason but just guess that it's possible to the definition of null has been changed (include undefined type).
To avoid this issue in v2.6, an alternative is to initialize a real value to field salon instead of doing nothing.
You can try this way to create index and your feedback is appriciated since I haven't the real data to make a test.
db.c.ensureIndex({taskTime:1, isDone:1, salon:1, department:1}, {name:"bigIndex"});
Add my test result - 1010,000 documents
var a = {
"taskTime" : {
"$gte" : 1410443932781,
"$lt" : 1412443932781
},
"isDone" : {
"$ne" : true
},
"$and" : [
{
"salon" : {
"$in" : [ null, "534f7b3bf9cd0e311e77896f", "5420ecdc218ba2fb5353ad5b" ]
}
}, {
"department" : {
"$in" : [ null,
"5356134ef9cd0e4805672a15",
"53561368f9cd0e4b05645f3f",
"53a0357ff9cd0e670537c4b7", "5420ecdc218ba2fb5353ad5d",
"53a03594f9cd0e6705389449" ]
}
}],
"$or" : [ {
"performer" : "534ba408f9cd0ecb51711673",
"performersRole" : {
"$in" : [ null, "5420ecdc218ba2fb5353ad5e" ]
}
}, {
"performersRole" : "5420ecdc218ba2fb5353ad5e",
"notShowInToDo" : {
"$ne" : true
}
} ]
};
db.c.find(a).explain();
{
"cursor" : "BtreeCursor bigIndex",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 54290,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 54290,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 425,
"nChunkSkips" : 0,
"millis" : 261,
"indexBounds" : {
"taskTime" : [
[
1410443932781,
1412443932781
]
],
"isDone" : [
[
{
"$minElement" : 1
},
true
],
[
true,
{
"$maxElement" : 1
}
]
],
"salon" : [
[
null,
null
],
[
"534f7b3bf9cd0e311e77896f",
"534f7b3bf9cd0e311e77896f"
],
[
"5420ecdc218ba2fb5353ad5b",
"5420ecdc218ba2fb5353ad5b"
]
],
"department" : [
[
null,
null
],
[
"5356134ef9cd0e4805672a15",
"5356134ef9cd0e4805672a15"
],
[
"53561368f9cd0e4b05645f3f",
"53561368f9cd0e4b05645f3f"
],
[
"53a0357ff9cd0e670537c4b7",
"53a0357ff9cd0e670537c4b7"
],
[
"53a03594f9cd0e6705389449",
"53a03594f9cd0e6705389449"
],
[
"5420ecdc218ba2fb5353ad5d",
"5420ecdc218ba2fb5353ad5d"
]
]
},
"server" : "Mars-PC:27017",
"filterSet" : false
}