Mongo aggregation $group conditional $eq for string field - mongodb

I use Mongo v2.2.0.
I wrote the query but the main issue is $arrayElemAt. Standard replacement with $unwind-$first doesn't work for me and I suppose that better solution exists. I have a restriction to run this aggregation pipeline as a single operation instead of running a query for positive and negative data and later merge results in a code. I need to apply a $sort, $limit and $skip for the resulting query to restrict count of words to be used for filtering records from other collection and combine data from both collections in Java code.
Aggregation query:
[
{
$match: {
"merchantId": ObjectId("59520e6ccc7a701fbed31f94"),
"date": {
"$gte": NumberLong(1389644800000),
"$lt": NumberLong(1502409599999)
},
"isbn": "a123",
}
},
{
$project: {
"word": 1,
"sentence": 1,
"type": 1,
"date": 1
}
},
{
$sort: {
"date": -1
}
},
{
$group: {
"_id": {
"word": "$word",
"type": "$type"
},
"date": {
$max: "$date"
},
"sentence": {
$first: "$sentence"
},
"sentenceCount": {
"$sum": 1
}
},
},
{
$group: {
"_id": "$_id.word",
"word": { $first: "$_id.word"},
"positiveCount": {$sum: {$cond: [{$eq: ["$_id.type", "positive"]}, "$sentenceCount", 0]}},
"count": {$sum: "$sentenceCount"},
"positiveSentence": {
"$push": {
"$cond": [{$eq: ["$_id.type", "positive"]}, "$sentence", "$noval"]
}
},
"negativeSentence": {
"$push": {
"$cond": [{$eq: ["$_id.type", "negative"]}, "$sentence", "$noval"]
}
}
}
},
{
$project: {
"_id": 0,
"word": 1,
"sentimentPercentage": {$cond: [{$eq: ["$count", 0]}, 0, {$multiply: [{$divide: ["$positiveCount", "$count"]}, 100]}]},
"positiveSentence": {$arrayElemAt: ["$positiveSentence", 0]},
"negativeSentence": {$arrayElemAt: ["$negativeSentence", 0]},
}
},
{
$sort: {
sentimentPercentage: -1
}
},
{
$limit: 50
}
]
Collection document "schema":
{
"_id" : ObjectId("59887424e4b099e00724aa44"),
"merchantId" : ObjectId("59520e6ccc7a701fbed31f94"),
"isbn" : "a123",
"sentence" : "Great, friendly service.",
"word" : "service",
"type" : "positive",
"date" : NumberLong(1466809200000),
}
Expected output:
{
"word" : "expectations",
"sentimentPercentage" : 100.0,
"positiveSentence" : "The service exceeded our expectations."
},
{
"word" : "representative",
"sentimentPercentage" : 87.5,
"positiveSentence" : "Excellent local representative, met the flight and gave us all the relevant information to ensure a great holiday.",
"negativeSentence" : "The representative at resort was poor."
},
{
"word" : "seats",
"sentimentPercentage" : 0.0,
"negativeSentence" : "Long delay and pre booked seats were lost ."
}
Please, could you advise me how to replace $arrayElemAt operator or even better how to optimise this query to the desired output using just features of Mongo <=2.2.0?

This appears to give me reasonable results. I think it will not work properly, though, in cases where you have no positive or no negative sentence because of the $unwind stage which does not support the preserveNullAndEmptyArrays parameter in v2.2...
db.getCollection('test').aggregate([
{
$project: {
"word": 1,
"sentence": 1,
"type": 1,
"date": 1
}
},
{
$sort: {
"date": -1
}
},
{
$group: {
"_id": {
"word": "$word",
"type": "$type"
},
"date": {
$max: "$date"
},
"sentence": {
$first: "$sentence"
},
"sentenceCount": {
"$sum": 1
}
},
},
{
$group: {
"_id": "$_id.word",
"word": { $first: "$_id.word"},
"positiveCount": {$sum: {$cond: [{$eq: ["$_id.type", "positive"]}, "$sentenceCount", 0]}},
"count": {$sum: "$sentenceCount"},
"positiveSentence": {
"$push": {
"$cond": [{$eq: ["$_id.type", "positive"]}, "$sentence", "$noval"]
}
},
"negativeSentence": {
"$push": {
"$cond": [{$eq: ["$_id.type", "negative"]}, "$sentence", "$noval"]
}
}
}
},
{ $unwind: "$positiveSentence" },
{ $group:
{
"_id": "$_id",
"word": { $first: "$word" },
"count": { $first: "$count" },
"positiveCount": { $first: "$positiveCount" },
"positiveSentence": { $first: "$positiveSentence" },
"negativeSentence": { $first: "$negativeSentence" },
}
},
{ $unwind: "$negativeSentence" },
{ $group:
{
"_id": "$_id",
"word": { $first: "$word" },
"count": { $first: "$count" },
"positiveCount": { $first: "$positiveCount" },
"positiveSentence": { $first: "$positiveSentence" },
"negativeSentence": { $first: "$negativeSentence" },
}
},
{
$project: {
"_id": 0,
"word": 1,
"sentimentPercentage": {$cond: [{$eq: ["$count", 0]}, 0, {$multiply: [{$divide: ["$positiveCount", "$count"]}, 100]}]},
"positiveSentence": 1,
"negativeSentence": 1
}
}
])
You might be able to simplify this further, e.g. get rid of the first projection and grouping stage. I can perhaps look into that in a few hours if you'd like me to.

Related

Reducing an array of values into object with their count using aggregation framework

We are using MongoDB to record statistics. The approach is to record each action for an object in its own document and later aggregate them on hourly basis and store them in different collection. Sample documents are below:
[{
"_id" : ObjectId("5e05de1e86029610dc2c6f9c"),
"object_type" : 1,
"object_id" : 1003,
"browser" : "chrome",
"os" : "osx",
"device" : "android",
"category" : 3,
"country" : "gb",
"action" : "impression",
"date_added" : ISODate("2019-12-26T19:00:00.000Z")
},{
"_id" : ObjectId("5e06226586029610db417b7a"),
"object_type" : 1,
"object_id" : 1003,
"browser" : "firefox",
"os" : "osx",
"device" : "lg_tv",
"category" : 1,
"country" : "pe",
"action" : "impression",
"date_added" : ISODate("2019-12-25T19:00:00.000Z")
},{
"_id" : ObjectId("5e06226586029610db417b7b"),
"object_type" : 1,
"object_id" : 1009,
"browser" : "uc_browser",
"os" : "osx",
"device" : "android",
"category" : 4,
"country" : "ru",
"action" : "view",
"date_added" : ISODate("2019-12-25T19:00:00.000Z")
}]
Output should be:
[{
"object_id": 1003,
"object_type": 1,
"action": "impression",
"total": 2,
"date": "2019-12-26 19:00:00",
"browsers": { "firefox": 1, "chrome": 1 },
"systems": { "osx": 2 },
"countries": { "gb": 1, "pe": 1 },
"devices": { "android": 1, "lg_tv": 1 },
"categories": { "3": 1, "1": 1 }
},
{
"object_id": 1009,
"object_type": 1,
"action": "view",
"total": 1,
"date": "2019-12-26 19:00:00",
"browsers": { "uc_browser": 1 },
"systems": { "osx": 1 },
"countries": { "ru": 1 },
"devices": { "android": 1 },
"categories": { "4": 1 }
}]
Aggregation pipeline:
[
{
"$match": {
"date_added": {
"$gte": {
"$date": {
"$numberLong": "1576820825000"
}
}
}
}
},
{
"$group": {
"_id": {
"object_id": "$object_id",
"object_type": "$object_type",
"action": "$action",
"date": {
"$dateToString": {
"format": "%Y-%m-%d %H-00-00",
"date": "$date_added"
}
}
},
"total": {
"$sum": 1
},
"countries": {
"$push": "$country"
}
}
},
{
"$project": {
"action": "$_id.action",
"object_id": "$_id.object_id",
"object_type": "$_id.object_type",
"date": "$_id.date",
"total": 1,
"countries": 1,
"systems": 1,
"devices": 1,
"categories": 1,
"_id": 0
}
},
{
"$sort": {
"total": -1
}
}
]
This pipeline provides total of an object for a certain action on given hour and push each country into countries array - for readability removed other indexes from $group.
I’m stuck at transforming countries array into desired object. Two question popped in my mind.
Is it possible with single aggregation pipeline?
Should I just return documents using above pipeline and process rest of the indexes with scripting?
It's possible, but a bit tedious...
You need to $group each new field in the next stage and acumulate previous fields.
ASSUMPTION
Your expected result for "object_id": 1003 with total:2, but date_added is 2019-12-26 and 2019-12-25. So, I've changed to 2019-12-26 both documents.
db.collection.aggregate([
{
"$match": {
"date_added": {
"$gte": {
"$date": {
"$numberLong": "1576820825000"
}
}
}
}
},
{
$group: {
_id: {
"object_id": "$object_id",
"object_type": "$object_type",
"action": "$action",
"date": {
"$dateToString": {
"format": "%Y-%m-%d %H-00-00",
"date": "$date_added",
timezone: "GMT"
}
}
},
data: {
"$push": "$$ROOT"
},
total: {
$sum: 1
}
}
},
{
$unwind: "$data"
},
{
$group: {
_id: {
_id: "$_id",
"tmp": "$data.category"
},
data: {
$push: "$data"
},
total: {
$first: "$total"
},
count: {
$sum: 1
}
}
},
{
$group: {
_id: "$_id._id",
data: {
$push: "$data"
},
total: {
$first: "$total"
},
categories: {
$push: {
k: {
$toString: "$_id.tmp"
},
v: "$count"
}
}
}
},
{
$unwind: "$data"
},
{
$unwind: "$data"
},
{
$group: {
_id: {
_id: "$_id",
"tmp": "$data.device"
},
categories: {
$first: "$categories"
},
data: {
$push: "$data"
},
total: {
$first: "$total"
},
count: {
$sum: 1
}
}
},
{
$group: {
_id: "$_id._id",
data: {
$push: "$data"
},
total: {
$first: "$total"
},
categories: {
$first: "$categories"
},
devices: {
$push: {
k: "$_id.tmp",
v: "$count"
}
}
}
},
{
$unwind: "$data"
},
{
$unwind: "$data"
},
{
$group: {
_id: {
_id: "$_id",
"tmp": "$data.country"
},
devices: {
$first: "$devices"
},
categories: {
$first: "$categories"
},
data: {
$push: "$data"
},
total: {
$first: "$total"
},
count: {
$sum: 1
}
}
},
{
$group: {
_id: "$_id._id",
data: {
$push: "$data"
},
total: {
$first: "$total"
},
devices: {
$first: "$devices"
},
categories: {
$first: "$categories"
},
countries: {
$push: {
k: "$_id.tmp",
v: "$count"
}
}
}
},
{
$unwind: "$data"
},
{
$unwind: "$data"
},
{
$group: {
_id: {
_id: "$_id",
"tmp": "$data.os"
},
countries: {
$first: "$countries"
},
devices: {
$first: "$devices"
},
categories: {
$first: "$categories"
},
data: {
$push: "$data"
},
total: {
$first: "$total"
},
count: {
$sum: 1
}
}
},
{
$group: {
_id: "$_id._id",
data: {
$push: "$data"
},
total: {
$first: "$total"
},
countries: {
$first: "$countries"
},
devices: {
$first: "$devices"
},
categories: {
$first: "$categories"
},
systems: {
$push: {
k: "$_id.tmp",
v: "$count"
}
}
}
},
{
$unwind: "$data"
},
{
$unwind: "$data"
},
{
$group: {
_id: {
_id: "$_id",
"tmp": "$data.browser"
},
systems: {
$first: "$systems"
},
countries: {
$first: "$countries"
},
devices: {
$first: "$devices"
},
categories: {
$first: "$categories"
},
data: {
$push: "$data"
},
total: {
$first: "$total"
},
count: {
$sum: 1
}
}
},
{
$group: {
_id: "$_id._id",
data: {
$push: "$data"
},
total: {
$first: "$total"
},
systems: {
$first: "$systems"
},
countries: {
$first: "$countries"
},
devices: {
$first: "$devices"
},
categories: {
$first: "$categories"
},
browsers: {
$push: {
k: "$_id.tmp",
v: "$count"
}
}
}
},
{
$project: {
_id: 0,
action: "$_id.action",
date: "$_id.date",
object_id: "$_id.object_id",
object_type: "$_id.object_type",
total: 1,
categories: {
$arrayToObject: "$categories"
},
countries: {
$arrayToObject: "$countries"
},
devices: {
$arrayToObject: "$devices"
},
systems: {
$arrayToObject: "$systems"
},
browsers: {
$arrayToObject: "$browsers"
}
}
},
{
$sort: {
object_id: 1,
date: 1
}
}
])
MongoPlayground
Note: Other approach was to use $facet and create fields separately and then merge them into final object, but MongoPlayground sometimes worked buggy (click Run button several times and you get different result)

Get last and minimal values from grouped documents

My document model looks like:
{
"model": "ABC123",
"date": "2018-12-24T23:00:00.000+0000",
"price": "2000" ,
}
I would like to retrive collection to get array of documents:
[
{ "_id" : "ABC123", "newestDate" : ISODate("2018-12-26T23:00:00Z"), "newestPrice" : 2801.33, "lowestPriceAtAll": 1300 },
{ "_id" : "ABC124", "newestDate" : ISODate("2018-12-26T23:00:00Z"), "newestPrice" : 2801.33, "lowestPriceAtAll": 990}
]
where _id is model field, newestPrice is price of newest document (grouped by model) and lowestPriceAtAll is lowest price in all documents with the same model.
I grilled two queries.
First is to find lowest price documents:
offers.aggregate([
{ $sort: { "model": 1, "price": 1 }},
{
$group: {
_id: "$model",
lowestPrice: { "$first": "$price" },
lowestPriceDate: { "$first": "$date"},
}
}
])
the second is to find newest documents:
offers.aggregate([
{ $sort: { "model": 1, "date": -1 }},
{
$group: {
_id: "$model",
newestDate: { "$first": "$date" },
newestPrice: { "$first": "$price"},
}
}
])
Is it possible to merge these two queries into one? (the most important thing is that documents have to be grouped by model field).
you can use $facet
db.offers.aggregate([
{$facet :{
lowest: [
{ $sort: { "model": 1, "price": 1 }},
{
$group: {
_id: "$model",
lowestPrice: { "$first": "$price" },
lowestPriceDate: { "$first": "$date"},
}
}
],
newest: [
{ $sort: { "model": 1, "date": -1 }},
{
$group: {
_id: "$model",
newestDate: { "$first": "$date" },
newestPrice: { "$first": "$price"},
}
}
]
}}
])

How to get count of multiple fields based on value in mongodb?

Collection exists as below:
[
{"currentLocation": "Chennai", "baseLocation": "Bengaluru"},
{"currentLocation": "Chennai", "baseLocation": "Bengaluru"},
{"currentLocation": "Delhi", "baseLocation": "Bengaluru"},
{"currentLocation": "Chennai", "baseLocation": "Chennai"}
]
Expected Output:
[
{"city": "Chennai", "currentLocationCount": 3, "baseLocationCount": 1},
{"city": "Bengaluru", "currentLocationCount": 0, "baseLocationCount": 3},
{"city": "Delhi", "currentLocationCount": 1, "baseLocationCount": 0}
]
What I have tried is:
db.getCollection('users').aggregate([{
$group: {
"_id": "$baselocation",
baseLocationCount: {
$sum: 1
}
},
}, {
$project: {
"_id": 0,
"city": "$_id",
"baseLocationCount": 1
}
}])
Got result as:
[
{"city": "Chennai", "baseLocationCount": 1},
{"city": "Bengaluru", "baseLocationCount": "3"}
]
I'm not familiar with mongo, so any help?
MongoDB Version - 3.4
Neil Lunn and myself had a lovely argument over this topic the other day which you can read all about here: Group by day with Multiple Date Fields.
Here are two solutions to your precise problem.
The first one uses the $facet stage. Bear in mind, though, that it may not be suitable for large collections because $facet produces a single (potentially huge) document that might be bigger than the current MongoDB document size limit of 16MB (which only applies to the result document and wouldn't be a problem during pipeline processing anyway):
collection.aggregate(
{
$facet:
{
"current":
[
{
$group:
{
"_id": "$currentLocation",
"currentLocationCount": { $sum: 1 }
}
}
],
"base":
[
{
$group:
{
"_id": "$baseLocation",
"baseLocationCount": { $sum: 1 }
}
}
]
}
},
{ $project: { "result": { $setUnion: [ "$current", "$base" ] } } }, // merge results into new array
{ $unwind: "$result" }, // unwind array into individual documents
{ $replaceRoot: { newRoot: "$result" } }, // get rid of the additional field level
{ $group: { "_id": "$_id", "currentLocationCount": { $sum: "$currentLocationCount" }, "baseLocationCount": { $sum: "$baseLocationCount" } } }, // group into final result)
{ $project: { "_id": 0, "city": "$_id", "currentLocationCount": 1, "baseLocationCount": 1 } } // group into final result
)
The second one works based on the $map stage instead:
collection.aggregate(
{
"$project": {
"city": {
"$map": {
"input": [ "current", "base" ],
"as": "type",
"in": {
"type": "$$type",
"name": {
"$cond": {
"if": { "$eq": [ "$$type", "current" ] },
"then": "$currentLocation",
"else": "$baseLocation"
}
}
}
}
}
}
},
{ "$unwind": "$city" },
{
"$group": {
"_id": "$city.name",
"currentLocationCount": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$city.type", "current" ] },
"then": 1,
"else": 0
}
}
},
"baseLocationCount": {
"$sum": {
"$cond": {
"if": { "$eq": [ "$city.type", "base" ] },
"then": 1,
"else": 0
}
}
}
}
}
)

MongoDB: error with $divide and $multiply

I'm creating a MongoDB aggregation pipeline and I'm stuck at this stage:
$group: {
_id: {checkType: "$_id.checkType", resultCode: "$_id.resultCode"},
count: { $sum: "$count" },
ctv: { $sum: "$ctv" },
perc:{$multiply:[{$divide:["$ctv","$count"]},100]},
weight: { $divide: [ "$ctv", "$count"] },
details: { $push: "$$ROOT" }
}
It gives the error "The $multiply accumulator is a unary operator". Similarly if I remove the line with $multiply I get "The $divide accumulator is a unary operator" on the subsequent line. I cannot find a description for this error on the Net. What's wrong in my sintax?
The arithmetic operators cannot be used as $group accumulators. Move them to another $project pipeline stage as:
db.collection.aggregate([
{ "$group": {
"_id": { "checkType": "$_id.checkType", "resultCode": "$_id.resultCode" },
"count": { "$sum": "$count" },
"ctv": { "$sum": "$ctv" },
"details": { "$push": "$$ROOT" }
} },
{ "$project": {
"count": 1,
"details": 1,
"ctv": 1,
"perc": { "$multiply": [ { "$divide": ["$ctv","$count"] }, 100 ] },
"weight": { "$divide": ["$ctv", "$count"] },
} }
])
or
if using MongoDB 3.4 and above, use $addFields instead of $project
db.collection.aggregate([
{ "$group": {
"_id": { "checkType": "$_id.checkType", "resultCode": "$_id.resultCode" },
"count": { "$sum": "$count" },
"ctv": { "$sum": "$ctv" },
"details": { "$push": "$$ROOT" }
} },
{ "$addFields": {
"perc": { "$multiply": [ { "$divide": ["$ctv","$count"] }, 100 ] },
"weight": { "$divide": ["$ctv", "$count"] },
} }
])

Match the field in mongodb

I have mongodb sample data result like this:
{
"_id" : {
"month" : 3,
"day" : 24,
"year" : 2017
},
"commodity" : [
{
"commodityId" : ObjectId("58d434c30da1364f1e2d682d"),
"commodityStock" : "88889s"
}
],
"totalStock" : 0,
"count" : 1.0 }
my question is, How can i get the result where month = 3 with $match?
below is my query:
db.orders.aggregate(
[
{ $match : {_id.month : 3}},
{
$group : {
_id : { month: { $month: "$createdAt" }, day: { $dayOfMonth: "$createdAt" }, year: { $year: "$createdAt" } },
commodity : {$push : {
'commodityId' : "$commodity",
'commodityStock' : "$stock",
}
},
totalStock: { $sum: "$stock" },
count: { $sum: 1 }
}
}
]
)
You could use a $redact pipeline which incorporates the functionality of $project and $match so that you can filter the documents in the collection by using a logical condition with the $cond operator and uses the special operations $$KEEP to "keep" the document where the logical condition is true or $$PRUNE to "remove" the document where the condition was false.
db.orders.aggregate([
{
"$redact": {
"$cond": [
{ "$eq": [{ "$month": "$createdAt" }, 3]},
"$$KEEP",
"$$PRUNE"
]
}
},
{
"$group": {
"_id": {
"month": { "$month": "$createdAt" },
"day": { "$dayOfMonth": "$createdAt" },
"year": { "$year": "$createdAt" }
},
"commodity": {
"$push": {
"commodityId": "$commodity",
"commodityStock": "$stock",
}
},
"totalStock": { "$sum": "$stock" },
"count": { "$sum": 1 }
}
}
])
Keep in mind that $redact does not use indexes, it performs a collection scan, but if you need to take advantage of indexes use the $project and $match pipeline stages as:
db.orders.aggregate([
{
"$project": {
"createdAt": 1,
"month": { "$month": "$createdAt" },
"day": { "$dayOfMonth": "$createdAt" },
"year": { "$year": "$createdAt" },
"commodity": 1,
"stock": 1
}
},
{ "$match": { "month": 3 } },
{
"$group": {
"_id": { "month": "$month", "day": "$day", "year": "$year" },
"commodity": {
"$push": {
"commodityId": "$commodity",
"commodityStock": "$stock",
}
},
"totalStock": { "$sum": "$stock" },
"count": { "$sum": 1 }
}
}
])