MongoDB Aggregation: grouping data within date groups - mongodb

I have many documents in a MongoDB database which look like the following four documents (note the first 3 are Feb 2017 and the last one is March 2017):
{"_id": 0,
"date": ISODate("2017-02-01T00:00:00Z),
"item": "Basketball",
"category": "Sports"}
{"_id": 1,
"date": ISODate("2017-02-13T00:00:00Z),
"item": "Football",
"category": "Sports"}
{"_id": 2,
"date": ISODate("2017-02-22T00:00:00Z),
"item": "Massage",
"category": "Leisure"}
{"_id": 3,
"date": ISODate("2017-03-05T00:00:00Z),
"item": "Golf club",
"category": "Sports"}
I'm trying to group the items by MONTH/YEAR and within that, group the items by CATEGORY. So the aggregation pipeline should return something that looks like this for the four docs above:
{"_id": {
"month": 2,
"year": 2017
},
"data": [
{"category": "Sports",
"items": ["Basketball", "Football"]
},
{"category": "Leisure",
"items": ["Massage"]
}
]
},
{"_id": {
"month": 3,
"year": 2017
},
"data": [
{"category": "Sports",
"items": ["Golf Club"]
}
]
}
I also want the returned cursor to be in order with year as the primary sort and month as the secondary sort.

Figured it out. Here's the answer using the pymongo api:
from bson.son import SON
cursor = db.collection.aggregate([
{'$group': {
'_id': {'month': {'$month': '$date'},
'year': {'$year': '$date'},
'$category': '$category'},
'items': {'$push': '$item'}
}},
{'$group': {
'_id': {'month': '_id.month',
'year': '_id.year'}
'data': {
'$push': {
'category': '$_id.category',
'items': '$items'
}
}
}},
{'$sort': SON([('_id.year', 1), ('_id.month', 1)])}
])
my_data = list(cursor)

Related

Mongodb Aggregations - Group by date including condition

I have a series of documents gathered by aggregation grouping. This is the result for one document:
{
"_id": {
"ip": "79.xxx.xxx.117",
"myDate": "2022-10-19"
},
"date": "2022-10-19",
"allVisitedPages": [
{
"page": "/",
"time": {
"time": "2022-10-19T11:35:44.655Z",
"tz": "-120",
"_id": "634fe1100a011986b7137da0"
}
},
{
"page": "/2",
"time": {
"time": "2022-10-19T12:14:29.536Z",
"tz": "-120",
"_id": "634fea257acb264f23d421f1"
}
},
{
"page": "/",
"time": {
"time": "2022-10-19T15:37:30.002Z",
"tz": "-120",
"_id": "634fea266001ea364eeb38ea"
}
},
],
"visitedPages": 3,
"createdAt": "2022-10-19T11:35:44.920Z"
},
I want to get this (in this case 2 documents as the time difference between array position 2 and 3 is greater than 2 hours):
{
"_id": {
"ip": "79.xxx.xxx.117",
"myDate": "2022-10-19"
},
"date": "2022-10-19",
"allVisitedPages": [
{
"page": "/",
"durationInMinutes": "39",
"time": {
"time": "2022-10-19T11:35:44.655Z",
"tz": "-120",
"_id": "634fe1100a011986b7137da0"
}
},
{
"page": "/2",
"durationInMinutes": "2",
"time": {
"time": "2022-10-19T12:14:29.536Z",
"tz": "-120",
"_id": "634fea257acb264f23d421f1"
}
}
],
"visitedPages": 2,
},
{
"_id": {
"ip": "79.xxx.xxx.117",
"myDate": "2022-10-19"
},
"date": "2022-10-19",
"allVisitedPages": [
{
"page": "/",
"durationInMinutes": "2",
"time": {
"time": "2022-10-19T15:37:30.002Z",
"tz": "-120",
"_id": "634fea266001ea364eeb38ea"
}
},
],
"visitedPages": 1,
},
I want to get a new grouping document if the time between an array position and the following array position is greater than 2 hours. On the last array position it show always show "2".
I tried $divide and $datediff. But this is not possible on the group stage as it's an unary operator. An approach I tried is to calculate the sum of start and end time by dividing. But how to execute this on an array level on the group stage? Maybe someone could point me in the right direction if possible at all?
You can group and then reduce, but another option is to use $setWindowFields to calculate your grouping index before grouping:
db.collection.aggregate([
{$setWindowFields: {
partitionBy: {$concat: ["$ip", "$date"]},
sortBy: {"time.time": 1},
output: {prevtime: {
$push: "$time.time",
window: {documents: [-1, "current"]}
}}
}},
{$addFields: {
minutesDiff: {
$toInt: {
$dateDiff: {
startDate: {$first: "$prevtime"},
endDate: {$last: "$prevtime"},
unit: "minute"
}
}
}
}},
{$addFields: {deltaIndex: {$cond: [{$gt: ["$minutesDiff", 120]}, 1, 0]}}},
{$setWindowFields: {
partitionBy: {$concat: ["$ip", "$date"]},
sortBy: {"time.time": 1},
output: {
groupIndex: {
$sum: "$deltaIndex",
window: {documents: ["unbounded", "current"]}
},
duration: {
$push: "$minutesDiff",
window: {documents: ["current", 1]}
}
}
}
},
{$set: {
duration: {
$cond: [
{$and: [
{$eq: [{$size: "$duration"}, 2]},
{$lte: [{$last: "$duration"}, 120]}
]},
{$last: "$duration"},
2
]
}
}},
{$group: {
_id: {ip: "$ip", myDate: "$date", groupIndex: "$groupIndex"},
date: {$first: "$date"},
allVisitedPages: {$push: {page: "$page", time: "$time", duration: "$duration"}},
visitedPages: {$sum: 1}
}},
{$unset: "_id.groupIndex"}
])
See how it works on the playground example

Migrate to new document structure in mongo 3.6

I have to migrate data from a structure from
{
"_id": "some-id",
"_class": "org.some.class",
"number": 1015,
"timestamp": {"$date": "2020-09-05T12:08:02.809Z"},
"cost": 0.9200000166893005
}
to
{"_id": {
"productId": "some-id",
"countryCode": "DE"
},
"_class": "org.some.class",
"number": 1015,
"timestamp": {"$date": "2020-09-05T12:08:02.809Z"},
"cost": 0.9200000166893005
}
The change that is in the new document is the _id field is replaced by a complex _id object (productId : String, country : String).
The country field is to be completed for the entire collection with a specific value - DE.
The collection has about 40 million records in the old format and 700k in the new format. I would like to bring these 40 million to this new form. I’m using mongo 3.6, so I’m a bit limited and I’ll probably have to use the aggregate functions to create a completely new collection, and then remove the old one.
I will be grateful for help on how to do it - how the query that will do it should look like and how to keep these migrated 700k documents.
What I have got so far:
db.productDetails.aggregate(
{$match: {_id: {$exists: true}}},
{$addFields: {"_id": {"productId": "$_id", "country": "DE"}},
{$project: {_id: 1, _class: 1, number: 1, timestamp: 1, cost: 1}},
{$out: "productDetailsV2"}
)
but this solution would only work if I didn't have 700k documents in the new form.
Your query is in the right direction. You may want to modify the $match filter a bit to better catch the old type documents.
db.collection.aggregate([
{
$match: {
"_id.country": {
$exists: false
}
}
},
{
$addFields: {
"_id": {
"productId": "$_id",
"country": "DE"
}
}
},
{
$project: {
"_id": 1,
"_class": 1,
"number": 1,
"timestamp": 1,
"cost": 1
}
},
{
$out: "productDetailsV2"
}
])
Mongo Playground

MongoDB aggregation, combining 2 arrays in round-robin fashion

I have data in a MongoDB collection that looks something like this:
[
{
"_id": 1,
"type": "big",
"fields": [11, 12, 13],
"items": [21, 22, 23]
},
{
"_id": 2,
"type": "small",
"fields": [14, 15],
"items": [24, 25]
},
{
"_id": 3,
"type": "small",
"fields": [],
"items": [41, 42]
},
{
"_id": 4,
"type": "small",
"fields": [31, 32, 33],
"items": []
}
]
I have been tasked with returning data according to a procedure like this:
For each document in the collection, obtain 1 value from its fields (if there are any), and 1 value its items (if there are any). Concatenate all of the results in a single array.
One might summarize this as selecting data in "round robin" fashion from two arrays held in each document.
How would I achieve this in a MongoDB aggregation query? This logic is not hard to implement in the client that connects to the Mongo server, but I would like to let Mongo take care of pagination (with $skip and $limit). I am using MongoDB 4.4.
The resulting data would look something like this:
[
{
"value": 11,
"type": "field",
"fromId": 1
},
{
"value": 21,
"type": "item",
"fromId": 1
},
{
"value": 14,
"type": "field",
"fromId": 2
},
{
"value": 24,
"type": "item",
"fromId": 2
},
{
"value": 41,
"type": "item",
"fromId": 3
},
{
"value": 31,
"type": "field",
"fromId": 4
},
]
If I understand your question right; this should be a workable pipe. To implement a random functionality, you would simply adjust the index passed to $arrayElemAt
https://mongoplayground.net/p/PPXV6fTSwHP
db.collection.aggregate([
{$project: {
types: [
{type: "field", values: "$fields"},
{type: "item", values: "$items"}
]
}},
{$unwind: '$types'},
{$project: {
_id: 0,
value: {$arrayElemAt: ['$types.values', 0]},
type: '$types.type',
fromId: '$_id'
}},
{$match: {
value: {$exists: true}
}}
])
Randomizing would look something like this:
https://mongoplayground.net/p/qi1Ud53J6yv
db.collection.aggregate([
{$project: {
types: [
{type: "field", values: "$fields"},
{type: "item", values: "$items"}
]
}},
{$unwind: '$types'},
{$project: {
_id: 0,
value: {$arrayElemAt: [
'$types.values',
{$floor: {$multiply: [{$rand: {}}, {$size: '$types.values'}]}}
]},
type: '$types.type',
fromId: '$_id'
}},
{$match: {
value: {$exists: true}
}}
])

Sorting mongodb aggregation by time, date and month

I've got an aggregation pipeline that sorts by the last 24 hours. The Json output generally looks something like the following:
[{"_id": {"day": 28, "hour": 23}, "count": 2},
However, because today is the first of the month, it got mixed up and the above json was the 'latest', while everything containing "day": 1, was sorted last.
I think I fixed it by adding 'month' into the query, but just to be sure is this the correct way of sorting my aggregation?
pipeline = [
{
"$project": {
"_id": 1,
"mac": 1,
"time": 1
}
},
{"$match": {"time": {"$gt": timenow-timedelta(hours=24)}}},
{"$group": {
"_id": {
'month': {"$month": {"date": '$time', "timezone": 'Africa/Johannesburg'}},
'day': {"$dayOfMonth": {"date": '$time', "timezone": 'Africa/Johannesburg'}},
'hour': {"$hour": {"date": '$time', "timezone": 'Africa/Johannesburg'}},
'mac': {'_id': "$mac"}
},
"count": {"$sum": 1}
}},
{
"$group": {
"_id": {
'month':"$_id.month",
'day': "$_id.day",
'hour': "$_id.hour",
},
"count": {"$sum": 1}
}
},
{"$sort": SON([("_id", -1)])}
]

Can I transform MongoDB Document to contain an Array inside an Array?

I am trying to transform documents that look something like this
{ "_id": 1,
"created_at": DateTimeObject,
"daily": [{ "impressions":13,
"clicks": 13,
"day": "2015-01-01" },
{ "impressions":15,
"clicks": 15,
"day": "2015-01-02" },
{ "impressions":20,
"clicks": 20,
"day": "2015-01-03" }]
}
to
{ "_id": 1,
"impressions": [["2015-01-01", 13],
["2015-01-02", 15],
["2015-01-03", 20]],
"clicks": [["2015-01-01", 13],
["2015-01-02", 15],
["2015-01-03", 20]] }
currently using unwind and group I can get the data in the below format
{ "_id": 1,
"impressions": [{ "date": "2015-01-01", "value":13],
{"date":"2015-01-02", "value":15},
{"date":"2015-01-03", "value":20}],
"clicks": [{"date": "2015-01-01", "value":13},
{"date": "2015-01-01", "value":15},
{"date": "2015-01-01", "value":20}] }
I don't want to do this outside the db because there are around 150 - 200 line charts that need to be generated from similar documents and would prefer to just have the transformation.
EDIT: Would this be possible with map reduce ?
Mongodb can't support inner array. you need to write with unwind and group only.