MongoDB Aggregation Pipeline - get oldest document, but debounced - mongodb

I have documents that look like:
[
{
value: 'Apple',
createdAt: '2021-12-09T20:15:26.421+00:00',
},
{
value: 'Blueberry',
createdAt: '2021-12-09T20:45:26.421+00:00',
},
{
value: 'Cranberry',
createdAt: '2021-12-09T21:30:26.421+00:00',
},
{
value: 'Durian',
createdAt: '2022-01-24T20:15:26.421+00:00',
},
{
value: 'Elderberry',
createdAt: '2022-01-24T20:45:26.421+00:00',
},
]
I'd like to do an aggregation where I get the oldest document, with the caveat that if another document was created within an hour, it invalidates the first document and I actually want to grab that one instead. For example, in the above I would like to return Cranberry. Initially pick Apple, but since Blueberry comes within an hour, move to that one, and since Cranberry comes within an hour of Blueberry, select Cranberry.

You can do the followings in an aggregation pipeline:
$sort by createdAt
$limit to get the oldest document
$lookup to get all the documents with createdAt behind the current document
$reduce to loop the result array; update the accumulator/result only if the current entry is within 1 hour
db.collection.aggregate([
{
$sort: {
createdAt: 1
}
},
{
$limit: 1
},
{
"$lookup": {
"from": "collection",
let: {
current: "$createdAt"
},
pipeline: [
{
$match: {
$expr: {
$gte: [
"$createdAt",
"$$current"
]
}
}
}
],
"as": "within"
}
},
{
"$addFields": {
"within": {
"$reduce": {
"input": "$within",
"initialValue": null,
"in": {
"$cond": {
"if": {
$or: [
{
$eq: [
"$$value",
null
]
},
{
$lte: [
{
"$subtract": [
"$$this.createdAt",
"$$value.createdAt"
]
},
3600000
]
}
]
},
"then": "$$this",
"else": "$$value"
}
}
}
}
}
}
])
Here is the Mongo playground for your reference.

Related

MongoDB Query on multiple field condition

I have a MongoDB model that is currently like this (this is the stripped version):
{
title: String,
type: {
type: String,
lowercase: true,
enum: ['event', 'regular', 'project'],
},
project_start_time: Date,
project_end_time: Date,
regular_start_date: Date,
regular_end_date: Date,
events: [{
id: Number,
date: Date
}]
}
Now, I want to query something like this:
Find data where the regular_end_date, project_end_time, and events at the last index are lower than the date provided
The catch is, not every data has the three criteria above because it is available according to the types (Sorry for the messy data, it is already there). Below is an example:
If the data type is an event, then there are events
If the data type is regular, then there are regular_start_date and regular_end_date
If the data type is a project, then there are project_start_date and project_end_date
So far, I've tried to use this:
db.data.find({
"$or": [
{
"project_end_time": {
"$lt": ISODate("2022-12-27T10:09:49.753Z")
},
},
{
"regular_end_date": {
"$lt": ISODate("2022-12-27T10:09:49.753Z")
}
},
{
"$expr": {
"$lt": [
{
"$getField": {
"field": "date",
"input": {
"$last": "$events"
}
}
},
ISODate("2022-12-27T10:09:49.753Z")
]
}
}
]
})
Also with aggregation pipeline:
db.data.aggregate([
{
$match: {
"$or": [{
"project_end_time": {
"$lt": ISODate("2022-12-27T10:09:49.753Z")
},
},
{
"regular_end_date": {
"$lt": ISODate("2022-12-27T10:09:49.753Z")
}
},
{
"$expr": {
"$lt": [{
"$getField": {
"field": "date",
"input": {
"$last": "$events"
}
}
},
ISODate("2022-12-27T10:09:49.753Z")
]}
}]
}
}
])
But it shows all data as if it wasn't filtered according to the criteria. Any idea where did I do wrong?
FYI I am using MongoDB 5.0.2
One option is to check if the relevant field exists before checking its value, otherwise its value is null which is less than your requested date:
db.collection.find({
$or: [
{$and: [
{project_end_time: {$exists: true}},
{project_end_time: {$lt: ISODate("2022-12-27T10:09:49.753Z")}}
]},
{$and: [
{regular_end_date: {$exists: true}},
{regular_end_date: {$lt: ISODate("2022-12-27T10:09:49.753Z")}}
]},
{$and: [
{"events.0": {$exists: true}},
{$expr: {
$lt: [
{$last: "$events.date"},
ISODate("2022-12-27T10:09:49.753Z")
]
}}
]}
]
})
See how it works on the playground example

Optimise MongoDB aggregate query performance

I have next DB structure:
Workspaces:
Key
Index
PK
id
id
content
Projects:
Key
Index
PK
id
id
FK
workspace
workspace_1
deleted
deleted_1
content
Items:
Key
Index
PK
id
id
FK
project
project_1
type
_type_1
deleted
deleted_1
content
I need to calculate a number of items of each type for each project in workspace, e.g. expected output:
[
{ _id: 'projectId1', itemType1Count: 100, itemType2Count: 50, itemType3Count: 200 },
{ _id: 'projectId2', itemType1Count: 40, itemType2Count: 100, itemType3Count: 300 },
....
]
After few attempts and some debugging I've created a query which provides output I needed:
const pipeline = [
{ $match: { workspace: 'workspaceId1' } },
{
$lookup: {
from: 'items',
let: { id: '$_id' },
pipeline: [
{
$match: {
$expr: {
$eq: ['$project', '$$id'],
},
},
},
// project only fields necessary for later pipelines to not overload
// memory and to not get `exceeded memory limit for $group` error
{ $project: { _id: 1, type: 1, deleted: 1 } },
],
as: 'items',
},
},
// Use $unwind here to optimize aggregation pipeline, see:
// https://stackoverflow.com/questions/45724785/aggregate-lookup-total-size-of-documents-in-matching-pipeline-exceeds-maximum-d
// Without $unwind we may get an `matching pipeline exceeds maximum document size` error.
// Error appears not in all requests and it's really strange and hard to debug.
{ $unwind: '$items' },
{ $match: { 'items.deleted': { $eq: false } } },
{
$group: {
_id: '$_id',
items: { $push: '$items' },
},
},
{
$project: {
_id: 1,
// Note: I have only 3 possible item types, so it's OK that it's names hardcoded.
itemType1Count: {
$size: {
$filter: {
input: '$items',
cond: { $eq: ['$$this.type', 'type1'] },
},
},
},
itemType2Count: {
$size: {
$filter: {
input: '$items',
cond: { $eq: ['$$this.type', 'type2'] },
},
},
},
itemType3Count: {
$size: {
$filter: {
input: '$items',
cond: { $eq: ['$$this.type', 'type3'] },
},
},
},
},
},
]
const counts = await Project.aggregate(pipeline)
Query works like expected, but very slow... If I have some about 1000 items in one workspace it takes about 8 seconds to complete. Any ideas how to make it faster are appreciated.
Thanks.
Assuming your indexs are properly indexed that they contain the "correct" fields, we can still have some tweaks on the query itself.
Approach 1: keeping existing collection schema
db.projects.aggregate([
{
$match: {
workspace: "workspaceId1"
}
},
{
$lookup: {
from: "items",
let: {id: "$_id"},
pipeline: [
{
$match: {
$expr: {
$and: [
{$eq: ["$project","$$id"]},
{$eq: ["$deleted",false]}
]
}
}
},
// project only fields necessary for later pipelines to not overload
// memory and to not get `exceeded memory limit for $group` error
{
$project: {
_id: 1,
type: 1,
deleted: 1
}
}
],
as: "items"
}
},
// Use $unwind here to optimize aggregation pipeline, see:
// https://stackoverflow.com/questions/45724785/aggregate-lookup-total-size-of-documents-in-matching-pipeline-exceeds-maximum-d
// Without $unwind we may get an `matching pipeline exceeds maximum document size` error.
// Error appears not in all requests and it's really strange and hard to debug.
{
$unwind: "$items"
},
{
$group: {
_id: "$_id",
itemType1Count: {
$sum: {
"$cond": {
"if": {$eq: ["$items.type","type1"]},
"then": 1,
"else": 0
}
}
},
itemType2Count: {
$sum: {
"$cond": {
"if": {$eq: ["$items.type","type2"]},
"then": 1,
"else": 0
}
}
},
itemType3Count: {
$sum: {
"$cond": {
"if": {$eq: ["$items.type","type1"]},
"then": 1,
"else": 0
}
}
}
}
}
])
There are 2 major changes:
moving the items.deleted : false condition into the $lookup subpipeline to lookup less items documents
skipped items: { $push: '$items' }. Instead, do a conditional sum in later $group stage
Here is the Mongo playground for your reference. (at least for the correctness of the new query)
Approach 2: If the collection schema can be modified. We can denormalize projects.workspace into the items collection like this:
{
"_id": "i1",
"project": "p1",
"workspace": "workspaceId1",
"type": "type1",
"deleted": false
}
In this way, you can skip the $lookup. A simple $match and $group will suffice.
db.items.aggregate([
{
$match: {
"deleted": false,
"workspace": "workspaceId1"
}
},
{
$group: {
_id: "$project",
itemType1Count: {
$sum: {
"$cond": {
"if": {$eq: ["$type","type1"]},
"then": 1,
"else": 0
}
}
},
...
Here is the Mongo playground with denormalized schema for your reference.

mongodb aggregate apply a function to a field

As part of an aggregate I need to run this transformation:
let inheritances = await db.collection('inheritance').aggregate([
{ $match: { status: 1 }}, // inheritance active
{ $project: { "_id":1, "name": 1, "time_trigger": 1, "signers": 1, "tree": 1, "creatorId": 1, "redeem": 1, "p2sh": 1 } },
{ $lookup:
{
from: "user",
let: { creatorId: { $concat: [ "secretkey", { $toString: "$creatorId" } ] }, time_trigger: "$time_trigger"},
pipeline: [
{ $match:
{ $expr:
{ $and:
[
{ $eq: [ "$_id", sha256( { $toString: "$$creatorId" } ) ] },
{ $gt: [ new Date(), { $add: [ { $multiply: [ "$$time_trigger", 24*60*60*1000 ] }, "$last_access" ] } ] },
]
}
}
},
],
as: "user"
},
},
{ $unwind: "$user" }
]).toArray()
creatorId comes from a lookup, and in order to compare it to _id I first need to do a sha256.
How can I do it?
Thanks.
External functions will not work with the aggregation framework. Everything is parsed to BSON by default. It is all basically processed from BSON operators to native C++ code implementation, This is by design for performance.
Basically in short, you can't do this. I recommend just storing the hashed value on every document as a new field, otherwise you'll have to do it in code just before the pipeline.

Returning a document with two fields from the same array in MongoDB

Given documents such as
{
_id: 'abcd',
userId: '12345',
activities: [
{ status: 'login', timestamp: '10000001' },
{ status: 'logout', timestamp: '10000002' },
{ status: 'login', timestamp: '10000003' },
{ status: 'logout', timestamp: '10000004' },
]
}
I am trying to create a pipeline such as all users that have their latest login/logout activities recorded between two timestamps will be returned. For example, if the two timestamp values are between 10000002 and 10000003, the expected document should be
{
_id: 'abcd',
userId: '12345',
login: '10000003',
logout: '10000002'
}
Of if the two timestamp values are between -1 and 10000001, the expected document should be :
{
_id: 'abcd',
userId: '12345',
login: '10000001',
logout: null
}
Etc.
I know it has to do with aggregations, and I need to $unwind, etc., but I'm not sure about the rest, namely evaluating two fields from the same document array
You can try below aggregation:
db.col.aggregate([
{
$unwind: "$activities"
},
{
$match: {
$and: [
{ "activities.timestamp": { $gte: "10000001" } },
{ "activities.timestamp": { $lte: "10000002" } }
]
}
},
{
$sort: {
"activities.timestamp": -1
}
},
{
$group: {
_id: "$_id",
userId: { $first: "$userId" },
activities: { $push: "$activities" }
}
},
{
$addFields: {
login: { $arrayElemAt: [ { $filter: { input: "$activities", as: "a", cond: { $eq: [ "$$a.status", "login" ] } } } , 0 ] },
logout: { $arrayElemAt: [ { $filter: { input: "$activities", as: "a", cond: { $eq: [ "$$a.status", "logout" ] } } } , 0 ] }
}
},
{
$project: {
_id: 1,
userId: 1,
login: { $ifNull: [ "$login.timestamp", null ] },
logout: { $ifNull: [ "$logout.timestamp", null ] }
}
}
])
We need to use $unwind + $sort + $group to make sure that our activities will be sorted by timestamp. After $unwind you can use $match to apply filtering condition. Then you can use $filter with $arrayElemAt to get first (latest) value of filtered array. In the last $project you can explicitly use $ifNull (otherwise JSON key will be skipped if there's no value)
You can use below aggregation
Instead of $unwind use $lte and $gte with the $fitler aggregation.
db.collection.aggregate([
{ "$project": {
"userId": 1,
"login": {
"$max": {
"$filter": {
"input": "$activities",
"cond": {
"$and": [
{ "$gte": ["$$this.timestamp", "10000001"] },
{ "$lte": ["$$this.timestamp", "10000004"] },
{ "$lte": ["$$this.status", "login"] }
]
}
}
}
},
"logout": {
"$max": {
"$filter": {
"input": "$activities",
"cond": {
"$and": [
{ "$gte": ["$$this.timestamp", "10000001"] },
{ "$lte": ["$$this.timestamp", "10000004"] },
{ "$lte": ["$$this.status", "logout"] }
]
}
}
}
}
}}
])

Mongo Query to Return only a subset of SubDocuments

Using the example from the Mongo docs:
{ _id: 1, results: [ { product: "abc", score: 10 }, { product: "xyz", score: 5 } ] }
{ _id: 2, results: [ { product: "abc", score: 8 }, { product: "xyz", score: 7 } ] }
{ _id: 3, results: [ { product: "abc", score: 7 }, { product: "xyz", score: 8 } ] }
db.survey.find(
{ id: 12345, results: { $elemMatch: { product: "xyz", score: { $gte: 6 } } } }
)
How do I return survey 12345 (regardless of even if it HAS surveys or not) but only return surveys with a score greater than 6? In other words I don't want the document disqualified from the results based on the subdocument, I want the document but only a subset of subdocuments.
What you are asking for is not so much a "query" but is basically just a filtering of content from the array in each document.
You do this with .aggregate() and $project:
db.survey.aggregate([
{ "$project": {
"results": {
"$setDifference": [
{ "$map": {
"input": "$results",
"as": "el",
"in": {
"$cond": [
{ "$and": [
{ "$eq": [ "$$el.product", "xyz" ] },
{ "$gte": [ "$$el.score", 6 ] }
]}
]
}
}},
[false]
]
}
}}
])
So rather than "contrain" results to documents that have an array member matching the condition, all this is doing is "filtering" the array members out that do not match the condition, but returns the document with an empty array if need be.
The fastest present way to do this is with $map to inspect all elements and $setDifference to filter out any values of false returned from that inspection. The possible downside is a "set" must contain unique elements, so this is fine as long as the elements themselves are unique.
Future releases will have a $filter method, which is similar to $map in structure, but directly removes non-matching results where as $map just returns them ( via the $cond and either the matching element or false ) and is then better suited.
Otherwise if not unique or the MongoDB server version is less than 2.6, you are doing this using $unwind, in a non performant way:
db.survey.aggregate([
{ "$unwind": "$results" },
{ "$group": {
"_id": "$_id",
"results": { "$push": "$results" },
"matched": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$results.product", "xyz" ] },
{ "$gte": [ "$results.score", 6 ] }
]},
1,
0
]
}
}
}},
{ "$unwind": "$results" },
{ "$match": {
"$or": [
{
"results.product": "xyz",
"results.score": { "$gte": 6 }
},
{ "matched": 0 }
}},
{ "$group": {
"_id": "$_id",
"results": { "$push": "$results" },
"matched": { "$first": "$matched" }
}},
{ "$project": {
"results": {
"$cond": [
{ "$ne": [ "$matched", 0 ] },
"$results",
[]
]
}
}}
])
Which is pretty horrible in both design and perfomance. As such you are probably better off doing the filtering per document in client code instead.
You can use $filter in mongoDB 3.2
db.survey.aggregate([{
$match: {
{ id: 12345}
}
}, {
$project: {
results: {
$filter: {
input: "$results",
as: "results",
cond:{$gt: ['$$results.score', 6]}
}
}
}
}]);
It will return all the sub document that have score greater than 6. If you want to return only first matched document than you can use '$' operator.
You can use $redact in this way:
db.survey.aggregate( [
{ $match : { _id : 12345 }},
{ $redact: {
$cond: {
if: {
$or: [
{ $eq: [ "$_id", 12345 ] },
{ $and: [
{ $eq: [ "$product", "xyz" ] },
{ $gte: [ "$score", 6 ] }
]}
]
},
then: "$$DESCEND",
else: "$$PRUNE"
}
}
}
] );
It will $match by _id: 12345 first and then it will "$$PRUNE" all the subdocuments that don't have "product":"xyz" and don't have score greater or equal 6. I added the condition ($cond) { $eq: [ "$_id", 12345 ] } so that it wouldn't prune the whole document before it reaches the subdocuments.