Combining data from 2 mongoDB collections into 1 document - mongodb

I want to filter 2 collections and return one document.
I have 2 MongoDB collections modelled as such
Analytics_Region
_id:5ecf3445365eca3e58ff57c0,
type:"city"
name:"Toronto"
CSD:"3520005"
CSDTYPE:"C"
PR:"35"
PRNAME:"Ontario"
geometry:Object
country:"CAN"
updatedAt:2021-04-23T18:25:50.774+00:00
province:"ON"
Analytics_Region_Custom
_id:5ecbe871d8ab4ab6845c5142
geometry:Object
name:"henry12"
user:5cbdd019b9d9170007d15990
__v:0
I want to output a single collection in alphabetical order by name,
{
_id: 5ecbe871d8ab4ab6845c5142,
name: "henry12",
type: "custom",
province: null
},
{
_id:5ecf3445365eca3e58ff57c0,
name:"Toronto"
type:"city"
province:"ON",
}
Things to note: In the output, we have added a type of "custom" for every document in Analytics_Region_custom. We also add a province of "null" for every document.
So far I looked into $lookup (to fetch results from another collection) but it does not seem to work for my needs since it adds an array onto every document

You can use $unionWith
Documents will be added to the pipeline(no check for duplicates), and from those documents we will project the fields
if type is missing => custom
if province missing => null
*if those 2 have any false value, like false/0/null the old value is kept (new value only if field is missing)
Test code here
db.coll1.aggregate([
{
"$unionWith": {
"coll": "coll2"
}
},
{
"$project": {
"_id": "$_id",
"name": "$name",
"type": {
"$cond": [
{
"$ne": [
{
"$type": "$type"
},
"missing"
]
},
"$type",
"custom"
]
},
"province": {
"$cond": [
{
"$ne": [
{
"$type": "$province"
},
"missing"
]
},
"$province",
null
]
}
}
},
{
"$sort": {
"name": 1
}
}
])

$unionWith to perform union of both collections
$project to project only fields that you want
sort to sort by name field
db.orders.aggregate([
{
$unionWith: "inventory"
},
{
$project: {
_id: 1,
name: 1,
province: { $cond: { if: "$province", then: "$province", else: null } },
type: { $cond: { if: "$type", then: "$type", else: "custom" } }
}
},
{
$sort: { name: 1 }
}
])
Working example

Related

Get Data from another collection (string -> ObjectId)

Let's say I have these two collections:
// Members:
{
"_id":{
"$oid":"60dca71f0394f430c8ca296d"
},
"church":"60dbb265a75a610d90b45c6b",
"name":"Julio Verne Cerqueira"
},
{
"_id":{
"$oid":"60dca71f0394f430c8ca29a8"
},
"nome":"Ryan Steel Oliveira",
"church":"60dbb265a75a610d90b45c6c"
}
And
// Churches
{
"_id": {
"$oid": "60dbb265a75a610d90b45c6c"
},
"name": "Saint Antoine Hill",
"active": true
},
{
"_id": {
"$oid": "60dbb265a75a610d90b45c6b"
},
"name": "Jackeline Hill",
"active": true
}
And I want to query it and have a result like this:
// Member with Church names
{
"_id":{
"$oid":"60dca71f0394f430c8ca296d"
},
"church":"Jackeline Hill",
"name":"Julio Verne Cerqueira"
},
{
"_id":{
"$oid":"60dca71f0394f430c8ca29a8"
},
"church":"Saint Antoine Hill",
"nome":"Ryan Steel Oliveira"
}
If I try a Lookup, I have the following Result: (It is getting the entire churches collection).
How would I do the query, so it gives me only the one church that member is related to?
And, if possible, how to Sort the result in alphabetical order by church then by name?
Obs.: MongoDB Version: 4.4.10
There is matching error in the $lookup --> $pipeline --> $match.
It should be:
$match: {
$expr: {
$eq: [
"$_id",
"$$searchId"
]
}
}
From the provided documents, members to churchies relationship will be 1 to many. Hence, when you join members with churchies via $lookup, the output church will be an array with only one churchies document.
Aggregation pipelines:
$lookup - Join members collection (by $$searchId) with churchies (by _id).
$unwind - Deconstruct church array field to multiple documents.
$project - Decorate output document.
$sort - Sort by church and name ascending.
db.members.aggregate([
{
"$lookup": {
"from": "churchies",
"let": {
searchId: {
"$toObjectId": "$church"
}
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$_id",
"$$searchId"
]
}
}
},
{
$project: {
name: 1
}
}
],
"as": "church"
}
},
{
"$unwind": "$church"
},
{
$project: {
_id: 1,
church: "$church.name",
name: 1
}
},
{
"$sort": {
"church": 1,
"name": 1
}
}
])
Sample Mongo Playground

How best to find duplicates and merge specific fields in a mongodb collection

I have a collection of entities and there are certain entities that have the same name which can be considered duplicates. The collection has a column called platforms and the value for that column is an array of objects.
What i'm trying to do using $aggregate is find all of the items with the same name and then merge together what is in the platforms column so both entries have the aggregation of both platform arrays.
// Current entity structure in the collection
{ _id: 1, name: "Foo1", platforms: [{ _id: 1, name: "Bar" }] }
{ _id: 2, name: "Foo2", platforms: [{ _id: 7, name: "FooBar" }] }
// Required
{
_id: 1,
name: "Foo1",
platforms: [{ _id: 1, name: "Bar" }, { _id: 7, name: "FooBar" }]
}
// This gets me all of the duplicates but it only shows the name and count, i don't want to lose any data that isn't simply dropping the duplicate field after the platforms have been merged.
db.titles.aggregate([
{ "$group": { "_id": "$name", "count": { "$sum": 1 }}},
{ "$match": { "_id": { "$ne": null } , "count": { "$gt": 1 }}}
], { allowDiskUse: true })
Unwinds and groups by name, and has 2 accumulators the array with all members and the array size. I think you dont want all member so remove it if you dont need it.
You can add the $match also based on your needs.
Test code here
Query
db.collection.aggregate([
{
"$unwind": {
"path": "$platforms"
}
},
{
"$group": {
"_id": "$name",
"platforms": {
"$push": "$platforms"
},
"nplatforms": {
"$sum": 1
}
}
},
{
"$addFields": {
"name": "$_id"
}
},
{
"$project": {
"_id": 0
}
}
])

SpringData MongoDb, how to count distinct of a query?

I'm doing paginated search with mongoDb in my Springboot API.
For a customer search path, I'm building a query with a bunch of criteria depending on the user input.
I then do a count to display the total number of results (and the computed number of page associated)
Long total = mongoTemplate.count(query, MyEntity.class);
I then do the paginated query to return only current page results
query.with(PageRequest.of(pagination.getPage(), pagination.getPageSize()));
query.with(Sort.by(Sort.Direction.DESC, "creationDate"));
List<MyEntity> listResults = mongoTemplate.find(query, MyEntity.class);
It all works well.
Now on my total results, i often have multiple result for the same users, I want to display those in the paginated list, but I also want to display a new counter with the total distinct user that are in that search.
I saw the findDistinct parameter
mongoTemplate.findDistinct(query, "userId", OnboardingItineraryEntity.class, String.class);
But I do not want to retrieve a huge list and do a count on it. Is there a way to easily do:
mongoTemplate.countDistinct(query, "userId", OnboardingItineraryEntity.class, String.class);
Cause I've a huge number of criteria, so i find it sad to have to rebuild an Aggregate object from scratch ?
Bonus question, sometime userId will be null, Is there an easy way do count number of distinct (not null) + number of null in one query?
Or do I need to do a query, when i add an extra criteira on userId being null, do a count on that, and then do the count distinct on all and add them up manualy in my code (minus one).
MongoDB aggregation solves this problem in several ways.
Aggregate with $type operator:
db.myEntity.aggregate([
{$match:...}, //add here MatchOperation
{
"$group": {
"_id": {
"$type": "$userId"
},
"count": {
"$sum": 1
}
}
}
])
MongoPlayground
---Ouput---
[
{
"_id": "null", //null values
"count": 2
},
{
"_id": "missing", // if userId doesn't exists at all
"count": 1
},
{
"_id": "string", //not null values
"count": 4
}
]
Single document with null and NonNull fields
db.myEntity.aggregate([
{$match:...}, //add here MatchOperation
{
"$group": {
"_id": "",
"null": {
$sum: {
$cond: [
{
$ne: [{ "$type": "$userId"}, "string"]
},
1,
0
]
}
},
"nonNull": {
"$sum": {
$cond: [
{
$eq: [{ "$type": "$userId" }, "string"]
},
1,
0
]
}
}
}
}
])
MongoPlayground
---Output---
[
{
"_id": "",
"nonNull": 4,
"null": 3
}
]
Performing $facet operator
db.myEntity.aggregate([
{$match:...}, //add here MatchOperation
{
$facet: {
"null": [
{
$match: {
$or: [
{
userId: {
$exists: false
}
},
{
userId: null
}
]
}
},
{
$count: "count"
}
],
"nonNull": [
{
$match: {
$and: [
{
userId: {
$exists: true
}
},
{
userId: {
$ne: null
}
}
]
}
},
{
$count: "count"
}
]
}
},
{
$project: {
"null": {
$ifNull: [
{
$arrayElemAt: [
"$null.count",
0
]
},
0
]
},
"nonNull": {
$ifNull: [
{
$arrayElemAt: [
"$nonNull.count",
0
]
},
0
]
}
}
}
])
MongoPlayground
Note: Try any of these solutions and let me know if you have any problem creating the MongoDB aggregation.

Lookup and group from two fields in one aggregation

I have an aggregation that looks like this:
userSchema.statics.getCounts = function (req, type) {
return this.aggregate([
{ $match: { organization: req.user.organization._id } },
{
$lookup: {
from: 'tickets', localField: `${type}Tickets`, foreignField: '_id', as: `${type}_tickets`,
},
},
{ $unwind: `$${type}_tickets` },
{ $match: { [`${type}_tickets.createdAt`]: { $gte: new Date(moment().subtract(4, 'd').startOf('day').utc()), $lt: new Date(moment().endOf('day').utc()) } } },
{
$group: {
_id: {
groupDate: {
$dateFromParts: {
year: { $year: `$${type}_tickets.createdAt` },
month: { $month: `$${type}_tickets.createdAt` },
day: { $dayOfMonth: `$${type}_tickets.createdAt` },
},
},
userId: `$${type}_tickets.assignee_id`,
},
ticketCount: {
$sum: 1,
},
},
},
{
$sort: { '_id.groupDate': -1 },
},
{ $group: { _id: '$_id.userId', data: { $push: { groupDate: '$_id.groupDate', ticketCount: '$ticketCount' } } } },
]);
};
Which outputs data like this:
[
{
_id: 5aeb6b71709f43359e0888bb,
data: [
{ "groupDate": 2018-05-07T00:00:000Z", ticketCount: 4 }
}
]
Ideally though, I would have data like this:
[
{
_id: 5aeb6b71709f43359e0888bb,
data: [
{ "groupDate": 2018-05-07T00:00:000Z", assignedCount: 4, resolvedCount: 8 }
}
]
The difference being that the object for the user would output both the total number of assigned tickets and the total number of resolved tickets for each date.
My userSchema is like this:
const userSchema = new Schema({
firstName: String,
lastName: String,
assignedTickets: [
{
type: mongoose.Schema.ObjectId,
ref: 'Ticket',
index: true,
},
],
resolvedTickets: [
{
type: mongoose.Schema.ObjectId,
ref: 'Ticket',
index: true,
},
],
}, {
timestamps: true,
});
An example user doc is like this:
{
"_id": "5aeb6b71709f43359e0888bb",
"assignedTickets": ["5aeb6ba7709f43359e0888bd", "5aeb6bf3709f43359e0888c2", "5aec7e0adcdd76b57af9e889"],
"resolvedTickets": ["5aeb6bc2709f43359e0888be", "5aeb6bc2709f43359e0888bf"],
"firstName": "Name",
"lastName": "Surname",
}
An example ticket doc is like this:
{
"_id": "5aeb6ba7709f43359e0888bd",
"ticket_id": 120292,
"type": "assigned",
"status": "Pending",
"assignee_email": "email#gmail.com",
"assignee_id": "5aeb6b71709f43359e0888bb",
"createdAt": "2018-05-02T20:05:59.147Z",
"updatedAt": "2018-05-03T20:05:59.147Z",
}
I've tried adding multiple lookups and group stages, but I keep getting an empty array. If I only do one lookup and one group, I get the correct counts for the searched on field, but I'd like to have both fields in one query. Is it possible to have the query group on two lookups?
In short you seem to be coming to terms with setting up your models in mongoose and have gone overboard with references. In reality you really should not keep the arrays within the "User" documents. This is actually an "anti-pattern" which was just something mongoose used initially as a convention for keeping "references" for population where it did not understand how to translate the references from being kept in the "child" to the "parent" instead.
You actually have that data in each "Ticket" and the natural form of $lookup is to use that "foreignField" in reference to the detail from the local collection. In this case the "assignee_id" on the tickets will suffice for looking at matching back to the "_id" of the "User". Though you don't state it, your "status" should be an indicator of whether the data is actually either "assigned" as when in "Pending" state or "resolved" when it is not.
For the sake of simplicity we are going to consider the state "resolved" if it is anything other than "Pending" in value, but extending on the logic from the example for actual needs is not the problem here.
Basically then we resolve to a single $lookup operation by actually using the natural "foreign key" as opposed to keeping separate arrays.
MongoDB 3.6 and greater
Ideally you would use features from MongoDB 3.6 with sub-pipeline processing here:
// Better date calculations
const oneDay = (1000 * 60 * 60 * 24);
var now = Date.now(),
end = new Date((now - (now % oneDay)) + oneDay),
start = new Date(end.valueOf() - (4 * oneDay));
User.aggregate([
{ "$match": { "organization": req.user.organization._id } },
{ "$lookup": {
"from": Ticket.collection.name,
"let": { "id": "$_id" },
"pipeline": [
{ "$match": {
"createdAt": { "$gte": start, "$lt": end },
"$expr": {
"$eq": [ "$$id", "$assignee_id" ]
}
}},
{ "$group": {
"_id": {
"status": "$status",
"date": {
"$dateFromParts": {
"year": { "$year": "$createdAt" },
"month": { "$month": "$createdAt" },
"day": { "$dayOfMonth": "$createdAt" }
}
}
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.date",
"data": {
"$push": {
"k": {
"$cond": [
{ "$eq": ["$_id.status", "Pending"] },
"assignedCount",
"resolvedCount"
]
},
"v": "$count"
}
}
}},
{ "$sort": { "_id": -1 } },
{ "$replaceRoot": {
"newRoot": {
"$mergeObjects": [
{ "groupDate": "$_id", "assignedCount": 0, "resolvedCount": 0 },
{ "$arrayToObject": "$data" }
]
}
}}
],
"as": "data"
}},
{ "$project": { "data": 1 } }
])
From MongoDB 3.0 and upwards
Or where you lack those features we use a different pipeline process and a little data transformation after the results are returned from the server:
User.aggregate([
{ "$match": { "organization": req.user.organization._id } },
{ "$lookup": {
"from": Ticket.collection.name,
"localField": "_id",
"foreignField": "assignee_id",
"as": "data"
}},
{ "$unwind": "$data" },
{ "$match": {
"data.createdAt": { "$gte": start, "$lt": end }
}},
{ "$group": {
"_id": {
"userId": "$_id",
"date": {
"$add": [
{ "$subtract": [
{ "$subtract": [ "$data.createdAt", new Date(0) ] },
{ "$mod": [
{ "$subtract": [ "$data.createdAt", new Date(0) ] },
oneDay
]}
]},
new Date(0)
]
},
"status": "$data.status"
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": {
"userId": "$_id.userId",
"date": "$_id.date"
},
"data": {
"$push": {
"k": {
"$cond": [
{ "$eq": [ "$_id.status", "Pending" ] },
"assignedCount",
"resolvedCount"
]
},
"v": "$count"
}
}
}},
{ "$sort": { "_id.userId": 1, "_id.date": -1 } },
{ "$group": {
"_id": "$_id.userId",
"data": {
"$push": {
"groupDate": "$_id.date",
"data": "$data"
}
}
}}
])
.then( results =>
results.map( ({ data, ...d }) =>
({
...d,
data: data.map(di =>
({
groupDate: di.groupDate,
assignedCount: 0,
resolvedCount: 0,
...di.data.reduce((acc,curr) => ({ ...acc, [curr.k]: curr.v }),{})
})
)
})
)
)
Which just really goes to show that even with the fancy features in modern releases, you really don't need them because there pretty much has always been ways to work around this. Even the JavaScript parts just had slightly longer winded versions before the current "object spread" syntax was available.
So that is really the direction you need to go in. What you certainly don't want is using "multiple" $lookup stages or even applying $filter conditions on what could potentially be large arrays. Also both forms here do their best to "filter down" the number of items "joined" from the foreign collection so as not to cause a breach of the BSON limit.
Particularly the "pre 3.6" version actually has a trick where $lookup + $unwind + $match occur in succession which you can see in the explain output. All stages actually combine into "one" stage there which solely returns only the items which match the conditions in the $match from the foreign collection. Keeping things "unwound" until we reduce further avoids BSON limit problems, as does the new form with MongoDB 3.6 where the "sub-pipeline" does all the document reduction and grouping before any results are returned.
Your one document sample would return like this:
{
"_id" : ObjectId("5aeb6b71709f43359e0888bb"),
"data" : [
{
"groupDate" : ISODate("2018-05-02T00:00:00Z"),
"assignedCount" : 1,
"resolvedCount" : 0
}
]
}
Once I expand the date selection to include that date, which of course the date selection can also be improved and corrected from your original form.
So it seems to make sense that your relationships are actually defined that way but it's just that you recorded them "twice". You don't need to and even if that's not the definition then you should actually instead record on the "child" rather than an array in the parent. We can juggle and merge the parent arrays, but that's counterproductive to actually establishing the data relations correctly and using them correctly as well.
How about something like this?
db.users.aggregate([
{
$lookup:{ // lookup assigned tickets
from:'tickets',
localField:'assignedTickets',
foreignField:'_id',
as:'assigned',
}
},
{
$lookup:{ // lookup resolved tickets
from:'tickets',
localField:'resolvedTickets',
foreignField:'_id',
as:'resolved',
}
},
{
$project:{
"tickets":{ // merge all tickets into one single array
$concatArrays:[
"$assigned",
"$resolved"
]
}
}
},
{
$unwind:'$tickets' // flatten the 'tickets' array into separate documents
},
{
$group:{ // group by 'createdAt' and 'assignee_id'
_id:{
groupDate:{
$dateFromParts:{
year:{ $year:'$tickets.createdAt' },
month:{ $month:'$tickets.createdAt' },
day:{ $dayOfMonth:'$tickets.createdAt' },
},
},
userId:'$tickets.assignee_id',
},
assignedCount:{ // get the count of assigned tickets
$sum:{
$cond:[
{ // by checking the 'type' field for a value of 'assigned'
$eq:[
'$tickets.type',
'assigned'
]
},
1, // if matching count 1
0 // else 0
]
}
},
resolvedCount:{
$sum:{
$cond:[
{ // by checking the 'type' field for a value of 'resolved'
$eq:[
'$tickets.type',
'resolved'
]
},
1, // if matching count 1
0 // else 0
]
}
},
},
},
{
$sort:{ // sort by 'groupDate' descending
'_id.groupDate':-1
},
},
{
$group:{
_id:'$_id.userId', // group again but only by userId
data:{
$push:{ // create an array
groupDate:'$_id.groupDate',
assignedCount:{
$sum:'$assignedCount'
},
resolvedCount:{
$sum:'$resolvedCount'
}
}
}
}
}
])

Use MongoDB projection for nesting whole documents?

I have a flat collection of documents, where some documents have a parent: ObjectId field, which points another document from the same collection, i.e.:
{id: 1, metadata: {text: "I'm a parent"}}
{id: 2, metadata: {text: "I'm child 1", parent: 1}}
Now I'd like to retrieve all parents where metadata.text = "I'm a parent" plus it's child elements. But I want that data in a nested format, so I can simply process it afterwards without having a look at metadata.parent. The output should look like:
{
id: 1,
metadata: {text: "I'm a parent"},
children: [
{id: 2, metadata: {text: "I'm child 1", parent: 1}}
]
}
(children could also be part of the parent's metadata object if that's easier)
Why don't I save the documents in a nested structure? I don't want to store the data in a nested format in DB, because those documents are part of GridFS.
The main problem is: How can I tell MongoDB to nest a whole document? Or do I have to use Mongo's aggregation framework for that task?
For the sort of "projection" you are asking for then the aggregation framework is the correct tool as this sort of "document re-shaping" is only really supported there.
The other case is the "parent/child" thing, where you again need to be "creative" when grouping using the aggregation framework. The full operations show what is essentially involved:
db.collection.aggregate([
// Group parent and children together with conditionals
{ "$group": {
"_id": { "$ifNull": [ "$metadata.parent", "$_id" ] },
"metadata": {
"$addToSet": {
"$cond": [
{ "$ifNull": [ "$metadata.parent", false ] },
false,
"$metadata"
]
}
},
"children": {
"$push": {
"$cond": [
{ "$ifNull": [ "$metadata.parent", false ] },
"$$ROOT",
false
]
}
}
}},
// Filter out "false" values
{ "$project": {
"metadata": { "$setDifference": [ "$metadata", [false] ] },
"children": { "$setDifference": [ "$children", [false] ] }
}},
// metadata is an array but should only have one item
{ "$unwind": "$metadata" },
// This is essentially sorting the children as "sets" are un-ordered
{ "$unwind": "$children" },
{ "$sort": { "_id": 1, "children._id": 1 } },
{ "$group": {
"_id": "$_id",
"metadata": { "$first": "$metadata" },
"children": { "$push": "$children" }
}}
])
The main thing here is the $ifNull operator used on the grouping _id. This will choose to $group on the "parent" field where present, otherwise using the general document _id.
Similar things are done with the $cond operator later where the evaluation is made of which data to add to the array or "set". In the following $project the false values are filtered out by use of the $setDifference operator.
If the final $sort and $group there seem confusing, then the actual reason is because the operator used is a "set" operator the resulting "set" is considered to be un-ordered. So really that part is just there to make sure that the array contents appear in order of their own _id field.
Without the additional operators from MongoDB 2.6 this can still be done, but just a little differently.
db.collection.aggregate([
{ "$group": {
"_id": { "$ifNull": [ "$metadata.parent", "$_id" ] },
"metadata": {
"$addToSet": {
"$cond": [
{ "$ifNull": [ "$metadata.parent", false ] },
false,
"$metadata"
]
}
},
"children": {
"$push": {
"$cond": [
{ "$ifNull": [ "$metadata.parent", false ] },
{ "_id": "$_id","metadata": "$metadata" },
false
]
}
}
}},
{ "$unwind": "$metadata" },
{ "$match": { "metadata": { "$ne": false } } },
{ "$unwind": "$children" },
{ "$match": { "children": { "$ne": false } } },
{ "$sort": { "_id": 1, "children._id": 1 } },
{ "$group": {
"_id": "$_id",
"metadata": { "$first": "$metadata" },
"children": { "$push": "$children" }
}}
])
Essentially the same thing but without the newer operators introduced in MongoDB 2.6, so this would work in earlier versions as well.
This will all be fine as long as your relationships are a single level of parent and child. For nested levels you would need to invoke a mapReduce process instead.
I wanted a similar result to Neil Lunn's answer except I wanted to fetch all parents regardless of them having children or not. I also wanted to generalise it to work across any collection that had a single level of nested children.
Here's my query based on Neil Lunn's answer
db.collection.aggregate([
{
$group: {
_id: {
$ifNull: ["$parent", "$_id"]
},
parent: {
$addToSet: {
$cond: [
{
$ifNull: ["$parent", false]
}, false, "$$ROOT"
]
}
},
children: {
$push: {
$cond: [
{
$ifNull: ["$parent", false]
}, "$$ROOT", false
]
}
}
}
}, {
$project: {
parent: {
$setDifference: ["$parent", [false]]
},
children: {
$setDifference: ["$children", [false]]
}
}
}, {
$unwind: "$parent"
}
])
This results in every parent being returned where the parent field contains the whole parent document and the children field returning either an empty array if the parent has no children or an array of child documents.
{
_id: PARENT_ID
parent: PARENT_OBJECT
children: [CHILD_OBJECTS]
}