Only return hits in array from mongodb - mongodb

So I have a mongodb collection with some nested documents. An example document looks like this:
{
"_id": "5afa9472e937b7254a306ff6",
"import_date": "2018-05-15T08:04:02.813Z",
"some_more_things": "foo",
"meta": {
"participants": [{ "name": "Ben" }, { "name": "Mary" }],
"messages": [
{
"tokens": [
{ "token": "What" },
{ "token": "do" },
{ "token": "you" },
{ "token": "do" },
{ "token": "today" }
],
"time": "2018-05-09T08:38:19.000Z"
},
{
"tokens": [
{ "token": "Just" },
{ "token": "lying" },
{ "token": "around" }
],
"time": "2018-05-09T08:40:08.000Z"
},
{
"tokens": [
{ "token": "What" },
{ "token": "about" },
{ "token": "you" }
],
"time": "2018-05-09T08:40:11.000Z"
}
]
}
}
I'm now looking for an effective way to search for messages where a specific token is included. I'm doing this with following query:
db.conversations.find({'meta.messages.tokens.token': /^What$/i})
.projection({'import_date': 1, 'meta.messages': 1})
.sort({_id:-1})
.limit(100)
That way I find the docs I want but I get the complete messages array. Is there a way that I can get only the items of the messages array matching with my regexp? The result should look like this (so only the first and last item of my example doc).
{
"_id": "5afa9472e937b7254a306ff6",
"import_date": "2018-05-15T08:04:02.813Z",
"meta": {
"participants": [{ "name": "Ben" }, { "name": "Mary" }],
"messages": [
{
"tokens": [
{ "token": "What" },
{ "token": "do" },
{ "token": "you" },
{ "token": "do" },
{ "token": "today" }
],
"time": "2018-05-09T08:38:19.000Z"
},
{
"tokens": [
{ "token": "What" },
{ "token": "about" },
{ "token": "you" }
],
"time": "2018-05-09T08:40:11.000Z"
}
]
}
}

You can use $indexOfBytes to check if What exists in every string. You also need $map with $filter and $anyElementTrue to build your filtering condition for nested array:
db.collection.aggregate([
{
$addFields: {
"meta.messages": {
$filter: {
input: "$meta.messages",
as: "m",
cond: {
$anyElementTrue: {
$map: {
input: "$$m.tokens",
in: { $gte: [ { $indexOfBytes: [ "$$this.token", "What" ] }, 0 ] }
}
}
}
}
}
}
}
])
Mongo Playground
If you need Regex you can take a look at $regexMatch introduced in 4.2 and use it as replacement for $indexOfBytes

Related

How to filter array in nested documents in MongoDB?

I'm starting MongoDB and I have difficulties to understand how to filter some nested documents in an array. The objective if to keep only relevant data from a nested array.
Here is the data:
{
"_id": {
"$oid": "47bb"
},
"email": "myemail#gmail.com",
"orders": [
{
"orderNumber": "",
"products": [
{
"brand": "Brand 1",
"processing": {
"status": "pending"
}
}
],
"updated": {
"$date": {
"$numberLong": "1673031718883"
}
}
},
{
"orderNumber": "",
"products": [
{
"brand": "Brand 2",
"processing": {
"status": "pending"
}
}
],
"updated": {
"$date": {
"$numberLong": "1673031718883"
}
}
},
{
"orderNumber": "",
"products": [
{
"brand": "Brand 3",
"processing": {
"status": "processing"
}
}
],
"updated": {
"$date": {
"$numberLong": "1673031718883"
}
}
}
],
"privilege": {
"admin": false
},
"isVerified": {
"email": "true"
}
}
I want exactly the same data structure with 'orders.products.processing.status': 'pending'
The response from the database should be:
{
"_id": {
"$oid": "62b333644f70f94aa47bb4da"
},
"email": "myemail#gmail.com",
"orders": [
{
"orderNumber": "",
"products": [
{
"brand": "Brand 1",
"processing": {
"status": "pending"
}
}
],
"updated": {
"$date": {
"$numberLong": "1673031718883"
}
}
},
{
"orderNumber": "",
"products": [
{
"brand": "Brand 2",
"processing": {
"status": "pending"
}
}
],
"updated": {
"$date": {
"$numberLong": "1673031718883"
}
}
}
],
"privilege": {
"admin": false
},
"isVerified": {
"email": "true"
}
}
My closest attempt to a correct query is:
db.collection.aggregate([{
$unwind: '$orders'
},
{
$unwind: '$orders.products'
},
{
$match: {
"orders.products.processing.status": 'pending'
}
}, {
$group: {
_id: {
"_id": "$_id",
"email": "$email",
"orders": {
"orderNumber": "$orders.orderNumber",
"products": {
"processing": "$orders.products.processing.updated",
"brand": "$orders.products.brand",
}
},
},
products: {
$push: "$orders.products"
},
}
}, {
$project: {
products: 0,
}
}
])
The problem is that the result lose the grouping by _id and loosing the initial json structure. Thanks.
You can try this query:
First $match to get only documents which have orders.products.processing.status as pending (later will be filtered and maybe is redundant but using $map and $filter I prefer to avoid to do over all collection).
Then $project to get only desired values. Here the trick is to return in orders only the orders you want.
To accomplish that you can use $map to iterate over the array and return a new one with values that matches the filter (like a JS map).
And then the $filter. Here are filtered values whose status is not pending and returned to the map that output in the field orders.
And this without $unwind and $group :)
db.collection.aggregate([
{
"$match": {
"orders.products.processing.status": "pending"
}
},
{
"$project": {
"email": 1,
"isVerified": 1,
"privilege": 1,
"orders": {
"$map": {
"input": "$orders",
"as": "order",
"in": {
"orderNumber": "$$order.orderNumber",
"products": {
"$filter": {
"input": "$$order.products",
"cond": {
"$eq": [ "$$this.processing.status", "pending" ]
}
}
}
}
}
}
}
}
])
Example here
And also a bonus... check this example here I've added a one more $filter. It's so messy but if you can understand is quite easy. The $map from the first example return an array, so now I'm using a $filter ussing that array and filtering (not show) the objects where products is empty (i.e. where products.processing.status is not pending).

How can I merge two documents, get rid of duplicates and keep certain data?

I have the following data, which describes who is going to do what work.
Basically I want to replace the "workId" and "userId" with objects that contain all the data from their respective documents and retain the "when" data.
I am starting with this data:
{
"schedule": {
"WorkId": "4e51dc1069c27c015ede4e3e",
"daily": [
{
"when": 1,
"U_W": [
{
"workId": "3a60dc1069c27c015ede1111",
"userId": "5f60c3b7f93d8e00a1cdf414"
},
{
"workId": "3a60dc1069c27c015ede1122",
"userId": "5f60c3b7f93d8e00a1cdf415"
}
]
}
]
}
}
Here is the user table
"userSchema": [
{
_id: "5f60c3b7f93d8e00a1cdf414",
Name: "Bob"
},
{
_id: "5f60c3b7f93d8e00a1cdf415",
Name: "Joe"
}
],
Here is the work table
"workSchema": [
{
_id: "3a60dc1069c27c015ede1111",
Name: "shovel"
},
{
_id: "3a60dc1069c27c015ede1122",
Name: "hammer"
}
]
what I want to end up with is this
{
"schedule": {
"WorkId": "4e51dc1069c27c015ede4e3e",
"daily": [
{
"when": 1,
"U_W": [
{
"work": {
"id": "3a60dc1069c27c015ede1111",
"name": "shovel"
},
"user": {
"id": "5f60c3b7f93d8e00a1cdf414",
"name": "bob"
}
},
{
"work": {
"id": "3a60dc1069c27c015ede1122",
"name": "hammer"
},
"user": {
"id": "5f60c3b7f93d8e00a1cdf415",
"name": "joe"
}
}
]
}
]
}
}
Here is my first attempt:
I have it joining the the two documents
How can I get rid of the duplicates ( bob:hammer and joe:shovel ) ?
and how do I include the "when" ?
Here is the playground that provides the following :
[
{
"_id": ObjectId("5a934e000102030405000000"),
"user_info": {
"Name": "Bob",
"_id": "5f60c3b7f93d8e00a1cdf414"
},
"work_role": {
"Name": "shovel",
"_id": "3a60dc1069c27c015ede1111"
}
},
{
"_id": ObjectId("5a934e000102030405000000"),
"user_info": {
"Name": "Bob",
"_id": "5f60c3b7f93d8e00a1cdf414"
},
"work_role": {
"Name": "hammer",
"_id": "3a60dc1069c27c015ede1122"
}
},
{
"_id": ObjectId("5a934e000102030405000000"),
"user_info": {
"Name": "Joe",
"_id": "5f60c3b7f93d8e00a1cdf415"
},
"work_role": {
"Name": "shovel",
"_id": "3a60dc1069c27c015ede1111"
}
},
{
"_id": ObjectId("5a934e000102030405000000"),
"user_info": {
"Name": "Joe",
"_id": "5f60c3b7f93d8e00a1cdf415"
},
"work_role": {
"Name": "hammer",
"_id": "3a60dc1069c27c015ede1122"
}
}
]
After beating my head against the wall for some time...
I found a pretty cool feature of mongo "references"
eg:
REF_work: { type: Schema.Types.ObjectId, required: true, ref: 'work' },
REF_person: { type: Schema.Types.ObjectId, required: true, ref: 'users' },
then when I call it from my get function I add a populate to the find
assignments.find(query).populate('daily.cp.REF_person').populate('daily.cp.REF_work');
I get exactly what I want:
[
{
"_id": ObjectId("5a934e000102030405000000"),
"REF_person": {
"Name": "Bob",
"_id": "5f60c3b7f93d8e00a1cdf414"
},
"REF_work": {
"Name": "shovel",
"_id": "3a60dc1069c27c015ede1111"
}
},
{
"_id": ObjectId("5a934e000102030405000000"),
"REF_person": {
"Name": "Joe",
"_id": "5f60c3b7f93d8e00a1cdf415"
},
"REF_work": {
"Name": "hammer",
"_id": "3a60dc1069c27c015ede1122"
}
}
]

Group and aggregate on sub document in mongodb

OBS! Noob question probably :)
Given the following data, how can I query and return a summary for each index?
[
{
"title": "test",
"indexes":[
{ "id":1, "value": 0.5764860139860139860139860140 },
{ "id":2, "value": 0.3083479020979020979020979020 },
{ "id":3, "value": 0.1151660839160839160839160838 }
]
},
{
"title": "test",
"indexes":[
{ "id":1, "value": 0.5764860139860139860139860140 },
{ "id":2, "value": 0.3083479020979020979020979020 },
{ "id":3, "value": 0.1151660839160839160839160838 }
]
},
{
"title": "test",
"indexes":[
{ "id":1, "value": 0.5764860139860139860139860140 },
{ "id":2, "value": 0.3083479020979020979020979020 },
{ "id":3, "value": 0.1151660839160839160839160838 }
]
},
{
"title": "test",
"indexes":[
{ "id":1, "value": 0.5764860139860139860139860140 },
{ "id":2, "value": 0.3083479020979020979020979020 },
{ "id":3, "value": 0.1151660839160839160839160838 }
]
}
]
I.e. I want to produce something like this:
index.id:1, total: 2.305...
index.id:2, total: 1.233...
etc
db.collection.aggregate([
{
"$unwind": "$indexes"
},
{
$group: {
_id: "$indexes.id",
total: {
$sum: "$indexes.value"
}
}
}
])
try this query
you will get like this
[
{
"_id": 2,
"total": 1.2333916083916083
},
{
"_id": 1,
"total": 2.305944055944056
},
{
"_id": 3,
"total": 0.4606643356643357
}
]
db.collection.aggregate([
{
$unwind: "$indexes"
},
{
$group: {
_id: "$indexes.id",
total: {
$sum: "$indexes.value"
}
}
}
])
Working Mongo playground

Combining unique elements of arrays without $unwind

I would like to get the unique elements of all arrays in a collection. Consider the following collection
[
{
"collection": "collection",
"myArray": [
{
"name": "ABC",
"code": "AB"
},
{
"name": "DEF",
"code": "DE"
}
]
},
{
"collection": "collection",
"myArray": [
{
"name": "GHI",
"code": "GH"
},
{
"name": "DEF",
"code": "DE"
}
]
}
]
I can achieve this by using $unwind and $group like this:
db.collection.aggregate([
{
$unwind: "$myArray"
},
{
$group: {
_id: null,
data: {
$addToSet: "$myArray"
}
}
}
])
And get the output:
[
{
"_id": null,
"data": [
{
"code": "GH",
"name": "GHI"
},
{
"code": "DE",
"name": "DEF"
},
{
"code": "AB",
"name": "ABC"
}
]
}
]
However, the array "myArray" will have a lot of elements (about 6) and the number of documents passed into this stage of the pipeline will be about 600. So unwinding the array would give me a total of 3600 documents being processed. I would like to know if there's a way for me to achieve the same result without unwinding
You can use below aggregation
db.collection.aggregate([
{ "$group": {
"_id": null,
"data": { "$push": "$myArray" }
}},
{ "$project": {
"data": {
"$reduce": {
"input": "$data",
"initialValue": [],
"in": { "$setUnion": ["$$this", "$$value"] }
}
}
}}
])
Output
[
{
"_id": null,
"data": [
{
"code": "AB",
"name": "ABC"
},
{
"code": "DE",
"name": "DEF"
},
{
"code": "GH",
"name": "GHI"
}
]
}
]

Need JOLT spec file for transfer of complex JSON

I have a complex JSON object (I've simplified it for this example) that I cannot figure out the JOLT transform JSON for. Does anybody have any ideas of what the JOLT spec file should be?
Original JSON
[
{
"date": {
"isoDate": "2019-03-22"
},
"application": {
"name": "SiebelProject"
},
"applicationResults": [
{
"reference": {
"name": "Number of Code Lines"
},
"result": {
"value": 44501
}
},
{
"reference": {
"name": "Transferability"
},
"result": {
"grade": 3.1889542208002064
}
}
]
},
{
"date": {
"isoDate": "2019-03-21"
},
"application": {
"name": "SiebelProject"
},
"applicationResults": [
{
"reference": {
"name": "Number of Code Lines"
},
"result": {
"value": 45000
}
},
{
"reference": {
"name": "Transferability"
},
"result": {
"grade": 3.8
}
}
]
}
]
Desired JSON after transformation and sorting by "Name" ASC, "Date" DESC
[
{
"Name": "SiebelProject",
"Date": "2019-03-22",
"Number of Code Lines": 44501,
"Transferability" : 3.1889542208002064
},
{
"Name": "SiebelProject",
"Date": "2019-03-21",
"Number of Code Lines": 45000,
"Transferability" : 3.8
}
]
I couldn't find a way to do the sort (I'm not even sure you can sort descending in JOLT) but here's a spec to do the transform:
[
{
"operation": "shift",
"spec": {
"*": {
"date": {
"isoDate": "[#3].Date"
},
"application": {
"name": "[#3].Name"
},
"applicationResults": {
"*": {
"reference": {
"name": {
"Number of Code Lines": {
"#(3,result.value)": "[#7].Number of Code Lines"
},
"Transferability": {
"#(3,result.grade)": "[#7].Transferability"
}
}
}
}
}
}
}
}
]
After that there are some tools (like jq I think) that could do the sort.