In each document,
the records is an array containing many duplicated objects.
and in buy_items there are also containing many duplicated items.
How could I clean the duplicated items ?
Original documents:
{
"_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records": [
{
"DATE": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"5210 ",
"5210 "
]
},
{
"DATE": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"5210 ",
"5210 "
]
}
{
"DATE": new Date("2012-12-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"1234 ",
" "
]
}
]
}
Expected Output:
{
"_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records": [
{
"DATE": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 "
]
},
{
"DATE": new Date("2012-12-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"1234 ",
" "
]
}
]
}
With Michaels solution, the output might looks like this
{
"_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records": [
"date": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 "
"1234 ",
" "
]
]
}
You can remove duplicated objects using the aggregation framework
db.collection.aggregate(
[
{ $unwind: "$records" },
{ $unwind: "$records.buy_items" },
{ $group: { "_id": {id: "$_id", date: "$records.DATE" }, buy_items: { $addToSet: "$records.buy_items" }}},
{ $group: {"_id": "$_id.id", records: { $push: {"date": "$_id.date", "buy_items": "$buy_items" }}}}, { $sort: { "records.0.date": 1 }} ,
{ $out: "collection" }
]
)
The $out operator let you write your aggregation result in specified collection or Replace you existing collection.
Even better using "Bulk" operations
var bulk = bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.aggregate([
{ "$unwind": "$records" },
{ "$project": {
"date": "$records.DATE",
"buy_items": { "$setIntersection": "$records.buy_items" }
}},
{ "$unwind": "$buy_items" },
{ "$group": {
"_id": { "id": "$_id", "date": "$date" },
"buy_items": { "$addToSet": "$buy_items" }
}},
{ "$group": {
"_id": "$_id.id",
"records": { "$push": {
"date": "$_id.date",
"buy_items": "$buy_items"
}}
}}
]).forEach(function(doc) {
bulk.find({"_id": doc._id}).updateOne({
"$set": { "records": doc.records }
});
count++;
if (count % 500 == 0) {
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
})
if (count % 500 != 0)
bulk.execute();
Result:
{
"_id" : "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records" : [
{
"date" : ISODate("2012-12-08T00:00:00Z"),
"buy_items" : [
" ",
"1234 ",
"5210 "
]
},
{
"date" : ISODate("1996-02-08T00:00:00Z"),
"buy_items" : [
"5210 "
]
}
]
}
If you want to update your current collections without creating new collection and drop previous collection. I tried this but doing this you should run two different update commands.
First update records with distinct like this :
db.collectionName.update({},{"$set":{"records":db.collectionName.distinct('records')}})
and second update for buy_items with distinct like this :
db.collectionName.update({},{"$set":{"records.0.buy_items":db.collectionName.distinct('records.buy_items')}})
If you want to avoid two update query then follow Michael answer .
You could try using the forEach() method of the find() cursor to iterate over each document properties, check for uniqueness and filter distinct values as follows:
db.collection.find().forEach(function(doc){
var records = [], seen = {};
doc.records.forEach(function (item){
var uniqueBuyItems = item["buy_items"].filter(function(i, pos) {
return item["buy_items"].indexOf(i) == pos;
});
item["buy_items"] = uniqueBuyItems;
if (JSON.stringify(item["buy_items"]) !== JSON.stringify(seen["buy_items"])) {
records.push(item);
seen["buy_items"] = item["buy_items"];
}
});
doc.records = records;
db.collection.save(doc);
})
Related
There are 15,000 documents in collection
This is old collection
[
{
"_id" : ObjectId("611f0b9f9964fea718ccea5f"),
"quotationNO" : "Q-000001",
"note": "21-8-2021<->send to DC<->John<#>21-8-2021<->OK<->Bob"
}
{
"_id" : ObjectId("611f2afa9964fea718ccea9c"),
"quotationNO" : "Q-000002",
"note": "22-8-2021<->send to DC<->Bob"
}
]
This is new collection . I want to modify note field from string to object array like this. what is the best solution to do?
[
{
"_id" : ObjectId("611f0b9f9964fea718ccea5f"),
"quotationNO" : "Q-000001",
"note": [
{
"data": "21-8-2021",
"message": "send to DC",
"user": "John"
},
{
"data": "21-8-2021",
"message": "OK",
"user": "Bob"
}
]
}
{
"_id" : ObjectId("611f2afa9964fea718ccea9c"),
"quotationNO" : "Q-000002",
"note": [
{
"data": "22-8-2021",
"message": "send to DC",
"user": "Bob"
}
]
}
]
Chain up $split and $map to split your note string and create the desired object. Finally do a $merge to upsert into new_collection.
db.collection.aggregate([
{
"$addFields": {
"note": {
"$split": [
"$note",
"<#>"
]
}
}
},
{
"$addFields": {
"note": {
"$map": {
"input": "$note",
"as": "n",
"in": {
$split: [
"$$n",
"<->"
]
}
}
}
}
},
{
"$addFields": {
"note": {
"$map": {
"input": "$note",
"as": "n",
"in": {
"data": {
"$arrayElemAt": [
"$$n",
0
]
},
"message": {
"$arrayElemAt": [
"$$n",
1
]
},
"user": {
"$arrayElemAt": [
"$$n",
2
]
}
}
}
}
}
},
{
"$merge": {
"into": "new_collection",
"on": "_id",
"whenMatched": "replace",
"whenNotMatched": "insert"
}
}
])
Here is the Mongo Playground for your reference.
You can try following these steps:
$project required fields and $split note by <#>
Afterwards using JS $function build from obtained arrays new objects by splitting elements by <-> separator and assign function result to new field note;
function(new_note){
let result = [];
for(let i = 0; i < new_note.length; i++){
const nested = new_note[i].split('<->');
result.push( {data:nested[0], message:nested[1],user:nested[2]});
}
return result
}
Afterwards $project required fields
Use MongoDb $merge to save data in new collection.
db.collection.aggregate([
{
$project: {
new_note: {
$split: [
"$note",
"<#>"
]
},
quotationNO: 1
}
},
{
$addFields: {
note: {
$function: {
body: "function(new_note){let result = []; for(let i = 0; i < new_note.length; i++){ const nested = new_note[i].split('<->'); result.push( {data:nested[0], message:nested[1],user:nested[2]}); } return result}",
args: [
"$new_note"
],
lang: "js"
}
}
}
},
{
$project: {
note: 1,
quotationNO: 1
}
},
{
$merge: {
into: "new_collection",
on: "_id",
whenMatched: "replace",
whenNotMatched: "insert"
}
}
])
I have a collection structured like this:
_id name date
1 Dave 15.02.2014
2 Dave 24.01.2014
3 Dave 20.01.2014
...
I need to aggregate the first and the last date for every name, such that I end up with something like this:
_id name First_Date Last_Date
1 Dave 20.01.2014 15.02.2014
...
I was trying the following query, but it did not work out well, the fie
db.users.aggregate([
{ "$sort": {
"date": 1
}},
{ "$group": {
"_id": 1,
"name": {"$first": "$name"},
"First_Date": {"$first": "$date"},
"Last_Date": {"$last": "$date"}
}}
]
Question: How can this be achieved with a mongo query?
To do it how the data is presently formed, you need to re-order the string to a "lexical" or `"yyyymmdd" format to allow it to sort:
db.users.aggregate([
{ "$group": {
"_id": "$name",
"first_date": {
"$min": {
"$concat": [
{ "$substrCP": [ "$date", 6, 4 ] },
{ "$substrCP": [ "$date", 3, 2 ] },
{ "$substrCP": [ "$date", 0, 2 ] }
]
}
},
"last_date": {
"$max": {
"$concat": [
{ "$substrCP": [ "$date", 6, 4 ] },
{ "$substrCP": [ "$date", 3, 2 ] },
{ "$substrCP": [ "$date", 0, 2 ] }
]
}
}
}}
])
Noting that for
Depending on your MongoDB version then that is $substrCP for Modern releases or $substr for older releases.
You also want to $group on "name" instead and use the $min and $max accumulators for the values.
To fix your dates as you really should then you can run something like this:
let ops = [];
db.users.find({ "date": { "$type": 2 } }).forEach(doc => {
let dt = new Date(
`${doc.date.substr(6,4)}/${doc.date.substr(3,2)}/${doc.date.substr(0,2)}`
);
ops = [
...ops,
{
"updateOne": {
"filter": { "_id": doc._id },
"update": { "$set": { "date": dt } }
}
}
];
if ( ops.length >= 500 ) {
db.users.bulkWrite(ops);
ops = [];
}
});
if ( ops.length > 0 ) {
db.users.bulkWrite(ops);
ops = [];
}
I'm trying to count the number of votes per question for the following schema.
[
{
"_id": "564b9e13583087872176dbd2",
"question": "fav NFL team",
"choices": [
{
"text": "St. Louis Rams",
"_id": "564b9e13583087872176dbd7",
"votes": [
{
"ip": "::ffff:192.168.15.130",
"_id": "564b9e30583087872176dbd8"
},
{
"ip": "::ffff:192.168.1.1",
"_id": "564bb355e4e1b7200da92668"
}
]
},
{
"text": "Oakland Raiders",
"_id": "564b9e13583087872176dbd6",
"votes": [
{
"ip": "::ffff:192.168.1.135",
"_id": "564bb273e4e1b7200da92667"
}
]
},
{
"text": "Denver Broncos",
"_id": "564b9e13583087872176dbd5",
"votes": []
},
{
"text": "Kansas City Chiefs",
"_id": "564b9e13583087872176dbd4",
"votes": [
{
"ip": "::ffff:192.168.1.100",
"_id": "564bab48e4e1b7200da92666"
}
]
},
{
"text": "Detroit Lions",
"_id": "564b9e13583087872176dbd3",
"votes": [
{
"ip": "::ffff:192.168.15.1",
"_id": "564b9f41583087872176dbd9"
}
]
}
]
}
]
I'm assuming I am going to have to use aggregate and sum.
I was able to get the count for the choices array, but I'm not sure how to go deeper.
db.polls.aggregate([{$unwind: '$choices'}, {$group:{_id:'$_id', 'sum':{$sum:1}}}])
The vote count for "fav NFL team" would be 5.
Also, for reference here is my mongoose code that generated the schema
var mongoose = require('mongoose');
var voteSchema = new mongoose.Schema({
ip: 'String'
});
var choiceSchema = new mongoose.Schema({
text: String,
votes: [voteSchema]
});
exports.PollSchema = new mongoose.Schema({
question: {
type: String,
required: true
},
choices: [choiceSchema]
});
I figured out how to do it in mango, I needed another unwind.
db.polls.aggregate([
{$unwind: '$choices'},
{$unwind:'$choices.votes'},
{$group:{
_id:'$_id',
'sum':{
$sum:1
}
}}
])
And here it is in mongoose
Poll.aggregate([
{$unwind: '$choices'},
{$unwind: '$choices.votes'},
{$group:{
_id: '$_id',
'sum': {
$sum:1
}
}}
], function(err, result) {
if (err) {
console.log(err);
}
res.json(result);
});
I've the following model
var messageSchema = new Schema({
creationDate: { type: Date, default: Date.now },
comment: { type: String },
author: { type: Schema.Types.ObjectId }
});
var conversationSchema = new Schema({
title: { type: String },
author: { type : Schema.Types.ObjectId },
members: [ { type: Schema.Types.ObjectId } ],
creationDate: { type: Date, default: Date.now },
lastUpdate: { type: Date, default: Date.now },
comments: [ messageSchema ]
});
I want to create two methods to get the comments generated after a date by user or by conversationId.
By User
I tried with the following method
var query = {
members : { $all : [ userId, otherUserId ], "$size" : 2 }
, comments : { $elemMatch : { creationDate : { $gte: from } } }
};
When there are no comments after the specified date (at from) the method returns [] or null
By conversationId
The same happen when I try to get by user id
var query = { _id : conversationId
, comments : { $elemMatch : { creationDate : { $gte: from } } }
};
Is there any way to make the method returns the conversation information with an empty comments?
Thank you!
Sounds like a couple of problems here, but stepping through them all
In order to get more than a single match "or" none from an array to need the aggregation framework of mapReduce to do this. You could try "projecting" with $elemMatch but this can only return the "first" match. i.e:
{ "a": [1,2,3] }
db.collection.find({ },{ "$elemMatch": { "$gte": 2 } })
{ "a": [2] }
So standard projection does not work for this. It can return an "empty" array but it an also only return the "first" that is matched.
Moving along, you also have this in your code:
{ $all : [ userId, otherUserId ], "$site" : 2 }
Where $site is not a valid operator. I think you mean $size but there are actuall "two" operators with that name and your intent may not be clear here.
If you mean that the array you are testing must have "only two" elements, then this is the operator for you. If you meant that the matched conversation between the two people had to be equal to both in the match, then $all does this anyway so the $size becomes redundant in either case unless you don't want anyone else in the conversation.
On to the aggregation problem. You need to "filter" the content of the array in a "non-destructive way" in order to get more than one match or an empty array.
The best approach for this is with modern MongoDB features available from 2.6, which allows the array content to be filtered without processing $unwind:
Model.aggregate(
[
{ "$match": {
"members": { "$all": [userId,otherUserId] }
}},
{ "$project": {
"title": 1,
"author": 1,
"members": 1,
"creationDate": 1,
"lastUpdate": 1,
"comments": {
"$setDifference": [
{ "$map": {
"input": "$comments",
"as": "c",
"in": { "$cond": [
{ "$gte": [ "$$c.creationDate", from ] },
"$$c",
false
]}
}},
[false]
]
}
}}
],
function(err,result) {
}
);
That uses $map which can process an expression against each array element. In this case the vallues are tested under the $cond ternary to either return the array element where the condition is true or otherwise return false as the element.
These are then "filtered" by the $setDifference operator which essentially compares the resulting array of $map to the other array [false]. This removes any false values from the result array and only leaves matched elements or no elements at all.
An alternate may have been $redact but since your document contains "creationDate" at multiple levels, then this messes with the logic used with it's $$DESCEND operator. This rules that action out.
In earlier versions "not destroying" the array needs to be treated with care. So you need to do much the same "filter" of results in order to get the "empty" array you want:
Model.aggregate(
[
{ "$match": {
"$and": [
{ "members": userId },
{ "members": otherUserId }
}},
{ "$unwind": "$comments" },
{ "$group": {
"_id": "$_id",
"title": { "$first": "$title" },
"author": { "$first": "$author" },
"members": { "$first": "$members" },
"creationDate": { "$first": "$creationDate" },
"lastUpdate": { "$first": "$lastUpdate" },
"comments": {
"$addToSet": {
"$cond": [
{ "$gte": [ "$comments.creationDate", from ] },
"$comments",
false
]
}
},
"matchedSize": {
"$sum": {
"$cond": [
{ "$gte": [ "$comments.creationDate", from ] },
1,
0
]
}
}
}},
{ "$unwind": "$comments" },
{ "$match": {
"$or": [
{ "comments": { "$ne": false } },
{ "matchedSize": 0 }
]
}},
{ "$group": {
"_id": "$_id",
"title": { "$first": "$title" },
"author": { "$first": "$author" },
"members": { "$first": "$members" },
"creationDate": { "$first": "$creationDate" },
"lastUpdate": { "$first": "$lastUpdate" },
"comments": { "$push": "$comments" }
}},
{ "$project": {
"title": 1,
"author": 1,
"members": 1,
"creationDate": 1,
"lastUpdate": 1,
"comments": {
"$cond": [
{ "$eq": [ "$comments", [false] ] },
{ "$const": [] },
"$comments"
]
}
}}
],
function(err,result) {
}
)
This does much of the same things, but longer. In order to look at the array content you need to $unwind the content. When you $group back, you look at each element to see if it matches the condition to decide what to return, also keeping a count of the matches.
This is going to put some ( one with $addToSet ) false results in the array or only an array with the entry false where there are no matches. So yo filter these out with $match but also testing on the matched "count" to see if no matches were found. If no match was found then you don't throw away that item.
Instead you replace the [false] arrays with empty arrays in a final $project.
So depending on your MongoDB version this is either "fast/easy" or "slow/hard" to process. Compelling reasons to update a version already many years old.
Working example
var async = require('async'),
mongoose = require('mongoose'),
Schema = mongoose.Schema;
mongoose.connect('mongodb://localhost/aggtest');
var memberSchema = new Schema({
name: { type: String }
});
var messageSchema = new Schema({
creationDate: { type: Date, default: Date.now },
comment: { type: String },
});
var conversationSchema = new Schema({
members: [ { type: Schema.Types.ObjectId } ],
comments: [messageSchema]
});
var Member = mongoose.model( 'Member', memberSchema );
var Conversation = mongoose.model( 'Conversation', conversationSchema );
async.waterfall(
[
// Clean
function(callback) {
async.each([Member,Conversation],function(model,callback) {
model.remove({},callback);
},
function(err) {
callback(err);
});
},
// add some people
function(callback) {
async.map(["bill","ted","fred"],function(name,callback) {
Member.create({ "name": name },callback);
},callback);
},
// Create a conversation
function(names,callback) {
var conv = new Conversation();
names.forEach(function(el) {
conv.members.push(el._id);
});
conv.save(function(err,conv) {
callback(err,conv,names)
});
},
// add some comments
function(conv,names,callback) {
async.eachSeries(names,function(name,callback) {
Conversation.update(
{ "_id": conv._id },
{ "$push": { "comments": { "comment": name.name } } },
callback
);
},function(err) {
callback(err,names);
});
},
function(names,callback) {
Conversation.findOne({},function(err,conv) {
callback(err,names,conv.comments[1].creationDate);
});
},
function(names,from,callback) {
var ids = names.map(function(el) {
return el._id
});
var pipeline = [
{ "$match": {
"$and": [
{ "members": ids[0] },
{ "members": ids[1] }
]
}},
{ "$project": {
"members": 1,
"comments": {
"$setDifference": [
{ "$map": {
"input": "$comments",
"as": "c",
"in": { "$cond": [
{ "$gte": [ "$$c.creationDate", from ] },
"$$c",
false
]}
}},
[false]
]
}
}}
];
//console.log(JSON.stringify(pipeline, undefined, 2 ));
Conversation.aggregate(
pipeline,
function(err,result) {
if(err) throw err;
console.log(JSON.stringify(result, undefined, 2 ));
callback(err);
}
)
}
],
function(err) {
if (err) throw err;
process.exit();
}
);
Which produces this output:
[
{
"_id": "55a63133dcbf671918b51a93",
"comments": [
{
"comment": "ted",
"_id": "55a63133dcbf671918b51a95",
"creationDate": "2015-07-15T10:08:51.217Z"
},
{
"comment": "fred",
"_id": "55a63133dcbf671918b51a96",
"creationDate": "2015-07-15T10:08:51.220Z"
}
],
"members": [
"55a63133dcbf671918b51a90",
"55a63133dcbf671918b51a91",
"55a63133dcbf671918b51a92"
]
}
]
Note the "comments" only contain the last two entries which are "greater than or equal" to the date which was used as input ( being the date from the second comment ).
I have a collection with documents like this:
[
{
"user_id": 1,
"prefs": [
"item1",
"item2",
"item3",
"item4"
]
},
{
"user_id": 2,
"prefs": [
"item2",
"item5",
"item3"
]
},
{
"user_id": 3,
"prefs": [
"item4",
"item3",
"item7"
]
}
]
What I want is to write an aggregation which will get a user_id and producer a list containing all users mapped to the number of same prefs in their lists. for example if I run the aggregation for user_id = 1, I have to get:
[
{
"user_id": 2,
"same": 1
},
{
"user_id": 3,
"same": 2
}
]
You cannot write any query here with input as simple as "user_id": 1 here, but you can retrieve the document for that user and then get a comparison of that data to the other documents you are retrieving:
var doc = db.collection.findOne({ "user_id": 1 });
db.collection.aggregate([
{ "$match": { "user_id": { "$ne": 1 } } },
{ "$project": {
"_id": 0,
"user_id": 1
"same": { "$size": { "$setIntersection": [ "$prefs", doc.prefs ] } }
}}
])
Which is one approach, but also not that much different to comparing each document in the client:
function intersect(a,b) {
var t;
if (b.length > a.length) t = b, b = a, a = t;
return a.filter(function(e) {
if (b.indexOf(e) != -1) return true;
});
}
var doc = db.collection.findOne({ "user_id": 1 });
db.collection.find({ "user_id": { "$ne": 1 } }).forEach(function(mydoc) {
printjson({
"user_id": mydoc.user_id,
"same": intersect(mydoc.prefs, doc.prefs).length
});
});
It's the same thing. You are not really "aggregating" anything here but just making comparisons of one documents content against the other. Of course you can ask the aggregation framework to do something like "filter" out anything that does not have a similar match:
var doc = db.collection.findOne({ "user_id": 1 });
db.collection.aggregate([
{ "$match": { "user_id": { "$ne": 1 } } },
{ "$project": {
"_id": 0,
"user_id": 1
"same": { "$size": { "$setIntersection": [ "$prefs", doc.prefs ] } }
}},
{ "$match": { "same": { "$gt": 0 } }}
])
Though actually that would be more efficient to remove any documents with a zero count before doing the projection:
var doc = db.collection.findOne({ "user_id": 1 });
db.collection.aggregate([
{ "$match": { "user_id": { "$ne": 1 } } },
{ "$redact": {
"$cond": {
"if": { "$gt": [
{ "$size": { "$setIntersection": [ "$prefs", doc.prefs ] } },
0
]},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}},
{ "$project": {
"_id": 0,
"user_id": 1
"same": { "$size": { "$setIntersection": [ "$prefs", doc.prefs ] } }
}}
])
And at least then that would make some sense to do the server processing.
But otherwise, it's all pretty much the same, with possibly a "little" more overhead on the client working out the "intersection" here.