Mongo DB - map relational data to document structure - mongodb

I have a dataset containing 30 million rows in a mongo collection. An example set of records would be:
{"_id" : ObjectId("568bc0f2f7cd2653e163a9e4"),
"EmailAddress" : "1234#ab.com",
"FlightNumber" : 1043,
"FlightTime" : "10:00"},
{"_id" : ObjectId("568bc0f2f7cd2653e163a9e5"),
"EmailAddress" : "1234#ab.com",
"FlightNumber" : 1045,
"FlightTime" : "12:00"},
{"_id" : ObjectId("568bc0f2f7cd2653e163a9e6"),
"EmailAddress" : "5678#ab.com",
"FlightNumber" : 1045,
"FlightTime" : "12:00"},
This has been imported directly from SQL server, hence the relational'esque nature of the data.
How can I best map this data to another collection so that all the data is then grouped by EmailAddress with the FlightNumbers nested? An example of the output would then be:
{"_id" : ObjectId("can be new id"),
"EmailAddress" : "1234#ab.com",
"Flights" : [{"Number":1043, "Time":"10:00"},{"Number":1045, "Time":"12:00"}]},
{"_id" : ObjectId("can be new id"),
"EmailAddress" : "5678#ab.com",
"Flights" : [{"Number":1045, "Time":"12:00"}]},
I've been working on an import routing that iterates through each record in the source collection and then bulk inserts into the second collection. This is working fine however doesn't allow me to group the data unless I back process through the records which adds a huge time overhead to the import routine.
The code for this would be:
var sourceDb = db.getSiblingDB("collectionSource");
var destinationDb = db.getSiblingDB("collectionDestination");
var externalUsers=sourceDb.CRM.find();
var index = 0;
var contactArray = new Array();
var identifierArray = new Array();
externalUsers.forEach(function(doc) {
//library code for NewGuid omitted
var guid = NewGuid();
//buildContact and buildIdentifier simply create 2 js objects based on the parameters
contactArray.push(buildContact(guid, doc.EmailAddress, doc.FlightNumber));
identifierArray.push(buildIdentifier(guid, doc.EmailAddress));
index++;
if (index % 1000 == 0) {
var now = new Date();
var dif = now.getTime() - startDate.getTime();
var Seconds_from_T1_to_T2 = dif / 1000;
var Seconds_Between_Dates = Math.abs(Seconds_from_T1_to_T2);
print("Written " + index + " items (" + Seconds_Between_Dates + "s from start)");
}
//bulk insert in batches
if (index % 5000 == 0) {
destinationDb.Contacts.insert(contactArray);
destinationDb.Identifiers.insert(identifierArray);
contactArray = new Array();
identifierArray = new Array();
}
});
Many thanks in advance

Hey there and welcome to MongoDB. In this situation you may want to consider using two different Collections -- one for users and one for flights.
User:
{
_id:
email:
}
Flight:
{
_id:
userId:
number: // if number is unique, you can actually specify _id as number
time:
}
In your forEach loop, you would first check to see if a user document with that specific email address already exists. If it doesn't, create it. Then use the User document's unique identifier to insert a new document into the Flights collection, storing the identifier under the field userId (or maybe passengerId?).

Related

Mongo Document Increment Sequence is skipping numbers

I'm currently having issues when querying for one of my Documents inside a Database through Meteor.
Using this line of code I'm trying to retrieve the next sequence number out of the DB. But it sometimes skips numbers randomly for some reason.
var col = MyCounters.findOne(type);
MyCounters.update(col._id, {$inc: {seq: 1}});
return col.seq;
Not getting any kind of errors server side.
Does anybody know what the issue might be?
I'm on Meteor 1.4+
====================
Update
I also update another Collection with the new value obtained from MyCounters collection, so it would be something like this:
var col = MyCounters.findOne(type);
MyCounters.update(col._id, {$inc: {seq: 1}});
var barId = col.seq;
// declare barObject + onInsertError
barObject.barId = barId;
// ...
FooCollection.insert(barObject, onInsertError);
And FooCollection ends up having skipped sequence numbers up to 5000 sometimes.
If you want increament at that item Document, you can use :
var col = MyCounters.findOne(type);
var valueOne = 1;
var nameItem = 'seq';
var inc = {};
inc[ nameItem ] = valueOne;
MyCounters.update({ _id: col._id }, { '$inc': inc } )
But if you want increament value from all Document from Collections MyCounters ( maks seq + 1 ) you can use :
var count = MyCounters.findOne({}, {sort:{seq:-1}}).seq;
count = count + 1;
MyCounters.update({_id:col._id}, {$set:{seq:count}})
I hope it work for you. Thanks
refer to : https://stackoverflow.com/a/33968766/4287229

How should you query a collection via nested arrays in mongodb (Meteor)?

I'm dont think that this is a Meteor specific question, but rather around mongo and building mongo queries.
If I have the following structure,
{
username : someName,
contacts : [
{
userid : asdfae33rtqqxxx,
name : contactName,
status : friend
}
{
userid : asdfae33rtqqxxx,
name : anotherName,
status : pending
}
{
userid : asdfae33rtqqxxx,
name : contactName,
status : blocked
}
]
}
How could I pass in values from this array into a query against the users collection, to a) get the users in this array, or b) get all users in this array from the users collection with a particular status.
If this is not possible, how should I adjust my schema in order to make these sorts of queries possible?
This function will return a Meteor.users cursor based on an array of contacts and an optionally required status:
var usersByContacts = function(contacts, requiredStatus) {
var userIds = _.chain(contacts)
.map(function(c) {
if (requiredStatus) {
if (c.status === requiredStatus)
return c.userid;
} else {
return c.userid;
}
})
.compact()
.value();
return Meteor.users.find({_id: {$in: userIds}});
};
You can use it like this:
var users1 = usersByContacts(thing.contacts);
var users2 = usersByContacts(thing.contacts, 'pending');
This assumes thing has the schema referenced in your question. Also note that if you find this is a common pattern in your code, you should consider turning usersByContacts into a transform.

MongoDB update sub document array or dictionary

I’m trying to decide which of the following schemas is most efficient for implementation with mongodb. I require to keep track of friend id’s & mutual friend counts for each user in a system (user_id is unique across the collection). The number of friends may be up to 100,000.
Schema 1
{
“_id” : “…”,
“user_id” : “1”,
friends : {
“2” : {
“id” : “2”,
“mutuals” : 3
}
“3” : {
“id” : “3”,
“mutuals”: “1”
}
“4” : {
“id” : “4”,
“mutuals”: “5”
}
}
}
Schema 2
{
“_id” : “…”,
“user_id” : “1”,
friends : [
{
“id” : “2”,
“mutuals” : 3
},
{
“id” : “3”,
“mutuals”: 1
},
{
“id” : “4”,
“mutuals”: 5
}
]
}
Requirements:
Given a user_id and friend id update the document such that if friend id exists increment mutuals by 1, else add new friend with a mutuals of 1
Given a user_id and friend id update the document such that if friend exists and mutual count > 1 then decrement mutual count by 1, else remove friend from document
With a list of ids, lookup in the document to identify which friend ids exist (I know this is something that can be done client side, but am interested in server side solution)
What indexes should be used to speed up the above?
In my work in progress I have implemented much of this with schema 1, but am now starting to realise it may not be as suitable as schema 2. However, I am having trouble finding the most efficient methods for the above questions.
AFAIK, points 1 & 2 cannot be done in a single statement in mongoDB.
You would probably have to query mongodb to check if the particular user_id, friend.id combination exists.
If it does then update else add to friend array.
Refer to JavaSscript code below:
use <dbname>;
var FriendsList;
var FriendId = "9";
var UserId = "1";
var Friends = db.Friends.findOne({"user_id":UserId, "friends.id":FriendId});
if (Friends != null){
print ("Friends is not null");
FriendsList = Friends.friends;
// print (FriendsList.toSource());
for (var i = 0; i < FriendsList.length; i++){
var curFriend = FriendsList[i];
if (curFriend["id"] == FriendId){
curFriend["mutuals"] = curFriend["mutuals"] + 1;
FriendsList[i] = curFriend;
break;
}
}
}
if (Friends == null){
print ("Friends is null");
Friends = db.Friends.findOne({"user_id":UserId});
FriendsList = Friends.friends;
FriendsList.push({"id":FriendId, "mutuals":1});
// print (FriendsList.toSource());
}
Friends.friends = FriendsList;
db.Friends.save(Friends);
Pls share if you find a better way to do this.

Meteor and DBRefs

I'm using meteor 0.3.7 in Win7(32) and trying to create a simple logging system using 2 MongoDB collections to store data that are linked by DBRef.
The current pseudo schema is :
Users {
username : String,
password : String,
created : Timestamp,
}
Logs {
user_id : DBRef {$id, $ref}
message : String
}
I use server methods to insert the logs so I can do some upserts on the clients collection.
Now I want to do an old "left join" and display a list of the last n logs with the embedded User name.
I don't want to embed the Logs in Users because the most used operation is getting the last n logs. Embedding in my opinion was going to have a big impact in performance.
What is the best approach to achieve this?
Next it was great if possible to edit the User name and all items change theis name
Regards
Playing around with Cursor.observe answered my question. It may not be the most effective way of doing this, but solves my future problems of derefering DBRefs "links"
So for the server we need to publish a special collection. One that can enumerate the cursor and for each document search for the corresponding DBRef.
Bare in mind this implementation is hardcoded and should be done as a package like UnRefCollection.
Server Side
CC.Logs = new Meteor.Collection("logs");
CC.Users = new Meteor.Collection("users");
Meteor.publish('logsAndUsers', function (page, size) {
var self = this;
var startup = true;
var startupList = [], uniqArr = [];
page = page || 1;
size = size || 100;
var skip = (page - 1) * size;
var cursor = CC.Logs.find({}, {limit : size, skip : skip});
var handle = cursor.observe({
added : function(doc, idx){
var clone = _.clone(doc);
var refId = clone.user_id.oid; // showld search DBRefs
if (startup){
startupList.push(clone);
if (!_.contains(uniqArr, refId))
uniqArr.push(refId);
} else {
// Clients added logs
var deref = CC.Users.findOne({_id : refid});
clone.user = deref;
self.set('logsAndUsers', clone._id, clone);
self.flush();
}
},
removed : function(doc, idx){
self.unset('logsAndUsers', doc._id, _.keys(doc));
self.flush();
},
changed : function(new_document, idx, old_document){
var set = {};
_.each(new_document, function (v, k) {
if (!_.isEqual(v, old_document[k]))
set[k] = v;
});
self.set('logsAndUsers', new_document._id, set);
var dead_keys = _.difference(_.keys(old_document), _.keys(new_document));
self.unset('logsAndUsers', new_document._id, dead_keys);
self.flush();
},
moved : function(document, old_index, new_index){
// Not used
}
});
self.onStop(function(){
handle.stop();
});
// Deref on first Run
var derefs = CC.Users.find({_id : {$in : uniqArr} }).fetch();
_.forEach(startupList, function (item){
_.forEach(derefs, function(ditems){
if (item["user_id"].oid === ditems._id){
item.user = ditems;
return false;
}
});
self.set('logsAndUsers', item._id, item);
});
delete derefs; // Not needed anymore
startup = false;
self.complete();
self.flush();
});
For each added logs document it'll search the users collection and try to add to the logs collection the missing information.
The added function is called for each document in the logs collection in the first run I created a startupList and an array of unique users ids so for the first run it'll query the db only once. Its a good idea to put a paging mechanism to speed up things.
Client Side
On the client, subscribe to the logsAndUsers collection, if you want to make changes do it directly to the Logs collection.
LogsAndUsers = new Meteor.collection('logsAndUser');
Logs = new Meteor.colection('logs'); // Changes here are observed in the LogsAndUsers collection
Meteor.autosubscribe(function () {
var page = Session.get('page') || 1;
Meteor.subscribe('logsAndUsers', page);
});
Why not just also store the username in the logs collection as well?
Then you can query on them directly without needing any kind of "join"
If for some reason you need to be able to handle that username change, you just fetch the user object by name, then query on Logs with { user_id : user._id }

Removing documents while preserving at least one

I have a MongoDB collection containing history data with id and timestamp.
I want to delete data from the collection older than a specific
timestamp. But for every id at least one
document (the newest) must stay in the collection.
Suppose I have the following documents in my collection ...
{"id" : "11", "timestamp" : ISODate("2011-09-09T10:27:34.785Z")} //1
{"id" : "11", "timestamp" : ISODate("2011-09-08T10:27:34.785Z")} //2
{"id" : "22", "timestamp" : ISODate("2011-09-05T10:27:34.785Z")} //3
{"id" : "22", "timestamp" : ISODate("2011-09-01T10:27:34.785Z")} //4
... and I want to delete documents having a timestamp older than
2011-09-07 then
1 and 2 should not be deleted because they are newer.
4 should be deleted because it is older, but 3 should not be deleted
(although it is older) because
at least one document per id should stay in the collection.
Does anyone know how I can do this with casbah and/or on the mongo
console?
Regards,
Christian
I can think of a couple of ways. First, try this:
var cutoff = new ISODate("2011-09-07T00:00:00.000Z");
db.testdata.find().forEach(function(data) {
if (data.timestamp.valueOf() < cutoff.valueOf()) {
// A candidate for deletion
if (db.testdata.find({"id": data.id, "timestamp": { $gt: data.timestamp }}).count() > 0) {
db.testdata.remove({"_id" : data._id});
}
}
});
This does the job you want. Or you can use a MapReduce job to do it as well. Load this into a text file:
var map = function() {
emit(this.id, {
ref: this._id,
timestamp: this.timestamp
});
};
var reduce = function(key, values) {
var cutoff = new ISODate("2011-09-07T00:00:00.000Z");
var newest = null;
var ref = null;
var i;
for (i = 0; i < values.length; ++i) {
if (values[i].timestamp.valueOf() < cutoff.valueOf()) {
// falls into the delete range
if (ref == null) {
ref = values[i].ref;
newest = values[i].timestamp;
} else if (values[i].timestamp.valueOf() > newest.valueOf()) {
// This one is newer than the one we are currently saving.
// delete ref
db.testdata.remove({_id : ref});
ref = values[i].ref;
newest = values[i].timestamp;
} else {
// This one is older
// delete values[i].ref
db.testdata.remove({_id : values[i].ref});
}
} else if (ref == null) {
ref = values[i].ref;
newest = values[i].timestamp;
}
}
return { ref: ref, timestamp: newest };
};
Load the above file into the shell: load("file.js");
Then run it: db.testdata.mapReduce(map, reduce, {out: "results"});
Then remove the mapReduce output: db.results.drop();