I have a simple code:
results = mongo.group(
key = [ 'test','id' ],
....
)
temp_hash = {}
for result in results:
if temp_hash.has_key(result['test']):
temp_hash[result['test']] = int(temp_hash[result['test']]) + 1
else:
temp_hash[result['test']] = 1
How to simplify this code using only mongodb logic ?
Take a look at gcount sample at
http://www.mongodb.org/display/DOCS/Aggregation#Aggregation-Group. For example, you can do something like:
result = coll.group(['test'],
None,
{ count : 0 },
'function(obj, prev) {prev.count++;}')
Related
I have some tweets downloaded to my mongodb.
The tweet document looks something like this:
{
"_id" : NumberLong("542499449474273280"),
"retweeted" : false,
"in_reply_to_status_id_str" : null,
"created_at" : ISODate("2014-12-10T02:02:02Z"),
"hashtags" : [
"Canucks",
"allhabs",
"GoHabsGo"
]
...
}
I want a construct a query/aggregation/map-reduce that will give me the count of tweets that have the same two hash tags. For every pair of nonequal hashtags it gives me the count of tweets eg.:
{'count': 12, 'pair': ['malaria', 'Ebola']}
{'count': 1, 'pair': ['Nintendo', '8bit']}
{'count': 1, 'pair': ['guinea', 'Ebola']}
{'count': 1, 'pair': ['fitness', 'HungerGames']}
...
I've made a python script to do this:
hashtags = set()
tweets = db.tweets.find({}, {'hashtags':1})
#gather all hashtags from every tweet
for t in tweets:
hashtags.update(t['hashtags'])
hashtags = list(hashtags)
hashtag_count = []
for i, h1 in enumerate(hashtags):
for j, h2 in enumerate(hashtags):
if i > j:
count = db.tweets.find({'hashtags' : {'$all':[h1,h2]}}).count()
if count > 0:
pair = {'pair' : [h1, h2], 'count' : count}
print(couple)
db.hashtags_pairs.insert(pair)
But I want to make it just with a query or JS functions to use the map-reduce.
Any ideas?
There's no aggregation pipeline or query that can compute this from your given document structure, so you'll have to use map/reduce if you don't want to drastically change the collection structure or construct a secondary collection. The map/reduce, however, is straightforward: in the map phase, emit a pair (pair of hashtags, 1) for each pair of hashtags in the document, then sum the values for each key in the reduce phase.
var map = function() {
var tags = this.tags;
var k = tags.length;
for (var i = 0; i < k; i++) {
for (var j = 0; j < i; j++) {
if (tags[i] != tags[j]) {
var ts = [tags[i], tags[j]].sort();
emit({ "t0" : ts[0], "t1" : ts[1] }, 1)
}
}
}
}
var reduce = function(key, values) { return Array.sum(values) }
Scenario:
Collection A has 40 million records and each record has almost 20 fields.
Get 5 (defined)fields from A and change the field name and populate in collection B.
Example:
A
"_id" is the primary key here
{
"_id":123
"id":123
"title":"test"
"summary": "test"
"version":1
"parentid":12
}
B
{
"_id":123
"p$id":123
"p$parentid":12
"p$title":"test"
}
Can someone please suggest a good way to write a code for this scenario?
I wrote the code but it took 5 hrs to complete.
My Code:
config.py:
It has all Mongo DB related details.
Actual code:
from pymongo import MongoClient
import operator
import datetime
print "Start time", datetime.datetime.now()
primary_dict = {}
primary_list = []
secondary_dict = {}
secondary_list = []
missing_id = []
mismatch_id = []
alias_dict = {
"_id": "_id",
"id":"p$id"
"title": "p$title"
"parentid":"p$parentid"
}
def mongo_connect(host, port, db, collection):
client = MongoClient(host, port)
db_obj = client[db]
collection_obj = db_obj[collection]
return collection_obj
def primary():
global primary_list
global primary_dict
global secondary_dict
global secondary_list
global missing_id
primary_collection = mongo_connect(config.mongo_host, config.mongo_port, config.mongo_primary_db, config.mongo_primary_collection)
secondary_collection = mongo_connect(config.mongo_host, config.mongo_port, config.mongo_secondary_db, config.mongo_secondary_collection)
for dict1 in primary_collection.find({},{"_id":1,"title":1}).batch_size(1000):
count = 0
target_id = ''
primary_list = []
secondary_list = []
target_id = dict1['_id']
primary_list.insert(count, dict1)
if (secondary_collection.find_one({"_id":target_id})) is None:
missing_id.append(target_id)
continue
else:
secondary_list.insert(count,secondary_collection.find_one({"_id":target_id}))
compare(primary_list, secondary_list)
def compare(list1, list2):
global alias_dict
global mismatch_id
global missing_id
for l1, l2 in zip(primary_list,secondary_list):
if len(l1) != len(l2):
mismatch_id.append(l1['_id'])
continue
else:
for key, value in l1.items():
if value != l2[alias_dict[key]]:
mismatch_id.append(l1['_id'])
primary()
print "Mismatch id list", mismatch_id
print "Missing Id list", missing_id
print "End time", datetime.datetime.now()
Well you could do this:
db.eval(function(){
db.primary_collection.find({},
{ id: 1, parentid: 1, title: 1 }).forEach(function(doc){
var newDoc = {};
Object.keys(doc).forEach(function(key) {
var newKey = ( key == "_id" ) ? key : "p$" + key;
newDoc[newKey] = doc[key];
});
db.secondary_collection.insert(newDoc);
});
})
Which uses db.eval() to execute the code on the server, which will be as fast as you will get.
But please read the documentation on this as you will be "locking" the database while this operation takes place. And of course you cannot do this across servers if that is your intent.
this is my collection structure :
coll{
id:...,
fieldA:{
fieldA1:[
{
...
}
],
fieldA2:[
{
text: "ciao",
},
{
text: "hello",
},
]
}
}
i want to extract all fieldA2 in my collection but if the fieldA2 is in two or more times i want show only one.
i try this
Db.runCommand({distinct:’coll’,key:’fieldA.fieldA2.text’})
but nothing. this return all filedA1 in the collection.
so i try
db.coll.group( {
key: { 'fieldA.fieldA2.text': 1 },
cond: { } },
reduce: function ( curr, result ) { },
initial: { }
} )
but this return an empty array...
How i can do this and see the execution time?? thank u very match...
Since you are running 2.0.4 (I recommend upgrading), you must run this through MR (I think, maybe there is a better way). Something like:
map = function(){
for(i in this.fieldA.fieldA2){
emit(this.fieldA.fieldA2[i].text, 1);
// emit per text value so that this will group unique text values
}
}
reduce = function(values){
// Now lets just do a simple count of how many times that text value was seen
var count = 0;
for (index in values) {
count += values[index];
}
return count;
}
Will then give you a collection of documents whereby _id is the unique text value from fieldA2 and the value field is of the amount of times is appeared i the collection.
Again this is a draft and is not tested.
I think the answer is simpler than a Map/Reduce .. if you just want distinct values plus execution time, the following should work:
var startTime = new Date()
var values = db.coll.distinct('fieldA.fieldA2.text');
var endTime = new Date();
print("Took " + (endTime - startTime) + " ms");
That would result in a values array with a list of distinct fieldA.fieldA2.text values:
[ "ciao", "hello", "yo", "sayonara" ]
And a reported execution time:
Took 2 ms
I have a hash coming back from an XML datasource that looks like this:
{...,
'records' :{
'record' :[
{'availability' :{'$t' :'available'}, ...},
{'availability' :{'$t' :'available'}, ...}
]
}
};
I'd like to get all the record hashes into an array so I can filter() it and do some other operations. However, when I have this statement in my pre block,
raw_records = raw.pick("$..record");
the array that gets returned is an array of two empty strings:
var raw_records = ['', ''];
The odd thing is that I can pick out just availability with expected results:
availability = raw.pick("$..availability.$t");
producing
var availability = ['available', 'available'];
What's wrong with my first pick()?
EDIT: Here is a more complete version that should help with reproducing the problem. It's slightly different, since I'm using the JSON version of the web service now:
global {
datasource hbll <- "https://svc.lib.byu.edu/services/catalog/v1/search/?field=isbn&format=json&terms=";
}
rule new_rule {
select when pageview "amazon.com/.*/?dp/(.*)/" setting (isbn)
pre {
//This is the array with two empty strings...
raw = datasource:hbll(isbn);
myfilter = function(x) { x.pick("availability") eq "available"; };
records = raw.filter(myfilter);
len = records.length();
availability = records.pick("$..availability");
middleman = len > 1 => availability[0] | availability;
available = middleman eq "available" => true | false;
url_list = records.pick("$..url");
url = len > 1 => url_list[0] | url_list;
msg = <<
<p>This book is available for checkout at the BYU Library.</p>
More information
>>;
}
notify("BYU Harold B. Lee Library", msg) with sticky=true;
}
I'm going to need a more complete example. The test app and results I got are below:
ruleset a8x167 {
meta {
name "Pick - Array of Hashes"
description <<
Testing
>>
author "Sam Curren"
logging on
}
dispatch {}
global {
raw = {
'records' :{
'record' :[
{'availability' :{'$t' :'available'}},
{'availability' :{'$t' :'available'}}
]
}
};
}
rule test {
select when pageview ".*" setting ()
pre {
raw_records = raw.pick("$..record");
availability = raw.pick("$..availability.$t");
}
notify("Hello World", "This is a sample rule.");
}
}
And Results:
var raw_records = [{'availability' :{'$t' :'available'}}, {'availability' :{'$t' :'available'}}];
var availability = ['available', 'available'];
I run an IRC bot and I have a function which returns 1 random url using Math.random at the moment, from my Mongodb collection.
I would like to refactor it to return x number of unique items, and for each subsequent invocation of the url fetching command .getlinks I would like that it keeps everything unique, so that a user doesn't see the same link unless all the possible links have been already returned.
Is there some algorithm or native mongodb function I could use for this?
Here's a sample scenario:
I have a total of 9 records in the collection. They have a _id and url field.
user a: .getlinks()
bot returns: http://unique-link-1, http://unique-link-2, http://unique-link-3, http://unique-link-4
user a: .getlinks()
bot returns: http://unique-link-5, http://unique-link-6, http://unique-link-7, http://unique-link-8
user a: .getlinks()
bot returns: http://unique-link-9, http://unique-link-6, http://unique-link-1, http://unique-link-3
Background information:
There's a total of about 200 links. I estimate that will grow to around 5000 links by the end of next year.
Currently the only thing I can think of is keeping an array of all returned items, and grabbing all items from the collection at once and getting a random one 4 times and making sure it's unique and hasn't been shown already.
var shown = [], amountToReturn = 4;
function getLinks() {
var items = links.find(), returned = [];
for ( var i = 0; i<amountToReturn; i++ ) {
var rand = randItem( items );
if ( shown.indexOf( rand.url ) == -1 && shown.length < items.length ) ) {
returned.push( rand.url );
}
}
message.say( returned.join(',') );
}
You should find a number of possible options to get random item(s) from Collection here ...
http://jira.mongodb.org/browse/SERVER-533
Another intersting method is documented here ...
http://cookbook.mongodb.org/patterns/random-attribute/
The method mentioned above basically creates a new key/value on the document using Math.random()
> db.docs.drop()
> db.docs.save( { key : 1, ..., random : Math.random() } )
> db.docs.save( { key : 1, ..., random : Math.random() } )
> db.docs.save( { key : 2, ..., random : Math.random() } )
... many more insertions with 'key : 2' ...
> db.docs.save( { key : 2, ..., random : Math.random() } )
...
Get random records form mongodb via map/reduce
// map
function() {
emit(0, {k: this, v: Math.random()})
}
// reduce
function(k, v) {
var a = []
v.forEach(function(x) {
a = a.concat(x.a ? x.a : x)
})
return {a:a.sort(function(a, b) {
return a.v - b.v;
}).slice(0, 3 /*how many records you want*/)};
}
// finalize
function(k, v) {
return v.a.map(function(x) {
return x.k
})
}