how to declare query to fetch data from mongodb? - mongodb

import pandas as pd
from pymongo import MongoClient
import matplotlib.pyplot as plt
def _connect_mongo(host, port, db):
""" A util for making a connection to mongo
if username and password:
mongo_uri = 'mongodb://%s:%s#%s:%s/%s' % (username, password, host, port, db)
conn = MongoClient(mongo_uri)
else:
"""
conn = MongoClient(host, port)
return conn[db]
def read_mongo(db, collection, host, port, query):
""" Read from Mongo and Store into DataFrame """
# Connect to MongoDB
db = _connect_mongo(host=host, port=port, db=db)
# Make a query to the specific DB and Collection
cursor = db[collection].find(query)
# Expand the cursor and construct the DataFrame
df = pd.DataFrame(list(cursor))
'''
Delete the _id
if no_id:
del df['_id']
'''
return df
#initialization
db = 'twittersmall'
collection='twitterdata'
query='{lang:{$exists: true}}'
host='localhost'
port=27017
var = read_mongo(db, collection, host, port, query)
print var
tweets_by_lang = var['lang'].value_counts()
fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Languages', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
tweets_by_lang[:5].plot(ax=ax, kind='bar', color='red')
In this code, I was trying to fetch those data from mongodb where language field exists(might be null). So in the attribute, query I assigned a filter that will be used in the fetching operation. But the problem is, when I initialize query='{lang:{$exists: true}}', query is of string datatype and query must be a dictionary. When I declare query={lang:{$exists: true}} , it says -> syntax error. Obviously because so far as I know declaration of dictionary is {'key':'value'} . And when I declare like this query={'lang':'{$exists: true}'} it doesn't work because of keyError as there's filed in the database called lang.
So, how to declare this query and pass it in the method?
ps: when I use query={lang:{$exists: true}} in Webstorm terminal, it works, but I am currently working on jupyter notebook that integrates ipython, so that I can create graph, charts using the data from mongodb. I also used pandas for dataframe.

Related

Create a mongodb collection index with expireAfterSeconds using pymongo

I have a collection in mongodb. In my python program, I have a variable named coll point at it. I want to create an index on a specified field, digestedOn, which will cause expiration of the record after 7776000 seconds.
I know how to create a simple index in python: coll.create_index([( "digestedOn", pymongo.ASCENDING)]). Where do I stick the {"expireAfterSeconds": 7776000} part?
Here's my whole program, I need the last line fixed so that the index is created with expireAfterSeconds.
import pymongo
import ssl
def connect_to_mongo(host, port, ssls, user, password, auth_source):
return pymongo.MongoClient(host, port, ssl=ssls, username=user, ssl_cert_reqs=ssl.CERT_NONE,
password=password, authSource=auth_source,
authMechanism='SCRAM-SHA-1', maxPoolSize=None)
client = connect_to_mongo(host="10.10.10.10", port=27017, ssls=True, user="user",
password="password",auth_source="admin")
db = client['logs']
colnames = db.list_collection_names()
coll = db[colnames[0]]
coll.create_index([( "digestedOn", pymongo.ASCENDING )])
Just pass it as a named parameter:
coll.create_index([( "digestedOn", pymongo.ASCENDING )], expireAfterSeconds=7776000)

How to fetch mongo db data and pass in jmeter request

I have one post call that response is like this.
{
"status":0,
"message":"Prescription Created",
"jsonResponse":{},
"cid":"C5975K",
"pid":"Rx5975K-175A",
"prescriptionSource":"GO_RX_CTO",
"imageStatus":[]
}
By taking this pid , I have to do the query for the one more record. For example:
db.order.find({"pid":"Rx5975K-175A"})
and the result of this query should pass in one more jmeter request.
I have used MongoDB Script (DEPRECATED) .. But this wont work as its deprecated ..
Tried with JSR223 Sampler, but its not working in new jmeter 3.2
import com.mongodb.*
import com.mongodb.BasicDBObject
MongoCredential coreCredential = MongoCredential.createCredential("${mongodb_user}", "${mongodb_database}", "${mongodb_password}".toCharArray());
MongoClient coreMongoClient = new MongoClient(new ServerAddress("${mongodb_server}", 13017), Arrays.asList(coreCredential));
DB coreDB = coreMongoClient.getDB("${mongodb_database}");
DBCollection coll = coreDB.getCollection("order");
coll.find();
You have to find your result based on "pid" and you are nowhere passing it. After finding you collection you need to create a query and searching using that query.
import com.mongodb.*
import com.mongodb.BasicDBObject
MongoCredential coreCredential = MongoCredential.createCredential("${mongodb_user}", "${mongodb_database}", "${mongodb_password}".toCharArray());
MongoClient coreMongoClient = new MongoClient(new ServerAddress("${mongodb_server}", 13017), Arrays.asList(coreCredential));
DB coreDB = coreMongoClient.getDB("${mongodb_database}");
DBCollection coll = coreDB.getCollection("order");
BasicDBObject query = new BasicDBObject();
query.put("pid", "Rx5975K-175A");
DBObject getData= coll.findOne(query);
and this will give you the desire result

How to view child nodes of a parent from panda dataframe?

I was trying to extract data from mongodb. So, I was using panda as a dataframe. I was using twitter dataset. The dataset was in json and when I import it in the database it looks like this:
user:Object
id:1292598776
id_str:1292598776
name:ahmd
screen_name:sameh7753
location:
url:null
description:null
protected:false
followers_count:5
friends_count:76
listed_count:0
created_at:Sat Mar 23 21:59:37 +0000 2013
favourites_count:1
utc_offset:null
time_zone:null
geo_enabled:true
lang:ar
contributors_enabled:false
is_translator:false
profile_background_color:C0DEED
profile_use_background_image:true
default_profile:true
default_profile_image:false
follow_request_sent:null
So, here 'user' is the parent and under it there are many children.There are other fields too in the dataset.
So, I was trying to execute a query which will find any tweet, tweeted on 2013 and the location of the tweet is "US". And then I was storing those cursors in the panda data frame. So when I was printing the data frame I was expecting to see those screen_name but it was not getting printed and also I couldn't access those data.
Here is the code I was using:
import pandas as pd
from pymongo import MongoClient
import matplotlib.pyplot as plt
import re
pd.set_option('display.expand_frame_repr', False)
def _connect_mongo(host, port, db):
conn = MongoClient(host, port)
return conn[db]
def read_mongo(db, collection, host, port):
""" Read from Mongo and Store into DataFrame """
# Connect to MongoDB
db = _connect_mongo(host=host, port=port, db=db)
cursor = db[collection].find({'created_at':{'$regex': '2013'}},
{'place.country':'US'}, no_cursor_timeout=True).toArray()
print cursor
# Expand the cursor and construct the DataFrame
df = pd.DataFrame(list(cursor))
return df
db = 'twittersmall' #'twitter'
collection='twitterdata' #'twitterCol' #
#query={'lang':'{$exists: true}'}
host='localhost'
port=27017
var = read_mongo(db, collection, host, port)
print var
It only prints under the user column in panda data frame this:
False {u'follow_request_sent':
u'profile_use_b...
And rest of the attributes don't get print and I can't even access them by writing var['user.screen_name'] stated in the python code.
How can I access the data?
First you have to include from pandas.io.json import json_normalize.
Now your read_mongo function should be like this-
def read_mongo(db, collection, host, port):
""" Read from Mongo and Store into DataFrame """
# Connect to MongoDB
db = _connect_mongo(host=host, port=port, db=db)
cursor = db[collection].find({'created_at':{'$regex': '2013'}},
no_cursor_timeout=True)
cursor = list(cursor)
df = json_normalize(cursor)
return df
Here json_normalaize flattens those fields which have children and make them columns of the panda dataframe.

How to add data in right collection in pymongo?

I want to add data in right collection considering by name. The code below is defining well. collection(db,name) returns the name of collection. But when I want to save the collection name via rightCollection = collections(db, name) and inserting it as db.rightCollection.insert({"1" : "Righ collection"}). Pymongo is creating the collection under name rightCollection not Peter. I want to insert data in Peter. Why is it so? Can I resolve it?
from pymongo import MongoClient
def collections(db,name):
if(name is 'Peter'):
return db.Peter
client = MongoClient()
db = client.myDB
name="Peter"
rightCollection = collections(db, name)
db.rightCollection.insert({"1" : "Righ collection"})
Using pymongo 3.2.2, you don't need the collections function, you can just use the collection name directly:
from pymongo import MongoClient
client = MongoClient()
db = client.myDB
db.Peter.insert_one({'1': 'Right collection'})
That should insert the document {'1': 'Right collection} into collection Peter under database myDB. To verify that the data is inserted correctly, you can use the mongo shell:
> use myDB
> db.Peter.find()
{ "_id": ObjectId("57df7a4f98e914c98d540992"), "1": "Right collection" }
Or, if you need the name Peter to be defined in a variable, you can do:
from pymongo import MongoClient
client = MongoClient()
db = client.myDB
coll_name = 'Peter'
db[coll_name].insert_one({'1': 'Right collection'})

iterating over a list of values of single key in a collection in mongodb

i have a collection of financial data stored in mongodb. each company symbol has its data. the question is how to iterate over the collection and change the value of the key which is the symbol company to print out the whole collection and this is my list of companies ['TSLA','TYO','C','LULU','DTV','SHS',' ZNGA'] and this is my cod which return the data of one company:
from pymongo import MongoClient
import csv
host = "localhost"
port = 27017
databaseName = "finance000"
collection_name = "income"
client = MongoClient(host, port)
database = client[databaseName]
collection = database[collection_name]
def finance_data(symbol):
Earnings_BeforeInterestAndTaxes = symbol['statement'[0]EarningsBeforeInterestAndTaxes']['content']
Total_Revenue = symbol['statement'][0]['TotalRevenue']['content']
return Earnings_BeforeInterestAndTaxes,Total_Revenue
i =collection.find({'symbol':'C'})
with open('D:/INCOMEdata.csv', "w") as output:
writer = csv.writer(output, lineterminator='\n')
for key in i :
print finance_data(key)
writer.writerow(finance_data(key))
If I understood you correctly. You want to retrieve the document/data for a given company. The company is denoted by a symbol example 'TYSLA'.
If that is what you need. Here is how you do it (assuming each symbol is unique)
company_data = collection.find({'symbol':'TYSLA'})
The above will return a dictionary. To access an element within the document you just use:
company_data['profit'] #profit is an example of an attribute name. You can use any attribute you like.
Assuming you have multiple companies with the same symbol. If you used the above command. you will get a cursor. to get each company just do a for loop example:
company_data = collection.find({'symbol':'TYSLA'})
Now to loop:
for one in company_date:
print one['profit'] #python 2.7
Now to edit the say for example the profit attribute use $set
collection.update_one({'symbol':'TYSLA'},{'profit':100})
the above will change TYSLA's company profit to 100
Update
Assuming you have a collection with the symbol being any value of ['TSLA','TYO','C','LULU','DTV','SHS',' ZNGA']. to get the data for any of the symbols you use (assuming symbol contains any of the names (only one name)):
you use the $or as:
collection.find({"$or":[ {'symbol':'TSLA},{'symbol':'TYO},... ]},{}) #the ... are the rest of the names you need
The above returns the whole data for a given symbol. To return the specifics Total_Revenue and Earnings_BeforeInterestAndTaxes you use the following:
collection.find({"$or":[ {'symbol':'TSLA},{'symbol':'TYO},... ]},{'Total_Revenue ':1,'Earnings_BeforeInterestAndTaxes ':1, _id:0 }) #if you remove _id it will always be returned
I hope that helps.