Error in connecting MongoDB on Amazon EC2 using pymongo - mongodb

I'm trying to use MongoDB on Amazon EC2 using Python3.4.3.
I followed this answer: modified security groups setting and edited "/etc/mongod.conf" (comment out bind_id). However, when I run following code, I got this error:
ServerSelectionTimeoutError: SSL handshake failed: [Errno 54] Connection reset by peer
What else should I do?
The code I run is:
import pymongo
import ssl
client = pymongo.MongoClient('ec2-**-**-*-**.us-west-2.compute.amazonaws.com', 27017,
ssl=True, ssl_keyfile='/Users/S/FILENAME.pem')
db = client["test"]
db.artist
collection = db.artist
import gzip
import json
from io import StringIO
with gzip.open('artist.json.gz', "rt") as a_file:
count=0
bulk = []
for line in a_file:
jdata = json.load(StringIO(line))
bulk.append(jdata)
count += 1
if 1000 < count:
print ('bulk insert!')
collection.insert_many(bulk)
bulk = []
count = 0
if len(bulk) > 0:
collection.insert_many(bulk)

Related

Can not connect to MongoDB via the Python Web Framework Bottle

I am not able to connect through bottle with my MongoDB. The usual way works fine:
from pymongo import MongoClient
client = MongoClient('....uri_string....')
db = client.myCluster
collection = db.collection
collection.insert_one({'key': 'value'})
But if I try it this way, I am not able to connect.
from bottle import Bottle
from bottle.ext.mongo import MongoPlugin
app = Bottle()
database_name = 'myCluster'
db_uri = '....uri_string....'
db_plugin = MongoPlugin(uri=db_uri, db=database_name)
app.install(db_plugin)
#app.route('/')
def index(mongodb):
mongodb['collection'].insert({'key': 'value'})
mongodb['collection'].insert({'key': 'value2'})
return 'Inserted Data'
app.run(debug=True, reloader=True)
I get the error:
pymongo.errors.OperationFailure: Authentication failed., full error: {'ok': 0, 'errmsg': 'Authentication failed.', 'code': 8000, 'codeName': 'AtlasError'}
I am using the free MongoDB Cloud.

Parameterize the find method in python using mongo

Files to upload will be like WFSIV0101202001.318.tar.gz,WFSIV0101202001.2624.tar.gz etc.
INPUT_FILE_PATH = 'C:\Files to upload'
try:
import os
from google.cloud import storage
import sys
import pymongo
import pymongo.errors
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
except:
print("missing modules")
try:
mongo_client = MongoClient(host="xyz.com", port=27017)
Db = mongo_client['abcd']
coll = Db['shopper_journey_sitedata']
except ConnectionFailure:
print("Connection failed")
date=[]
# Thirdpartyid=[]
input_files = os.listdir(INPUT_FILE_PATH)
# looping through input files
for input_file in input_files:
x = input_file.split(".")
date.append(x[0][5:13])
tp_site_id = x[1]
# print(tp_site_id)
cur = coll.find({"third_party_site_id":tp_site_id})
for doc in cur:
print(doc)
Now i want to parameterize the find() method for every id, so that on each iteration i should get st_site_id ?
above code i tried but ist giving error as "Datas:name error"
You can do one thing
coll.find({"third_party_site_id": { $in :
[318,2624,2621,2622,102,078]}})
If Tid is an array, then you could replace 318 in your query to Tid[I]

How to create a mongoengine connection with ssh?

I'm trying to create a connection and add a document with mongoengine through an SSH tunnel.
A successful attempt with pymongo can be seen below, I simply want something similar with mongoengine. :-)
from auth import *
import pymongo
from sshtunnel import SSHTunnelForwarder
server = SSHTunnelForwarder(
(HOST_IP, HOST_PORT),
ssh_username = SSH_USER,
ssh_password = SSH_PASS,
remote_bind_address = ('localhost', 27017)
)
server.start()
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port)
db = client[MONGO_DB]
db.authenticate(MONGO_USER, MONGO_PASS)
coll = db.queue_db
coll.insert({"testFile42":43})
server.stop()
mongoengine.connect(
db=DB_NAME,
host="127.0.0.1",
port=server.local_bind_port
)

flask/MongoDB error on local server using raspberry pi3 - raspbian os

i've made a local server using flask and mongoDB which works great on windows, but when i moved my code to the raspberry pi, i've got an error which i couldn't figure out why it occurs.
the code im using:
1) for the flask server
from flask import Flask
from flask import jsonify
from flask import request
import pymongo
import time
import datetime
import json
app = Flask(__name__)
client = pymongo.MongoClient("localhost", 27017)
db = client['mqtt-db']
obs_collection = db['mqtt-collection']
#app.route("/obs")
def obs():
data_str = request.args.get("data")
print data_str
data = json.loads(data_str)
print data
data["date"] = datetime.datetime.now()
obs_collection.save(data)
return "success"
#app.route("/get_obs")
def get_obs():
res = []
for row in obs_collection.find():
del row['_id']
res.append(row)
return jsonify(res)
#app.route("/delete_all")
def delete_all():
res = obs_collection.delete_many({})
return jsonify({"deleted": res.deleted_count})
if __name__ == "__main__":
app.run(host="0.0.0.0", debug=True)
2) script for inserting messages into db , using mqtt protocol:
import paho.mqtt.client as mqtt
import pymongo
import json
import datetime
topic = "sensor"
host = "10.0.0.6"
client = pymongo.MongoClient("localhost", 27017)
db = client['mqtt-db']
mqtt_collection = db['mqtt-collection']
# The callback for when the client receives a CONNACK response from the server.
def on_connect(client, userdata, flags, rc):
print("Connected with result code "+str(rc))
# Subscribing in on_connect() means that if we lose the connection and
# reconnect then subscriptions will be renewed.
client.subscribe(topic)
# The callback for when a PUBLISH message is received from the server.
def on_message(client, userdata, msg):
data_str = str(msg.payload)
data = json.loads(data_str)
print data_str
print data
data["date"] = datetime.datetime.now()
mqtt_collection.save(data)
print(msg.topic+" "+str(msg.payload))
client = mqtt.Client()
client.on_connect = on_connect
client.on_message = on_message
client.connect(host, 1883, 60)
# Blocking call that processes network traffic, dispatches callbacks and
# handles reconnecting.
# Other loop*() functions are available that give a threaded interface and a
# manual interface.
client.loop_forever()
the error occurs when i try to retrieve data from the server using "get_obs" function.
the error is: "Value Error: dictionary update sequence element #0 has length 4; 2 is required"
appreciate your help.
as #davidism suggested, the solution was to update to the latest version of Flask

Match a running ipython notebook to a process

My server runs many long running notebooks, and I'd like to monitor the notebooks memory.
Is there a way to match between the pid or process name and a notebook?
Since the question is about monitoring notebooks' memory, I've written a complete example showing the memory consumption of the running notebooks. It is based on the excellent #jcb91 answer and a few other answers (1, 2, 3, 4).
import json
import os
import os.path
import posixpath
import subprocess
import urllib2
import pandas as pd
import psutil
def show_notebooks_table(host, port):
"""Show table with info about running jupyter notebooks.
Args:
host: host of the jupyter server.
port: port of the jupyter server.
Returns:
DataFrame with rows corresponding to running notebooks and following columns:
* index: notebook kernel id.
* path: path to notebook file.
* pid: pid of the notebook process.
* memory: notebook memory consumption in percentage.
"""
notebooks = get_running_notebooks(host, port)
prefix = long_substr([notebook['path'] for notebook in notebooks])
df = pd.DataFrame(notebooks)
df = df.set_index('kernel_id')
df.index.name = prefix
df.path = df.path.apply(lambda x: x[len(prefix):])
df['pid'] = df.apply(lambda row: get_process_id(row.name), axis=1)
# same notebook can be run in multiple processes
df = expand_column(df, 'pid')
df['memory'] = df.pid.apply(memory_usage_psutil)
return df.sort_values('memory', ascending=False)
def get_running_notebooks(host, port):
"""Get kernel ids and paths of the running notebooks.
Args:
host: host at which the notebook server is listening. E.g. 'localhost'.
port: port at which the notebook server is listening. E.g. 8888.
username: name of the user who runs the notebooks.
Returns:
list of dicts {kernel_id: notebook kernel id, path: path to notebook file}.
"""
# find which kernel corresponds to which notebook
# by querying the notebook server api for sessions
sessions_url = posixpath.join('http://%s:%d' % (host, port), 'api', 'sessions')
response = urllib2.urlopen(sessions_url).read()
res = json.loads(response)
notebooks = [{'kernel_id': notebook['kernel']['id'],
'path': notebook['notebook']['path']} for notebook in res]
return notebooks
def get_process_id(name):
"""Return process ids found by (partial) name or regex.
Source: https://stackoverflow.com/a/44712205/304209.
>>> get_process_id('kthreadd')
[2]
>>> get_process_id('watchdog')
[10, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61] # ymmv
>>> get_process_id('non-existent process')
[]
"""
child = subprocess.Popen(['pgrep', '-f', name], stdout=subprocess.PIPE, shell=False)
response = child.communicate()[0]
return [int(pid) for pid in response.split()]
def memory_usage_psutil(pid=None):
"""Get memory usage percentage by current process or by process specified by id, like in top.
Source: https://stackoverflow.com/a/30014612/304209.
Args:
pid: pid of the process to analyze. If None, analyze the current process.
Returns:
memory usage of the process, in percentage like in top, values in [0, 100].
"""
if pid is None:
pid = os.getpid()
process = psutil.Process(pid)
return process.memory_percent()
def long_substr(strings):
"""Find longest common substring in a list of strings.
Source: https://stackoverflow.com/a/2894073/304209.
Args:
strings: list of strings.
Returns:
longest substring which is found in all of the strings.
"""
substr = ''
if len(strings) > 1 and len(strings[0]) > 0:
for i in range(len(strings[0])):
for j in range(len(strings[0])-i+1):
if j > len(substr) and all(strings[0][i:i+j] in x for x in strings):
substr = strings[0][i:i+j]
return substr
def expand_column(dataframe, column):
"""Transform iterable column values into multiple rows.
Source: https://stackoverflow.com/a/27266225/304209.
Args:
dataframe: DataFrame to process.
column: name of the column to expand.
Returns:
copy of the DataFrame with the following updates:
* for rows where column contains only 1 value, keep them as is.
* for rows where column contains a list of values, transform them
into multiple rows, each of which contains one value from the list in column.
"""
tmp_df = dataframe.apply(
lambda row: pd.Series(row[column]), axis=1).stack().reset_index(level=1, drop=True)
tmp_df.name = column
return dataframe.drop(column, axis=1).join(tmp_df)
Here is an example output of show_notebooks_table('localhost', 8888):
I came here looking for the simple answer to this question, so I'll post it for anyone else looking.
import os
os.getpid()
This is possible, although I could only think of the rather hackish solution I outline below. In summary:
Get the ports each kernel (id) is listening on from the corresponding json connection files residing in the server's security directory
Parse the output of a call to netstat to determine which pid is listening to the ports found in step 1
Query the server's sessions url to find which kernel id maps to which session, and hence to which notebook. See the ipython wiki for the api. Although not all of it works for me, running IPython 2.1.0, the sessions url does.
I suspect there is a much simpler way, but I'm not sure as yet where to find it.
import glob
import os.path
import posixpath
import re
import json
import subprocess
import urllib2
# the url and port at which your notebook server listens
server_path = 'http://localhost'
server_port = 8888
# the security directory of the notebook server, containing its connections files
server_sec_dir = 'C:/Users/Josh/.ipython/profile_default/security/'
# part 1 : open all the connection json files to find their port numbers
kernels = {}
for json_path in glob.glob(os.path.join(server_sec_dir, 'kernel-*.json')):
control_port = json.load(open(json_path, 'r'))['control_port']
key = os.path.basename(json_path)[7:-5]
kernels[control_port] = {'control_port': control_port, 'key': key}
# part2 : get netstat info for which processes use which tcp ports
netstat_ouput = subprocess.check_output(['netstat', '-ano'])
# parse the netstat output to map ports to PIDs
netstat_regex = re.compile(
"^\s+\w+\s+" # protocol word
"\d+(\.\d+){3}:(\d+)\s+" # local ip:port
"\d+(\.\d+){3}:(\d+)\s+" # foreign ip:port
"LISTENING\s+" # connection state
"(\d+)$" # PID
)
for line in netstat_ouput.splitlines(False):
match = netstat_regex.match(line)
if match and match.lastindex == 5:
port = int(match.group(2))
if port in kernels:
pid = int(match.group(5))
kernels[port]['pid'] = pid
# reorganize kernels to use 'key' as keys
kernels = {kernel['key']: kernel for kernel in kernels.values()}
# part 3 : find which kernel corresponds to which notebook
# by querying the notebook server api for sessions
sessions_url = posixpath.join('%s:%d' % (server_path, server_port),
'api','sessions')
response = urllib2.urlopen(sessions_url).read()
for session in json.loads(response):
key = session['kernel']['id']
if key in kernels:
nb_path = os.path.join(session['notebook']['path'],
session['notebook']['name'])
kernels[key]['nb_path'] = nb_path
# now do what you will with the dict. I just print a pretty list version:
print json.dumps(kernels.values(), sort_keys=True, indent=4)
outputs (for me, at the moment):
[
{
"key": "9142896a-34ca-4c01-bc71-e5709652cac5",
"nb_path": "2015/2015-01-16\\serhsdh.ipynb",
"pid": 11436,
"port": 56173
},
{
"key": "1ddedd95-5673-45a6-b0fb-a3083debb681",
"nb_path": "Untitled0.ipynb",
"pid": 11248,
"port": 52191
},
{
"key": "330343dc-ae60-4f5c-b9b8-e5d05643df19",
"nb_path": "ipynb\\temp.ipynb",
"pid": 4680,
"port": 55446
},
{
"key": "888ad49b-5729-40c8-8d53-0e025b03ecc6",
"nb_path": "Untitled2.ipynb",
"pid": 7584,
"port": 55401
},
{
"key": "26d9ddd2-546a-40b4-975f-07403bb4e048",
"nb_path": "Untitled1.ipynb",
"pid": 10916,
"port": 55351
}
]
Adding to the Dennis Golomazov's answer to:
Make the code compatible with Python 3
Allow to login into a password-protected session
I replaced the get_running_notebooks function by this one (source):
import requests
import posixpath
import json
def get_running_notebooks(host, port, password=''):
"""
Get kernel ids and paths of the running notebooks.
Args:
host: host at which the notebook server is listening. E.g. 'localhost'.
port: port at which the notebook server is listening. E.g. 8888.
Returns:
list of dicts {kernel_id: notebook kernel id, path: path to notebook file}.
"""
BASE_URL = 'http://{0}:{1}/'.format(host, port)
# Get the cookie data
s = requests.Session()
url = BASE_URL + 'login?next=%2F'
resp = s.get(url)
xsrf_cookie = resp.cookies['_xsrf']
# Login with the password
params = {'_xsrf': xsrf_cookie, 'password': password}
res = s.post(url, data=params)
# Find which kernel corresponds to which notebook
# by querying the notebook server api for sessions
url = posixpath.join(BASE_URL, 'api', 'sessions')
ret = s.get(url)
#print('Status code:', ret.status_code)
# Get the notebook list
res = json.loads(ret.text)
notebooks = [{'kernel_id': notebook['kernel']['id'],
'path': notebook['notebook']['path']} for notebook in res]
return notebooks
Here is a solution that solves the access issue mentioned in other posts by first obtaining the access-token via jupyter lab list.
import requests
import psutil
import re
import os
import pandas as pd
# get all processes that have a ipython kernel and get kernel id
dfp = pd.DataFrame({'p': [p for p in psutil.process_iter() if 'ipykernel_launcher' in ' '.join(p.cmdline())]})
dfp['kernel_id'] = dfp.p.apply(lambda p: re.findall(r".+kernel-(.+)\.json", ' '.join(p.cmdline()))[0])
# get url to jupyter server with token and open once to get access
urlp = requests.utils.parse_url([i for i in os.popen("jupyter lab list").read().split() if 'http://' in i][0])
s = requests.Session()
res = s.get(urlp)
# read notebook list into dataframe and get kernel id
resapi = s.get(f'http://{urlp.netloc}/api/sessions')
dfn = pd.DataFrame(resapi.json())
dfn['kernel_id'] = dfn.kernel.apply(lambda item: item['id'])
# merge the process and notebook dataframes
df = dfn.merge(dfp, how = 'inner')
# add process info as desired
df['pid'] = df.p.apply(lambda p: p.pid)
df['mem [%]'] = df.p.apply(lambda p: p.memory_percent())
df['cpu [%]'] = df.p.apply(lambda p: p.cpu_percent())
df['status'] = df.p.apply(lambda p: p.status())
# reduce to columns of interest and sort
dfout = df.loc[:,['name','pid','mem [%]', 'cpu [%]','status']].sort_values('mem [%]', ascending=False)
I have asked similar question and in order to make it a duplicate I "reverse engineer" Dennis Golomazov's answer with focus on matching notebooks in a generic way (also manually).
Get json from the api/sessions path of your Jupyter server (i.e. https://localhost:8888/api/sessions in most cases).
Parse the json. It is a sequence of session objects (nested dicts if parsed with json module). Their .path attributes point to the notebook file, and .kernel.id is the kernel id (which is a part of path passed as an argument of python -m ipykernel_launcher, in my case `{PATH}/python -m ipykernel_launcher -f {HOME}/.local/share/jupyter/runtime/kernel-{ID}.json).
Find PID of process run with that path (e.g. by pgrep -f {ID}).