How can I query my collection of tweets for a given variable (tweets with hashtags) using pymongo? - mongodb

I have a collection called "tweets" stored in my mongodb database called "test".
I connect to the db in the following manner:
import sys
import pymongo
import Connection from pymongo
connection = Connection()
db = connection.test
tweets = db.tweets
I get one document from my tweets as a list comprehension:
list(tweets.find())[0]
This shows me that the structure of the document is as follows:
{u'_id': ObjectId('...'),
u'contributors': None,
u'coordinates': {u'coordinates': [-placeholder coordinate, placeholder coordinate],
u'type': u'Point'},
u'created_at': u'Mon Jun 24 17:53:47 +0000 2013',
u'entities': {u'hashtags': [],
u'symbols': [],
u'urls': [],
u'user_mentions': []},
u'favorite_count': 0,
u'favorited': False,
u'filter_level': u'medium',
u'geo': {u'coordinates': [40.81019674, -73.99020695], u'type': u'Point'},
u'id': 349223842700472320L,
u'id_str': u'349223842700472320',
u'in_reply_to_screen_name': None,
u'in_reply_to_status_id': None,
u'in_reply_to_status_id_str': None,
u'in_reply_to_user_id': None,
u'in_reply_to_user_id_str': None,
u'lang': u'en',
u'place': {u'attributes': {},
u'bounding_box': {u'coordinates': [[[placeholder coordinate, placeholder coordinate],
[-placeholder coordinate, placeholder coordinate],
[-placeholder coordinate, placeholder coordinate],
[-placeholder coordinate, placeholder coordinate]]],
u'type': u'Polygon'},
u'country': u'placeholder country',
u'country_code': u'example',
u'full_name': u'name, xx',
u'id': u'user id',
u'name': u'name',
u'place_type': u'city',
u'url': u'http://api.twitter.com/1/geo/id/1820d77fb3f65055.json'},
u'retweet_count': 0,
u'retweeted': False,
u'source': u'Twitter for iPhone',
u'text': u'example text',
u'truncated': False,
u'user': {u'contributors_enabled': False,
u'created_at': u'Sat Jan 22 13:42:59 +0000 2011',
u'default_profile': False,
u'default_profile_image': False,
u'description': u'example description',
u'favourites_count': 100,
u'follow_request_sent': None,
u'followers_count': 100,
u'following': None,
u'friends_count': 100,
u'geo_enabled': True,
u'id': placeholder_id,
u'id_str': u'placeholder_id',
u'is_translator': False,
u'lang': u'en',
u'listed_count': 0,
u'location': u'example place',
u'name': u'example name',
u'notifications': None,
u'profile_background_color': u'000000',
u'profile_background_image_url': u'http://a0.twimg.com/images/themes/theme19/bg.gif',
u'profile_background_image_url_https': u'https://si0.twimg.com/images/themes/theme19/bg.gif',
u'profile_background_tile': False,
u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/241527685/1363314054',
u'profile_image_url': u'http://a0.twimg.com/profile_images/378800000038841219/8a71d0776da0c48dcc4ef6fee9f78880_normal.jpeg',
u'profile_image_url_https': u'https://si0.twimg.com/profile_images/378800000038841219/8a71d0776da0c48dcc4ef6fee9f78880_normal.jpeg',
u'profile_link_color': u'000000',
u'profile_sidebar_border_color': u'FFFFFF',
u'profile_sidebar_fill_color': u'000000',
u'profile_text_color': u'000000',
u'profile_use_background_image': False,
u'protected': False,
u'screen_name': placeholder screen_name',
u'statuses_count': xxxx,
u'time_zone': u'placeholder time_zone',
u'url': None,
u'utc_offset': -21600,
u'verified': False}}
I then query for all documents with hashtags in my collection:
list(tweets.find({'entities.hashtags.text': {"$ne":None}}))
So far so good. Now, here is my problem. I would like to sort the documents in my collection by screen_name. I try:
users = tweets.find({'entities.hashtags.text': {"$ne":None}}, {"user.screen_name":1})
for user in users:
print user.get["user.screen_name"]
but get the following error message:
TypeError Traceback (most recent call last)
/Users/home/<ipython-input-98-ea29cbbcfe27> in <module>()
1 for user in users:
----> 2 print user.get["screen_name"]
3
TypeError: 'builtin_function_or_method' object has no attribute '__getitem__'
Any idea what I'm doing wrong, here/any idea how I can fix my code?
Thanks!

You use brackets with the get method where you should use parentheses, so either access the key with user.get('screen_name') or user['screen_name'].

Related

zeep.exceptions.Fault: Server was unable to process request. ---> Object reference not set to an instance of an object

I'm trying to send request and receive the response of a soap service using the python package zeep.
But I can't do this, I get this error message:
Traceback (most recent call last):
File "/home/oussama/PycharmProjects/pythonProject/main.py", line 44, in <module>
res = client.service.addShip(**data)
File "/usr/local/lib/python3.6/dist-packages/zeep/proxy.py", line 51, in __call__
kwargs,
File "/usr/local/lib/python3.6/dist-packages/zeep/wsdl/bindings/soap.py", line 135, in send
return self.process_reply(client, operation_obj, response)
File "/usr/local/lib/python3.6/dist-packages/zeep/wsdl/bindings/soap.py", line 229, in process_reply
return self.process_error(doc, operation)
File "/usr/local/lib/python3.6/dist-packages/zeep/wsdl/bindings/soap.py", line 333, in process_error
detail=fault_node.find("detail"),
zeep.exceptions.Fault: Server was unable to process request. ---> Object reference not set to an instance of an object.
Here is my code:
import zeep
client = zeep.Client(wsdl='http://track.smsaexpress.com/SECOM/SMSAwebService.asmx?WSDL')
data = {
'passKey': 'xxxxxxx',
'refNo': None,
'sentDate': None,
'idNo': None,
'cName': None,
'cntry': None,
'cCity': None,
'cZip': None,
'cPOBox': None,
'cMobile': None,
'cTel1': None,
'cTel2': None,
'cAddr1': None,
'cAddr2': None,
'shipType': None,
'PCs': 1,
'cEmail': None,
'carrValue': None,
'carrCurr': None,
'codAmt': None,
'weight': None,
'custVal': None,
'custCurr': None,
'insrAmt': None,
'insrCurr': None,
'itemDesc': None,
'sName': None,
'sContact': None,
'sAddr1': None,
'sAddr2': None,
'sCity': None,
'sPhone': None,
'sCntry': None,
'prefDelvDate': None,
'gpsPoints': None,
}
res = client.service.addShip(**data)
print(res)
Here (Link) you can find some info about the service
The zeep Client object is looking for a string and does not like the None keyword. Change the None to "" or '' (i.e. string space) and you should be good to go.
import zeep
client = zeep.Client(wsdl='http://track.smsaexpress.com/SECOM/SMSAwebService.asmx?WSDL')
data = {
'passKey': 'xxxxxxx',
'refNo': "",
'sentDate': "",
'idNo': "",
'cName': "",
'cntry': "",
'cCity': "",
'cZip': "",
'cPOBox': "",
'cMobile': "",
'cTel1': "",
'cTel2': "",
'cAddr1': "",
'cAddr2': "",
'shipType': "",
'PCs': 1,
'cEmail': "",
'carrValue': "",
'carrCurr': "",
'codAmt': "",
'weight': "",
'custVal': "",
'custCurr': "",
'insrAmt': "",
'insrCurr': "",
'itemDesc': "",
'sName': "",
'sContact': "",
'sAddr1': "",
'sAddr2': "",
'sCity': "",
'sPhone': "",
'sCntry': "",
'prefDelvDate': "",
'gpsPoints': "",
}
res = client.service.addShip(**data)
print(res)
I think the definition from the wsdl differs from the implementation on the server side. if you change the request that all optional fields contain a valid value it will return a result stating that the passKey is incorrect.
If you use a mock tool like SoapUI that mocks the server side it is perfectly fine to send a request with the dictionary looking like this
data = {'PCs' : 1}
In a side node the wsdl has both soap1.1 and soap1.2 implemented if you mock it make sure you use the correct endpoint url otherwise you keep sending data to the original server.

Correct way to invoke the copy module with module param 'content'

I have a custom action plugin and I need to write out returned variable data on the controller to a file. I'm trying this locally right now.
copy_module_args = dict()
copy_module_args["content"] = 'test'
copy_module_args["dest"] = dest
copy_module_args["owner"] = owner
copy_module_args["group"] = group
copy_module_args["mode"] = mode
try:
result = merge_hash(result, self._execute_module(
module_name="copy",
module_args=copy_module_args,
task_vars=task_vars))
except (AnsibleError, TypeError) as err:
err_msg = "Failed to do stuff"
raise AnsibleActionFail(to_text(err_msg), to_text(err))
The result of ._execute_module is
fatal: [localhost]: FAILED! => {"changed": false, "msg": "Source None not found"}
The vaule of result is
{'msg': 'Source None not found', 'failed': True, 'invocation': {'module_args': {'content': 'VALUE_SPECIFIED_IN_NO_LOG_PARAMETER', 'dest': '/home/me/testfile', 'owner': 'me', 'group': 'me', 'mode': None, 'backup': False, 'force': True, 'follow': False, 'src': None, '_original_basename': None, 'validate': None, 'directory_mode': None, 'remote_src': None, 'local_follow': None, 'checksum': None, 'seuser': None, 'serole': None, 'selevel': None, 'setype': None, 'attributes': None, 'regexp': None, 'delimiter': None, 'unsafe_writes': None}}, '_ansible_parsed': True}
This invocation is trying to use the "src" param even though I'm only passing the "content" param. I know this because when I add "src" the failure message changes. I excepted, from the docs and from reading the copy module and template module source that at a bare minimum my implementation would result in:
- name: Copy using inline content
copy:
content: 'test'
dest: /home/me/testfile
Does anyone know what I'm missing or why "src" is being preferred over "content" even though it's not being specified?
The content: argument is just syntatic sugar for writing it to a tempfile, so I would guess you will need to take charge of that, or find a way to invoke the copy action, which apparently runs before the copy module.
I was able to see that "content" was being handled in the action plugin, not the module. I've adapted what I found to fit my needs. I call the action plugin, instead of the module directly.
copy_module_args = dict()
copy_module_args["content"] = 'test'
copy_module_args["dest"] = dest
copy_module_args["owner"] = owner
copy_module_args["group"] = group
copy_module_args["mode"] = mode
copy_module_args["follow"] = True
copy_module_args["force"] = False
copy_action = self._task.copy()
copy_action.args.update(copy_module_args)
# Removing args passed in via the playbook that aren't meant for
# the copy module
for remove in ("arg1", "arg2", "arg3", "arg4"):
copy_action.args.pop(remove, None)
try:
copy_action = self._shared_loader_obj.action_loader.get('copy',
task=copy_action,
connection=self._connection,
play_context=self._play_context,
loader=self._loader,
templar=self._templar,
shared_loader_obj=self._shared_loader_obj)
result = merge_hash(result, copy_action.run(task_vars=task_vars))
This allows me to leverage copy how I originally intended, by utilising its idempotency and checksumming without having to write my own.
changed: [localhost] => {"changed": true, "checksum": "00830d74b4975d59049f6e0e7ce551477a3d9425", "dest": "/home/me/testfile", "gid": 1617705057, "group": "me", "md5sum": "6f007f4188a0d35835f4bb84a2548b66", "mode": "0644", "owner": "me", "size": 9, "src": "/home/me/.ansible/tmp/ansible-tmp-1560715301.737494-249856394953357/source", "state": "file", "uid": 1300225668}
And running it again,
ok: [localhost] => {"changed": false, "dest": "/home/me/testfile", "src": "/home/me/testfile/.ansible/tmp/ansible-local-9531902t7jt3/tmp_nq34zm5"}

How to run the getRole command using pymongo?

I want to check whether a role exists in a mongodb, before I create a new one . I tried to do it the following way:
result = self.client[database].command("getRole", name=app_name)
Unfortunately I get the following error:
msg = msg or "%s"
raise OperationFailure(msg % errmsg, code, response)
pymongo.errors.OperationFailure: no such command: 'getRole', bad cmd: '{ getRole: 1, name: "test" }'
I am referring to this database command: https://docs.mongodb.com/manual/reference/method/db.getRole/
For createRole I can execute the command: https://docs.mongodb.com/manual/reference/method/db.createRole/#db.createRole
Shell methods db.* are different from Database commands.
Using the roleInfo command you can get information for a particular role.
db.command({
'rolesInfo': {'role': 'noremove','db': 'test'},
'showPrivileges': True, 'showBuiltinRoles': True
})
The above command returns a result in this form when there is a matching role:
{'ok': 1.0,
'roles': [{'db': 'test',
'inheritedPrivileges': [{'actions': ['find', 'insert', 'update'],
'resource': {'collection': 'test', 'db': 'test'}}],
'inheritedRoles': [],
'isBuiltin': False,
'privileges': [{'actions': ['find', 'insert', 'update'],
'resource': {'collection': 'test', 'db': 'test'}}],
'role': 'noremove',
'roles': []}]}
When there is no matching role, you get this result:
{'ok': 1.0, 'roles': []}
Checking that a role exists falls to checking for the length of the "roles" list in the returned result as follow:
noremove_role = db.command({
'rolesInfo': {'role': 'noremove','db': 'test'},
'showPrivileges': True, 'showBuiltinRoles': True
})
if not len(noremove_role['roles']):
# create role
pass
Is there a better way?
Yes, in keeping with ask forgiveness not permission philosophy, create the role and handle the resulting exception from trying to add an existing role.
from pymongo.errors import DuplicateKeyError
import logging
logger = logging.getLogger()
try:
db.command(
'createRole', 'noremove',
privileges=[{
'actions': ['insert', 'update', 'find'],
'resource': {'db': 'test', 'collection': 'test'}
}],
roles=[])
except DuplicateKeyError:
logger.error('Role already exists.')
pass

Showing KeyError: 'schedules.tasks.run' while running the django celery for periodic tasks

I've created a classes based periodic task using djcelery to send emails to the client. Task is performing the action and sending email when it is called from shell but while using the crontab, I am getting KeyError as "Schedule.tasks.run". I have added the following setting and created the tasks:
settings.py
import os
import djcelery
djcelery.setup_loader()
BROKER_URL = 'django://'
BROKER_HOST = "localhost"
BROKER_PORT = 5672
BROKER_USER = "guest"
BROKER_PASSWORD = "guest"
BROKER_VHOST = "/"
CELERYBEAT_SCHEDULER = 'djcelery.schedulers.DatabaseScheduler'
CELERY_RESULT_BACKEND = 'djcelery.backends.database:DatabaseBackend'
CELERYBEAT_SCHEDULE = {
"runs-every-30-seconds": {
"task": "schedules.tasks.EndingDrawslotScheduler.run",
"schedule": timedelta(seconds=30),
"args": (16, 16)
},
}
app.conf.timezone = 'UTC'
INSTALLED_APPS = ('djcelery',
'kombu.transport.django',)
Error-Info:
The full contents of the message body was:
{'utc': True, 'callbacks': None, 'id': '6ad19ff8-9825-4d54-a8b2-0a8322fc9fb1',
'args': [], 'taskset': None, 'retries': 0, 'timelimit': (None, None),
'kwargs': {}, 'expires': None, 'errbacks': None, 'chord': None, 'task':
'schedules.tasks.run', 'eta': None} (262b)
Traceback (most recent call last):
File "/home/s/proj/env/lib/python3.5/site-packages/celery/worker/consumer.py", line 465, in on_task_received strategies[type_](message, body,
KeyError: 'schedules.tasks.run'

How can I load data from mongodb collection into pandas' DataFrame?

I am new to pandas (well, to all things "programming"...), but have been encouraged to give it a try.
I have a mongodb database - "test" - with a collection called "tweets".
I access the database in ipython:
import sys
import pymongo
from pymongo import Connection
connection = Connection()
db = connection.test
tweets = db.tweets
the document structure of documents in tweets is as follows:
entities': {u'hashtags': [],
u'symbols': [],
u'urls': [],
u'user_mentions': []},
u'favorite_count': 0,
u'favorited': False,
u'filter_level': u'medium',
u'geo': {u'coordinates': [placeholder coordinate, -placeholder coordinate], u'type': u'Point'},
u'id': 349223842700472320L,
u'id_str': u'349223842700472320',
u'in_reply_to_screen_name': None,
u'in_reply_to_status_id': None,
u'in_reply_to_status_id_str': None,
u'in_reply_to_user_id': None,
u'in_reply_to_user_id_str': None,
u'lang': u'en',
u'place': {u'attributes': {},
u'bounding_box': {u'coordinates': [[[placeholder coordinate, placeholder coordinate],
[-placeholder coordinate, placeholder coordinate],
[-placeholder coordinate, placeholder coordinate],
[-placeholder coordinate, placeholder coordinate]]],
u'type': u'Polygon'},
u'country': u'placeholder country',
u'country_code': u'example',
u'full_name': u'name, xx',
u'id': u'user id',
u'name': u'name',
u'place_type': u'city',
u'url': u'http://api.twitter.com/1/geo/id/1820d77fb3f65055.json'},
u'retweet_count': 0,
u'retweeted': False,
u'source': u'Twitter for iPhone',
u'text': u'example text',
u'truncated': False,
u'user': {u'contributors_enabled': False,
u'created_at': u'Sat Jan 22 13:42:59 +0000 2011',
u'default_profile': False,
u'default_profile_image': False,
u'description': u'example description',
u'favourites_count': 100,
u'follow_request_sent': None,
u'followers_count': 100,
u'following': None,
u'friends_count': 100,
u'geo_enabled': True,
u'id': placeholder_id,
u'id_str': u'placeholder_id',
u'is_translator': False,
u'lang': u'en',
u'listed_count': 0,
u'location': u'example place',
u'name': u'example name',
u'notifications': None,
u'profile_background_color': u'000000',
u'profile_background_image_url': u'http://a0.twimg.com/images/themes/theme19/bg.gif',
u'profile_background_image_url_https': u'https://si0.twimg.com/images/themes/theme19/bg.gif',
u'profile_background_tile': False,
u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/241527685/1363314054',
u'profile_image_url': u'http://a0.twimg.com/profile_images/378800000038841219/8a71d0776da0c48dcc4ef6fee9f78880_normal.jpeg',
u'profile_image_url_https': u'https://si0.twimg.com/profile_images/378800000038841219/8a71d0776da0c48dcc4ef6fee9f78880_normal.jpeg',
u'profile_link_color': u'000000',
u'profile_sidebar_border_color': u'FFFFFF',
u'profile_sidebar_fill_color': u'000000',
u'profile_text_color': u'000000',
u'profile_use_background_image': False,
u'protected': False,
u'screen_name': placeholder screen_name',
u'statuses_count': xxxx,
u'time_zone': u'placeholder time_zone',
u'url': None,
u'utc_offset': -21600,
u'verified': False}}
Now, as far as I understand, pandas' main data structure - a spreadsheet-like table - is called DataFrame. How can I load the data from my "tweets" collection into pandas' DataFrame? And how can I query for a subdocument within the database?
Comprehend the cursor you got from the MongoDB before passing it to DataFrame
import pandas as pd
df = pd.DataFrame(list(tweets.find()))
If you have data in MongoDb like this:
[
{
"name": "Adam",
"age": 27,
"address":{
"number": 4,
"street": "Main Road",
"city": "Oxford"
}
},
{
"name": "Steve",
"age": 32,
"address":{
"number": 78,
"street": "High Street",
"city": "Cambridge"
}
}
]
You can put the data straight into a dataframe like this:
from pandas import DataFrame
df = DataFrame(list(db.collection_name.find({}))
And you will get this output:
df.head()
| | name | age | address |
|----|---------|------|-----------------------------------------------------------|
| 1 | "Steve" | 27 | {"number": 4, "street": "Main Road", "city": "Oxford"} |
| 2 | "Adam" | 32 | {"number": 78, "street": "High St", "city": "Cambridge"} |
However the subdocuments will just appear as JSON inside the subdocument cell. If you want to flatten objects so that subdocument properties are shown as individual cells you can use json_normalize without any parameters.
from pandas.io.json import json_normalize
datapoints = list(db.collection_name.find({})
df = json_normalize(datapoints)
df.head()
This will give the dataframe in this format:
| | name | age | address.number | address.street | address.city |
|----|--------|------|----------------|----------------|--------------|
| 1 | Thomas | 27 | 4 | "Main Road" | "Oxford" |
| 2 | Mary | 32 | 78 | "High St" | "Cambridge" |
You can load your MongoDB data to pandas DataFame using this code. It works for me.
import pymongo
import pandas as pd
from pymongo import Connection
connection = Connection()
db = connection.database_name
input_data = db.collection_name
data = pd.DataFrame(list(input_data.find()))
Use:
df=pd.DataFrame.from_dict(collection)
This is the simplest technique to achieve your aim.
import pymongo
import pandas as pd
from pymongo import Connection
conn = Connection()
db = conn.your_database_name
input_data = db.your_collection_name
pandas_data_frame = pd.DataFrame(list(input_data.find()))
print(pandas_data_frame)