Write csv to Ibm bluemix object storage from DSX python 2.7 notebook - ibm-cloud

I am trying to write a pandas dataframe as CSV to Bluemix Object Storage from a DSX Python notebook. I first save the dataframe to a 'local' CSV file. I then have a routine that attempts to write the file to Object Storage. I get a 413 response - object too large. The file is only about 3MB. Here's my code, based on a JSON example I found here: http://datascience.ibm.com/blog/working-with-object-storage-in-data-science-experience-python-edition/
import requests
def put_file(credentials, local_file_name):
"""This function writes file content to Object Storage V3 """
url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
data = {'auth': {'identity': {'methods': ['password'],
'password': {'user': {'name': credentials['name'],'domain': {'id': credentials['domain']},
'password': credentials['password']}}}}}
headers = {'Content-Type': 'text/csv'}
with open(local_file_name, 'rb') as f:
resp1 = requests.post(url=url1, data=f, headers=headers)
return resp1
Any help or pointers is much appreciated.

This code snippet from the tutorial worked fine for me (for a 12 MB file).
from io import BytesIO
import requests
import json
import pandas as pd
def put_file(credentials, local_file_name):
"""This functions returns a StringIO object containing
the file content from Bluemix Object Storage V3."""
f = open(local_file_name,'r')
my_data = f.read()
url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
data = {'auth': {'identity': {'methods': ['password'],
'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domain_id']},
'password': credentials['password']}}}}}
headers1 = {'Content-Type': 'application/csv'}
resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
resp1_body = resp1.json()
for e1 in resp1_body['token']['catalog']:
if(e1['type']=='object-store'):
for e2 in e1['endpoints']:
if(e2['interface']=='public'and e2['region']=='dallas'):
url2 = ''.join([e2['url'],'/', credentials['container'], '/', local_file_name])
s_subject_token = resp1.headers['x-subject-token']
headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}
resp2 = requests.put(url=url2, headers=headers2, data = my_data )
print resp2
I created a random pandas dataframe using:
df = pd.DataFrame(np.random.randint(0,100,size=(1000000, 4)), columns=list('ABCD'))
saved it to csv
df.to_csv('myPandasData_1000000.csv',index=False)
and then put it to object store
put_file(credentials_1,'myPandasData_1000000.csv')
You can get the credentials_1 object by clicking insert to code -> Insert credentials for any object in your object store.

Related

Open binary file data with Spark - ValueError: The truth value of a Series is ambiguous

Having the following binary file (mp3) that send audio to a service in Azure to be trascripted.
The following code works in Databricks.
import os
import requests
url = "https://endpoint_service"
headers = {
'Ocp-Apim-Subscription-Key': 'MyKey',
'Content-Type': 'audio/mpeg'
}
def send_audio_transcript(url, payload, header):
"""Send audio.mp3 to a Azure service to be transcripted to text."""
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()
full_path = <my_path>file.mp3
with open(full_path, mode='rb') as file: # b is important -> binary
fileContent = file.read()
send_audio_transcript(url, fileContent, headers) # a POST request its works
But my audio files are in a sensitive storage in Data lake and the only way to access them is by spark read.
looking for the documentation the way to read a binary file is.
df = spark.read.format("binaryFile").load(full_path)
display(df)
path || modificationTime || length || content
path || sometime || some_lenght || 2dnUwAC
first try:
content = df.content
test_service = send_audio_transcript(url, content , headers)
ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
Second try(convert spark to pandas):
pandas_df = df.toPandas()
content = pandas_df["content"]
test_service = send_audio_transcript(url, content , headers)
Valuerror:ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
What is the exactly translate in python-pyspark to:
with open(full_path, mode='rb') as file: # b is important -> binary
fileContent = file.read()
Your content data comming from Spark is not the same as the content data comming from open file.
From spark and later pandas you have a pandas series but from open the file you will have a class bytes
with open(full_path, mode='rb') as file: # b is important -> binary
fileContent = file.read()
print(type(fileContent)) # will return <class 'bytes'>
but from Spark
input_df = spark.read.format("binaryFile").load(full_path)
pandas_df = input_df.toPandas()
content = pandas_df['content']
print(type(content)) # return <class 'pandas.core.series.Series'>
In your case to fix your problem you need to take just the first element of the series.
content_good = content[0]
print(content_good) # you have your <class 'bytes'> wich is what you need

How to return a file from django API which was saved in a mongodb?

I am struggeling to return a file e.g. a pdf which was uploaded to a mongodb.
I am able to upload a file to the database but I am not able to retrieve the file again.
How shoud my endpoint (view) look like to return the file?
I am using django rest framework v3.12.4 with djongo v1.3.6. I use drf-yasg v1.20.0 for the documentation of the API.
Here are my settings, models.py, serializers.py, views.py and urls.py:
# app settings.py
DATABASES = {
'default': {
'ENGINE': 'djongo',
'NAME': 'TAST_DB2',
'CLIENT': {
'host': 'localhost',
'port': 27017,
'username': 'root',
'password': 'reallychangeme', # :-)
'authSource': 'admin',
'authMechanism': 'SCRAM-SHA-1'
}
}
}
DEFAULT_FILE_STORAGE = 'mongo_storage.storage.GridFSStorage'
GRIDFS_DATABASE = 'myfiles'
BASE_URL = 'http://localhost:8085/'
UPLOADED_FILES_USE_URL = True
# models.py
from django.db import models
from djongo.storage import GridFSStorage
grid_fs_storage = GridFSStorage(collection='myfiles', base_url=''.join([settings.BASE_URL, 'myfiles/']))
class TestStandardFile(models.Model):
myfile = models.FileField(upload_to='teststandards1', storage=grid_fs_storage)
# serializers.py
from rest_framework import serializers
from teststandards.models import TestStandardFile
class TestStandardFileSerializer(serializers.ModelSerializer):
class Meta:
model = TestStandardFile
fields = '__all__'
# views.py
from rest_framework.generics import ListCreateAPIView
from .models import TestStandard
from .serializers import TestStandardFileSerializer
from rest_framework.parsers import MultiPartParser
# used for the upload
class FileView(ListCreateAPIView):
parser_classes = ( MultiPartParser,)
serializer_class = TestStandardFileSerializer
queryset = TestStandardFile.objects.all()
<---------- !ENDPOINT FOR FILE RETRIEVAL MISSING HERE???
# urls.py
urlpatterns = [
re_path(r'^api/teststandards/file', api.FileView.as_view(), name='teststandard-file'),
re_path(r'^myfiles/(?P<pk>[0-9]+)$', api.myfiles.as_view(), name='get-file'),
]
I can see my file properly uploaded to mongodb with mongodb compass.
There are three collections in it:
TAST_DB2.teststandardsFiles
TAST_DB2.myfiles.teststandards1.files
TAST_DB2.myfiles.teststandards1.chunks
I assume that I need an endpoint which gives the file back as a response.
I tried to overwrite the 'get'-function of my 'myfiles' endpoint. But I don't know
how to get the file handle from the requested file. And I do not know how to
return the file as a HttpResponse.
Any help is appreciated!
I finally got it to work. I created an RetrieveAPIView for the retrieval of one entry and overwrote the retrieve function. THis is how my views.py looked like:
# for upload
class FileView(ListCreateAPIView):
parser_classes = ( MultiPartParser,)
serializer_class = TestStandardFileSerializer
queryset = TestStandardFile.objects.all()
# download
class myfiles(RetrieveAPIView):
parser_classes = ( MultiPartParser,)
serializer_class = TestStandardFileSerializer
queryset = TestStandardFile.objects.all()
def retrieve(self, request, *args, **kwargs):
obj = self.get_object()
response = HttpResponse(obj.myfile, content_type='application/octet-stream')
response['Content-Disposition'] = 'attachment; filename=%s' % obj.myfile
return response
"myfile" is the name of the FileField from my model. With using drf_spectacular for the swagger documentation. This generated a nice download button for the file retrieval. It helped a lot during testing the upload / download functionality.

Get list of all notebooks in my databricks workspace

How do I get a list of all notebooks in my workspace & store their names along with full path in csv file, I have tried using Databricks CLI option but that doesn't seem to have recursive operation.
databricks workspace list
As we can see in code there is no recursive option:
https://github.com/databricks/databricks-cli/blob/master/databricks_cli/workspace/cli.py (def ls_cli)
Example solution is to import cli in python and extend it:
from databricks_cli.sdk import ApiClient
from databricks_cli.sdk import service
host = "your_host"
token = "your_token"
client = ApiClient(host=host, token=token)
objects = []
workspace = service.WorkspaceService(client)
def list_workspace_objects(path):
elements = workspace.list(path).get('objects')
if elements is not None:
for object in elements:
objects.append(object)
if(object['object_type'] == 'DIRECTORY'):
list_workspace_objects(object['path'])
list_workspace_objects("/")
print(objects)
You can use below code directly . Note : Tested Code
from pyspark.sql.types import IntegerType
from pyspark.sql.types import *
from pyspark.sql import Row
import base64
import requests
import json
databricks_instance ="databricks Instance"
url_list = f"{databricks_instance}/api/2.0/workspace/list"
url_export = f"{databricks_instance}/api/2.0/workspace/export"
payload = json.dumps({
"path": "/"
})
headers = {
'Authorization': 'Bearer token',
'Content-Type': 'application/json'
}
response = requests.request("GET", url_list, headers=headers, data=payload).json()
notebooks = []
# Getting the all notebooks list for given notebooks.
def list_notebooks(mylist):
for element in mylist['objects']:
if element['object_type'] == 'NOTEBOOK':
notebooks.append(element)
if element['object_type'] == 'DIRECTORY':
payload_inner = json.dumps({
"path": element['path']
})
response_inner = requests.request("GET", url_list, headers=headers, data=payload_inner).json()
if len(response_inner) != 0:
list_notebooks(response_inner)
return notebooks
result = list_notebooks(response)
print(result[0])

Inserting JSON into postgresql from Flask-Sql Alchemy

I want to insert JSON type of data in PostgreSQL Database from flask
eg: {"a":[1,2,3] , "b":[1,2,3]}
One example for such data is Phone.no and Childrens, One person can have multiple ph.no and Childrens
In flask View
#app.route('/add', methods=['POST'])
def addRex():
Name = request.form[‘name’]
data = request.get_json(force=False, silent=False, cache=True)
p = Projects(name=name,data = data)
db.session.add(p)
db.session.commit()
In HTTP post method
def addData():
name = input ('Enter Name :')
data = input('Enter Data :')
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
r = requests.post(localhost:5000/add,
data ={'name':name}, json={'data':data})
if (r.status_code == 200):print(' Added Successfully!!') else:print('Already exists!')
How can I insert such kind of data from flask into postgresql.
if Anyone can help me with my problem.
Thanks in advance
From sqlalchemy dialect, you can select JSON for Postgres. Here is an example,
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.dialects.postgresql import JSON
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'postgres://username:password#localhost:5432/db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = True
db = SQLAlchemy(app)
class Person(db.Model):
person_name = db.Column(db.Text, primary_key=True)
details = db.Column(JSON)
# db.create_all() ==> for creating the db
per_1 = Person(person_name='Batman', details={"phone_no": [5, 6, 7, 8, 9], "children": {"son": [
"Dick Grayson", "Jason Todd", "Damian Wayne", "Tim Drake"], "daughter": ['Helena Wayne']}})
db.session.add(per_1)
db.session.commit()

Webscraper not giving the right results with bs4

I'm trying to scrape the live billionaire networth table here > https://www.bloomberg.com/billionaires/
This is my code so far. All I get is [] as result on the python shell.
Something has to be wrong with the "findAll", I don't think I'm using the correct tag lines.
Tried to use just "find"
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv
#Open page and grab html
my_url = ('https://www.bloomberg.com/billionaires/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML Parser.
page_soup = soup(page_html, 'html.parser')
table = []
#Find table.
ele_table = page_soup.findAll('div',{'class':'dvz-content'})
print(ele_table)
I'm expecting for the table to be printed out so I can get it into a CSV file.
Data is dynamically loaded. You can pull from script tag provided you supply the right headers. Regex out the required info and parse with json library. Hand this off to pandas to write to csv
from bs4 import BeautifulSoup as bs
import requests, re, json
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'if-none-match': 'W/^\\^5dbb59e6-91b10^\\^',
'if-modified-since': 'Thu, 31 Oct 2019 22:02:14 GMT' # this may be safeguard for caching. Consider if add dynamically.
}
p = re.compile(r'window.top500 = (.*);')
r = requests.get('https://www.bloomberg.com/billionaires/', headers = headers)
data = json.loads(p.findall(r.text)[0])
df = pd.DataFrame(data)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False)
Example output: