How do I get a list of all notebooks in my workspace & store their names along with full path in csv file, I have tried using Databricks CLI option but that doesn't seem to have recursive operation.
databricks workspace list
As we can see in code there is no recursive option:
https://github.com/databricks/databricks-cli/blob/master/databricks_cli/workspace/cli.py (def ls_cli)
Example solution is to import cli in python and extend it:
from databricks_cli.sdk import ApiClient
from databricks_cli.sdk import service
host = "your_host"
token = "your_token"
client = ApiClient(host=host, token=token)
objects = []
workspace = service.WorkspaceService(client)
def list_workspace_objects(path):
elements = workspace.list(path).get('objects')
if elements is not None:
for object in elements:
objects.append(object)
if(object['object_type'] == 'DIRECTORY'):
list_workspace_objects(object['path'])
list_workspace_objects("/")
print(objects)
You can use below code directly . Note : Tested Code
from pyspark.sql.types import IntegerType
from pyspark.sql.types import *
from pyspark.sql import Row
import base64
import requests
import json
databricks_instance ="databricks Instance"
url_list = f"{databricks_instance}/api/2.0/workspace/list"
url_export = f"{databricks_instance}/api/2.0/workspace/export"
payload = json.dumps({
"path": "/"
})
headers = {
'Authorization': 'Bearer token',
'Content-Type': 'application/json'
}
response = requests.request("GET", url_list, headers=headers, data=payload).json()
notebooks = []
# Getting the all notebooks list for given notebooks.
def list_notebooks(mylist):
for element in mylist['objects']:
if element['object_type'] == 'NOTEBOOK':
notebooks.append(element)
if element['object_type'] == 'DIRECTORY':
payload_inner = json.dumps({
"path": element['path']
})
response_inner = requests.request("GET", url_list, headers=headers, data=payload_inner).json()
if len(response_inner) != 0:
list_notebooks(response_inner)
return notebooks
result = list_notebooks(response)
print(result[0])
Related
I am struggeling to return a file e.g. a pdf which was uploaded to a mongodb.
I am able to upload a file to the database but I am not able to retrieve the file again.
How shoud my endpoint (view) look like to return the file?
I am using django rest framework v3.12.4 with djongo v1.3.6. I use drf-yasg v1.20.0 for the documentation of the API.
Here are my settings, models.py, serializers.py, views.py and urls.py:
# app settings.py
DATABASES = {
'default': {
'ENGINE': 'djongo',
'NAME': 'TAST_DB2',
'CLIENT': {
'host': 'localhost',
'port': 27017,
'username': 'root',
'password': 'reallychangeme', # :-)
'authSource': 'admin',
'authMechanism': 'SCRAM-SHA-1'
}
}
}
DEFAULT_FILE_STORAGE = 'mongo_storage.storage.GridFSStorage'
GRIDFS_DATABASE = 'myfiles'
BASE_URL = 'http://localhost:8085/'
UPLOADED_FILES_USE_URL = True
# models.py
from django.db import models
from djongo.storage import GridFSStorage
grid_fs_storage = GridFSStorage(collection='myfiles', base_url=''.join([settings.BASE_URL, 'myfiles/']))
class TestStandardFile(models.Model):
myfile = models.FileField(upload_to='teststandards1', storage=grid_fs_storage)
# serializers.py
from rest_framework import serializers
from teststandards.models import TestStandardFile
class TestStandardFileSerializer(serializers.ModelSerializer):
class Meta:
model = TestStandardFile
fields = '__all__'
# views.py
from rest_framework.generics import ListCreateAPIView
from .models import TestStandard
from .serializers import TestStandardFileSerializer
from rest_framework.parsers import MultiPartParser
# used for the upload
class FileView(ListCreateAPIView):
parser_classes = ( MultiPartParser,)
serializer_class = TestStandardFileSerializer
queryset = TestStandardFile.objects.all()
<---------- !ENDPOINT FOR FILE RETRIEVAL MISSING HERE???
# urls.py
urlpatterns = [
re_path(r'^api/teststandards/file', api.FileView.as_view(), name='teststandard-file'),
re_path(r'^myfiles/(?P<pk>[0-9]+)$', api.myfiles.as_view(), name='get-file'),
]
I can see my file properly uploaded to mongodb with mongodb compass.
There are three collections in it:
TAST_DB2.teststandardsFiles
TAST_DB2.myfiles.teststandards1.files
TAST_DB2.myfiles.teststandards1.chunks
I assume that I need an endpoint which gives the file back as a response.
I tried to overwrite the 'get'-function of my 'myfiles' endpoint. But I don't know
how to get the file handle from the requested file. And I do not know how to
return the file as a HttpResponse.
Any help is appreciated!
I finally got it to work. I created an RetrieveAPIView for the retrieval of one entry and overwrote the retrieve function. THis is how my views.py looked like:
# for upload
class FileView(ListCreateAPIView):
parser_classes = ( MultiPartParser,)
serializer_class = TestStandardFileSerializer
queryset = TestStandardFile.objects.all()
# download
class myfiles(RetrieveAPIView):
parser_classes = ( MultiPartParser,)
serializer_class = TestStandardFileSerializer
queryset = TestStandardFile.objects.all()
def retrieve(self, request, *args, **kwargs):
obj = self.get_object()
response = HttpResponse(obj.myfile, content_type='application/octet-stream')
response['Content-Disposition'] = 'attachment; filename=%s' % obj.myfile
return response
"myfile" is the name of the FileField from my model. With using drf_spectacular for the swagger documentation. This generated a nice download button for the file retrieval. It helped a lot during testing the upload / download functionality.
When I run this code in Jupyter notebook it runs without any error but when I want to run it in VSCODE I get an error , It works by url and a token ,then I got the Json file and exported it to csv file to open in Excel :
This is the code ;
import json
import requests
import pandas
url = "url"
headers = {"Cookie":"token"}
response = requests.get(url, headers=headers)
data = response.json()
print(data)
a=data['tickers'][0]['items'][0]['days'][0]['items']
from copy import deepcopy
import pandas
def cross_join(left, right):
new_rows = []
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def json_to_dataframe(data_in):
def flatten_json(data, prev_heading=''):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_json(value, prev_heading + '.' + key))
elif isinstance(data, list):
rows = []
for i in range(len(data)):
[rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
else:
rows = [{prev_heading[1:]: data}]
return rows
return pandas.DataFrame(flatten_json(data_in))
if __name__ == '__main__':
json_data = a
df = json_to_dataframe(json_data)
print(df)
df.to_excel("output.xlsx")
This is the Error ;
ModuleNotFoundError Traceback (most recent call last)
---> 59 df.to_excel("output.xlsx")
-> 2026 formatter.write(
--> 730 writer = ExcelWriter(stringify_path(writer), engine=engine)
---> 18 from openpyxl.workbook import Workbook
ModuleNotFoundError: No module named 'openpyxl'
What am I missing ?
Run this command from cmd window in VSCODE.
pip install openpyxl
I'm trying to scrape the live billionaire networth table here > https://www.bloomberg.com/billionaires/
This is my code so far. All I get is [] as result on the python shell.
Something has to be wrong with the "findAll", I don't think I'm using the correct tag lines.
Tried to use just "find"
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv
#Open page and grab html
my_url = ('https://www.bloomberg.com/billionaires/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML Parser.
page_soup = soup(page_html, 'html.parser')
table = []
#Find table.
ele_table = page_soup.findAll('div',{'class':'dvz-content'})
print(ele_table)
I'm expecting for the table to be printed out so I can get it into a CSV file.
Data is dynamically loaded. You can pull from script tag provided you supply the right headers. Regex out the required info and parse with json library. Hand this off to pandas to write to csv
from bs4 import BeautifulSoup as bs
import requests, re, json
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'if-none-match': 'W/^\\^5dbb59e6-91b10^\\^',
'if-modified-since': 'Thu, 31 Oct 2019 22:02:14 GMT' # this may be safeguard for caching. Consider if add dynamically.
}
p = re.compile(r'window.top500 = (.*);')
r = requests.get('https://www.bloomberg.com/billionaires/', headers = headers)
data = json.loads(p.findall(r.text)[0])
df = pd.DataFrame(data)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False)
Example output:
I am trying to write a pandas dataframe as CSV to Bluemix Object Storage from a DSX Python notebook. I first save the dataframe to a 'local' CSV file. I then have a routine that attempts to write the file to Object Storage. I get a 413 response - object too large. The file is only about 3MB. Here's my code, based on a JSON example I found here: http://datascience.ibm.com/blog/working-with-object-storage-in-data-science-experience-python-edition/
import requests
def put_file(credentials, local_file_name):
"""This function writes file content to Object Storage V3 """
url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
data = {'auth': {'identity': {'methods': ['password'],
'password': {'user': {'name': credentials['name'],'domain': {'id': credentials['domain']},
'password': credentials['password']}}}}}
headers = {'Content-Type': 'text/csv'}
with open(local_file_name, 'rb') as f:
resp1 = requests.post(url=url1, data=f, headers=headers)
return resp1
Any help or pointers is much appreciated.
This code snippet from the tutorial worked fine for me (for a 12 MB file).
from io import BytesIO
import requests
import json
import pandas as pd
def put_file(credentials, local_file_name):
"""This functions returns a StringIO object containing
the file content from Bluemix Object Storage V3."""
f = open(local_file_name,'r')
my_data = f.read()
url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
data = {'auth': {'identity': {'methods': ['password'],
'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domain_id']},
'password': credentials['password']}}}}}
headers1 = {'Content-Type': 'application/csv'}
resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
resp1_body = resp1.json()
for e1 in resp1_body['token']['catalog']:
if(e1['type']=='object-store'):
for e2 in e1['endpoints']:
if(e2['interface']=='public'and e2['region']=='dallas'):
url2 = ''.join([e2['url'],'/', credentials['container'], '/', local_file_name])
s_subject_token = resp1.headers['x-subject-token']
headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}
resp2 = requests.put(url=url2, headers=headers2, data = my_data )
print resp2
I created a random pandas dataframe using:
df = pd.DataFrame(np.random.randint(0,100,size=(1000000, 4)), columns=list('ABCD'))
saved it to csv
df.to_csv('myPandasData_1000000.csv',index=False)
and then put it to object store
put_file(credentials_1,'myPandasData_1000000.csv')
You can get the credentials_1 object by clicking insert to code -> Insert credentials for any object in your object store.
How to do HTTP PUT/POSTs from inside Groovy code without having to import any libraries (if at all possible)? I know there is a simple getText() methods that Groovy adds to the java.net.URL class, that could be used without adding any dependencies. Is there a way to do Rest PUT in the same fashion?
You can do it with HttpURLConnection in a similar way as you would do it with java:
def url = new URL('http://your_rest_endpoint')
def http = url.openConnection()
http.setDoOutput(true)
http.setRequestMethod('PUT')
http.setRequestProperty('User-agent', 'groovy script')
def out = new OutputStreamWriter(http.outputStream)
out.write('data')
out.close()
http.inputStream // read server response from it
import groovyx.net.http.RESTClient
import static groovyx.net.http.ContentType.JSON
import groovy.json.JsonSlurper
import groovy.json.JsonOutput
url = "http://restapi3.apiary.io"
#Grab (group = 'org.codehaus.groovy.modules.http-builder', module = 'http-builder', version = '0.5.0')
def client = new RESTClient(url)
def jsonObj = new JsonSlurper().parseText('{ "title": "Pick-up posters from Post-Office" }')
def response = client.put(path: "/notes/id",
contentType: JSON,
body: jsonObj,
headers: [Accept: 'application/json'])
println("Status: " + response.status)
if (response.data) {
println("Content Type: " + response.contentType)
println("Headers: " + response.getAllHeaders())
println("Body:\n" + JsonOutput.prettyPrint(JsonOutput.toJson(response.data)))
}