I'm trying to scrape the live billionaire networth table here > https://www.bloomberg.com/billionaires/
This is my code so far. All I get is [] as result on the python shell.
Something has to be wrong with the "findAll", I don't think I'm using the correct tag lines.
Tried to use just "find"
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv
#Open page and grab html
my_url = ('https://www.bloomberg.com/billionaires/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML Parser.
page_soup = soup(page_html, 'html.parser')
table = []
#Find table.
ele_table = page_soup.findAll('div',{'class':'dvz-content'})
print(ele_table)
I'm expecting for the table to be printed out so I can get it into a CSV file.
Data is dynamically loaded. You can pull from script tag provided you supply the right headers. Regex out the required info and parse with json library. Hand this off to pandas to write to csv
from bs4 import BeautifulSoup as bs
import requests, re, json
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'if-none-match': 'W/^\\^5dbb59e6-91b10^\\^',
'if-modified-since': 'Thu, 31 Oct 2019 22:02:14 GMT' # this may be safeguard for caching. Consider if add dynamically.
}
p = re.compile(r'window.top500 = (.*);')
r = requests.get('https://www.bloomberg.com/billionaires/', headers = headers)
data = json.loads(p.findall(r.text)[0])
df = pd.DataFrame(data)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False)
Example output:
Related
How do I get a list of all notebooks in my workspace & store their names along with full path in csv file, I have tried using Databricks CLI option but that doesn't seem to have recursive operation.
databricks workspace list
As we can see in code there is no recursive option:
https://github.com/databricks/databricks-cli/blob/master/databricks_cli/workspace/cli.py (def ls_cli)
Example solution is to import cli in python and extend it:
from databricks_cli.sdk import ApiClient
from databricks_cli.sdk import service
host = "your_host"
token = "your_token"
client = ApiClient(host=host, token=token)
objects = []
workspace = service.WorkspaceService(client)
def list_workspace_objects(path):
elements = workspace.list(path).get('objects')
if elements is not None:
for object in elements:
objects.append(object)
if(object['object_type'] == 'DIRECTORY'):
list_workspace_objects(object['path'])
list_workspace_objects("/")
print(objects)
You can use below code directly . Note : Tested Code
from pyspark.sql.types import IntegerType
from pyspark.sql.types import *
from pyspark.sql import Row
import base64
import requests
import json
databricks_instance ="databricks Instance"
url_list = f"{databricks_instance}/api/2.0/workspace/list"
url_export = f"{databricks_instance}/api/2.0/workspace/export"
payload = json.dumps({
"path": "/"
})
headers = {
'Authorization': 'Bearer token',
'Content-Type': 'application/json'
}
response = requests.request("GET", url_list, headers=headers, data=payload).json()
notebooks = []
# Getting the all notebooks list for given notebooks.
def list_notebooks(mylist):
for element in mylist['objects']:
if element['object_type'] == 'NOTEBOOK':
notebooks.append(element)
if element['object_type'] == 'DIRECTORY':
payload_inner = json.dumps({
"path": element['path']
})
response_inner = requests.request("GET", url_list, headers=headers, data=payload_inner).json()
if len(response_inner) != 0:
list_notebooks(response_inner)
return notebooks
result = list_notebooks(response)
print(result[0])
soap.find(id='productTitle').get_text(strip=True)
Output: 'NoneType' Object has no attribute 'get_text'.
There's not a lot to go off since you didn't provide a lot of information, but from the information I got, you've put soap.find instead of soup.find
You could try something like this to fix it:
import requests
from bs4 import BeautifulSoup
URL = "Your url"
headers = {
"User-Agent": '(search My user agent)'}
def product_title():
req = requests.Session()
page = req.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
productTitle = soup.find(id='productTitle').get_text(strip=True)
print(product)
productTitle()
I'm using Python 3 to request share statistics from Linkedin API for the last 14 months.
It works fine, when I hardcoded the epoch values in the request link:
https://api.linkedin.com/v2/organizationalEntityShareStatistics?q=organizationalEntity&organizationalEntity=urn%3Ali%3Aorganization%3AXXXXX&timeIntervals=(timeRange:(start:1596206929000,end:1632938933000),timeGranularityType:DAY)
Obviously I don't want to be changing the end values each time I make a request, so I thought I'd declare a variable:
from datetime import datetime
epochcdt = datetime.now().timestamp()
And then just use it in the link instead of the hardcoded value:
https://api.linkedin.com/v2/organizationalEntityShareStatistics?q=organizationalEntity&organizationalEntity=urn%3Ali%3Aorganization%3AXXXXX&timeIntervals=(timeRange:(start:1596206929000,end:epochcdt),timeGranularityType:DAY)
But, that doesn't work:
{'message': 'Internal Server Error', 'status': 500}
Can you please help me with this hopefully, easy to solve problem?
Whole code:
import requests
import json
from liapiauth import auth, headers
from datetime import datetime
epochcdt = (datetime.now().timestamp())*1000
def organization_info(headers):
response = requests.get('https://api.linkedin.com/v2/organizationalEntityShareStatistics?q=organizationalEntity&organizationalEntity=urn%3Ali%3Aorganization%3AXXXXX&timeIntervals=(timeRange:(start:1596206929000,end:{epochcdt}),timeGranularityType:DAY)', headers = headers)
organization_info = response.json()
return organization_info
if __name__ == '__main__':
credentials = 'credentials.json'
access_token = auth(credentials)
headers = headers(access_token)
organization_info = organization_info(headers)
with open('lishare14m.json', 'w') as outfile:
json.dump(organization_info, outfile)
print(organization_info)
Can write_files be merged? I can't seem to get the merge_how syntax correct. Only files in the last write_files are being created.
Thanks
The answer is yes.
When working with multi-part MIME, you must add the following to the header of each part.
Merge-Type: list(append)+dict(no_replace,recurse_list)+str()
Below is a modified version of the helper script provided in the cloud-init documentation.
#!/usr/bin/env python3
import click
import sys
import gzip as gz
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
#click.command()
#click.option('--gzip', is_flag=True,
help='gzip MIME message to current directory')
#click.argument('message', nargs=-1)
def main(message: str, gzip: bool) -> None:
"""This script creates a multi part MIME suitable for cloud-init
A MESSAGE has following format <filename>:<type> where <filename> is the
file that contains the data to add to the MIME message. <type> is the
MIME content-type.
MIME content-type may be on of the following:
text/cloud-boothook
text/cloud-config
text/cloud-config-archive
text/jinja2
text/part-handler
text/upstart-job
text/x-include-once-url
text/x-include-url
text/x-shellscript
EXAMPLE:
write-mime-multipart afile.cfg:cloud-config bfile.sh:x-shellscript
"""
combined_message = MIMEMultipart()
for message_part in message:
(filename, format_type) = message_part.split(':', 1)
with open(filename) as fh:
contents = fh.read()
sub_message = MIMEText(contents, format_type, sys.getdefaultencoding())
sub_message.add_header('Content-Disposition',
'attachment; filename="{}"'.format(filename))
sub_message.add_header('Merge-Type',
'list(append)+dict(no_replace,recurse_list)+str()')
combined_message.attach(sub_message)
if gzip:
with gz.open('combined-userdata.txt.gz', 'wb') as fd:
fd.write(bytes(combined_message))
else:
print(combined_message)
if __name__ == '__main__':
main()
I am trying to write a pandas dataframe as CSV to Bluemix Object Storage from a DSX Python notebook. I first save the dataframe to a 'local' CSV file. I then have a routine that attempts to write the file to Object Storage. I get a 413 response - object too large. The file is only about 3MB. Here's my code, based on a JSON example I found here: http://datascience.ibm.com/blog/working-with-object-storage-in-data-science-experience-python-edition/
import requests
def put_file(credentials, local_file_name):
"""This function writes file content to Object Storage V3 """
url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
data = {'auth': {'identity': {'methods': ['password'],
'password': {'user': {'name': credentials['name'],'domain': {'id': credentials['domain']},
'password': credentials['password']}}}}}
headers = {'Content-Type': 'text/csv'}
with open(local_file_name, 'rb') as f:
resp1 = requests.post(url=url1, data=f, headers=headers)
return resp1
Any help or pointers is much appreciated.
This code snippet from the tutorial worked fine for me (for a 12 MB file).
from io import BytesIO
import requests
import json
import pandas as pd
def put_file(credentials, local_file_name):
"""This functions returns a StringIO object containing
the file content from Bluemix Object Storage V3."""
f = open(local_file_name,'r')
my_data = f.read()
url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
data = {'auth': {'identity': {'methods': ['password'],
'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domain_id']},
'password': credentials['password']}}}}}
headers1 = {'Content-Type': 'application/csv'}
resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
resp1_body = resp1.json()
for e1 in resp1_body['token']['catalog']:
if(e1['type']=='object-store'):
for e2 in e1['endpoints']:
if(e2['interface']=='public'and e2['region']=='dallas'):
url2 = ''.join([e2['url'],'/', credentials['container'], '/', local_file_name])
s_subject_token = resp1.headers['x-subject-token']
headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}
resp2 = requests.put(url=url2, headers=headers2, data = my_data )
print resp2
I created a random pandas dataframe using:
df = pd.DataFrame(np.random.randint(0,100,size=(1000000, 4)), columns=list('ABCD'))
saved it to csv
df.to_csv('myPandasData_1000000.csv',index=False)
and then put it to object store
put_file(credentials_1,'myPandasData_1000000.csv')
You can get the credentials_1 object by clicking insert to code -> Insert credentials for any object in your object store.