How to read kaggle zip file dataset in the databricks - pyspark

I want to read the zip file dataset from the kaggle but I am unable to read that dataset:
import urllib
urllib.request.urlretrieve("https://www.kaggle.com/himanshupoddar/zomato-bangalore-restaurants/downloads/zomato-bangalore-restaurants.zip", "/tmp/zomato-bangalore-restaurants.zip")
then I run shell scripting to extracting the file:
%sh
unzip /tmp/zomato-bangalore-restaurants.zip
tail -n +2 zomato-bangalore-restaurants.csv > temp.csv
rm zomato-bangalore-restaurants.csv
Then I got an error:
Archive: /tmp/zomato-bangalore-restaurants.zip
End-of-central-directory signature not found. Either this file is not
a zipfile, or it constitutes one disk of a multi-part archive. In the
latter case the central directory and zipfile comment will be found on
the last disk(s) of this archive.
unzip: cannot find zipfile directory in one of /tmp/zomato-bangalore-restaurants.zip or
/tmp/zomato-bangalore-restaurants.zip.zip, and cannot find /tmp/zomato-bangalore-restaurants.zip.ZIP, period.
tail: cannot open 'zomato-bangalore-restaurants.csv' for reading: No such file or directory
rm: cannot remove 'zomato-bangalore-restaurants.csv': No such file or directory

Note: Attempt to download a file from Kaggle is blocked because you are not logged in yet.
Here is the script to download all the competition data sets.
from requests import get, post
from os import mkdir, remove
from os.path import exists
from shutil import rmtree
import zipfile
def purge_all_downloads(db_full_path):
# Removes all the downloaded datasets
if exists(db_full_path): rmtree(db_full_path)
def datasets_are_available_locally(db_full_path, datasets):
# Returns True only if all the competition datasets are available locally in Databricks CE
if not exists(db_full_path): return False
for df in datasets:
# Assumes all the datasets end with '.csv' extention
if not exists(db_full_path + df + '.csv'): return False
return True
def remove_zip_files(db_full_path, datasets):
for df in datasets:
remove(db_full_path + df + '.csv.zip')
def unzip(db_full_path, datasets):
for df in datasets:
with zipfile.ZipFile(db_full_path + df + '.csv.zip', 'r') as zf:
zf.extractall(db_full_path)
remove_zip_files(db_full_path, datasets)
def download_datasets(competition, db_full_path, datasets, username, password):
# Downloads the competition datasets if not availible locally
if datasets_are_available_locally(db_full_path, datasets):
print 'All the competition datasets have been downloaded, extraced and are ready for you !'
return
purge_all_downloads(db_full_path)
mkdir(db_full_path)
kaggle_info = {'UserName': username, 'Password': password}
for df in datasets:
url = (
'https://www.kaggle.com/account/login?ReturnUrl=' +
'/c/' + competition + '/download/'+ df + '.csv.zip'
)
request = post(url, data=kaggle_info, stream=True)
# write data to local file
with open(db_full_path + df + '.csv.zip', "w") as f:
for chunk in request.iter_content(chunk_size = 512 * 1024):
if chunk: f.write(chunk)
# extract competition data
unzip(db_full_path, datasets)
print('done !')
For more details, refer "Download the competition data sets directly".
Hope this helps.

Related

Is there a tool that shows a distribution of lines of code per file of a folder?

I want to know how big files are within my repository in terms of lines of code, to see the 'health' of a repository.
In order to answer this, I would like to see a distribution (visualised or not) of the number of files for a specific range (can be 1):
#lines of code #files
1-10 1
11-20 23
etc...
(A histogram of this would be nice)
Is there quick why to get this, with for example cloc or any other (command line) tool?
A combination of cloc and Pandas can handle this. First, capture the line counts with cloc to a csv file using --by-file and --csv switches, for example
cloc --by-file --csv --out data.csv curl-7.80.0.tar.bz2
then use the Python program below to aggregate and bin the data by folders:
./aggregate_by_folder.py data.csv
The code for aggregate_by_folder.py is
#!/usr/bin/env python
import sys
import os.path
import pandas as pd
def add_folder(df):
"""
Return a Pandas dataframe with an additional 'folder' column
containing each file's parent directory
"""
header = 'github.com/AlDanial/cloc'
df = df.drop(df.columns[df.columns.str.contains(header)], axis=1)
df['folder'] = df['filename'].dropna().apply(os.path.dirname)
return df
def bin_by_folder(df):
bins = list(range(0,1000,50))
return df.groupby('folder')['code'].value_counts(bins=bins).sort_index()
def file_count_by_folder(df):
df_files = pd.pivot_table(df, index=['folder'], aggfunc='count')
file_counts = df_files.rename(columns={'blank':'file count'})
return file_counts[['file count']]
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} data.csv")
print(" where the .csv file is created with")
print(" cloc --by-file --csv --out data.csv my_code_base")
raise SystemExit
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
df = add_folder(pd.read_csv(sys.argv[1]))
print(pd.pivot_table(df, index=['folder'], aggfunc='sum'))
print('-' * 50)
print(file_count_by_folder(df))
print('-' * 50)
print(bin_by_folder(df))
if __name__ == "__main__": main()

How to efficiently rename many files or partially select the names of these files during import?

how to rename the files efficiently by the number in the name (see picture)? I did not succeed with Windows PowerToys and I dont wana click each file and rename to the number (e.g. 290)
or how to read the files in this order and define a name? If I try it with a script (see below) the following output occurs:
[![ValueError: invalid literal for int() with base 10: '211001_164357_P_Scripted_Powermeasurement_Wavelength_automatic_Powermeter1_0'][1]][1]
or how to select only the numbers (290 to 230 - see picture) within the name when reading?
Script:
#import libraries
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
data_location = r'C:\Users\...\Characterization_OPO\Data'
data_folder = Path(data_location)
data = {}
allist = list(data_folder.glob('*'))
for i, file in enumerate(allist):
file = str(file)
file_name = file.split('\\')[-1]
wavelength = int(file_name.split('.')[0])
tmp = pd.read_csv(file, skiprows=20, skipfooter=59, index_col="PixelNo")
data[f'{wavelength} nm'] = tmp;
#data.plot(x='Wavelength',y='CCD_1', label=f"{wavelength} nm")
Picture:
I removed all words with windows power rename and than took the last three digits:
for i, file in enumerate(allist):
file = str(file)
file_name = file.split('\\')[-1]
wavelength = int(file_name.split('.')[0])
tmp = pd.read_csv(file, skiprows=26, skipfooter=5)
data[f'{wavelength % 1000} nm'] = tmp;
#data.plot(x='Wavelength',y='CCD_1', label=f"{wavelength} nm")

reading a s3 file with s3a format using pyspark - High

#
# Some constants
#
aws_profile = "your_profile"
aws_region = "your_region"
s3_bucket = "your_bucket"
#
# Reading environment variables from aws credential file
#
import os
import configparser
config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
access_id = config.get(aws_profile, "aws_access_key_id")
access_key = config.get(aws_profile, "aws_secret_access_key")
#
# Configuring pyspark
#
# see https://github.com/jupyter/docker-stacks/issues/127#issuecomment-214594895
# and https://github.com/radanalyticsio/pyspark-s3-notebook/blob/master/s3-source-example.ipynb
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"
# If this doesn't work you might have to delete your ~/.ivy2 directory to reset your package cache.
# (see https://github.com/databricks/spark-redshift/issues/244#issuecomment-239950148)
import pyspark
sc=pyspark.SparkContext()
# see https://github.com/databricks/spark-redshift/issues/298#issuecomment-271834485
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
# see https://stackoverflow.com/questions/28844631/how-to-set-hadoop-configuration-values-from-pyspark
hadoop_conf=sc._jsc.hadoopConfiguration()
# see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true")
hadoop_conf.set("fs.s3a.access.key", access_id)
hadoop_conf.set("fs.s3a.secret.key", access_key)
# see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/
hadoop_conf.set("fs.s3a.connection.maximum", "100000")
# see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")
#
# Downloading the parquet file
#
sql=pyspark.sql.SparkSession(sc)
path = s3_bucket + "your_path"
dataS3=sql.read.parquet("s3://" + path)
Even have tried to write a file thinking that my directory pointing was not correct and if the file write is successful, could pin point the path where it is pointing now but still no progress and say no path exists.
If you please could guide us in this regard, it would really be helpful. Thanks in advance.

Trying to install a corpus for countVectorizer in sklearn package

I am trying to load a corpus from my local drive into python at one time with a for loop and then read each text file and save it for analysis with countVectorizer. But, I am only getting the last file. How do I get the results from all of the files to be stored for analysis with countVectorizer?
This code brings out the text from last file in folder.
folder_path = "folder"
#import and read all files in animal_corpus
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
txt = f.read()
print(txt)
MyList= [txt]
## Create a CountVectorizer object that you can use
MyCV1 = CountVectorizer()
## Call your MyCV1 on the data
DTM1 = MyCV1.fit_transform(MyList)
## get col names
ColNames=MyCV1.get_feature_names()
print(ColNames)
## convert DTM to DF
MyDF1 = pd.DataFrame(DTM1.toarray(), columns=ColNames)
print(MyDF1)
This code works, but would not work for a huge corpus that I am preparing it for.
#import and read text files
f1 = open("folder/animal_1.txt",'r')
f1r = f1.read()
f2 = open("/folder/animal_2.txt",'r')
f2r = f2.read()
f3 = open("/folder/animal_3.txt",'r')
f3r = f3.read()
#reassemble corpus in python
MyCorpus=[f1r, f2r, f3r]
## Create a CountVectorizer object that you can use
MyCV1 = CountVectorizer()
## Call your MyCV1 on the data
DTM1 = MyCV1.fit_transform(MyCorpus)
## get col names
ColNames=MyCV1.get_feature_names()
print(ColNames)
## convert DTM to DF
MyDF2 = pd.DataFrame(DTM1.toarray(), columns=ColNames)
print(MyDF2)
I figured it out. Just gotta keep grinding.
MyCorpus=[]
#import and read all files in animal_corpus
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
txt = f.read()
MyCorpus.append(txt)

Is it possible to copy all files from one S3 bucket to another with s3cmd?

I'm pretty happy with s3cmd, but there is one issue: How to copy all files from one S3 bucket to another? Is it even possible?
EDIT: I've found a way to copy files between buckets using Python with boto:
from boto.s3.connection import S3Connection
def copyBucket(srcBucketName, dstBucketName, maxKeys = 100):
conn = S3Connection(awsAccessKey, awsSecretKey)
srcBucket = conn.get_bucket(srcBucketName);
dstBucket = conn.get_bucket(dstBucketName);
resultMarker = ''
while True:
keys = srcBucket.get_all_keys(max_keys = maxKeys, marker = resultMarker)
for k in keys:
print 'Copying ' + k.key + ' from ' + srcBucketName + ' to ' + dstBucketName
t0 = time.clock()
dstBucket.copy_key(k.key, srcBucketName, k.key)
print time.clock() - t0, ' seconds'
if len(keys) < maxKeys:
print 'Done'
break
resultMarker = keys[maxKeys - 1].key
Syncing is almost as straight forward as copying. There are fields for ETag, size, and last-modified available for keys.
Maybe this helps others as well.
s3cmd sync s3://from/this/bucket/ s3://to/this/bucket/
For available options, please use:
$s3cmd --help
AWS CLI seems to do the job perfectly, and has the bonus of being an officially supported tool.
aws s3 sync s3://mybucket s3://backup-mybucket
http://docs.aws.amazon.com/cli/latest/reference/s3/sync.html
The answer with the most upvotes as I write this is this one:
s3cmd sync s3://from/this/bucket s3://to/this/bucket
It's a useful answer. But sometimes sync is not what you need (it deletes files, etc.). It took me a long time to figure out this non-scripting alternative to simply copy multiple files between buckets. (OK, in the case shown below it's not between buckets. It's between not-really-folders, but it works between buckets equally well.)
# Slightly verbose, slightly unintuitive, very useful:
s3cmd cp --recursive --exclude=* --include=file_prefix* s3://semarchy-inc/source1/ s3://semarchy-inc/target/
Explanation of the above command:
–recursiveIn my mind, my requirement is not recursive. I simply want multiple files. But recursive in this context just tells s3cmd cp to handle multiple files. Great.
–excludeIt’s an odd way to think of the problem. Begin by recursively selecting all files. Next, exclude all files. Wait, what?
–includeNow we’re talking. Indicate the file prefix (or suffix or whatever pattern) that you want to include.s3://sourceBucket/ s3://targetBucket/This part is intuitive enough. Though technically it seems to violate the documented example from s3cmd help which indicates that a source object must be specified:s3cmd cp s3://BUCKET1/OBJECT1 s3://BUCKET2[/OBJECT2]
You can also use the web interface to do so:
Go to the source bucket in the web interface.
Mark the files you want to copy (use shift and mouse clicks to mark several).
Press Actions->Copy.
Go to the destination bucket.
Press Actions->Paste.
That's it.
I needed to copy a very large bucket so I adapted the code in the question into a multi threaded version and put it up on GitHub.
https://github.com/paultuckey/s3-bucket-to-bucket-copy-py
It's actually possible. This worked for me:
import boto
AWS_ACCESS_KEY = 'Your access key'
AWS_SECRET_KEY = 'Your secret key'
conn = boto.s3.connection.S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)
bucket = boto.s3.bucket.Bucket(conn, SRC_BUCKET_NAME)
for item in bucket:
# Note: here you can put also a path inside the DEST_BUCKET_NAME,
# if you want your item to be stored inside a folder, like this:
# bucket.copy(DEST_BUCKET_NAME, '%s/%s' % (folder_name, item.key))
bucket.copy(DEST_BUCKET_NAME, item.key)
Thanks - I use a slightly modified version, where I only copy files that don't exist or are a different size, and check on the destination if the key exists in the source. I found this a bit quicker for readying the test environment:
def botoSyncPath(path):
"""
Sync keys in specified path from source bucket to target bucket.
"""
try:
conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
srcBucket = conn.get_bucket(AWS_SRC_BUCKET)
destBucket = conn.get_bucket(AWS_DEST_BUCKET)
for key in srcBucket.list(path):
destKey = destBucket.get_key(key.name)
if not destKey or destKey.size != key.size:
key.copy(AWS_DEST_BUCKET, key.name)
for key in destBucket.list(path):
srcKey = srcBucket.get_key(key.name)
if not srcKey:
key.delete()
except:
return False
return True
I wrote a script that backs up an S3 bucket: https://github.com/roseperrone/aws-backup-rake-task
#!/usr/bin/env python
from boto.s3.connection import S3Connection
import re
import datetime
import sys
import time
def main():
s3_ID = sys.argv[1]
s3_key = sys.argv[2]
src_bucket_name = sys.argv[3]
num_backup_buckets = sys.argv[4]
connection = S3Connection(s3_ID, s3_key)
delete_oldest_backup_buckets(connection, num_backup_buckets)
backup(connection, src_bucket_name)
def delete_oldest_backup_buckets(connection, num_backup_buckets):
"""Deletes the oldest backup buckets such that only the newest NUM_BACKUP_BUCKETS - 1 buckets remain."""
buckets = connection.get_all_buckets() # returns a list of bucket objects
num_buckets = len(buckets)
backup_bucket_names = []
for bucket in buckets:
if (re.search('backup-' + r'\d{4}-\d{2}-\d{2}' , bucket.name)):
backup_bucket_names.append(bucket.name)
backup_bucket_names.sort(key=lambda x: datetime.datetime.strptime(x[len('backup-'):17], '%Y-%m-%d').date())
# The buckets are sorted latest to earliest, so we want to keep the last NUM_BACKUP_BUCKETS - 1
delete = len(backup_bucket_names) - (int(num_backup_buckets) - 1)
if delete <= 0:
return
for i in range(0, delete):
print 'Deleting the backup bucket, ' + backup_bucket_names[i]
connection.delete_bucket(backup_bucket_names[i])
def backup(connection, src_bucket_name):
now = datetime.datetime.now()
# the month and day must be zero-filled
new_backup_bucket_name = 'backup-' + str('%02d' % now.year) + '-' + str('%02d' % now.month) + '-' + str(now.day);
print "Creating new bucket " + new_backup_bucket_name
new_backup_bucket = connection.create_bucket(new_backup_bucket_name)
copy_bucket(src_bucket_name, new_backup_bucket_name, connection)
def copy_bucket(src_bucket_name, dst_bucket_name, connection, maximum_keys = 100):
src_bucket = connection.get_bucket(src_bucket_name);
dst_bucket = connection.get_bucket(dst_bucket_name);
result_marker = ''
while True:
keys = src_bucket.get_all_keys(max_keys = maximum_keys, marker = result_marker)
for k in keys:
print 'Copying ' + k.key + ' from ' + src_bucket_name + ' to ' + dst_bucket_name
t0 = time.clock()
dst_bucket.copy_key(k.key, src_bucket_name, k.key)
print time.clock() - t0, ' seconds'
if len(keys) < maximum_keys:
print 'Done backing up.'
break
result_marker = keys[maximum_keys - 1].key
if __name__ =='__main__':main()
I use this in a rake task (for a Rails app):
desc "Back up a file onto S3"
task :backup do
S3ID = "*****"
S3KEY = "*****"
SRCBUCKET = "primary-mzgd"
NUM_BACKUP_BUCKETS = 2
Dir.chdir("#{Rails.root}/lib/tasks")
system "./do_backup.py #{S3ID} #{S3KEY} #{SRCBUCKET} #{NUM_BACKUP_BUCKETS}"
end
mdahlman's code didn't work for me but this command copies all the files in the bucket1 to a new folder (command also creates this new folder) in bucket 2.
cp --recursive --include=file_prefix* s3://bucket1/ s3://bucket2/new_folder_name/
s3cmd won't cp with only prefixes or wildcards but you can script the behavior with 's3cmd ls sourceBucket', and awk to extract the object name. Then use 's3cmd cp sourceBucket/name destBucket' to copy each object name in the list.
I use these batch files in a DOS box on Windows:
s3list.bat
s3cmd ls %1 | gawk "/s3/{ print \"\\"\"\"substr($0,index($0,\"s3://\"))\"\\"\"\"; }"
s3copy.bat
#for /F "delims=" %%s in ('s3list %1') do #s3cmd cp %%s %2
You can also use s3funnel which uses multi-threading:
https://github.com/neelakanta/s3funnel
example (without the access key or secret key parameters shown):
s3funnel source-bucket-name list | s3funnel dest-bucket-name copy --source-bucket source-bucket-name --threads=10