from pymongo import MongoClient
from bson.objectid import ObjectId
import numpy as np
import gridfs
import os,os.path
i=0
try:
for file in os.listdir("/Users/sarthakgupta/Desktop/sae/Images"):
if (file.endswith(".png") | file.endswith(".jpg")):
filename = "/Users/sarthakgupta/Desktop/sae/Images/"+file
datafile = open(filename,"rb")
thedata = datafile.read()
datafile.close()
c = MongoClient()
i=i+1
db = c.trial5
fs = gridfs.GridFS(db)
t = "class"+str(i)
stored = fs.put(thedata,filename=q)
except IOError:
print("Image file %s not found" %datafile)
raise SystemExit
I stored image in a mongo db . Now i want to retrieve those images from database by the filename and to store the image(or pixels) of same filename in an array or list. Suppose if there is 2 images with filename "class1",then they should be in one array.
Create your fs variable like before, and:
data = fs.get_last_version(filename).read()
You could also query for a list of files like:
from bson import Regex
for f in fs.find({'filename': Regex(r'.*\.(png|jpg)')):
data = f.read()
Also, a comment about your code: it's very slow to recreate the MongoClient and GridFS instances for every iteration of your loop. Create them once before you start looping, and reuse them.
Related
I have a bunch of CSV files in a mounted blob container and I need to calculate the 'SHA1' hash values for every file to store as inventory. I'm very new to Azure cloud and pyspark so I'm not sure how this can be achieved efficiently. I have written the following code in Python Pandas and I'm trying to use this in pyspark. It seems to work however it takes quite a while to run as there are thousands of CSV files. I understand that things work differently in pyspark, so can someone please guide if my approach is correct, or if there is a better piece of code I can use to accomplish this task?
import os
import subprocess
import hashlib
import pandas as pd
class File:
def __init__(self, path):
self.path = path
def get_hash(self):
hash = hashlib.sha1()
with open(self.path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash.update(chunk)
self.md5hash = hash.hexdigest()
return self.md5hash
path = '/dbfs/mnt/data/My_Folder' #Path to CSV files
cnt = 0
rlist = []
for path, subdirs, files in os.walk(path):
for fi in files:
if cnt < 10: #check on only 10 files for now as it takes ages!
f = File(os.path.join(path, fi))
cnt +=1
hash_value = f.get_hash()
results = {'File_Name': fi, 'File_Path': f.filename, 'SHA1_Hash_Value': hash_value}
rlist.append(results)
print(fi)
df = pd.DataFrame(rlist)
print(str(cnt) + ' files processed')
df = pd.DataFrame(rlist)
#df.to_csv('/dbfs/mnt/workspace/Inventory/File_Hashes.csv', mode='a', header=False) #not sure how to write files in pyspark!
display(df)
Thanks
Since you want to treat the files as blobs and not read them into a table. I would recommend using spark.sparkContext.binaryFiles this would land you an RDD of pairs where the key is the file name and the value is a file-like object, on which you can calculate the hash in a map function (rdd.mapValues(calculate_hash_of_file_like))
For more information, refer to the documentation: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.SparkContext.binaryFiles.html#pyspark.SparkContext.binaryFiles
I would create a python code to fetch the average volume of a given link stock using BeautifulSoup.
What I have done so far:
import bs4
import requests
from bs4 import BeautifulSoup
r=requests.get('https://finance.yahoo.com/quote/M/key-statistics?p=M')
soup=BeautifulSoup(r.content,"html.parser")
# p = soup.find_all(class_="Fw(500) Ta(end) Pstart(10px) Miw(60px)")[1].get_text
# p = soup.find_all('td')[2].get_text
# p = soup.find_all('table', class_='W(100%) Bdcl(c)')[70].tr.get_text
Anyway, I was able to get that number directly from google console using this command:
Document.querySelectorAll('table tbody tr td')[71].innerText
"21.07M"
Please, help with the basic explanation, I know a few about DOM.
You can use this logic to make it easier:
Use find_all to find all the spans in the html file
Search the spans for the correct label (Avg Vol...)
Use parent to go up the hierarchy to the full table row
Use find_all again from the parent to get the last cell which contains the value
Here is the updated code:
import bs4
import requests
from bs4 import BeautifulSoup
r=requests.get('https://finance.yahoo.com/quote/M/key-statistics?p=M')
soup=BeautifulSoup(r.content,"html.parser")
p = soup.find_all('span')
for s in p: # each span
if s.text == 'Avg Vol (10 day)': # starting cell
pnt = s.parent.parent # up 2 levels, table row
print(pnt.find_all('td')[-1].text) # last table cell
Output
21.76M
I am trying to create multiple collections in MongoDB using pymongo module as below. But its been failing.
import pymongo
client = MongoClient('localhost:27017')
mydb = client.tstdata # connect/create a db in mongodb
list1 = ["tst-1","tst-2","tst-3"] # list of collections to be created
for each_val in list1:
print (each_val)
col = mydb.each_val #create collection
col.insert_one({"name" : "test"})
Instead of creating multiple collections using values in 'each_val' variable, its just creating collection with "each_val" name; how can I fix the above error
Use the syntax:
col = mydb[each_val]
The drivers do their best to (sort of) match the mongo shell. So db.x.insert will insert into collection x rather than any variable named x as you have seen.
according to your code, you are creating each_val collection
you have to add collection name in square brackets in pymongo [ ]
from pymongo import MongoClient
client = MongoClient('localhost:27017')
mydb = client.tstdata # connect/create a db in mongodb
list1 = ["tst-1","tst-2","tst-3"] # list of collections to be created
for each_val in list1:
print (each_val)
col = mydb[each_val] #create collection
col.insert_one({"name" : test})
I'm trying to save data frame into CSV file using the following code df.repartition(1).write.csv('path',sep = ',') then beside the csv file there are other files generated as in the following snippet
how do I avoid saving the df into CSV file without generating those CSC files? incase there is no possibility how can I let the pandas read the only CSV files out all other files. taking into consideration that there is a file with format csv.crc
For Pandas reading only the csv files you can do:
import pandas as pd
import os
from os import listdir
#you can change the suffix, csv will be the default
def find_csv_filenames( path_to_dir, suffix=".csv" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
your_dir = '/your_path_here/complete_route'
csv_files = ind_csv_filenames(your_dir)
for filename in csv_files:
print(pd.read_csv(your_dir+"/"+filename))
If you want to read all files in the same dataframe:
df = pd.DataFrame()
for filename in csv_files:
df = df.append(pd.read_csv(your_dir+"/"+filename), ignore_index=True)
I want to read a .mat file available at http://www.eigenvector.com/data/tablets/index.html. To access the data inside this file, I am trying the follwing:
import scipy.io as spio
import numpy as np
import matplotlib.pyplot as plt
mat = spio.loadmat('nir_shootout_2002.mat')
# http://pyhogs.github.io/reading-mat-files.html
def print_mat_nested(d, indent=0, nkeys=0):
if nkeys>0:
d = {k: d[k] for k in d.keys()[:nkeys]}
if isinstance(d, dict):
for key, value in d.iteritems():
print '\t' * indent + 'Key: ' + str(key)
print_mat_nested(value, indent+1)
if isinstance(d,np.ndarray) and d.dtype.names is not None:
for n in d.dtype.names:
print '\t' * indent + 'Field: ' + str(n)
print_mat_nested(d[n], indent+1)
print_mat_nested(mat, nkeys=1)
Above command shows that the first key in the dictionary is "validate_1" and it has a field "data". To access this field, I try:
t = mat['validate_1']
print(t['data'])
It prints an array but when I use np.shape(t['data']), it just returns (1,1) whereas the data seems to be larger. I am not sure how to access array inside t['data'].
Thanks for the help.
I found that following works:
t = mat['validate_1']['data'][0,0]
print(np.shape(t))
It returns that t is an array of shape (40,650).