Sending Email From Databricks Notebooks - html-email

i want to send email from databricks notebooks, based on this article: https://docs.databricks.com/user-guide/faq/send-email.html
I am following the steps, however I got an error: UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
And, I think, the reason is because inside the function makeCompatibleImage we have this snipet: val = "" % base64.standard_b64encode(png.read()), and probably there is something wrong with base64.standard_b64encode
import numpy as np
import matplotlib.pyplot as plt
# Compute pie slices
N = 20
theta = np.linspace(0.0, 2 * np.pi, N, endpoint=False)
radii = 10 * np.random.rand(N)
width = np.pi / 4 * np.random.rand(N)
ax = plt.subplot(111, projection='polar')
bars = ax.bar(theta, radii, width=width, bottom=0.0)
# Use custom colors and opacity
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
# Convert image add append to html array
html.append(makeCompatibleImage(ax))
#
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<command-890455078841631> in <module>()
16 bar.set_alpha(0.5)
17 # Convert image add append to html array
---> 18 html.append(makeCompatibleImage(ax))
<command-890455078841625> in makeCompatibleImage(image, withLabel)
11 val = None
12 with open(imageName) as png:
---> 13 val = "<img src='data:image/png;base64,%s'>" % base64.standard_b64encode(png.read())
14
15 displayHTML(val)
/databricks/python/lib/python3.6/codecs.py in decode(self, input, final)
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
--> 321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
I want to know how I can replicate this article.

Adding the following code in makeCompatibleImage function, to read the file in binary mode, worked for me:
with open(imageName, 'rb') as png:

Related

Problem with pytorch dataset.imageFolder with custom dataset in Google Colab

I am trying to load a dataset for a classification task using pytorch, this is the code i use:
data_transforms = {
'train': transforms.Compose([
transforms.RandomRotation(2.8),
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.5), (0.5))
]),
'valid': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize((0.5), (0.5))
])
}
print(os.listdir())
# TODO: Load the datasets with ImageFolder
image_datasets = {x: datasets.ImageFolder(os.path.join("/content/drive/MyDrive/DatasetPersonale", x),
data_transforms[x])
for x in ['train', 'valid']}
# TODO: Using the image datasets and the trainforms, define the dataloaders
batch_size = 32
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
shuffle=True, num_workers=4)
for x in ['train', 'valid']}
class_names = image_datasets['train'].classes
print(class_names)
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'valid']}
the code worked fine but as my dataset was in grayscale, I needed to convert it to RGB so I used this code:
rootdir = '/content/drive/MyDrive/DatasetPersonale/trainRGB'
print("Train")
for subdir, dirs, files in os.walk(rootdir):
for file in files:
filePath = os.path.join(subdir, file)
name = os.path.basename(filePath)
img=Image.open(filePath, mode="r")
print(img.mode)
if img.mode != "RGB":
RGBimg=img.convert("RGB")
RGBimg.save(filePath,format=jpeg)
now my images are still jpeg, but now they are RGB and not L. the problem is that if I go to rerun the code to load the dataset I get this error
FileNotFoundError Traceback (most recent call last)
<ipython-input-15-3dace4b0f21b> in <module>()
19 image_datasets = {x: datasets.ImageFolder(os.path.join("/content/drive/MyDrive/DatasetPersonale", x),
20 data_transforms[x])
---> 21 for x in ['trainRGB', 'validRGB']}
22
23 # TODO: Using the image datasets and the trainforms, define the dataloaders
4 frames
<ipython-input-15-3dace4b0f21b> in <dictcomp>(.0)
19 image_datasets = {x: datasets.ImageFolder(os.path.join("/content/drive/MyDrive/DatasetPersonale", x),
20 data_transforms[x])
---> 21 for x in ['trainRGB', 'validRGB']}
22
23 # TODO: Using the image datasets and the trainforms, define the dataloaders
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/folder.py in __init__(self, root, transform, target_transform, loader, is_valid_file)
311 transform=transform,
312 target_transform=target_transform,
--> 313 is_valid_file=is_valid_file)
314 self.imgs = self.samples
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/folder.py in __init__(self, root, loader, extensions, transform, target_transform, is_valid_file)
144 target_transform=target_transform)
145 classes, class_to_idx = self.find_classes(self.root)
--> 146 samples = self.make_dataset(self.root, class_to_idx, extensions, is_valid_file)
147
148 self.loader = loader
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/folder.py in make_dataset(directory, class_to_idx, extensions, is_valid_file)
190 "The class_to_idx parameter cannot be None."
191 )
--> 192 return make_dataset(directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file)
193
194 def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
/usr/local/lib/python3.7/dist-packages/torchvision/datasets/folder.py in make_dataset(directory, class_to_idx, extensions, is_valid_file)
100 if extensions is not None:
101 msg += f"Supported extensions are: {', '.join(extensions)}"
--> 102 raise FileNotFoundError(msg)
103
104 return instances
FileNotFoundError: Found no valid file for the classes .ipynb_checkpoints. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp
Does someone know why this error appears? I checked the extension of all the files and they are jpeg.
Thank you.
Problem: This is because of .ipynb_checkpoints folder inside the folder /content/drive/MyDrive/DatasetPersonale/trainRGB which contains files (invalid images) cannot be read as images that have valid extensions (.jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp).
Solution: You can save all your images in a subfolder namely 'images' and then change your root folder to /content/drive/MyDrive/DatasetPersonale/trainRGB/images to avoid reading the .ipynb_checkpoints folder with your images.

How to convert mnist dataset in array

Hello consider following code
# load the mnist training data CSV file into a list
training_data_file = open("Training_Set/mnist_train_100.csv", 'r')
training_data_list = training_data_file.readlines()
training_data_file.close()
for record in training_data_list:
all_values = record.split(',')
x_inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99) + 0.01
print("xinput=" + str(x_inputs))
print(len(training_data_list))
MyCompleteInput = np.array(x_inputs,len(training_data_list))
I want to put x_inputs and len(training_data_list) into an array so if I print the shape of the array I get an output of (784,100).
But if I run my code I get following error:
TypeError Traceback (most recent call last)
<ipython-input-38-b0f129f57bcb> in <module>()
11 print("xinput=" + str(x_inputs))
12 print(len(training_data_list))
---> 13 MyCompleteInput = np.array(x_inputs,len(training_data_list))
14
15
TypeError: data type not understood
Can somebody help me out? tnx
The line will be
MyCompleteInput = np.array((x_inputs,len(training_data_list)))
Do this and your error will be gone. You need to add another set of parantheses for specifying the size.

UnicodeDecodeError: 'ascii' codec can't decode, with gensim, python3.5

I am using python 3.5 on both windows and Linux but get the same error:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc1 in position 0: ordinal not in range(128)
The error log is the following:
Reloaded modules: lazylinker_ext
Traceback (most recent call last):
File "<ipython-input-2-d60a2349532e>", line 1, in <module>
runfile('C:/Users/YZC/Google Drive/sunday/data/RA/data_20100101_20150622/w2v_coherence.py', wdir='C:/Users/YZC/Google Drive/sunday/data/RA/data_20100101_20150622')
File "C:\Users\YZC\Anaconda3\lib\site- packages\spyderlib\widgets\externalshell\sitecustomize.py", line 699, in runfile
execfile(filename, namespace)
File "C:\Users\YZC\Anaconda3\lib\site- packages\spyderlib\widgets\externalshell\sitecustomize.py", line 88, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "C:/Users/YZC/Google Drive/sunday/data/RA/data_20100101_20150622/w2v_coherence.py", line 70, in <module>
model = gensim.models.Word2Vec.load('model_all_no_lemma')
File "C:\Users\YZC\Anaconda3\lib\site-packages\gensim\models\word2vec.py", line 1485, in load
model = super(Word2Vec, cls).load(*args, **kwargs)
File "C:\Users\YZC\Anaconda3\lib\site-packages\gensim\utils.py", line 248, in load
obj = unpickle(fname)
File "C:\Users\YZC\Anaconda3\lib\site-packages\gensim\utils.py", line 912, in unpickle
return _pickle.loads(f.read())
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc1 in position 0: ordinal not in range(128)
1.I checked and found the default decode method is utf-8 by:
import sys
sys.getdefaultencoding()
Out[2]: 'utf-8'
when read the file, I also added .decode('utf-8')
I did add shepang line in the beginning and declare utf-8
so I really dont know why python couldnt read the file. Can anybody help me out?
Here are the code:
# -*- coding: utf-8 -*-
import gensim
import csv
import numpy as np
import math
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob, Word
class SpeechParser(object):
def __init__(self, filename):
self.filename = filename
self.lemmatize = WordNetLemmatizer().lemmatize
self.cached_stopwords = stopwords.words('english')
def __iter__(self):
with open(self.filename, 'rb', encoding='utf-8') as csvfile:
file_reader = csv.reader(csvfile, delimiter=',', quotechar='|', )
headers = file_reader.next()
for row in file_reader:
parsed_row = self.parse_speech(row[-2])
yield parsed_row
def parse_speech(self, row):
speech_words = row.replace('\r\n', ' ').strip().lower().translate(None, string.punctuation).decode('utf-8', 'ignore')
return speech_words.split()
# -- source: https://github.com/prateekpg2455/U.S-Presidential- Speeches/blob/master/speech.py --
def pos(self, tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return ''
if __name__ == '__main__':
# instantiate object
sentences = SpeechParser("sample.csv")
# load an existing model
model = gensim.models.Word2Vec.load('model_all_no_lemma')
print('\n-----------------------------------------------------------')
print('MODEL:\t{0}'.format(model))
vocab = model.vocab
# print log-probability of first 10 sentences
row_count = 0
print('\n------------- Scores for first 10 documents: -------------')
for doc in sentences:
print(sum(model.score(doc))/len(doc))
row_count += 1
if row_count > 10:
break
print('\n-----------------------------------------------------------')
It looks like a bug in Gensim when you try to use a Python 2 pickle file that has non-ASCII chars in it with Python 3.
The unpickle is happening when you call:
model = gensim.models.Word2Vec.load('model_all_no_lemma')
In Python 3, during the unpickle it wants to convert legacy byte strings to (Unicode) strings. The default action is to decode with 'ASCII' in strict mode.
The fix will be dependant on the encoding in your original pickle file and will require you to patch the gensim code.
I'm not familiar with gensim so you will have to try the following two options:
Force UTF-8
Chances are, your non-ASCII data is in UTF-8 format.
Edit C:\Users\YZC\Anaconda3\lib\site-packages\gensim\utils.py
Goto line 912
Change line to read:
return _pickle.loads(f.read(), encoding='utf-8')
Byte mode
Gensim in Python3 may happily work with byte strings:
Edit C:\Users\YZC\Anaconda3\lib\site-packages\gensim\utils.py
Goto line 912
Change line to read:
return _pickle.loads(f.read(), encoding='bytes')

How do I parse a captured packet in python?

I have a capture packet raw packet using python's sockets:
s = socket.socket(socket.AF_PACKET, socket.SOCK_RAW, socket.ntohs(0x0003))
while True:
message = s.recv(4096)
test = []
print(len(message))
print(repr(message))
I assumed that the packet returned would be in hex string format, however the printout of print(repr(message)) get me something like this:
b'\x00\x1b\xac\x00Gd\x00\x14\xd1+\x1f\x19\x05\n\x124VxC!UUUU\x00\x00\x00\x00\xcd\xcc\xcc=\xcd\xccL>\x9a\x99\x99>\xcd\xcc\xcc>\x00\x00\x00?\x9a\x......'
which has weird non hex characters like !UUUU or =. What encoding is this, and how do I decode the packet?
I know what the packet looks like ahead of time for now, since I'm the one generating the packets using winpcapy:
from ctypes import *
from winpcapy import *
import zlib
import binascii
import time
from ChanPackets import base, FrMessage, FrTodSync, FrChanConfig, FlChan, RlChan
while (1):
now = time.time()
errbuf = create_string_buffer(PCAP_ERRBUF_SIZE)
fp = pcap_t
deviceName = b'\\Device\\NPF_{8F5BD2E9-253F-4659-8256-B3BCD882AFBC}'
fp = pcap_open_live(deviceName, 65536, 1, 1000, errbuf)
if not bool(fp):
print ("\nUnable to open the adapter. %s is not supported by WinPcap\n" % deviceName)
sys.exit(2)
# FrMessage is a custom class that creates the packet
test = FrMessage('00:1b:ac:00:47:64', '00:14:d1:2b:1f:19', 0x12345678, 0x4321, 0x55555555, list(i/10 for i in range(320)))
# test.get_Raw_Packet() returns a c_bytes array needed for winpcap to send the packet
if (pcap_sendpacket(fp, test.get_Raw_Packet(), test.packet_size) != 0):
print ("\nError sending the packet: %s\n" % pcap_geterr(fp))
sys.exit(3)
elapsed = time.time() - now
if elapsed < 0.02 and elapsed > 0:
time.sleep(0.02 - elapsed)
pcap_close(fp)
Note: I would like to get an array of hex values representing each byte
What encoding is this, and how do I decode the packet?
What you see is the representation of bytes object in Python. As you might have guessed \xab represents byte 0xab (171).
which has weird non hex characters like !UUUU or =
Printable ASCII characters represent themselves i.e., instead of \x55 the representation contains just U.
What you have is a sequence of bytes. How to decode them depends on your application. For example, to decode a data packet that contains Ethernet frame, you could use scapy (Python 2):
>>> b = '\x00\x02\x157\xa2D\x00\xae\xf3R\xaa\xd1\x08\x00E\x00\x00C\x00\x01\x00\x00#\x06x<\xc0\xa8\x05\x15B#\xfa\x97\x00\x14\x00P\x00\x00\x00\x00\x00\x00\x00\x00P\x02 \x00\xbb9\x00\x00GET /index.html HTTP/1.0 \n\n'
>>> c = Ether(b)
>>> c.hide_defaults()
>>> c
<Ether dst=00:02:15:37:a2:44 src=00:ae:f3:52:aa:d1 type=0x800 |
<IP ihl=5L len=67 frag=0 proto=tcp chksum=0x783c src=192.168.5.21 dst=66.35.250.151 |
<TCP dataofs=5L chksum=0xbb39 options=[] |
<Raw load='GET /index.html HTTP/1.0 \n\n' |>>>>
I would like to get an array of hex values representing each byte
You could use binascii.hexlify():
>>> pkt = b'\x00\x1b\xac\x00Gd\x00'
>>> import binascii
>>> binascii.hexlify(pkt)
b'001bac00476400'
or If you want a list with string hex values:
>>> hexvalue = binascii.hexlify(pkt).decode()
>>> [hexvalue[i:i+2] for i in range(0, len(hexvalue), 2)]
['00', '1b', 'ac', '00', '47', '64', '00']
In python raw packet decode can be done using the scapy functions like IP(), TCP(), UDP() etc.
import sys
import socket
from scapy.all import *
s = socket.socket(socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_TCP)
while 1:
packet = s.recvfrom(2000);
packet = packet[0]
ip = IP(packet)
ip.show()

Need help identifying and computing a number representation

I need help identifying the following number format.
For example, the following number format in MIB:
0x94 0x78 = 2680
0x94 0x78 in binary: [1001 0100] [0111 1000]
It seems that if the MSB is 1, it means another character follows it. And if it is 0, it is the end of the number.
So the value 2680 is [001 0100] [111 1000], formatted properly is [0000 1010] [0111 1000]
What is this number format called and what's a good way for computing this besides bit manipulation and shifting to a larger unsigned integer?
I have seen this called either 7bhm (7-bit has-more) or VLQ (variable length quantity); see http://en.wikipedia.org/wiki/Variable-length_quantity
This is stored big-endian (most significant byte first), as opposed to the C# BinaryReader.Read7BitEncodedInt method described at Encoding an integer in 7-bit format of C# BinaryReader.ReadString
I am not aware of any method of decoding other than bit manipulation.
Sample PHP code can be found at
http://php.net/manual/en/function.intval.php#62613
or in Python I would do something like
def encode_7bhm(i):
o = [ chr(i & 0x7f) ]
i /= 128
while i > 0:
o.insert(0, chr(0x80 | (i & 0x7f)))
i /= 128
return ''.join(o)
def decode_7bhm(s):
o = 0
for i in range(len(s)):
v = ord(s[i])
o = 128*o + (v & 0x7f)
if v & 0x80 == 0:
# found end of encoded value
break
else:
# out of string, and end not found - error!
raise TypeError
return o