I want to use Numpy to simulate the inference process of a quantized MobileNet V2 network, but the outcome is different with pytorch realized one - neural-network

Python version: 3.8
Pytorch version: 1.9.0+cpu
Platform: Anaconda Spyder5.0
To reproduce this problem, just copy every code below to a single file.
The ILSVRC2012_val_00000293.jpg file used in this code is shown below, you also need to download it and then change its destination in the code.
Some background of this problem:
I am now working on a project that aims to develop a hardware accelerator to complete the inference process of the MobileNet V2 network. I used pretrained quantized Pytorch model to simulate the outcome, and the result comes out very well.
In order to use hardware to complete this task, I wish to know every inputs and outputs as well as intermidiate variables during runing this piece of pytorch code. I used a package named torchextractor to fetch the outcomes of first layer, which in this case, is a 3*3 convolution layer.
import numpy as np
import torchvision
import torch
from torchvision import transforms, datasets
from PIL import Image
from torchvision import transforms
import torchextractor as tx
import math
#########################################################################################
##### Processing of input image
#########################################################################################
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
test_transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,])
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
#image file destination
filename = "D:\Project_UM\MobileNet_VC709\MobileNet_pytorch\ILSVRC2012_val_00000293.jpg"
input_image = Image.open(filename)
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)
#########################################################################################
#########################################################################################
#########################################################################################
#----First verify that the torchextractor class should not influent the inference outcome
# ofmp of layer1 before putting into torchextractor
a,b,c = quantize_tensor(input_batch)# to quantize the input tensor and return an int8 tensor, scale and zero point
input_qa = torch.quantize_per_tensor(torch.tensor(input_batch.clone().detach()), b, c, torch.quint8)# Using quantize_per_tensor method of torch
# Load a quantized mobilenet_v2 model
model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)
model_quantized.eval()
with torch.no_grad():
output = model_quantized.features[0][0](input_qa)# Ofmp of layer1, datatype : quantized_tensor
# print("FM of layer1 before tx_extractor:\n",output.int_repr())# Ofmp of layer1, datatype : int8 tensor
output1_clone = output.int_repr().detach().numpy()# Clone ofmp of layer1, datatype : ndarray
#########################################################################################
#########################################################################################
#########################################################################################
# ofmp of layer1 after adding torchextractor
model_quantized_ex = tx.Extractor(model_quantized, ["features.0.0"])#Capture of the module inside first layer
model_output, features = model_quantized_ex(input_batch)# Forward propagation
# feature_shapes = {name: f.shape for name, f in features.items()}
# print(features['features.0.0']) # Ofmp of layer1, datatype : quantized_tensor
out1_clone = features['features.0.0'].int_repr().numpy() # Clone ofmp of layer1, datatype : ndarray
if(out1_clone.all() == output1_clone.all()):
print('Model with torchextractor attached output the same value as the original model')
else:
print('Torchextractor method influence the outcome')
Here I define a numpy quantization scheme based on the quantization scheme proposed by
Quantization and Training of Neural Networks for Efficient
Integer-Arithmetic-Only Inference
# Convert a normal regular tensor to a quantized tensor with scale and zero_point
def quantize_tensor(x, num_bits=8):# to quantize the input tensor and return an int8 tensor, scale and zero point
qmin = 0.
qmax = 2.**num_bits - 1.
min_val, max_val = x.min(), x.max()
scale = (max_val - min_val) / (qmax - qmin)
initial_zero_point = qmin - min_val / scale
zero_point = 0
if initial_zero_point < qmin:
zero_point = qmin
elif initial_zero_point > qmax:
zero_point = qmax
else:
zero_point = initial_zero_point
# print(zero_point)
zero_point = int(zero_point)
q_x = zero_point + x / scale
q_x.clamp_(qmin, qmax).round_()
q_x = q_x.round().byte()
return q_x, scale, zero_point
#%%
# #############################################################################################
# --------- Simulate the inference process of layer0: conv33 using numpy
# #############################################################################################
# get the input_batch quantized buffer data
input_scale = b.item()
input_zero = c
input_quantized = a[0].detach().numpy()
# get the layer0 output scale and zero_point
output_scale = model_quantized.features[0][0].state_dict()['scale'].item()
output_zero = model_quantized.features[0][0].state_dict()['zero_point'].item()
# get the quantized weight with scale and zero_point
weight_scale = model_quantized.features[0][0].state_dict()["weight"].q_scale()
weight_zero = model_quantized.features[0][0].state_dict()["weight"].q_zero_point()
weight_quantized = model_quantized.features[0][0].state_dict()["weight"].int_repr().numpy()
# print(weight_quantized)
# print(weight_quantized.shape)
# bias_quantized,bias_scale,bias_zero= quantize_tensor(model_quantized.features[0][0].state_dict()["bias"])# to quantize the input tensor and return an int8 tensor, scale and zero point
# print(bias_quantized.shape)
bias = model_quantized.features[0][0].state_dict()["bias"].detach().numpy()
# print(input_quantized)
print(type(input_scale))
print(type(output_scale))
print(type(weight_scale))
Then I write a quantized 2D convolution using numpy, hope to figure out every details in pytorch data flow during the inference.
#%% numpy simulated layer0 convolution function define
def conv_cal(input_quantized, weight_quantized, kernel_size, stride, out_i, out_j, out_k):
weight = weight_quantized[out_i]
input = np.zeros((input_quantized.shape[0], kernel_size, kernel_size))
for i in range(weight.shape[0]):
for j in range(weight.shape[1]):
for k in range(weight.shape[2]):
input[i][j][k] = input_quantized[i][stride*out_j+j][stride*out_k+k]
# print(np.dot(weight,input))
# print(input,"\n")
# print(weight)
return np.multiply(weight,input).sum()
def QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, kernel_size, stride, padding, ofm_size):
output = np.zeros((weight_quantized.shape[0],ofm_size,ofm_size))
input_quantized_padding = np.full((input_quantized.shape[0],input_quantized.shape[1]+2*padding,input_quantized.shape[2]+2*padding),0)
zero_temp = np.full(input_quantized.shape,input_zero)
input_quantized = input_quantized - zero_temp
for i in range(input_quantized.shape[0]):
for j in range(padding,padding + input_quantized.shape[1]):
for k in range(padding,padding + input_quantized.shape[2]):
input_quantized_padding[i][j][k] = input_quantized[i][j-padding][k-padding]
zero_temp = np.full(weight_quantized.shape, weight_zero)
weight_quantized = weight_quantized - zero_temp
for i in range(output.shape[0]):
for j in range(output.shape[1]):
for k in range(output.shape[2]):
# output[i][j][k] = (weight_scale*input_scale)*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i] #floating_output
output[i][j][k] = weight_scale*input_scale/output_scale*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i]/output_scale + output_zero
output[i][j][k] = round(output[i][j][k])
# int_output
return output
Here I input the same image, weight, and bias together with their zero_point and scale, then compare this "numpy simulated" result to the PyTorch calculated one.
quantized_model_out1_int8 = np.squeeze(features['features.0.0'].int_repr().numpy())
print(quantized_model_out1_int8.shape)
print(quantized_model_out1_int8)
out1_np = QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, 3, 2, 1, 112)
np.save("out1_np.npy",out1_np)
for i in range(quantized_model_out1_int8.shape[0]):
for j in range(quantized_model_out1_int8.shape[1]):
for k in range(quantized_model_out1_int8.shape[2]):
if(out1_np[i][j][k] < 0):
out1_np[i][j][k] = 0
print(out1_np)
flag = np.zeros(quantized_model_out1_int8.shape)
for i in range(quantized_model_out1_int8.shape[0]):
for j in range(quantized_model_out1_int8.shape[1]):
for k in range(quantized_model_out1_int8.shape[2]):
if(quantized_model_out1_int8[i][j][k] == out1_np[i][j][k]):
flag[i][j][k] = 1
out1_np[i][j][k] = 0
quantized_model_out1_int8[i][j][k] = 0
# Compare the simulated result to extractor fetched result, gain the total hit rate
print(flag.sum()/(112*112*32)*100,'%')
If the "numpy simulated" results are the same as the extracted one, call it a hit. Print the total hit rate, it shows that numpy gets 92% of the values right. Now the problem is, I have no idea why the rest 8% of values come out wrong.
Comparison of two outcomes:
The picture below shows the different values between Numpy one and PyTorch one, the sample channel is index[1]. The left upper corner is Numpy one, and the upright corner is PyTorch one, I have set all values that are the same between them to 0, as you can see, most of the values just have a difference of 1(This can be view as the error brought by the precision loss of fixed point arithmetics), but some have large differences, e.g. the value[1][4], 121 vs. 76 (I don't know why)
Focus on one strange value:
This code is used to replay the calculation process of the value[1][4], originally I was expecting a trial and error process could lead me to solve this problem, to get my wanted number of 76, but no matter how I tried, it didn't output 76. If you want to try this, I paste this code for your convenience.
#%% A test code to check the calculation process
weight_quantized_sample = weight_quantized[2]
M_t = input_scale * weight_scale / output_scale
ifmap_t = np.int32(input_quantized[:,1:4,7:10])
weight_t = np.int32(weight_quantized_sample)
bias_t = bias[2]
bias_q = bias_t/output_scale
res_t = 0
for ch in range(3):
ifmap_offset = ifmap_t[ch]-np.int32(input_zero)
weight_offset = weight_t[ch]-np.int32(weight_zero)
res_ch = np.multiply(ifmap_offset, weight_offset)
res_ch = res_ch.sum()
res_t = res_t + res_ch
res_mul = M_t*res_t
# for n in range(1, 30):
# res_mul = multiply(n, M_t, res_t)
res_t = round(res_mul + output_zero + bias_q)
print(res_t)
Could you help me out of this, have been stuck here for a long time.

I implemented my own version of quantized convolution and got from 99.999% to 100% hitrate (and mismatch of a single value is by 1 that I can consider to be a rounding issue). The link on the paper in the question helped a lot.
But I found that your formulas are the same as mine. So I don't know what was your issue. As I understand quantization in pytorch is hardware dependent.
Here is my code:
def my_Conv2dRelu_b2(input_q, conv_layer, output_shape):
'''
Args:
input_q: quantized tensor
conv_layer: quantized tensor
output_shape: the pre-computed shape of the result
Returns:
'''
output = np.zeros(output_shape)
# extract needed float numbers from quantized operations
weights_scale = conv_layer.weight().q_per_channel_scales()
input_scale = input_q.q_scale()
weights_zp = conv_layer.weight().q_per_channel_zero_points()
input_zp = input_q.q_zero_point()
# extract needed convolution parameters
padding = conv_layer.padding
stride = conv_layer.stride
# extract float numbers for results
output_zp = conv_layer.zero_point
output_scale = conv_layer.scale
conv_weights_int = conv_layer.weight().int_repr()
input_int = input_q.int_repr()
biases = conv_layer.bias().numpy()
for k in range(input_q.shape[0]):
for i in range(conv_weights_int.shape[0]):
output[k][i] = manual_convolution_quant(
input_int[k].numpy(),
conv_weights_int[i].numpy(),
biases[i],
padding=padding,
stride=stride,
image_zp=input_zp, image_scale=input_scale,
kernel_zp=weights_zp[i].item(), kernel_scale=weights_scale[i].item(),
result_zp=output_zp, result_scale=output_scale
)
return output
def manual_convolution_quant(image, kernel, b, padding, stride, image_zp, image_scale, kernel_zp, kernel_scale,
result_zp, result_scale):
H = image.shape[1]
W = image.shape[2]
new_H = H // stride[0]
new_W = W // stride[1]
results = np.zeros([new_H, new_W])
M = image_scale * kernel_scale / result_scale
bias = b / result_scale
paddedIm = np.pad(
image,
[(0, 0), (padding[0], padding[0]), (padding[1], padding[1])],
mode="constant",
constant_values=image_zp,
)
s = kernel.shape[1]
for i in range(new_H):
for j in range(new_W):
patch = paddedIm[
:, i * stride[0]: i * stride[0] + s, j * stride[1]: j * stride[1] + s
]
res = M * ((kernel - kernel_zp) * (patch - image_zp)).sum() + result_zp + bias
if res < 0:
res = 0
results[i, j] = round(res)
return results
Code to compare pytorch and my own version.
def calc_hit_rate(array1, array2):
good = (array1 == array2).astype(np.int).sum()
all = array1.size
return good / all
# during inference
y2 = model.conv1(y1)
y2_int = torch.int_repr(y2)
y2_int_manual = my_Conv2dRelu_b2(y1, model.conv1, y2.shape)
print(f'y2 hit rate= {calc_hit_rate(y2.int_repr().numpy(), y2_int_manual)}') #hit_rate=1.0

Related

How do I train a new network with a convolution kernel?

Doing a final project for my school and need help because I'm not sure how to execute it.
I'm investigating how image distortion will affect ANN's learning ability. Have no background in coding whatsoever so any help will be hugely appreciated! Can't use TensorFlow as part of requirements.
This is what my prof used for network classification which is standard from the textbook we're using.
# neural network class definition
class neuralNetwork:
# initialise the neural network
def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
# set number of nodes in each input, hidden, output layer
self.inodes = inputnodes
self.hnodes = hiddennodes
self.onodes = outputnodes
self.wih = numpy.random.normal(0.0, pow(self.inodes, -0.5), (self.hnodes, self.inodes))
self.who = numpy.random.normal(0.0, pow(self.hnodes, -0.5), (self.onodes, self.hnodes))
self.lr = learningrate
self.activation_function = lambda x: scipy.special.expit(x)
pass
# train the neural network
def train(self, inputs_list, targets_list):
inputs = numpy.array(inputs_list, ndmin=2).T
targets = numpy.array(targets_list, ndmin=2).T
hidden_inputs = numpy.dot(self.wih, inputs)
hidden_outputs = self.activation_function(hidden_inputs)
final_inputs = numpy.dot(self.who, hidden_outputs)
final_outputs = self.activation_function(final_inputs)
output_errors = targets - final_outputs
hidden_errors = numpy.dot(self.who.T, output_errors)
self.who += self.lr * numpy.dot((output_errors * final_outputs * (1.0 - final_outputs)), numpy.transpose(hidden_outputs))
self.wih += self.lr * numpy.dot((hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), numpy.transpose(inputs))
pass
# query the neural network
def query(self, inputs_list):
inputs = numpy.array(inputs_list, ndmin=2).T
hidden_inputs = numpy.dot(self.wih, inputs)
hidden_outputs = self.activation_function(hidden_inputs)
final_inputs = numpy.dot(self.who, hidden_outputs)
final_outputs = self.activation_function(final_inputs)
return final_outputs
# number of input, hidden and output nodes
input_nodes = 784
hidden_nodes = 200
output_nodes = 10
learning_rate = 0.1
# create instance of neural network
n = neuralNetwork(input_nodes,hidden_nodes,output_nodes, learning_rate)
# load the mnist training data CSV file into a list
training_data_file = open("mnist_train.csv", 'r')
training_data_list = training_data_file.readlines()
training_data_file.close()
# train the neural network
epochs = 5
for e in range(epochs):
for record in training_data_list:
all_values = record.split(',')
inputs = (numpy.asfarray(all_values[1:]) / 255.0 * 0.99) + 0.01
targets = numpy.zeros(output_nodes) + 0.01
targets[int(all_values[0])] = 0.99
n.train(inputs, targets)
pass
pass
# load the mnist test data CSV file into a list
test_data_file = open("mnist_test.csv", 'r')
test_data_list = test_data_file.readlines()
test_data_file.close()
# test the neural network
# scorecard for how well the network performs, initially empty
scorecard = []
for record in test_data_list:
all_values = record.split(',')
correct_label = int(all_values[0])
inputs = (numpy.asfarray(all_values[1:]) / 255.0 * 0.99) + 0.01
outputs = n.query(inputs)
label = numpy.argmax(outputs)
if (label == correct_label):
scorecard.append(1)
else:
scorecard.append(0)
pass
pass
# calculate the performance score, the fraction of correct answers
scorecard_array = numpy.asarray(scorecard)
print ("performance = ", scorecard_array.sum() / scorecard_array.size)
So I came up with a sharpening kernel and basically what I want to accomplish is to apply this kernel to the network so that the images in the training dataset is now 'sharpened'. (Also am using the convolution method given by my prof) Where am I supposed to add this to the class above??
sharpen_kernel = np.array([[0, -1, 0],
[-1, 5, -1],
[0, -1, 0]])
# convolve your image with the kernel
matplotlib.rcParams['figure.figsize'] = 20,20
conv_image = numpy.ones((28,28))
print("shape of image_array",numpy.shape(image_array))
step = 3
i=0
while i < 25:
i+=1
j = 0
while j < 25 :
sub_image = image_array[i:(i+step),j:(j+step):] # array[starty : endy , start x : end x : step]
sub_image = numpy.reshape(sub_image,(1,(step ** 2)))
kernel = numpy.reshape(sharpen_kernel, ((step ** 2),1))
conv_scalar = numpy.dot(sub_image,kernel)
conv_image[i,j] = conv_scalar
j+=1
pass
pass
Basically want to measure the relationship of the performance of ANN and quality of image (e.g. blur, sharpening). Help!

Precison issue with sigmoid activation function for Tensorflow/Keras 2.3.1

Assuming the following forward pass in a classic ANN
(Based on https://mattmazur.com/2015/03/17/a-step-by-step-backpropagation-example/):
Now let's use a sigmoid activation on that, I get:
So far so good, now let's check the result of this calculation in python:
1 / (1+ math.exp(-0.3775)) # ... = 0.5932699921071872, OK
However this is double precision and since Keras uses float32 let's calculate the same thing but with float32, I get:
w1 = np.array(0.15, dtype=np.float32)
i1 = np.array(0.05, dtype=np.float32)
w2 = np.array(0.2, dtype=np.float32)
i2 = np.array(0.1, dtype=np.float32)
b1 = np.array(0.35, dtype=np.float32)
k1 = np.array(1, dtype=np.float32)
k2= np.array(1, dtype=np.float32)
n1 = w1 * i1 + w2 * i2 + b1 * k1
np.array(k2 / (1 + np.exp(-n1)), dtype=np.float32) # --->array(0.59327, dtype=float32)
Ok so normally I should be able to get 0.59327 for our out_{h1} using Keras float32 framework.
Let's now try to get this result using Keras:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
import numpy as np
model = Sequential()
model.add(Dense(2,activation='sigmoid')) # Build only one layer (enough for example)
model.compile(optimizer = SGD(lr = 0.5),loss='mse') # Don't really care for this example, simply for compilation
input = np.array([[0.05, 0.10]]) # Set input, same as the ones provided in the example
output = np.array([[0.01, 0.99]]) # Don't really care for this example since we want to check activation only
weights = [np.array([[0.15, 0.20 ], # Required for set_weights
[0.25, 0.30]], dtype=np.float32),
np.array([0.35, 0.35], dtype=np.float32)]
model.build(input.shape) # Required for set_weights
model.set_weights(weights) # Set the weights to be the same as the one provided in the example
model.predict(np.array([[0.05, 0.10]], dtype=np.float32)) # This can be seen as out_{h1}
# array([[0.5944759 , 0.59628266]], dtype=float32)
# NOK: 0.5944759 != 0.59327
Can someone explain to me why I get 0.5944759 instead of 0.59327? The result seem far from the expected ouput and if possible provide an example of calculation and/or a way to get the expected output of 0.59327.
Please note this example was done using:
tensorflow 2.3.1
numpy 1.18.5
python 3.8.12
Thx for your help.
Tensorflow perform the Dense layer row to column:
It multiplies each row of your input with a column in the weights matrix, You need to transpose your weights matrix, if you do it you will get the correct result.
I validated using your code just modified:
weights = [np.array([[0.15, 0.20], # Required for set_weights
[0.2, 0.30]], dtype=np.float32),
np.array([0.35, 0.35], dtype=np.float32)]
To (pay attention to the transpose operator T:
weights = [np.array([[0.15, 0.20], # Required for set_weights
[0.2, 0.30]], dtype=np.float32).T,
np.array([0.35, 0.35], dtype=np.float32)]

Seemingly inconsistent tensor sizes in pytorch

I'm building a convolutional autoencoder, but want the encoding to be in a linear form so I can more easily feed it as input into an MLP. I have two convolutional layers on the encoder along with a linear inner layer to reduce dimension. This encoding is then fed into the corresponding decoder.
When I flatten the output of the second convolutional layer, based on my calculation (using the standard formula: Calculate the Output size in Convolution layer) should come out to a 1x100352 rank 1 tensor. However, when I set the input dimension of the linear layer to be 100352, the flattened rank 1 tensor has dimension 1x50176. Then comes the weird part.
I tried changing the input dimension of the linear layer to be 50176, assuming I had miscalculated. When I do this, the reshaped rank 1 tensor confusingly becomes 1x100352, and then the aforementioned weight matrix becomes 50176x256 as expected.
This response to modifying the linear layer's input dimension doesn't make sense to me. That hyperparameter controls the weight matrix correctly, but I guess I'm uncertain why it has any bearing on the linear layer's input since that's just a reshaped tensor output from a convolutional layer whose hyperparameters are unrelated to the hyperparameter in question.
I apologize if I'm just missing something obvious. I'm very new to pytorch, and I couldn't find any other posts which discussed this sort of issue.
Here's what I believe to be the minimal reproducible example:
import os
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets
from torch.utils.data import DataLoader
from torchvision.utils import save_image
class convAutoEncoder(nn.Module):
def __init__(self,**kwargs):
super().__init__()
#Creating network structure
#Encoder portion of autoencoder
self.enc1 = nn.Conv2d(in_channels = kwargs["inputChannels"], out_channels = kwargs["channelsEncoderMid"], kernel_size = kwargs["kernelSize"])
self.enc2 = nn.Conv2d(in_channels = kwargs["channelsEncoderMid"], out_channels = kwargs["channelsEncoderInner"], kernel_size = kwargs["kernelSize"])
self.enc3 = nn.Linear(in_features = kwargs["intoLinear"], out_features = kwargs["linearEncoded"])
#Decoder portion of autoencoder
self.dec1 = nn.Linear(in_features = kwargs["linearEncoded"], out_features = kwargs["intoLinear"])
self.dec2 = nn.ConvTranspose2d(in_channels = kwargs["channelsEncoderInner"], out_channels = kwargs["channelsDecoderMid"], kernel_size = kwargs["kernelSize"])
self.dec3 = nn.ConvTranspose2d(in_channels = kwargs["channelsDecoderMid"], out_channels = kwargs["inputChannels"], kernel_size = kwargs["kernelSize"])
def forward(self,x):
#Encoding
x = F.relu(self.enc1(x))
x = F.relu(self.enc2(x))
x = x.reshape(1,-1)
x = x.squeeze()
x = F.relu(self.enc3(x))
#Decoding
x = F.relu(self.dec1(x))
x = x.reshape([32,4,28,28])
x = F.relu(self.dec2(x))
x = F.relu(self.dec3(x))
return x
def encodeDecodeConv(numEpochs = 20, input_Channels = 3, batchSize = 32,
channels_Encoder_Inner = 4, channels_Encoder_Mid = 8, into_Linear = 100352,
linear_Encoded = 256, channels_Decoder_Mid = 8, kernel_Size = 3,
learningRate = 1e-3):
#Pick a device. If GPU available, use that. Otherwise, use CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#Define data transforms
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
#Define training dataset
trainSet = datasets.CIFAR10(root = './data', train = True, download = True, transform = transform)
#Define testing dataset
testSet = datasets.CIFAR10(root = './data', train = False, download = True, transform = transform)
#Define data loaders
trainLoader = DataLoader(trainSet, batch_size = batchSize, shuffle = True)
testLoader = DataLoader(testSet, batch_size = batchSize, shuffle = True)
#Initialize neural network
model = convAutoEncoder(inputChannels = input_Channels, channelsEncoderMid = channels_Encoder_Mid, channelsEncoderInner = channels_Encoder_Inner, intoLinear = into_Linear, linearEncoded = linear_Encoded, channelsDecoderMid = channels_Decoder_Mid, kernelSize = kernel_Size)
#Optimization setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(),lr = learningRate)
lossTracker = []
for epoch in range(numEpochs):
loss = 0
for data,_ in trainLoader:
data = data.to(device)
optimizer.zero_grad()
outputs = model(data)
train_loss = criterion(outputs,data)
train_loss.backward()
optimizer.step()
loss += train_loss.item()
loss = loss/len(trainLoader)
print('Epoch {} of {}, Train loss: {:.3f}'.format(epoch+1,numEpochs,loss))
encodeDecodeConv()
Edit2: Somewhere in the CIFAR10 dataset, the data appears to change dimension. After playing around with print statements more, I discovered that setting the relevant hyperparameter to 100352 works great for many entries, but then seemingly one image pops up that has a different size. Not sure why that would occur, though.

Kalman Filter (pykalman): Value for obs_covariance and model without intercept

I am looking at the KalmanFilter from pykalman shown in examples:
pykalman documentation
Example 1
Example 2
and I am wondering
observation_covariance=100,
vs
observation_covariance=1,
the documentation states
observation_covariance R: e(t)^2 ~ Gaussian (0, R)
How should the value be set here correctly?
Additionally, is it possible to apply the Kalman filter without intercept in the above module?
The observation covariance shows how much error you assume to be in your input data. Kalman filter works fine on normally distributed data. Under this assumption you can use the 3-Sigma rule to calculate the covariance (in this case the variance) of your observation based on the maximum error in the observation.
The values in your question can be interpreted as follows:
Example 1
observation_covariance = 100
sigma = sqrt(observation_covariance) = 10
max_error = 3*sigma = 30
Example 2
observation_covariance = 1
sigma = sqrt(observation_covariance) = 1
max_error = 3*sigma = 3
So you need to choose the value based on your observation data. The more accurate the observation, the smaller the observation covariance.
Another point: you can tune your filter by manipulating the covariance, but I think it's not a good idea. The higher the observation covariance value the weaker impact a new observation has on the filter state.
Sorry, I did not understand the second part of your question (about the Kalman Filter without intercept). Could you please explain what you mean?
You are trying to use a regression model and both intercept and slope belong to it.
---------------------------
UPDATE
I prepared some code and plots to answer your questions in details. I used EWC and EWA historical data to stay close to the original article.
First of all here is the code (pretty the same one as in the examples above but with a different notation)
from pykalman import KalmanFilter
import numpy as np
import matplotlib.pyplot as plt
# reading data (quick and dirty)
Datum=[]
EWA=[]
EWC=[]
for line in open('data/dataset.csv'):
f1, f2, f3 = line.split(';')
Datum.append(f1)
EWA.append(float(f2))
EWC.append(float(f3))
n = len(Datum)
# Filter Configuration
# both slope and intercept have to be estimated
# transition_matrix
F = np.eye(2) # identity matrix because x_(k+1) = x_(k) + noise
# observation_matrix
# H_k = [EWA_k 1]
H = np.vstack([np.matrix(EWA), np.ones((1, n))]).T[:, np.newaxis]
# transition_covariance
Q = [[1e-4, 0],
[ 0, 1e-4]]
# observation_covariance
R = 1 # max error = 3
# initial_state_mean
X0 = [0,
0]
# initial_state_covariance
P0 = [[ 1, 0],
[ 0, 1]]
# Kalman-Filter initialization
kf = KalmanFilter(n_dim_obs=1, n_dim_state=2,
transition_matrices = F,
observation_matrices = H,
transition_covariance = Q,
observation_covariance = R,
initial_state_mean = X0,
initial_state_covariance = P0)
# Filtering
state_means, state_covs = kf.filter(EWC)
# Restore EWC based on EWA and estimated parameters
EWC_restored = np.multiply(EWA, state_means[:, 0]) + state_means[:, 1]
# Plots
plt.figure(1)
ax1 = plt.subplot(211)
plt.plot(state_means[:, 0], label="Slope")
plt.grid()
plt.legend(loc="upper left")
ax2 = plt.subplot(212)
plt.plot(state_means[:, 1], label="Intercept")
plt.grid()
plt.legend(loc="upper left")
# check the result
plt.figure(2)
plt.plot(EWC, label="EWC original")
plt.plot(EWC_restored, label="EWC restored")
plt.grid()
plt.legend(loc="upper left")
plt.show()
I could not retrieve data using pandas, so I downloaded them and read from the file.
Here you can see the estimated slope and intercept:
To test the estimated data I restored the EWC value from the EWA using the estimated parameters:
About the observation covariance value
By varying the observation covariance value you tell the Filter how accurate the input data is (normally you just describe your confidence in the observation using some datasheets or your knowledge about the system).
Here are estimated parameters and the restored EWC values using different observation covariance values:
You can see the filter follows the original function better with a bigger confidence in observation (smaller R). If the confidence is low (bigger R) the filter leaves the initial estimate (slope = 0, intercept = 0) very slowly and the restored function is far away from the original one.
About the frozen intercept
If you want to freeze the intercept for some reason, you need to change the whole model and all filter parameters.
In the normal case we had:
x = [slope; intercept] #estimation state
H = [EWA 1] #observation matrix
z = [EWC] #observation
Now we have:
x = [slope] #estimation state
H = [EWA] #observation matrix
z = [EWC-const_intercept] #observation
Results:
Here is the code:
from pykalman import KalmanFilter
import numpy as np
import matplotlib.pyplot as plt
# only slope has to be estimated (it will be manipulated by the constant intercept) - mathematically incorrect!
const_intercept = 10
# reading data (quick and dirty)
Datum=[]
EWA=[]
EWC=[]
for line in open('data/dataset.csv'):
f1, f2, f3 = line.split(';')
Datum.append(f1)
EWA.append(float(f2))
EWC.append(float(f3))
n = len(Datum)
# Filter Configuration
# transition_matrix
F = 1 # identity matrix because x_(k+1) = x_(k) + noise
# observation_matrix
# H_k = [EWA_k]
H = np.matrix(EWA).T[:, np.newaxis]
# transition_covariance
Q = 1e-4
# observation_covariance
R = 1 # max error = 3
# initial_state_mean
X0 = 0
# initial_state_covariance
P0 = 1
# Kalman-Filter initialization
kf = KalmanFilter(n_dim_obs=1, n_dim_state=1,
transition_matrices = F,
observation_matrices = H,
transition_covariance = Q,
observation_covariance = R,
initial_state_mean = X0,
initial_state_covariance = P0)
# Creating the observation based on EWC and the constant intercept
z = EWC[:] # copy the list (not just assign the reference!)
z[:] = [x - const_intercept for x in z]
# Filtering
state_means, state_covs = kf.filter(z) # the estimation for the EWC data minus constant intercept
# Restore EWC based on EWA and estimated parameters
EWC_restored = np.multiply(EWA, state_means[:, 0]) + const_intercept
# Plots
plt.figure(1)
ax1 = plt.subplot(211)
plt.plot(state_means[:, 0], label="Slope")
plt.grid()
plt.legend(loc="upper left")
ax2 = plt.subplot(212)
plt.plot(const_intercept*np.ones((n, 1)), label="Intercept")
plt.grid()
plt.legend(loc="upper left")
# check the result
plt.figure(2)
plt.plot(EWC, label="EWC original")
plt.plot(EWC_restored, label="EWC restored")
plt.grid()
plt.legend(loc="upper left")
plt.show()

Theano ANN "TypeError: randint() takes at least 1 positional argument (0 given)"

This is the error that I'm receiving:
File "mtrand.pyx",line 1192, in mtrand.RandomState.randint(numpy/random/mtrand/mtrand.c:14128)
I am somewhat new to coding, but I really want to get started with simple ANNs so I decided to start this project.
TypeError: randint() takes at least 1 positional argument (0 given)
# -- coding: utf-8 --
"""
Created on Sun Sep 18 14:56:44 2016
#author: Jamoonie
"""
##theano practice
import numpy as np
import theano
import theano.tensor as T
from sklearn.datasets import load_digits
digits=load_digits()
print (digits.data.shape)
train_x = list(digits.data)
#print train_x.count
train_x = np.array(train_x)
#print train_x
train_y = list(digits.target)
#print train_y.count
train_y = np.array(train_y)
#print train_y
#q = T.matrix('q') checking how matrix dot products work, and how the row,col of the W0 should be set up
#q = np.zeros([5,10])
#print q
#p = T.matrix('p')
#p = np.zeros([10,5])
#
#print np.dot(q,p)
nn_input_dim = train_x.shape[1] ## if shape[0] it yields 1797, which is the number of rows
print nn_input_dim ##shows 64; shape[1] yields 1 row thus 64 columns! which are the layers of data we want to apply
nn_hdim0 = 10
nn_output_dim = len(train_y)
#nn_hdim0 = np.transpose(np.zeros(digits.data.shape))
#print nn_hdim0
epsilon = 0.008
batch_size = 100 ## how much data input per iteration
X = T.matrix('X')
y = T.lvector('y')
## set weight shapeswith random values
#W0 = np.transpose(np.zeros(digits.data.shape))
W0 = theano.shared(np.random.randn(nn_input_dim,nn_hdim0),name='W0') ##the shape of W0 should be row=input_dim, col=# hidden nodes
b0 = theano.shared(np.zeros(nn_hdim0),name='b0')
W1 = theano.shared(np.random.randn(nn_hdim0,nn_output_dim),name='W1') ## shape of W1 should have row=#hidden nodes, col = output dimension
b1 = theano.shared(np.zeros(nn_output_dim),name='b1')
z0 = X.dot(W0)+b0
a0 = T.nnet.softmax(z0) ## first hidden layer result
z1 = a0.dot(W1)+b1
a1 = T.nnet.softmax(z1) ## final result or prediction
loss = T.nnet.categorical_crossentropy(a1,y).mean() ## howmuch the prediction differrs from the real result
prediction = T.argmax(a1,axis=1) ## the maximum values of a1, presented in index posn 1
fwd_propagation = theano.function([X],a1) ## forward propagation function dpeneding on the array of X values and final prediction
calc_loss = theano.function([X,y],loss)
predict= theano.function([X],prediction)
accuracy = theano.function([X],T.sum(T.eq(prediction,train_y))) ## T.eq is elementwise. so this does an elementwise sum of prediction and train_y
dW0 = T.grad(loss,W0)
dW1 = T.grad(loss,W1)
db0=T.grad(loss,b0)
db1=T.grad(loss,b1)
np.random.randint()
gradient_step = theano.function(
[X,y], ##for each set of X,y values
updates=((W1,W1-epsilon*dW1), ##updates W1 by deltaW1(error)*learning rate and subtracting from original W1
(W0,W0-epsilon*dW0),
(b1,b1-epsilon*db1),
(b0,b0-epsilon*db0)))
def build(iterations = 80000):
W1.set_value(np.random.randn(nn_hdim0,nn_output_dim)/np.sqrt(nn_input_dim)) ## why dividing by the sqrt of nn_input_dim,i'm not sure, but they're meant to be random anyway.
W0.set_value(np.random.randn(nn_input_dim,nn_hdim0)/np.sqrt(nn_input_dim))
b1.set_value(np.zeros(nn_output_dim))
b0.set_value(np.zeros(nn_hdim0))
for i in range(0, iterations):
batch_indicies=np.random.randint(0,17,size=100)
batch_x,batch_y=train_x[batch_indicies],train_y[batch_indicies]
gradient_step(batch_x,batch_y)
##so we're providing the values now for the weights, biases and input output values
if i%2000==0:
print("loss after iteration %r: %r" % (i, calc_loss(train_x,train_y)))
print(accuracy(train_x))
if i==80000:
print (W0,b0,W1,b1)
build()
As per the documentation, you need to at-least specify the lowest value of integer to be drawn from the distribution. If you want a random number less than 213 (to be exact between 0 and 213) then you would do r = np.random.randint(213), and if you want a random number between some range let's say 213 and 537 then you would do, r = np.random.randint(213, 537). Also you are trying to get a random number from randint(..) without even storing it to any variable (or passing to any function), which is useless. I would suggest going through basic Theano tutorials to get started, start from here.