wanting to implement a deconv layer in theano but I don't think it's working right. Any suggestions?
class ConvTransposeLayer(object):
"""
Deconv layer
Built from the following literatures:
- http://deeplearning.net/software/theano_versions/dev/tutorial/conv_arithmetic.html
- A guide to convolution arithmetic for deep learning. (https://arxiv.org/pdf/1603.07285v1.pdf)
- Lasagne deconv layer (https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/conv.py)
Performs the backward pass of a 2D convolution (also called transposed
convolution, fractionally-strided convolution or deconvolution in the
literature) on its input and optionally applies an elementwise nonlinearity
"""
def __init__(self, input, output_shape, filters, filter_shape, activation=None, border_mode='valid', stride=(2,2), filter_flip=False):
# track input
self.input = input
# --------------------
# Deconv is done by swapping the forward and backward passes of a convolution.
# this is implemented as the gradients with respect to the original layer with the
# convolutions
deconv_op = AbstractConv2d_gradInputs(
imshp=output_shape,
kshp=filter_shape,
border_mode=border_mode,
subsample=stride,
filter_flip=filter_flip
)
output = deconv_op(filters, input, output_shape[2:])
# --------------------
# Run through activation function
if activation is None:
self.output = output
else:
self.output = activation(output)
It is used like:
conv_layer_3_encoder = Layers.ConvLayer(
self.rng,
input=droput_conv_2.output,
input_filter_count=96,
output_filter_count=192,
filter_shape=(3, 3),
stride=(2, 2),
activation=activations.relu
)
droput_conv_3 = Layers.DropoutLayer(
self.rng,
input=conv_layer_3_encoder.output,
p=0.2
)
# -----------------
# DECODING LAYER 3 + DROPOUT
deconv_layer_3_decoder = Layers.ConvTransposeLayer(input=droput_conv_3.output,
output_shape=image_shape_layer_3,
filters=conv_layer_3_encoder.filters,
filter_shape=filter_shape_layer_3,
activation=activations.relu)
droput_deconv_3 = Layers.DropoutLayer(
self.rng,
input=deconv_layer_3_decoder.output,
p=0.2
)
Related
I'm new to neural networks and I'm building one that reads handwritten numbers. The losses float around 2.2-2.3 and I'm not sure why it's not converging. I tried messing with the learning rate but it doesn't really do anything.
# TODO: Define function to create our own neural network
# Parameters
input_size = 784 # Hint: image size is 28x28, and we want to flatten the image
num_classes = 10 # Hint: our inputs include 0-9
num_epochs = 5 # Number of times we loop through the entire training dataset, can be pretty arbitrary
class NN(nn.Module):
############ YOUR CODE STARTS HERE ############
# 1. Initialize our own NN model
def __init__(self, input_size, num_classes):
super(NN, self).__init__()
self.flatten = nn.Flatten()
# Use ReLU activation function
self.relu = nn.ReLU()
# Input layer
self.input_layer = nn.Linear(input_size, 13)
# Hidden layers: use at least 1 hidden layer!
self.hidden1 = nn.Linear(13, 6)
# Output layer
self.output_layer = nn.Linear(6, 10)
# 2. Define method for forwarding input data
def forward(self, sample):
sample = self.flatten(sample)
out = self.input_layer(sample)
out = self.relu(out)
out = self.hidden1(out)
out = self.relu(out) #TODO: activation function
out = self.output_layer(out) #TODO: forward to output layer
return out
nn_model = NN(input_size, num_classes)
print("My NN Model: ", nn_model)
Loss function:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=.0001)
Training:
total_steps = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Reshape our images from 2D(28x28) to 1D(784)
images = images.view(-1, 28*28).to(device)
labels = labels.to(device)
# Call functions we've previously defined to perform forward pass & calculate loss
output = nn_model.forward(images)
loss = loss_function(output, labels)
# Backward pass
optimizer.zero_grad()
loss.backward()
# calculates gradients
optimizer.step()
# Print out training process
if (i+1) % 100 == 0:
print(f'epoch {epoch+1} / {num_epochs}, step {i+1}/{total_steps}, loss = {loss.item():.4f}')
Any help is appreciated!
I am using Sklearn to train a MultiLayer Perceptron Regression on 12 features and one output. The StandardScalar() is fit to the training data and applied to all input data. After a training period with architectural optimization, I get a model that is seemingly quite accurate (<10% error). I now need to extract the weights and biases in order to implement the prediction in real time on a system that interacts with a person. This is being done with my_model.coefs_ for weights and my_model.intercepts_ for the biases. The weights are appropriately shaped for the number of nodes in my model and the biases have the appropriate lengths for each layer.
The problem is now that I implement the matrix algebra in MatLab and get wildly different predictions from what my_model.predict() yields.
My reconstruction process for a 2 layer MLP (with 11 nodes in the first layer and 10 nodes in the second):
scale() % elementwise subtract feature mean and divide by feature stdev
scaled_obs = scale(raw_obs)
% Up to this point results from MatLab == Sklearn
weight1 = [12x11] % weights to transition from the input layer to the first hidden layer
weight2 = [11x10]
weight3 = [10x1]
bias1 = [11x1] % bias to add to the first layer after weight1 has been applied
bias2 = [10x1]
bias3 = [1x1]
my_prediction = ((( scaled_obs * w1 + b1') * w2 + b2') * w3 + b3);
I also tried
my_prediction2 = ((( scaled_obs * w1 .* b1') * w2 .* b2') * w3 .* b3); % because nothing worked...```
for my specific data:
Sklearn prediction = 1.731
my_prediction = -50.347
my_prediction2 = -3.2075
Is there another weight/bias that I am skipping when extracting relevant params from my_model? Is my order of operations in the reconstruction flawed?
In my opinion my_prediction = ((( scaled_obs * w1 + b1') * w2 + b2') * w3 + b3); is correct, but there is only 1 missing part and that is activation function. What was the activation function you had passed for the model. By default MLPRegressor have relu as activation function from first layer to third last layer(inclusive). Second last layer doesn't have any activation function. And output layer have a separate activation function which is identity function, basically f(x) = x so you don't have to do anything for that.
If you selected relu or if You didn't at all selected an activation (then relu is default), then you have to do something like this in numpy as np.maximum(0, your_layer1_calculation), I am not sure how this is done in matlab
So final formula would be :
layer1 = np.dot(scaled_inputs, weight0) + bias0
layer2 = np.dot(np.maximum(0, layer1), weight1) + bias1
layer......
layer(n-1) = np.dot(np.maximum(0, layer(n-2), weight(n-1)) + bias(n-1)
layer(n) = layer(n-1) # identity function
Let's evaluate usage of this line in the block of code given below.
L1_delta = L1_error * nonlin(L1,True) # line 36
import numpy as np #line 1
# sigmoid function
def nonlin(x,deriv=False):
if(deriv==True):
return x*(1-x)
return 1/(1+np.exp(-x))
# input dataset
X = np.array([ [0,0,1],
[0,1,1],
[1,0,1],
[1,1,1] ])
# output dataset
y = np.array([[0,0,1,1]]).T
# seed random numbers to make calculation
# deterministic (just a good practice)
np.random.seed(1)
# initialize weights randomly with mean 0
syn0 = 2*np.random.random((3,1)) - 1
for iter in range(1000):
# forward propagation
L0 = X
L1 = nonlin(np.dot(L0,syn0))
# how much did we miss?
L1_error = y - L1
# multiply how much we missed by the
# slope of the sigmoid at the values in L1
L1_delta = L1_error * nonlin(L1,True) # line 36
# update weights
syn0 += np.dot(L0.T,L1_delta)
print ("Output After Training:")
print (L1)
I wanted to know, is the line required? Why do we need the factor of derivative of Sigmoid?
I have seen many similar logistic regression examples where derivative of Sigmoid is not used.
For example
https://github.com/chayankathuria/LogReg01/blob/master/GradientDescent.py
Yes, the line is indeed required. You need the derivative of the activation function (in this case sigmoid) because your final output is only implicitly dependent of the weights.
That's why you need to apply the chain rule where the derivative of the sigmoid will appear.
I recommend you to take a look at this post regardind backpropagation: https://datascience.stackexchange.com/questions/28719/a-good-reference-for-the-back-propagation-algorithm
It explains the mathematics behind backpropagation quite well.
What impact does the fact the relu activation function does not contain a derivative ?
How to implement the ReLU function in Numpy implements relu as maximum of (0 , matrix vector elements).
Does this mean for gradient descent we do not take derivative of relu function ?
Update :
From Neural network backpropagation with RELU
this text aids in understanding :
The ReLU function is defined as: For x > 0 the output is x, i.e. f(x)
= max(0,x)
So for the derivative f '(x) it's actually:
if x < 0, output is 0. if x > 0, output is 1.
The derivative f '(0) is not defined. So it's usually set to 0 or you
modify the activation function to be f(x) = max(e,x) for a small e.
Generally: A ReLU is a unit that uses the rectifier activation
function. That means it works exactly like any other hidden layer but
except tanh(x), sigmoid(x) or whatever activation you use, you'll
instead use f(x) = max(0,x).
If you have written code for a working multilayer network with sigmoid
activation it's literally 1 line of change. Nothing about forward- or
back-propagation changes algorithmically. If you haven't got the
simpler model working yet, go back and start with that first.
Otherwise your question isn't really about ReLUs but about
implementing a NN as a whole.
But this still leaves some confusion as the neural network cost function typically takes derivative of activation function, so for relu how does this impact cost function ?
The standard answer is that the input to ReLU is rarely exactly zero, see here for example, so it doesn't make any significant difference.
Specifically, for ReLU to get a zero input, the dot product of one entire row of the input to a layer with one entire column of the layer's weight matrix would have to be exactly zero. Even if you have an all-zero input sample, there should still be a bias term in the last position, so I don't really see this ever happening.
However, if you want to test for yourself, try implementing the derivative at zero as 0, 0.5, and 1 and see if anything changes.
The PyTorch docs give a simple neural network with numpy example with one hidden layer and relu activation. I have reproduced it below with a fixed random seed and three options for setting the behavior of the ReLU gradient at 0. I have also added a bias term.
N, D_in, H, D_out = 4, 2, 30, 1
# Create random input and output data
x = x = np.random.randn(N, D_in)
x = np.c_(x, no.ones(x.shape[0]))
y = x = np.random.randn(N, D_in)
np.random.seed(1)
# Randomly initialize weights
w1 = np.random.randn(D_in+1, H)
w2 = np.random.randn(H, D_out)
learning_rate = 0.002
loss_col = []
for t in range(200):
# Forward pass: compute predicted y
h = x.dot(w1)
h_relu = np.maximum(h, 0) # using ReLU as activate function
y_pred = h_relu.dot(w2)
# Compute and print loss
loss = np.square(y_pred - y).sum() # loss function
loss_col.append(loss)
print(t, loss, y_pred)
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y) # the last layer's error
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T) # the second laye's error
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0 # grad at zero = 1
# grad[h <= 0] = 0 # grad at zero = 0
# grad_h[h < 0] = 0; grad_h[h == 0] = 0.5 # grad at zero = 0.5
grad_w1 = x.T.dot(grad_h)
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
I want to convert a pre-trained CNN (like VGG-16) to a fully convolutional network in Pytorch. How can I do so?
You can do that as follows (see comments for description):
import torch
import torch.nn as nn
from torchvision import models
# 1. LOAD PRE-TRAINED VGG16
model = models.vgg16(pretrained=True)
# 2. GET CONV LAYERS
features = model.features
# 3. GET FULLY CONNECTED LAYERS
fcLayers = nn.Sequential(
# stop at last layer
*list(model.classifier.children())[:-1]
)
# 4. CONVERT FULLY CONNECTED LAYERS TO CONVOLUTIONAL LAYERS
### convert first fc layer to conv layer with 512x7x7 kernel
fc = fcLayers[0].state_dict()
in_ch = 512
out_ch = fc["weight"].size(0)
firstConv = nn.Conv2d(in_ch, out_ch, 7, 7)
### get the weights from the fc layer
firstConv.load_state_dict({"weight":fc["weight"].view(out_ch, in_ch, 7, 7),
"bias":fc["bias"]})
# CREATE A LIST OF CONVS
convList = [firstConv]
# Similarly convert the remaining linear layers to conv layers
for layer in enumerate(fcLayers[1:]):
if isinstance(module, nn.Linear):
# Convert the nn.Linear to nn.Conv
fc = module.state_dict()
in_ch = fc["weight"].size(1)
out_ch = fc["weight"].size(0)
conv = nn.Conv2d(in_ch, out_ch, 1, 1)
conv.load_state_dict({"weight":fc["weight"].view(out_ch, in_ch, 1, 1),
"bias":fc["bias"]})
convList += [conv]
else:
# Append other layers such as ReLU and Dropout
convList += [layer]
# Set the conv layers as a nn.Sequential module
convLayers = nn.Sequential(*convList)