Convolutional Neural Networks in MATLAB. Adding one more layer - matlab

what I'm trying to do is to testify the fact that if I add one more layer into CNN, the accuracy goes higher.
The code is below here.
This code is from https://github.com/lhoang29/DigitRecognition/blob/master/cnnload.m
I'm at the beginner stage of CNN and trying to expand one more layer including
convolution and pooling stage. I tried several ways but seems not working. Could someone show me how to expand one more layer?
Thankyou. Below is the code
Code for main function:
clear all; close all; clc;
maxtrain = 10000;
iter = 10;
eta = 0.01;
%% Data Load
trlblid = fopen('train-labels-idx1-ubyte');
trimgid = fopen('train-images-idx3-ubyte');
tslblid = fopen('t10k-labels-idx1-ubyte');
tsimgid = fopen('t10k-images-idx3-ubyte');
% read train labels
fread(trlblid, 4);
numtrlbls = toint(fread(trlblid, 4));
trainlabels = fread(trlblid, numtrlbls);
% read train data
fread(trimgid, 4);
numtrimg = toint(fread(trimgid, 4));
trimgh = toint(fread(trimgid, 4));
trimgw = toint(fread(trimgid, 4));
trainimages = permute(reshape(fread(trimgid,trimgh*trimgw*numtrimg),trimgh,trimgw,numtrimg), [2 1 3]);
% read test labels
fread(tslblid, 4);
numtslbls = toint(fread(tslblid, 4));
testlabels = fread(tslblid, numtslbls);
% read test data
fread(tsimgid, 4);
numtsimg = toint(fread(tsimgid, 4));
tsimgh = toint(fread(tsimgid, 4));
tsimgw = toint(fread(tsimgid, 4));
testimages = permute(reshape(fread(tsimgid, tsimgh*tsimgw*numtsimg),tsimgh,tsimgw,numtsimg), [2 1 3]);
%% CNN Training
[missimages, misslabels] = cnntrain(trainlabels,trainimages,testlabels,testimages,maxtrain,iter,eta);
%% CNN Testing
showmiss(missimages,misslabels,testimages,testlabels,25,2);
Code for training:
function [missimages, misslabels] = cnntrain(trainlabels,trainimages,testlabels,testimages,maxtrain,iter,eta)
fn = 5; % number of kernels for layer 1
ks = 5; % size of kernel
[h,w,n] = size(trainimages);
n = min(n,maxtrain);
% normalize data to [-1,1] range
nitrain = (trainimages / 255) * 2 - 1;
nitest = (testimages / 255) * 2 - 1;
% train with backprop
h1 = h-ks+1;
w1 = w-ks+1;
A1 = zeros(h1,w1,fn);
h2 = h1/2;
w2 = w1/2;
I2 = zeros(h2,w2,fn);
A2 = zeros(h2,w2,fn);
A3 = zeros(10,1);
% kernels for layer 1
W1 = randn(ks,ks,fn) * .01;
B1 = ones(1,fn);
% scale parameter and bias for layer 2
S2 = randn(1,fn) * .01;
B2 = ones(1,fn);
% weights and bias parameters for fully-connected output layer
W3 = randn(h2,w2,fn,10) * .01;
B3 = ones(10,1);
% true outputs
Y = eye(10)*2-1;
for it=1:iter
err = 0;
for im=1:n
%------------ FORWARD PROP ------------%
% Layer 1: convolution with bias followed by sigmoidal squashing
for fm=1:fn
A1(:,:,fm) = convn(nitrain(:,:,im),W1(end:-1:1,end:-1:1,fm),'valid') + B1(fm);
end
Z1 = tanh(A1);
% Layer 2: average/subsample with scaling and bias
for fm=1:fn
I2(:,:,fm) = avgpool(Z1(:,:,fm));
A2(:,:,fm) = I2(:,:,fm) * S2(fm) + B2(fm);
end
Z2 = tanh(A2);
% Layer 3: fully connected
for cl=1:10
A3(cl) = convn(Z2,W3(end:-1:1,end:-1:1,end:-1:1,cl),'valid') + B3(cl);
end
Z3 = tanh(A3); % Final output
err = err + .5 * norm(Z3 - Y(:,trainlabels(im)+1),2)^2;
%------------ BACK PROP ------------%
% Compute error at output layer
Del3 = (1 - Z3.^2) .* (Z3 - Y(:,trainlabels(im)+1));
% Compute error at layer 2
Del2 = zeros(size(Z2));
for cl=1:10
Del2 = Del2 + Del3(cl) * W3(:,:,:,cl);
end
Del2 = Del2 .* (1 - Z2.^2);
% Compute error at layer 1
Del1 = zeros(size(Z1));
for fm=1:fn
Del1(:,:,fm) = (S2(fm)/4)*(1 - Z1(:,:,fm).^2);
for ih=1:h1
for iw=1:w1
Del1(ih,iw,fm) = Del1(ih,iw,fm) * Del2(floor((ih+1)/2),floor((iw+1)/2),fm);
end
end
end
% Update bias at layer 3
DB3 = Del3; % gradient w.r.t bias
B3 = B3 - eta*DB3;
% Update weights at layer 3
for cl=1:10
DW3 = DB3(cl) * Z2; % gradient w.r.t weights
W3(:,:,:,cl) = W3(:,:,:,cl) - eta * DW3;
end
% Update scale and bias parameters at layer 2
for fm=1:fn
DS2 = convn(Del2(:,:,fm),I2(end:-1:1,end:-1:1,fm),'valid');
S2(fm) = S2(fm) - eta * DS2;
DB2 = sum(sum(Del2(:,:,fm)));
B2(fm) = B2(fm) - eta * DB2;
end
% Update kernel weights and bias parameters at layer 1
for fm=1:fn
DW1 = convn(nitrain(:,:,im),Del1(end:-1:1,end:-1:1,fm),'valid');
W1(:,:,fm) = W1(:,:,fm) - eta * DW1;
DB1 = sum(sum(Del1(:,:,fm)));
B1(fm) = B1(fm) - eta * DB1;
end
end
disp(['Error: ' num2str(err) ' at iteration ' num2str(it)]);
end
miss = 0;
numtest=size(testimages,3);
missimages = zeros(1,numtest);
misslabels = zeros(1,numtest);
for im=1:numtest
for fm=1:fn
A1(:,:,fm) = convn(nitest(:,:,im),W1(end:-1:1,end:-1:1,fm),'valid') + B1(fm);
end
Z1 = tanh(A1);
% Layer 2: average/subsample with scaling and bias
for fm=1:fn
I2(:,:,fm) = avgpool(Z1(:,:,fm));
A2(:,:,fm) = I2(:,:,fm) * S2(fm) + B2(fm);
end
Z2 = tanh(A2);
% Layer 3: fully connected
for cl=1:10
A3(cl) = convn(Z2,W3(end:-1:1,end:-1:1,end:-1:1,cl),'valid') + B3(cl);
end
Z3 = tanh(A3); % Final output
[pm,pl] = max(Z3);
if pl ~= testlabels(im)+1
miss = miss + 1;
missimages(miss) = im;
misslabels(miss) = pl - 1;
end
end
disp(['Miss: ' num2str(miss) ' out of ' num2str(numtest)]);
end
function [pr] = avgpool(img)
pr = zeros(size(img)/2);
for r=1:2:size(img,1)
for c=1:2:size(img,2)
pr((r+1)/2,(c+1)/2) = (img(r,c)+img(r+1,c)+img(r,c+1)+img(r+1,c+1))/4;
end
end
end
Code for showing accuracy
function [] = showmiss(missim,misslab,testimages,testlabels,numshow,numpages)
nummiss = nnz(missim);
page = 1;
showsize = floor(sqrt(numshow));
for f=1:numshow:nummiss
figure(floor(f/numshow) + 1);
for m=f:min(nummiss,f+numshow-1)
subplot(showsize,showsize,m-f+1);
imshow(testimages(:,:,missim(m)));
title(strcat(num2str(testlabels(missim(m))), ':', num2str(misslab(m))));
end
page = page + 1;
if page > numpages
break;
end
end
end
Function toint
function [x] = toint(b)
x = b(1)*16777216 + b(2)*65536 + b(3)*256 + b(4);
end

Related

Neural Network Backpropagation Algorithm Implementation

I implemented a Neural Network Back propagation Algorithm in MATLAB, however is is not training correctly. The training data is a matrix X = [x1, x2], dimension 2 x 200 and I have a target matrix T = [target1, target2], dimension 2 x 200. The first 100 columns in T can be [1; -1] for class 1, and the second 100 columns in T can be [-1; 1] for class 2.
theta = 0.1; % criterion to stop
eta = 0.1; % step size
Nh = 10; % number of hidden nodes
For some reason the total training error is always 1.000, it never goes close to the theta, so it runs forever.
I used the following formulas:
The total training error:
The code is well documented below. I would appreciate any help.
clear;
close all;
clc;
%%('---------------------')
%%('Generating dummy data')
%%('---------------------')
d11 = [2;2]*ones(1,70)+2.*randn(2,70);
d12 = [-2;-2]*ones(1,30)+randn(2,30);
d1 = [d11,d12];
d21 = [3;-3]*ones(1,50)+randn([2,50]);
d22 = [-3;3]*ones(1,50)+randn([2,50]);
d2 = [d21,d22];
hw5_1 = d1;
hw5_2 = d2;
save hw5.mat hw5_1 hw5_2
x1 = hw5_1;
x2 = hw5_2;
% step 1: Construct training data matrix X=[x1,x2], dimension 2x200
training_data = [x1, x2];
% step 2: Construct target matrix T=[target1, target2], dimension 2x200
target1 = repmat([1; -1], 1, 100); % class 1
target2 = repmat([-1; 1], 1, 100); % class 2
T = [target1, target2];
% step 3: normalize training data
training_data = training_data - mean(training_data(:));
training_data = training_data / std(training_data(:));
% step 4: specify parameters
theta = 0.1; % criterion to stop
eta = 0.1; % step size
Nh = 10; % number of hidden nodes, actual hidden nodes should be 11 (including a biase)
Ni = 2; % dimension of input vector = number of input nodes, actual input nodes should be 3 (including a biase)
No = 2; % number of class = number of out nodes
% step 5: Initialize the weights
a = -1/sqrt(No);
b = +1/sqrt(No);
inputLayerToHiddenLayerWeight = (b-a).*rand(Ni, Nh) + a
hiddenLayerToOutputLayerWeight = (b-a).*rand(Nh, No) + a
J = inf;
p = 1;
% activation function
% f(net) = a*tanh(b*net),
% f'(net) = a*b*sech2(b*net)
a = 1.716;
b = 2/3;
while J > theta
% step 6: randomly choose one training sample vector from X,
% together with its target vector
k = randi([1, size(training_data, 2)]);
input_X = training_data(:,k);
input_T = T(:,k);
% step 7: Calculate net_j values for hidden nodes in layer 1
% hidden layer output before activation function applied
netj = inputLayerToHiddenLayerWeight' * input_X;
% step 8: Calculate hidden node output Y using activation function
% apply activation function to hidden layer neurons
Y = a*tanh(b*netj);
% step 9: Calculate net_k values for output nodes in layer 2
% output later output before activation function applied
netk = hiddenLayerToOutputLayerWeight' * Y;
% step 10: Calculate output node output Z using the activation function
% apply activation function to the output layer neurons
Z = a*tanh(b*netk);
% step 11: Calculate sensitivity delta_k = (target - Z) * f'(Z)
% find the error between the expected_output and the neuron output
% we got using the weights
% delta_k = (expected - output) * activation(output)
delta_k = [];
for i=1:size(Z)
yi = Z(i,:);
expected_output = input_T(i,:);
delta_k = [delta_k; (expected_output - yi) ...
* a*b*(sech(b*yi)).^2];
end
% step 12: Calculate sensitivity
% delta_j = Sum_k(delta_k * hidden-to-out weights) * f'(net_j)
% error = (weight_k * error_j) * activation(output)
delta_j = [];
for j=1:size(Y)
yi = Y(j,:);
error = 0;
for k=1:size(delta_k)
error = error + delta_k(k,:)*hiddenLayerToOutputLayerWeight(j, k);
end
delta_j = [delta_j; error * (a*b*(sech(b*yi)).^2)];
end
% step 13: update weights
%2x10
inputLayerToHiddenLayerWeight = [];
for i=1:size(input_X)
xi = input_X(i,:);
wji = [];
for j=1:size(delta_j)
wji = [wji, eta * xi * delta_j(j,:)];
end
inputLayerToHiddenLayerWeight = [inputLayerToHiddenLayerWeight; wji];
end
inputLayerToHiddenLayerWeight
%10x2
hiddenLayerToOutputLayerWeight = [];
for j=1:size(Y)
yi = Y(j,:);
wjk = [];
for k=1:size(delta_k)
wjk = [wjk, eta * delta_k(k,:) * yi];
end
hiddenLayerToOutputLayerWeight = [hiddenLayerToOutputLayerWeight; wjk];
end
hiddenLayerToOutputLayerWeight
% Mean Square Error
J = 0;
for j=1:size(training_data, 2)
X = training_data(:,j);
t = T(:,j);
netj = inputLayerToHiddenLayerWeight' * X;
Y = a*tanh(b*netj);
netk = hiddenLayerToOutputLayerWeight' * Y;
Z = a*tanh(b*netk);
J = J + immse(t, Z);
end
J = J/size(training_data, 2)
p = p + 1;
if p == 4
break;
end
end
% testing neural network using the inputs
test_data = [[2; -2], [-3; -3], [-2; 5], [3; -4]];
for i=1:size(test_data, 2)
end
Weight decay isn't essential for Neural Network training.
What I did notice was that your feature normalization wasn't correct.
The correct algorthim for scaling data to the range of 0 to 1 is
(max - x) / (max - min)
Note: you apply this for every element within the array (or vector). Data inputs for NN need to be within the range of [0,1]. (Technically they can be a little bit outside of that ~[-3,3] but values furthur from 0 make training difficult)
edit*
I am unaware of this activation function
a = 1.716;
b = 2/3;
% f(net) = a*tanh(b*net),
% f'(net) = a*b*sech2(b*net)
It sems like a variation on tanh.
Could you elaborate what it is?
If you're net still doesn't work give me an update and I'll look at your code more closely.

Output of k3_1 is capped at -3.1445e+24

I'm solving a system of ODEs using RK4. I'm generating a straight line plot that seems to be due to the fact that k3_1 is capped at -3.1445e+24. I don't understand why it is capped.
function RK4system_MNModel()
parsec = 3.08*10^18;
r_1 = 8.5*1000.0*parsec; % in cm
z_1 = 0.0; % in cm also
theta_1 = 0.0;
grav = 6.6720*10^-8;
amsun = 1.989*10^33; % in grams
amg = 1.5d11*amsun; % in grams
gm = grav*amg; % constant
q = 0.9; % axial ratio
u_1 = 130.0; % in cm/sec
w_1 = 95*10^4.0; % in cm/sec
v = 180*10^4.0; % in cm/sec
vcirc = sqrt(gm/r_1); % circular speed (constant)
nsteps = 50000;
deltat = 5.0*10^11; % in seconds
angmom = r_1*v; % these are the same
angmom2 = angmom^2.0;
e = -gm/r_1+u_1*u_1/2.0+angmom2/(2.0*r_1*r_1);
time=0.0;
for i=1:nsteps
k3_1 = deltat*u_1 %%%%% THIS LINE
k4_1 = deltat*(-gm*r_1/((r_1^2.0+(1+sqrt(1+z_1^2.0))^2.0)^1.5) + angmom2/(r_1^3.0)); % u'=-dphi_dr+lz^2/(r^3.0) with lz=vi*ri this gives deltau
k5_1 = deltat*(angmom/(r_1^2.0)); % theta'=lz/r^2 this gives deltatheta
k6_1 = deltat*w_1;
k7_1 = deltat*(-gm*z_1*(1+sqrt(1+z_1^2.0))/(sqrt(1+z_1^2.0)*(r_1^2.0+(1+sqrt(1+z_1^2.0))^2.0)^1.5));
r_2 = r_1+k3_1/2.0;
u_2 = u_1+k4_1/2.0;
theta_2 = theta_1+k5_1/2.0;
z_2 = z_1 + k6_1/2.0;
w_2 = w_1 + k7_1/2.0;
k3_2 = deltat*u_2;
k4_2 = deltat*(-gm*r_2/((r_2^2.0+(1+sqrt(1+z_2^2.0))^2.0)^1.5)+angmom2/(r_2^3.0));
k5_2 = deltat*(angmom/(r_2^2.0)); % theta'=lz/r^2 =====> deltatheta
k6_2 = deltat*w_2;
k7_2 = deltat*(-gm*z_2*(1+sqrt(1+z_2^2.0))/(sqrt(1+z_2^2.0)*(r_2^2.0+(1+sqrt(1+z_2^2.0))^2.0)^1.5));
r_3 = r_1+k3_2/2.0;
u_3 = u_1+k4_2/2.0;
theta_3 = theta_1+k5_2/2.0;
z_3 = z_1 + k6_2/2.0;
w_3 = w_1 + k7_2/2.0;
k3_3 = deltat*u_3; % r'=u
k4_3 = deltat*(-gm*r_3/((r_3^2.0+(1+sqrt(1+z_3^2.0))^2.0)^1.5)+angmom2/(r_3^3.0));% u'=-dphi_dr+lz^2/(r^3.0)
k5_3 = deltat*(angmom/(r_3^2.0)); % theta'=lz/r^2
k6_3 = deltat*w_3;
k7_3 = deltat*(-gm*z_3*(1+sqrt(1+z_3^2.0))/(sqrt(1+z_3^2.0)*(r_3^2.0+(1+sqrt(1+z_3^2.0))^2.0)^1.5));
r_4 = r_1+k3_2;
u_4 = u_1+k4_2;
theta_4 = theta_1+k5_2;
z_4 = z_1 + k6_2;
w_4 = w_1 + k7_2;
k3_4 = deltat*u_4; % r'=u
k4_4 = deltat*(-gm*r_4/((r_4^2.0+(1+sqrt(1+z_4^2.0))^2.0)^1.5)+angmom2/(r_4^3.0)); % u'=-dphi_dr+lz^2/(r^3.0)
k5_4 = deltat*(angmom/(r_4^2.0)); % theta'=lz/r^2
k6_4 = deltat*w_4;
k7_4 = deltat*(-gm*z_4*(1+sqrt(1+z_4^2.0))/(sqrt(1+z_4^2.0)*(r_4^2.0+(1+sqrt(1+z_4^2.0))^2.0)^1.5));
r_1 = r_1+(k3_1+2.0*k3_2+2.0*k3_3+k3_4)/6.0; % New value of R for next step
u_1 = u_1+(k4_1+2.0*k4_2+2.0*k4_3+k4_4)/6.0; % New value of U for next step
theta_1 = theta_1+(k5_1+2.0*k5_2+2.0*k5_3+k5_4)/6.0; % New value of theta
z_1 = z_1+(k6_1+2.0*k6_2+2.0*k6_3+k6_4)/6.0;
w_1 = w_1+(k7_1+2.0*k7_2+2.0*k7_3+k7_4)/6.0;
e = -gm/r_1+u_1*u_1/2.0+angmom2/(2.0*r_1*r_1); % energy
ecc = (1.0+(2.0*e*angmom2)/(gm^2.0))^0.5; % eccentricity
x(i) = r_1*cos(theta_1)/(1000.0*parsec); % X for plotting orbit
y(i) = r_1*sin(theta_1)/(1000.0*parsec); % Y for plotting orbit
time = time+deltat;
r(i) = r_1;
z(i) = z_1;
time1(i)= time;
end
Note that the anomally occurs on the indicated line.
It's not k3_1 that's capped, it's the calculation of u_1 that returns a value of -3.1445e+24 / deltat (deltat is constant).
u_1 is calculated in the line:
u_1 = u_1+(k4_1+2.0*k4_2+2.0*k4_3+k4_4)/6.0;
After the first iteration, this returns:
u_1(1) = 6.500e+13 % Hard coded before the loop
u_1(2) = -1.432966614767040e+04 % Calculated using the equation above
u_1(3) = -2.878934017859105e+04 % Calculated using the equation above
u_1(4) = -4.324903004768405e+04
Based on the equation u_1(n+1) = u_1(n) + du it looks like du represents a relatively small difference. The difference between the two first values is very large, so I'm assuming it is this calculation that's incorrect.
If you find that that calculation is correct, then your error is in one of these lines:
k4_1 = deltat*(-gm*r_1/((r_1^2.0+(1+sqrt(1+z_1^2.0))^2.0)^1.5)+angmom2/(r_1^3.0)); % u'=-dphi_dr+lz^2/(r^3.0) with lz=vi*ri this gives delta
k4_2 = deltat*(-gm*r_2/((r_2^2.0+(1+sqrt(1+z_2^2.0))^2.0)^1.5)+angmom2/(r_2^3.0));
k4_3 = deltat*(-gm*r_3/((r_3^2.0+(1+sqrt(1+z_3^2.0))^2.0)^1.5)+angmom2/(r_3^3.0));% u'=-dphi_dr+lz^2/(r^3.0)
k4_4 = deltat*(-gm*r_4/((r_4^2.0+(1+sqrt(1+z_4^2.0))^2.0)^1.5)+angmom2/(r_4^3.0)); % u'=-dphi_dr+lz^2/(r^3.0)

What's wrong with my matlab programming of a hopfield neural network?

Sorry if the code is sloppy. The code is supposed to set up a Hopfield network from memory vectors of firing rates (a cross, a square, etc), converting between membrane potential and firing rate with the function f(ui) = tanh(beta * ui) and evolving using the evolution rule indicated. But for some reason it's not working: memories are not retrieved and instead the image disappears in some way. Please help.
w = zeros(100);
beta = 4;
% Memory 1: a cross
m1 = ones(10,10);
m1(:, 4) = 0;
m1(4, :) = 0;
init = m1(1:4,:);
m1 = (m1*2-1);
% Memory 2: a square
m2 = ones(10,10);
m2(5:7, 5:7) = 0;
m2 = (m2*2-1);
% Memory 3: a hollow square
m3 = ones(10,10);
m3(:, 1) = 0;
m3(:, 10) = 0;
m3(1, :) = 0;
m3(10, :) = 0;
m3 = (m3*2-1);
% build weight network
for i = 1:100
for j = 1:100
if i ~= j
w(i,j) = w(i,j) + m1(i) * m1(j) + m2(i) * m2(j) + m3(i) * m3(j);
end
end
end
% Initial condition
f1 = [init; rand(6,10)];
f1 = f1*2-1;
f1 = 0.9*f1;
%invert to produce membrane potential
u1 = atanh(f1)/beta;
% evolve according to equation 1
weight = 0;
rate = 0;
evolve = u1;
for t = 1:1
u1 = evolve;
for i = 1:10
for j = 1:10
weight = w(i,:); % weighted connection w/ every other neuron
rate = tanh(beta*u1(:));
evolve(i,j) = u1(i,j) + .05* (-u1(i,j) + (weight * rate));
end
end
end
u1 = evolve;
f1 = tanh(beta * u1);
imshow(reshape(f1, 10, 10), [-1 , 1], 'InitialMagnification', 'fit');

Application of Neural Network in MATLAB

I asked a question a few days before but I guess it was a little too complicated and I don't expect to get any answer.
My problem is that I need to use ANN for classification. I've read that much better cost function (or loss function as some books specify) is the cross-entropy, that is J(w) = -1/m * sum_i( yi*ln(hw(xi)) + (1-yi)*ln(1 - hw(xi)) ); i indicates the no. data from training matrix X. I tried to apply it in MATLAB but I find it really difficult. There are couple things I don't know:
should I sum each outputs given all training data (i = 1, ... N, where N is number of inputs for training)
is the gradient calculated correctly
is the numerical gradient (gradAapprox) calculated correctly.
I have following MATLAB codes. I realise I may ask for trivial thing but anyway I hope someone can give me some clues how to find the problem. I suspect the problem is to calculate gradients.
Many thanks.
Main script:
close all
clear all
L = #(x) (1 + exp(-x)).^(-1);
NN = #(x,theta) theta{2}*[ones(1,size(x,1));L(theta{1}*[ones(size(x,1),1) x]')];
% theta = [10 -30 -30];
x = [0 0; 0 1; 1 0; 1 1];
y = [0.9 0.1 0.1 0.1]';
theta0 = 2*rand(9,1)-1;
options = optimset('gradObj','on','Display','iter');
thetaVec = fminunc(#costFunction,theta0,options,x,y);
theta = cell(2,1);
theta{1} = reshape(thetaVec(1:6),[2 3]);
theta{2} = reshape(thetaVec(7:9),[1 3]);
NN(x,theta)'
Cost function:
function [jVal,gradVal,gradApprox] = costFunction(thetaVec,x,y)
persistent index;
% 1 x x
% 1 x x
% 1 x x
% x = 1 x x
% 1 x x
% 1 x x
% 1 x x
m = size(x,1);
if isempty(index) || index > size(x,1)
index = 1;
end
L = #(x) (1 + exp(-x)).^(-1);
NN = #(x,theta) theta{2}*[ones(1,size(x,1));L(theta{1}*[ones(size(x,1),1) x]')];
theta = cell(2,1);
theta{1} = reshape(thetaVec(1:6),[2 3]);
theta{2} = reshape(thetaVec(7:9),[1 3]);
Dew = cell(2,1);
DewApprox = cell(2,1);
% Forward propagation
a0 = x(index,:)';
z1 = theta{1}*[1;a0];
a1 = L(z1);
z2 = theta{2}*[1;a1];
a2 = L(z2);
% Back propagation
d2 = 1/m*(a2 - y(index))*L(z2)*(1-L(z2));
Dew{2} = [1;a1]*d2;
d1 = [1;a1].*(1 - [1;a1]).*theta{2}'*d2;
Dew{1} = [1;a0]*d1(2:end)';
% NNRes = NN(x,theta)';
% jVal = -1/m*sum(NNRes-y)*NNRes*(1-NNRes);
jVal = -1/m*(a2 - y(index))*a2*(1-a2);
gradVal = [Dew{1}(:);Dew{2}(:)];
gradApprox = CalcGradApprox(0.0001);
index = index + 1;
function output = CalcGradApprox(epsilon)
output = zeros(size(gradVal));
for n=1:length(thetaVec)
thetaVecMin = thetaVec;
thetaVecMax = thetaVec;
thetaVecMin(n) = thetaVec(n) - epsilon;
thetaVecMax(n) = thetaVec(n) + epsilon;
thetaMin = cell(2,1);
thetaMax = cell(2,1);
thetaMin{1} = reshape(thetaVecMin(1:6),[2 3]);
thetaMin{2} = reshape(thetaVecMin(7:9),[1 3]);
thetaMax{1} = reshape(thetaVecMax(1:6),[2 3]);
thetaMax{2} = reshape(thetaVecMax(7:9),[1 3]);
a2min = NN(x(index,:),thetaMin)';
a2max = NN(x(index,:),thetaMax)';
jValMin = -1/m*(a2min-y(index))*a2min*(1-a2min);
jValMax = -1/m*(a2max-y(index))*a2max*(1-a2max);
output(n) = (jValMax - jValMin)/2/epsilon;
end
end
end
EDIT:
Below I present the correct version of my costFunction for those who may be interested.
function [jVal,gradVal,gradApprox] = costFunction(thetaVec,x,y)
m = size(x,1);
L = #(x) (1 + exp(-x)).^(-1);
NN = #(x,theta) L(theta{2}*[ones(1,size(x,1));L(theta{1}*[ones(size(x,1),1) x]')]);
theta = cell(2,1);
theta{1} = reshape(thetaVec(1:6),[2 3]);
theta{2} = reshape(thetaVec(7:9),[1 3]);
Delta = cell(2,1);
Delta{1} = zeros(size(theta{1}));
Delta{2} = zeros(size(theta{2}));
D = cell(2,1);
D{1} = zeros(size(theta{1}));
D{2} = zeros(size(theta{2}));
jVal = 0;
for in = 1:size(x,1)
% Forward propagation
a1 = [1;x(in,:)']; % added bias to a0
z2 = theta{1}*a1;
a2 = [1;L(z2)]; % added bias to a1
z3 = theta{2}*a2;
a3 = L(z3);
% Back propagation
d3 = a3 - y(in);
d2 = theta{2}'*d3.*a2.*(1 - a2);
Delta{2} = Delta{2} + d3*a2';
Delta{1} = Delta{1} + d2(2:end)*a1';
jVal = jVal + sum( y(in)*log(a3) + (1-y(in))*log(1-a3) );
end
D{1} = 1/m*Delta{1};
D{2} = 1/m*Delta{2};
jVal = -1/m*jVal;
gradVal = [D{1}(:);D{2}(:)];
gradApprox = CalcGradApprox(x(in,:),0.0001);
% Nested function to calculate gradApprox
function output = CalcGradApprox(x,epsilon)
output = zeros(size(thetaVec));
for n=1:length(thetaVec)
thetaVecMin = thetaVec;
thetaVecMax = thetaVec;
thetaVecMin(n) = thetaVec(n) - epsilon;
thetaVecMax(n) = thetaVec(n) + epsilon;
thetaMin = cell(2,1);
thetaMax = cell(2,1);
thetaMin{1} = reshape(thetaVecMin(1:6),[2 3]);
thetaMin{2} = reshape(thetaVecMin(7:9),[1 3]);
thetaMax{1} = reshape(thetaVecMax(1:6),[2 3]);
thetaMax{2} = reshape(thetaVecMax(7:9),[1 3]);
a3min = NN(x,thetaMin)';
a3max = NN(x,thetaMax)';
jValMin = 0;
jValMax = 0;
for inn=1:size(x,1)
jValMin = jValMin + sum( y(inn)*log(a3min) + (1-y(inn))*log(1-a3min) );
jValMax = jValMax + sum( y(inn)*log(a3max) + (1-y(inn))*log(1-a3max) );
end
jValMin = 1/m*jValMin;
jValMax = 1/m*jValMax;
output(n) = (jValMax - jValMin)/2/epsilon;
end
end
end
I've only had a quick eyeball over your code. Here are some pointers.
Q1
should I sum each outputs given all training data (i = 1, ... N, where
N is number of inputs for training)
If you are talking in relation to the cost function, it is normal to sum and normalise by the number of training examples in order to provide comparison between.
I can't tell from the code whether you have a vectorised implementation which will change the answer. Note that the sum function will only sum up a single dimension at a time - meaning if you have a (M by N) array, sum will result in a 1 by N array.
The cost function should have a scalar output.
Q2
is the gradient calculated correctly
The gradient is not calculated correctly - specifically the deltas look wrong. Try following Andrew Ng's notes [PDF] they are very good.
Q3
is the numerical gradient (gradAapprox) calculated correctly.
This line looks a bit suspect. Does this make more sense?
output(n) = (jValMax - jValMin)/(2*epsilon);
EDIT: I actually can't make heads or tails of your gradient approximation. You should only use forward propagation and small tweaks in the parameters to compute the gradient. Good luck!

Continuous RBM: Poor performance only for negative valued input data?

i tried to port this python implementation of a continuous RBM to Matlab:
http://imonad.com/rbm/restricted-boltzmann-machine/
I generated 2-dimensional trainingdata in the shape of a (noisy) circle and trained the rbm with 2 visible an 8 hidden layers. To test the implementation i fed uniformly distributed randomdata to the RBM and plotted the reconstructed data (Same procedure as used in the link above).
Now the confusing part: With trainingdata in the range of (0,1)x(0,1) i get very satisfying results, however with trainingdata in range (-0.5,-0.5)x(-0.5,-0.5) or (-1,0)x(-1,0) the RBM reconstructs only data in the very right top of the circle. I dont understand what causes this, is it just a bug in my implementation i dont see?
Some plots, the blue dots are the training data, the red dots are the reconstructions.
Here is my implementation of the RBM:
Training:
maxepoch = 300;
ksteps = 10;
sigma = 0.2; % cd standard deviation
learnW = 0.5; % learning rate W
learnA = 0.5; % learning rate A
nVis = 2; % number of visible units
nHid = 8; % number of hidden units
nDat = size(dat, 1);% number of training data points
cost = 0.00001; % cost
moment = 0.9; % momentum
W = randn(nVis+1, nHid+1) / 10; % weights
dW = randn(nVis+1, nHid+1) / 1000; % change of weights
sVis = zeros(1, nVis+1); % state of visible neurons
sVis(1, end) = 1.0; % bias
sVis0 = zeros(1, nVis+1); % initial state of visible neurons
sVis0(1, end) = 1.0; % bias
sHid = zeros(1, nHid+1); % state of hidden neurons
sHid(1, end) = 1.0; % bias
aVis = 0.1*ones(1, nVis+1);% A visible
aHid = ones(1, nHid+1); % A hidden
err = zeros(1, maxepoch);
e = zeros(1, maxepoch);
for epoch = 1:maxepoch
wPos = zeros(nVis+1, nHid+1);
wNeg = zeros(nVis+1, nHid+1);
aPos = zeros(1, nHid+1);
aNeg = zeros(1, nHid+1);
for point = 1:nDat
sVis(1:nVis) = dat(point, :);
sVis0(1:nVis) = sVis(1:nVis); % initial sVis
% positive phase
activHid;
wPos = wPos + sVis' * sHid;
aPos = aPos + sHid .* sHid;
% negative phase
activVis;
activHid;
for k = 1:ksteps
activVis;
activHid;
end
tmp = sVis' * sHid;
wNeg = wNeg + tmp;
aNeg = aNeg + sHid .* sHid;
delta = sVis0(1:nVis) - sVis(1:nVis);
err(epoch) = err(epoch) + sum(delta .* delta);
e(epoch) = e(epoch) - sum(sum(W' * tmp));
end
dW = dW*moment + learnW * ((wPos - wNeg) / numel(dat)) - cost * W;
W = W + dW;
aHid = aHid + learnA * (aPos - aNeg) / (numel(dat) * (aHid .* aHid));
% error
err(epoch) = err(epoch) / (nVis * numel(dat));
e(epoch) = e(epoch) / numel(dat);
disp(['epoch: ' num2str(epoch) ' err: ' num2str(err(epoch)) ...
' ksteps: ' num2str(ksteps)]);
end
save(['rbm_' filename '.mat'], 'W', 'err', 'aVis', 'aHid');
activHid.m:
sHid = (sVis * W) + randn(1, nHid+1);
sHid = sigFun(aHid .* sHid, datRange);
sHid(end) = 1.; % bias
activVis.m:
sVis = (W * sHid')' + randn(1, nVis+1);
sVis = sigFun(aVis .* sVis, datRange);
sVis(end) = 1.; % bias
sigFun.m:
function [sig] = sigFun(X, datRange)
a = ones(size(X)) * datRange(1);
b = ones(size(X)) * (datRange(2) - datRange(1));
c = ones(size(X)) + exp(-X);
sig = a + (b ./ c);
end
Reconstruction:
nSamples = 2000;
ksteps = 10;
nVis = 2;
nHid = 8;
sVis = zeros(1, nVis+1); % state of visible neurons
sVis(1, end) = 1.0; % bias
sHid = zeros(1, nHid+1); % state of hidden neurons
sHid(1, end) = 1.0; % bias
input = rand(nSamples, 2);
output = zeros(nSamples, 2);
for sample = 1:nSamples
sVis(1:nVis) = input(sample, :);
for k = 1:ksteps
activHid;
activVis;
end
output(sample, :) = sVis(1:nVis);
end
RBM's were originally designed to work only with binary data. But also work with data between 0 and 1. Its part of the algorithm. Further reading
As input is in the range of [0 1] for both x and y, this is why they stay in that ares. Changing the input to input = (rand(nSamples, 2)*2) -1; results in input sampled from a range of [-1 1] and therefore the red dots will be more spread out around the circle.