Convolutional Neural Networks in MATLAB. Adding one more layer - matlab
what I'm trying to do is to testify the fact that if I add one more layer into CNN, the accuracy goes higher.
The code is below here.
This code is from https://github.com/lhoang29/DigitRecognition/blob/master/cnnload.m
I'm at the beginner stage of CNN and trying to expand one more layer including
convolution and pooling stage. I tried several ways but seems not working. Could someone show me how to expand one more layer?
Thankyou. Below is the code
Code for main function:
clear all; close all; clc;
maxtrain = 10000;
iter = 10;
eta = 0.01;
%% Data Load
trlblid = fopen('train-labels-idx1-ubyte');
trimgid = fopen('train-images-idx3-ubyte');
tslblid = fopen('t10k-labels-idx1-ubyte');
tsimgid = fopen('t10k-images-idx3-ubyte');
% read train labels
fread(trlblid, 4);
numtrlbls = toint(fread(trlblid, 4));
trainlabels = fread(trlblid, numtrlbls);
% read train data
fread(trimgid, 4);
numtrimg = toint(fread(trimgid, 4));
trimgh = toint(fread(trimgid, 4));
trimgw = toint(fread(trimgid, 4));
trainimages = permute(reshape(fread(trimgid,trimgh*trimgw*numtrimg),trimgh,trimgw,numtrimg), [2 1 3]);
% read test labels
fread(tslblid, 4);
numtslbls = toint(fread(tslblid, 4));
testlabels = fread(tslblid, numtslbls);
% read test data
fread(tsimgid, 4);
numtsimg = toint(fread(tsimgid, 4));
tsimgh = toint(fread(tsimgid, 4));
tsimgw = toint(fread(tsimgid, 4));
testimages = permute(reshape(fread(tsimgid, tsimgh*tsimgw*numtsimg),tsimgh,tsimgw,numtsimg), [2 1 3]);
%% CNN Training
[missimages, misslabels] = cnntrain(trainlabels,trainimages,testlabels,testimages,maxtrain,iter,eta);
%% CNN Testing
showmiss(missimages,misslabels,testimages,testlabels,25,2);
Code for training:
function [missimages, misslabels] = cnntrain(trainlabels,trainimages,testlabels,testimages,maxtrain,iter,eta)
fn = 5; % number of kernels for layer 1
ks = 5; % size of kernel
[h,w,n] = size(trainimages);
n = min(n,maxtrain);
% normalize data to [-1,1] range
nitrain = (trainimages / 255) * 2 - 1;
nitest = (testimages / 255) * 2 - 1;
% train with backprop
h1 = h-ks+1;
w1 = w-ks+1;
A1 = zeros(h1,w1,fn);
h2 = h1/2;
w2 = w1/2;
I2 = zeros(h2,w2,fn);
A2 = zeros(h2,w2,fn);
A3 = zeros(10,1);
% kernels for layer 1
W1 = randn(ks,ks,fn) * .01;
B1 = ones(1,fn);
% scale parameter and bias for layer 2
S2 = randn(1,fn) * .01;
B2 = ones(1,fn);
% weights and bias parameters for fully-connected output layer
W3 = randn(h2,w2,fn,10) * .01;
B3 = ones(10,1);
% true outputs
Y = eye(10)*2-1;
for it=1:iter
err = 0;
for im=1:n
%------------ FORWARD PROP ------------%
% Layer 1: convolution with bias followed by sigmoidal squashing
for fm=1:fn
A1(:,:,fm) = convn(nitrain(:,:,im),W1(end:-1:1,end:-1:1,fm),'valid') + B1(fm);
end
Z1 = tanh(A1);
% Layer 2: average/subsample with scaling and bias
for fm=1:fn
I2(:,:,fm) = avgpool(Z1(:,:,fm));
A2(:,:,fm) = I2(:,:,fm) * S2(fm) + B2(fm);
end
Z2 = tanh(A2);
% Layer 3: fully connected
for cl=1:10
A3(cl) = convn(Z2,W3(end:-1:1,end:-1:1,end:-1:1,cl),'valid') + B3(cl);
end
Z3 = tanh(A3); % Final output
err = err + .5 * norm(Z3 - Y(:,trainlabels(im)+1),2)^2;
%------------ BACK PROP ------------%
% Compute error at output layer
Del3 = (1 - Z3.^2) .* (Z3 - Y(:,trainlabels(im)+1));
% Compute error at layer 2
Del2 = zeros(size(Z2));
for cl=1:10
Del2 = Del2 + Del3(cl) * W3(:,:,:,cl);
end
Del2 = Del2 .* (1 - Z2.^2);
% Compute error at layer 1
Del1 = zeros(size(Z1));
for fm=1:fn
Del1(:,:,fm) = (S2(fm)/4)*(1 - Z1(:,:,fm).^2);
for ih=1:h1
for iw=1:w1
Del1(ih,iw,fm) = Del1(ih,iw,fm) * Del2(floor((ih+1)/2),floor((iw+1)/2),fm);
end
end
end
% Update bias at layer 3
DB3 = Del3; % gradient w.r.t bias
B3 = B3 - eta*DB3;
% Update weights at layer 3
for cl=1:10
DW3 = DB3(cl) * Z2; % gradient w.r.t weights
W3(:,:,:,cl) = W3(:,:,:,cl) - eta * DW3;
end
% Update scale and bias parameters at layer 2
for fm=1:fn
DS2 = convn(Del2(:,:,fm),I2(end:-1:1,end:-1:1,fm),'valid');
S2(fm) = S2(fm) - eta * DS2;
DB2 = sum(sum(Del2(:,:,fm)));
B2(fm) = B2(fm) - eta * DB2;
end
% Update kernel weights and bias parameters at layer 1
for fm=1:fn
DW1 = convn(nitrain(:,:,im),Del1(end:-1:1,end:-1:1,fm),'valid');
W1(:,:,fm) = W1(:,:,fm) - eta * DW1;
DB1 = sum(sum(Del1(:,:,fm)));
B1(fm) = B1(fm) - eta * DB1;
end
end
disp(['Error: ' num2str(err) ' at iteration ' num2str(it)]);
end
miss = 0;
numtest=size(testimages,3);
missimages = zeros(1,numtest);
misslabels = zeros(1,numtest);
for im=1:numtest
for fm=1:fn
A1(:,:,fm) = convn(nitest(:,:,im),W1(end:-1:1,end:-1:1,fm),'valid') + B1(fm);
end
Z1 = tanh(A1);
% Layer 2: average/subsample with scaling and bias
for fm=1:fn
I2(:,:,fm) = avgpool(Z1(:,:,fm));
A2(:,:,fm) = I2(:,:,fm) * S2(fm) + B2(fm);
end
Z2 = tanh(A2);
% Layer 3: fully connected
for cl=1:10
A3(cl) = convn(Z2,W3(end:-1:1,end:-1:1,end:-1:1,cl),'valid') + B3(cl);
end
Z3 = tanh(A3); % Final output
[pm,pl] = max(Z3);
if pl ~= testlabels(im)+1
miss = miss + 1;
missimages(miss) = im;
misslabels(miss) = pl - 1;
end
end
disp(['Miss: ' num2str(miss) ' out of ' num2str(numtest)]);
end
function [pr] = avgpool(img)
pr = zeros(size(img)/2);
for r=1:2:size(img,1)
for c=1:2:size(img,2)
pr((r+1)/2,(c+1)/2) = (img(r,c)+img(r+1,c)+img(r,c+1)+img(r+1,c+1))/4;
end
end
end
Code for showing accuracy
function [] = showmiss(missim,misslab,testimages,testlabels,numshow,numpages)
nummiss = nnz(missim);
page = 1;
showsize = floor(sqrt(numshow));
for f=1:numshow:nummiss
figure(floor(f/numshow) + 1);
for m=f:min(nummiss,f+numshow-1)
subplot(showsize,showsize,m-f+1);
imshow(testimages(:,:,missim(m)));
title(strcat(num2str(testlabels(missim(m))), ':', num2str(misslab(m))));
end
page = page + 1;
if page > numpages
break;
end
end
end
Function toint
function [x] = toint(b)
x = b(1)*16777216 + b(2)*65536 + b(3)*256 + b(4);
end
Related
Neural Network Backpropagation Algorithm Implementation
I implemented a Neural Network Back propagation Algorithm in MATLAB, however is is not training correctly. The training data is a matrix X = [x1, x2], dimension 2 x 200 and I have a target matrix T = [target1, target2], dimension 2 x 200. The first 100 columns in T can be [1; -1] for class 1, and the second 100 columns in T can be [-1; 1] for class 2. theta = 0.1; % criterion to stop eta = 0.1; % step size Nh = 10; % number of hidden nodes For some reason the total training error is always 1.000, it never goes close to the theta, so it runs forever. I used the following formulas: The total training error: The code is well documented below. I would appreciate any help. clear; close all; clc; %%('---------------------') %%('Generating dummy data') %%('---------------------') d11 = [2;2]*ones(1,70)+2.*randn(2,70); d12 = [-2;-2]*ones(1,30)+randn(2,30); d1 = [d11,d12]; d21 = [3;-3]*ones(1,50)+randn([2,50]); d22 = [-3;3]*ones(1,50)+randn([2,50]); d2 = [d21,d22]; hw5_1 = d1; hw5_2 = d2; save hw5.mat hw5_1 hw5_2 x1 = hw5_1; x2 = hw5_2; % step 1: Construct training data matrix X=[x1,x2], dimension 2x200 training_data = [x1, x2]; % step 2: Construct target matrix T=[target1, target2], dimension 2x200 target1 = repmat([1; -1], 1, 100); % class 1 target2 = repmat([-1; 1], 1, 100); % class 2 T = [target1, target2]; % step 3: normalize training data training_data = training_data - mean(training_data(:)); training_data = training_data / std(training_data(:)); % step 4: specify parameters theta = 0.1; % criterion to stop eta = 0.1; % step size Nh = 10; % number of hidden nodes, actual hidden nodes should be 11 (including a biase) Ni = 2; % dimension of input vector = number of input nodes, actual input nodes should be 3 (including a biase) No = 2; % number of class = number of out nodes % step 5: Initialize the weights a = -1/sqrt(No); b = +1/sqrt(No); inputLayerToHiddenLayerWeight = (b-a).*rand(Ni, Nh) + a hiddenLayerToOutputLayerWeight = (b-a).*rand(Nh, No) + a J = inf; p = 1; % activation function % f(net) = a*tanh(b*net), % f'(net) = a*b*sech2(b*net) a = 1.716; b = 2/3; while J > theta % step 6: randomly choose one training sample vector from X, % together with its target vector k = randi([1, size(training_data, 2)]); input_X = training_data(:,k); input_T = T(:,k); % step 7: Calculate net_j values for hidden nodes in layer 1 % hidden layer output before activation function applied netj = inputLayerToHiddenLayerWeight' * input_X; % step 8: Calculate hidden node output Y using activation function % apply activation function to hidden layer neurons Y = a*tanh(b*netj); % step 9: Calculate net_k values for output nodes in layer 2 % output later output before activation function applied netk = hiddenLayerToOutputLayerWeight' * Y; % step 10: Calculate output node output Z using the activation function % apply activation function to the output layer neurons Z = a*tanh(b*netk); % step 11: Calculate sensitivity delta_k = (target - Z) * f'(Z) % find the error between the expected_output and the neuron output % we got using the weights % delta_k = (expected - output) * activation(output) delta_k = []; for i=1:size(Z) yi = Z(i,:); expected_output = input_T(i,:); delta_k = [delta_k; (expected_output - yi) ... * a*b*(sech(b*yi)).^2]; end % step 12: Calculate sensitivity % delta_j = Sum_k(delta_k * hidden-to-out weights) * f'(net_j) % error = (weight_k * error_j) * activation(output) delta_j = []; for j=1:size(Y) yi = Y(j,:); error = 0; for k=1:size(delta_k) error = error + delta_k(k,:)*hiddenLayerToOutputLayerWeight(j, k); end delta_j = [delta_j; error * (a*b*(sech(b*yi)).^2)]; end % step 13: update weights %2x10 inputLayerToHiddenLayerWeight = []; for i=1:size(input_X) xi = input_X(i,:); wji = []; for j=1:size(delta_j) wji = [wji, eta * xi * delta_j(j,:)]; end inputLayerToHiddenLayerWeight = [inputLayerToHiddenLayerWeight; wji]; end inputLayerToHiddenLayerWeight %10x2 hiddenLayerToOutputLayerWeight = []; for j=1:size(Y) yi = Y(j,:); wjk = []; for k=1:size(delta_k) wjk = [wjk, eta * delta_k(k,:) * yi]; end hiddenLayerToOutputLayerWeight = [hiddenLayerToOutputLayerWeight; wjk]; end hiddenLayerToOutputLayerWeight % Mean Square Error J = 0; for j=1:size(training_data, 2) X = training_data(:,j); t = T(:,j); netj = inputLayerToHiddenLayerWeight' * X; Y = a*tanh(b*netj); netk = hiddenLayerToOutputLayerWeight' * Y; Z = a*tanh(b*netk); J = J + immse(t, Z); end J = J/size(training_data, 2) p = p + 1; if p == 4 break; end end % testing neural network using the inputs test_data = [[2; -2], [-3; -3], [-2; 5], [3; -4]]; for i=1:size(test_data, 2) end
Weight decay isn't essential for Neural Network training. What I did notice was that your feature normalization wasn't correct. The correct algorthim for scaling data to the range of 0 to 1 is (max - x) / (max - min) Note: you apply this for every element within the array (or vector). Data inputs for NN need to be within the range of [0,1]. (Technically they can be a little bit outside of that ~[-3,3] but values furthur from 0 make training difficult) edit* I am unaware of this activation function a = 1.716; b = 2/3; % f(net) = a*tanh(b*net), % f'(net) = a*b*sech2(b*net) It sems like a variation on tanh. Could you elaborate what it is? If you're net still doesn't work give me an update and I'll look at your code more closely.
Output of k3_1 is capped at -3.1445e+24
I'm solving a system of ODEs using RK4. I'm generating a straight line plot that seems to be due to the fact that k3_1 is capped at -3.1445e+24. I don't understand why it is capped. function RK4system_MNModel() parsec = 3.08*10^18; r_1 = 8.5*1000.0*parsec; % in cm z_1 = 0.0; % in cm also theta_1 = 0.0; grav = 6.6720*10^-8; amsun = 1.989*10^33; % in grams amg = 1.5d11*amsun; % in grams gm = grav*amg; % constant q = 0.9; % axial ratio u_1 = 130.0; % in cm/sec w_1 = 95*10^4.0; % in cm/sec v = 180*10^4.0; % in cm/sec vcirc = sqrt(gm/r_1); % circular speed (constant) nsteps = 50000; deltat = 5.0*10^11; % in seconds angmom = r_1*v; % these are the same angmom2 = angmom^2.0; e = -gm/r_1+u_1*u_1/2.0+angmom2/(2.0*r_1*r_1); time=0.0; for i=1:nsteps k3_1 = deltat*u_1 %%%%% THIS LINE k4_1 = deltat*(-gm*r_1/((r_1^2.0+(1+sqrt(1+z_1^2.0))^2.0)^1.5) + angmom2/(r_1^3.0)); % u'=-dphi_dr+lz^2/(r^3.0) with lz=vi*ri this gives deltau k5_1 = deltat*(angmom/(r_1^2.0)); % theta'=lz/r^2 this gives deltatheta k6_1 = deltat*w_1; k7_1 = deltat*(-gm*z_1*(1+sqrt(1+z_1^2.0))/(sqrt(1+z_1^2.0)*(r_1^2.0+(1+sqrt(1+z_1^2.0))^2.0)^1.5)); r_2 = r_1+k3_1/2.0; u_2 = u_1+k4_1/2.0; theta_2 = theta_1+k5_1/2.0; z_2 = z_1 + k6_1/2.0; w_2 = w_1 + k7_1/2.0; k3_2 = deltat*u_2; k4_2 = deltat*(-gm*r_2/((r_2^2.0+(1+sqrt(1+z_2^2.0))^2.0)^1.5)+angmom2/(r_2^3.0)); k5_2 = deltat*(angmom/(r_2^2.0)); % theta'=lz/r^2 =====> deltatheta k6_2 = deltat*w_2; k7_2 = deltat*(-gm*z_2*(1+sqrt(1+z_2^2.0))/(sqrt(1+z_2^2.0)*(r_2^2.0+(1+sqrt(1+z_2^2.0))^2.0)^1.5)); r_3 = r_1+k3_2/2.0; u_3 = u_1+k4_2/2.0; theta_3 = theta_1+k5_2/2.0; z_3 = z_1 + k6_2/2.0; w_3 = w_1 + k7_2/2.0; k3_3 = deltat*u_3; % r'=u k4_3 = deltat*(-gm*r_3/((r_3^2.0+(1+sqrt(1+z_3^2.0))^2.0)^1.5)+angmom2/(r_3^3.0));% u'=-dphi_dr+lz^2/(r^3.0) k5_3 = deltat*(angmom/(r_3^2.0)); % theta'=lz/r^2 k6_3 = deltat*w_3; k7_3 = deltat*(-gm*z_3*(1+sqrt(1+z_3^2.0))/(sqrt(1+z_3^2.0)*(r_3^2.0+(1+sqrt(1+z_3^2.0))^2.0)^1.5)); r_4 = r_1+k3_2; u_4 = u_1+k4_2; theta_4 = theta_1+k5_2; z_4 = z_1 + k6_2; w_4 = w_1 + k7_2; k3_4 = deltat*u_4; % r'=u k4_4 = deltat*(-gm*r_4/((r_4^2.0+(1+sqrt(1+z_4^2.0))^2.0)^1.5)+angmom2/(r_4^3.0)); % u'=-dphi_dr+lz^2/(r^3.0) k5_4 = deltat*(angmom/(r_4^2.0)); % theta'=lz/r^2 k6_4 = deltat*w_4; k7_4 = deltat*(-gm*z_4*(1+sqrt(1+z_4^2.0))/(sqrt(1+z_4^2.0)*(r_4^2.0+(1+sqrt(1+z_4^2.0))^2.0)^1.5)); r_1 = r_1+(k3_1+2.0*k3_2+2.0*k3_3+k3_4)/6.0; % New value of R for next step u_1 = u_1+(k4_1+2.0*k4_2+2.0*k4_3+k4_4)/6.0; % New value of U for next step theta_1 = theta_1+(k5_1+2.0*k5_2+2.0*k5_3+k5_4)/6.0; % New value of theta z_1 = z_1+(k6_1+2.0*k6_2+2.0*k6_3+k6_4)/6.0; w_1 = w_1+(k7_1+2.0*k7_2+2.0*k7_3+k7_4)/6.0; e = -gm/r_1+u_1*u_1/2.0+angmom2/(2.0*r_1*r_1); % energy ecc = (1.0+(2.0*e*angmom2)/(gm^2.0))^0.5; % eccentricity x(i) = r_1*cos(theta_1)/(1000.0*parsec); % X for plotting orbit y(i) = r_1*sin(theta_1)/(1000.0*parsec); % Y for plotting orbit time = time+deltat; r(i) = r_1; z(i) = z_1; time1(i)= time; end Note that the anomally occurs on the indicated line.
It's not k3_1 that's capped, it's the calculation of u_1 that returns a value of -3.1445e+24 / deltat (deltat is constant). u_1 is calculated in the line: u_1 = u_1+(k4_1+2.0*k4_2+2.0*k4_3+k4_4)/6.0; After the first iteration, this returns: u_1(1) = 6.500e+13 % Hard coded before the loop u_1(2) = -1.432966614767040e+04 % Calculated using the equation above u_1(3) = -2.878934017859105e+04 % Calculated using the equation above u_1(4) = -4.324903004768405e+04 Based on the equation u_1(n+1) = u_1(n) + du it looks like du represents a relatively small difference. The difference between the two first values is very large, so I'm assuming it is this calculation that's incorrect. If you find that that calculation is correct, then your error is in one of these lines: k4_1 = deltat*(-gm*r_1/((r_1^2.0+(1+sqrt(1+z_1^2.0))^2.0)^1.5)+angmom2/(r_1^3.0)); % u'=-dphi_dr+lz^2/(r^3.0) with lz=vi*ri this gives delta k4_2 = deltat*(-gm*r_2/((r_2^2.0+(1+sqrt(1+z_2^2.0))^2.0)^1.5)+angmom2/(r_2^3.0)); k4_3 = deltat*(-gm*r_3/((r_3^2.0+(1+sqrt(1+z_3^2.0))^2.0)^1.5)+angmom2/(r_3^3.0));% u'=-dphi_dr+lz^2/(r^3.0) k4_4 = deltat*(-gm*r_4/((r_4^2.0+(1+sqrt(1+z_4^2.0))^2.0)^1.5)+angmom2/(r_4^3.0)); % u'=-dphi_dr+lz^2/(r^3.0)
What's wrong with my matlab programming of a hopfield neural network?
Sorry if the code is sloppy. The code is supposed to set up a Hopfield network from memory vectors of firing rates (a cross, a square, etc), converting between membrane potential and firing rate with the function f(ui) = tanh(beta * ui) and evolving using the evolution rule indicated. But for some reason it's not working: memories are not retrieved and instead the image disappears in some way. Please help. w = zeros(100); beta = 4; % Memory 1: a cross m1 = ones(10,10); m1(:, 4) = 0; m1(4, :) = 0; init = m1(1:4,:); m1 = (m1*2-1); % Memory 2: a square m2 = ones(10,10); m2(5:7, 5:7) = 0; m2 = (m2*2-1); % Memory 3: a hollow square m3 = ones(10,10); m3(:, 1) = 0; m3(:, 10) = 0; m3(1, :) = 0; m3(10, :) = 0; m3 = (m3*2-1); % build weight network for i = 1:100 for j = 1:100 if i ~= j w(i,j) = w(i,j) + m1(i) * m1(j) + m2(i) * m2(j) + m3(i) * m3(j); end end end % Initial condition f1 = [init; rand(6,10)]; f1 = f1*2-1; f1 = 0.9*f1; %invert to produce membrane potential u1 = atanh(f1)/beta; % evolve according to equation 1 weight = 0; rate = 0; evolve = u1; for t = 1:1 u1 = evolve; for i = 1:10 for j = 1:10 weight = w(i,:); % weighted connection w/ every other neuron rate = tanh(beta*u1(:)); evolve(i,j) = u1(i,j) + .05* (-u1(i,j) + (weight * rate)); end end end u1 = evolve; f1 = tanh(beta * u1); imshow(reshape(f1, 10, 10), [-1 , 1], 'InitialMagnification', 'fit');
Application of Neural Network in MATLAB
I asked a question a few days before but I guess it was a little too complicated and I don't expect to get any answer. My problem is that I need to use ANN for classification. I've read that much better cost function (or loss function as some books specify) is the cross-entropy, that is J(w) = -1/m * sum_i( yi*ln(hw(xi)) + (1-yi)*ln(1 - hw(xi)) ); i indicates the no. data from training matrix X. I tried to apply it in MATLAB but I find it really difficult. There are couple things I don't know: should I sum each outputs given all training data (i = 1, ... N, where N is number of inputs for training) is the gradient calculated correctly is the numerical gradient (gradAapprox) calculated correctly. I have following MATLAB codes. I realise I may ask for trivial thing but anyway I hope someone can give me some clues how to find the problem. I suspect the problem is to calculate gradients. Many thanks. Main script: close all clear all L = #(x) (1 + exp(-x)).^(-1); NN = #(x,theta) theta{2}*[ones(1,size(x,1));L(theta{1}*[ones(size(x,1),1) x]')]; % theta = [10 -30 -30]; x = [0 0; 0 1; 1 0; 1 1]; y = [0.9 0.1 0.1 0.1]'; theta0 = 2*rand(9,1)-1; options = optimset('gradObj','on','Display','iter'); thetaVec = fminunc(#costFunction,theta0,options,x,y); theta = cell(2,1); theta{1} = reshape(thetaVec(1:6),[2 3]); theta{2} = reshape(thetaVec(7:9),[1 3]); NN(x,theta)' Cost function: function [jVal,gradVal,gradApprox] = costFunction(thetaVec,x,y) persistent index; % 1 x x % 1 x x % 1 x x % x = 1 x x % 1 x x % 1 x x % 1 x x m = size(x,1); if isempty(index) || index > size(x,1) index = 1; end L = #(x) (1 + exp(-x)).^(-1); NN = #(x,theta) theta{2}*[ones(1,size(x,1));L(theta{1}*[ones(size(x,1),1) x]')]; theta = cell(2,1); theta{1} = reshape(thetaVec(1:6),[2 3]); theta{2} = reshape(thetaVec(7:9),[1 3]); Dew = cell(2,1); DewApprox = cell(2,1); % Forward propagation a0 = x(index,:)'; z1 = theta{1}*[1;a0]; a1 = L(z1); z2 = theta{2}*[1;a1]; a2 = L(z2); % Back propagation d2 = 1/m*(a2 - y(index))*L(z2)*(1-L(z2)); Dew{2} = [1;a1]*d2; d1 = [1;a1].*(1 - [1;a1]).*theta{2}'*d2; Dew{1} = [1;a0]*d1(2:end)'; % NNRes = NN(x,theta)'; % jVal = -1/m*sum(NNRes-y)*NNRes*(1-NNRes); jVal = -1/m*(a2 - y(index))*a2*(1-a2); gradVal = [Dew{1}(:);Dew{2}(:)]; gradApprox = CalcGradApprox(0.0001); index = index + 1; function output = CalcGradApprox(epsilon) output = zeros(size(gradVal)); for n=1:length(thetaVec) thetaVecMin = thetaVec; thetaVecMax = thetaVec; thetaVecMin(n) = thetaVec(n) - epsilon; thetaVecMax(n) = thetaVec(n) + epsilon; thetaMin = cell(2,1); thetaMax = cell(2,1); thetaMin{1} = reshape(thetaVecMin(1:6),[2 3]); thetaMin{2} = reshape(thetaVecMin(7:9),[1 3]); thetaMax{1} = reshape(thetaVecMax(1:6),[2 3]); thetaMax{2} = reshape(thetaVecMax(7:9),[1 3]); a2min = NN(x(index,:),thetaMin)'; a2max = NN(x(index,:),thetaMax)'; jValMin = -1/m*(a2min-y(index))*a2min*(1-a2min); jValMax = -1/m*(a2max-y(index))*a2max*(1-a2max); output(n) = (jValMax - jValMin)/2/epsilon; end end end EDIT: Below I present the correct version of my costFunction for those who may be interested. function [jVal,gradVal,gradApprox] = costFunction(thetaVec,x,y) m = size(x,1); L = #(x) (1 + exp(-x)).^(-1); NN = #(x,theta) L(theta{2}*[ones(1,size(x,1));L(theta{1}*[ones(size(x,1),1) x]')]); theta = cell(2,1); theta{1} = reshape(thetaVec(1:6),[2 3]); theta{2} = reshape(thetaVec(7:9),[1 3]); Delta = cell(2,1); Delta{1} = zeros(size(theta{1})); Delta{2} = zeros(size(theta{2})); D = cell(2,1); D{1} = zeros(size(theta{1})); D{2} = zeros(size(theta{2})); jVal = 0; for in = 1:size(x,1) % Forward propagation a1 = [1;x(in,:)']; % added bias to a0 z2 = theta{1}*a1; a2 = [1;L(z2)]; % added bias to a1 z3 = theta{2}*a2; a3 = L(z3); % Back propagation d3 = a3 - y(in); d2 = theta{2}'*d3.*a2.*(1 - a2); Delta{2} = Delta{2} + d3*a2'; Delta{1} = Delta{1} + d2(2:end)*a1'; jVal = jVal + sum( y(in)*log(a3) + (1-y(in))*log(1-a3) ); end D{1} = 1/m*Delta{1}; D{2} = 1/m*Delta{2}; jVal = -1/m*jVal; gradVal = [D{1}(:);D{2}(:)]; gradApprox = CalcGradApprox(x(in,:),0.0001); % Nested function to calculate gradApprox function output = CalcGradApprox(x,epsilon) output = zeros(size(thetaVec)); for n=1:length(thetaVec) thetaVecMin = thetaVec; thetaVecMax = thetaVec; thetaVecMin(n) = thetaVec(n) - epsilon; thetaVecMax(n) = thetaVec(n) + epsilon; thetaMin = cell(2,1); thetaMax = cell(2,1); thetaMin{1} = reshape(thetaVecMin(1:6),[2 3]); thetaMin{2} = reshape(thetaVecMin(7:9),[1 3]); thetaMax{1} = reshape(thetaVecMax(1:6),[2 3]); thetaMax{2} = reshape(thetaVecMax(7:9),[1 3]); a3min = NN(x,thetaMin)'; a3max = NN(x,thetaMax)'; jValMin = 0; jValMax = 0; for inn=1:size(x,1) jValMin = jValMin + sum( y(inn)*log(a3min) + (1-y(inn))*log(1-a3min) ); jValMax = jValMax + sum( y(inn)*log(a3max) + (1-y(inn))*log(1-a3max) ); end jValMin = 1/m*jValMin; jValMax = 1/m*jValMax; output(n) = (jValMax - jValMin)/2/epsilon; end end end
I've only had a quick eyeball over your code. Here are some pointers. Q1 should I sum each outputs given all training data (i = 1, ... N, where N is number of inputs for training) If you are talking in relation to the cost function, it is normal to sum and normalise by the number of training examples in order to provide comparison between. I can't tell from the code whether you have a vectorised implementation which will change the answer. Note that the sum function will only sum up a single dimension at a time - meaning if you have a (M by N) array, sum will result in a 1 by N array. The cost function should have a scalar output. Q2 is the gradient calculated correctly The gradient is not calculated correctly - specifically the deltas look wrong. Try following Andrew Ng's notes [PDF] they are very good. Q3 is the numerical gradient (gradAapprox) calculated correctly. This line looks a bit suspect. Does this make more sense? output(n) = (jValMax - jValMin)/(2*epsilon); EDIT: I actually can't make heads or tails of your gradient approximation. You should only use forward propagation and small tweaks in the parameters to compute the gradient. Good luck!
Continuous RBM: Poor performance only for negative valued input data?
i tried to port this python implementation of a continuous RBM to Matlab: http://imonad.com/rbm/restricted-boltzmann-machine/ I generated 2-dimensional trainingdata in the shape of a (noisy) circle and trained the rbm with 2 visible an 8 hidden layers. To test the implementation i fed uniformly distributed randomdata to the RBM and plotted the reconstructed data (Same procedure as used in the link above). Now the confusing part: With trainingdata in the range of (0,1)x(0,1) i get very satisfying results, however with trainingdata in range (-0.5,-0.5)x(-0.5,-0.5) or (-1,0)x(-1,0) the RBM reconstructs only data in the very right top of the circle. I dont understand what causes this, is it just a bug in my implementation i dont see? Some plots, the blue dots are the training data, the red dots are the reconstructions. Here is my implementation of the RBM: Training: maxepoch = 300; ksteps = 10; sigma = 0.2; % cd standard deviation learnW = 0.5; % learning rate W learnA = 0.5; % learning rate A nVis = 2; % number of visible units nHid = 8; % number of hidden units nDat = size(dat, 1);% number of training data points cost = 0.00001; % cost moment = 0.9; % momentum W = randn(nVis+1, nHid+1) / 10; % weights dW = randn(nVis+1, nHid+1) / 1000; % change of weights sVis = zeros(1, nVis+1); % state of visible neurons sVis(1, end) = 1.0; % bias sVis0 = zeros(1, nVis+1); % initial state of visible neurons sVis0(1, end) = 1.0; % bias sHid = zeros(1, nHid+1); % state of hidden neurons sHid(1, end) = 1.0; % bias aVis = 0.1*ones(1, nVis+1);% A visible aHid = ones(1, nHid+1); % A hidden err = zeros(1, maxepoch); e = zeros(1, maxepoch); for epoch = 1:maxepoch wPos = zeros(nVis+1, nHid+1); wNeg = zeros(nVis+1, nHid+1); aPos = zeros(1, nHid+1); aNeg = zeros(1, nHid+1); for point = 1:nDat sVis(1:nVis) = dat(point, :); sVis0(1:nVis) = sVis(1:nVis); % initial sVis % positive phase activHid; wPos = wPos + sVis' * sHid; aPos = aPos + sHid .* sHid; % negative phase activVis; activHid; for k = 1:ksteps activVis; activHid; end tmp = sVis' * sHid; wNeg = wNeg + tmp; aNeg = aNeg + sHid .* sHid; delta = sVis0(1:nVis) - sVis(1:nVis); err(epoch) = err(epoch) + sum(delta .* delta); e(epoch) = e(epoch) - sum(sum(W' * tmp)); end dW = dW*moment + learnW * ((wPos - wNeg) / numel(dat)) - cost * W; W = W + dW; aHid = aHid + learnA * (aPos - aNeg) / (numel(dat) * (aHid .* aHid)); % error err(epoch) = err(epoch) / (nVis * numel(dat)); e(epoch) = e(epoch) / numel(dat); disp(['epoch: ' num2str(epoch) ' err: ' num2str(err(epoch)) ... ' ksteps: ' num2str(ksteps)]); end save(['rbm_' filename '.mat'], 'W', 'err', 'aVis', 'aHid'); activHid.m: sHid = (sVis * W) + randn(1, nHid+1); sHid = sigFun(aHid .* sHid, datRange); sHid(end) = 1.; % bias activVis.m: sVis = (W * sHid')' + randn(1, nVis+1); sVis = sigFun(aVis .* sVis, datRange); sVis(end) = 1.; % bias sigFun.m: function [sig] = sigFun(X, datRange) a = ones(size(X)) * datRange(1); b = ones(size(X)) * (datRange(2) - datRange(1)); c = ones(size(X)) + exp(-X); sig = a + (b ./ c); end Reconstruction: nSamples = 2000; ksteps = 10; nVis = 2; nHid = 8; sVis = zeros(1, nVis+1); % state of visible neurons sVis(1, end) = 1.0; % bias sHid = zeros(1, nHid+1); % state of hidden neurons sHid(1, end) = 1.0; % bias input = rand(nSamples, 2); output = zeros(nSamples, 2); for sample = 1:nSamples sVis(1:nVis) = input(sample, :); for k = 1:ksteps activHid; activVis; end output(sample, :) = sVis(1:nVis); end
RBM's were originally designed to work only with binary data. But also work with data between 0 and 1. Its part of the algorithm. Further reading
As input is in the range of [0 1] for both x and y, this is why they stay in that ares. Changing the input to input = (rand(nSamples, 2)*2) -1; results in input sampled from a range of [-1 1] and therefore the red dots will be more spread out around the circle.