My matlab neural network backpropagation algorithm seems buggy - matlab

Here is my code. I think it is wrong because the difference between this computed gradient and my numerical estimate is too significant. It doesn't seem to be due to wrongly inverting matrices, etc.
For context, Y is the output layer, X is the input layer, and there is only 1 hidden layer. Theta1 is the weights for the first input layer and Theta2 is the weights for the hidden layer.
for t = 1:m
% do fw prop again...
a1 = [1 X(i,:)];
a2 = [1 sigmoid(a1 * Theta1')];
a3 = sigmoid(a2 * Theta2');
delta_3 = a3' - Y(:, t);
delta_2 = Theta2' * delta_3 .* a2' .* (1 - a2)';
delta_2 = delta_2(2:end,:);
Theta1_grad = Theta1_grad + delta_2 * [1 X(i, :)];
Theta2_grad = Theta2_grad + delta_3 * [1 sigmoid([1 X(i,:)] * Theta1')];
end
grad = [Theta1_grad(:) ; Theta2_grad(:)];

Related

What type of (probably syntactic) mistake am I making when using a generic function on an array for plotting?

I am trying to plot a geodesic on a 3D surface (tractrix) with Matlab. This worked for me in the past when I didn't need to parametrize the surface (see here). However, the tractrix called for parameterization, chain rule differentiation, and collection of u,v,x,y and f(x,y) values.
After many mistakes I think that I'm getting the right values for x = f1(u,v) and y = f2(u,v) describing a spiral right at the base of the surface:
What I can't understand is why the z value or height of the 3D plot of the curve is consistently zero, when I'm applying the same mathematical formula that allowed me to plot the surface in the first place, ie. f = #(x,y) a.* (y - tanh(y)) .
Here is the code, which runs without any errors on Octave. I'm typing a special note in upper case on the crucial calls. Also note that I have restricted the number of geodesic lines to 1 to decrease the execution time.
a = 0.3;
u = 0:0.1:(2 * pi);
v = 0:0.1:5;
[X,Y] = meshgrid(u,v);
% NOTE THAT THESE FORMULAS RESULT IN A SUCCESSFUL PLOT OF THE SURFACE:
x = a.* cos(X) ./ cosh(Y);
y = a.* sin(X) ./ cosh(Y);
z = a.* (Y - tanh(Y));
h = surf(x,y,z);
zlim([0, 1.2]);
set(h,'edgecolor','none')
colormap summer
hold on
% THESE ARE THE GENERIC FUNCTIONS (f) WHICH DON'T SEEM TO WORK AT THE END:
f = #(x,y) a.* (y - tanh(y));
f1 = #(u,v) a.* cos(u) ./ cosh(v);
f2 = #(u,v) a.* sin(u) ./ cosh(v);
dfdu = #(u,v) ((f(f1(u,v)+eps, f2(u,v)) - f(f1(u,v) - eps, f2(u,v)))/(2 * eps) .*
(f1(u+eps,v)-f1(u-eps,v))/(2*eps) +
(f(f1(u,v), f2(u,v)+eps) - f(f1(u,v), f2(u,v)-eps))/(2 * eps) .*
(f2(u+eps,v)-f2(u-eps,v))/(2*eps));
dfdv = #(u,v) ((f(f1(u,v)+eps, f2(u,v)) - f(f1(u,v) - eps, f2(u,v)))/(2 * eps) .*
(f1(u,v+eps)-f1(u,v-eps))/(2*eps) +
(f(f1(u,v), f2(u,v)+eps) - f(f1(u,v), f2(u,v)-eps))/(2 * eps) .*
(f2(u,v+eps)-f2(u,v-eps))/(2*eps));
% Normal vector to the surface:
N = #(u,v) [- dfdu(u,v), - dfdv(u,v), 1]; % Normal vec to surface # any pt.
% Some colors to draw the lines:
C = {'k','r','g','y','m','c'};
for s = 1:1 % No. of lines to be plotted.
% Starting points:
u0 = [0, u(length(u))];
v0 = [0, v(length(v))];
du0 = 0.001;
dv0 = 0.001;
step_size = 0.00005; % Will determine the progression rate from pt to pt.
eta = step_size / sqrt(du0^2 + dv0^2); % Normalization.
eps = 0.0001; % Epsilon
max_num_iter = 100000; % Number of dots in each line.
% Semi-empty vectors to collect results:
U = [[u0(s), u0(s) + eta*du0], zeros(1,max_num_iter - 2)];
V = [[v0(s), v0(s) + eta*dv0], zeros(1,max_num_iter - 2)];
for i = 2:(max_num_iter - 1) % Creating the geodesic:
ut = U(i);
vt = V(i);
xt = f1(ut,vt);
yt = f2(ut,vt);
ft = f(xt,yt);
utm1 = U(i - 1);
vtm1 = V(i - 1);
xtm1 = f1(utm1,vtm1);
ytm1 = f2(utm1,vtm1);
ftm1 = f(xtm1,ytm1);
usymp = ut + (ut - utm1);
vsymp = vt + (vt - vtm1);
xsymp = f1(usymp,vsymp);
ysymp = f2(usymp,vsymp);
fsymp = ft + (ft - ftm1);
df = fsymp - f(xsymp,ysymp); % Is the surface changing? How much?
n = N(ut,vt); % Normal vector at point t
gamma = df * n(3); % Scalar x change f x z value of N
xtp1 = xsymp - gamma * n(1); % Gamma to modulate incre. x & y.
ytp1 = ysymp - gamma * n(2);
U(i + 1) = usymp - gamma * n(1);;
V(i + 1) = vsymp - gamma * n(2);;
end
% THE PROBLEM! f(f1(U,V),f2(U,V)) below YIELDS ALL ZEROS!!! The expected values are between 0 and 1.2.
P = [f1(U,V); f2(U,V); f(f1(U,V),f2(U,V))]; % Compiling results into a matrix.
units = 35; % Determines speed (smaller, faster)
packet = floor(size(P,2)/units);
P = P(:,1: packet * units);
for k = 1:packet:(packet * units)
hold on
plot3(P(1, k:(k+packet-1)), P(2,(k:(k+packet-1))), P(3,(k:(k+packet-1))),
'.', 'MarkerSize', 5,'color',C{s})
drawnow
end
end
The answer is to Cris Luengo's credit, who noticed that the upper-case assigned to the variable Y, used for the calculation of the height of the curve z, was indeed in the parametrization space u,v as intended, and not in the manifold x,y! I don't use Matlab/Octave other than for occasional simulations, and I was trying every other syntactical permutation I could think of without realizing that f fed directly from v (as intended). I changed now the names of the different variables to make it cleaner.
Here is the revised code:
a = 0.3;
u = 0:0.1:(3 * pi);
v = 0:0.1:5;
[U,V] = meshgrid(u,v);
x = a.* cos(U) ./ cosh(V);
y = a.* sin(U) ./ cosh(V);
z = a.* (V - tanh(V));
h = surf(x,y,z);
zlim([0, 1.2]);
set(h,'edgecolor','none')
colormap summer
hold on
f = #(x,y) a.* (y - tanh(y));
f1 = #(u,v) a.* cos(u) ./ cosh(v);
f2 = #(u,v) a.* sin(u) ./ cosh(v);
dfdu = #(u,v) ((f(f1(u,v)+eps, f2(u,v)) - f(f1(u,v) - eps, f2(u,v)))/(2 * eps) .*
(f1(u+eps,v)-f1(u-eps,v))/(2*eps) +
(f(f1(u,v), f2(u,v)+eps) - f(f1(u,v), f2(u,v)-eps))/(2 * eps) .*
(f2(u+eps,v)-f2(u-eps,v))/(2*eps));
dfdv = #(u,v) ((f(f1(u,v)+eps, f2(u,v)) - f(f1(u,v) - eps, f2(u,v)))/(2 * eps) .*
(f1(u,v+eps)-f1(u,v-eps))/(2*eps) +
(f(f1(u,v), f2(u,v)+eps) - f(f1(u,v), f2(u,v)-eps))/(2 * eps) .*
(f2(u,v+eps)-f2(u,v-eps))/(2*eps));
% Normal vector to the surface:
N = #(u,v) [- dfdu(u,v), - dfdv(u,v), 1]; % Normal vec to surface # any pt.
% Some colors to draw the lines:
C = {'y','r','k','m','w',[0.8 0.8 1]}; % Color scheme
for s = 1:6 % No. of lines to be plotted.
% Starting points:
u0 = [0, -pi/2, 2*pi, 4*pi/3, pi/4, pi];
v0 = [0, 0, 0, 0, 0, 0];
du0 = [0, -0.0001, 0.001, - 0.001, 0.001, -0.01];
dv0 = [0.1, 0.01, 0.001, 0.001, 0.0005, 0.01];
step_size = 0.00005; % Will determine the progression rate from pt to pt.
eta = step_size / sqrt(du0(s)^2 + dv0(s)^2); % Normalization.
eps = 0.0001; % Epsilon
max_num_iter = 180000; % Number of dots in each line.
% Semi-empty vectors to collect results:
Uc = [[u0(s), u0(s) + eta*du0(s)], zeros(1,max_num_iter - 2)];
Vc = [[v0(s), v0(s) + eta*dv0(s)], zeros(1,max_num_iter - 2)];
for i = 2:(max_num_iter - 1) % Creating the geodesic:
ut = Uc(i);
vt = Vc(i);
xt = f1(ut,vt);
yt = f2(ut,vt);
ft = f(xt,yt);
utm1 = Uc(i - 1);
vtm1 = Vc(i - 1);
xtm1 = f1(utm1,vtm1);
ytm1 = f2(utm1,vtm1);
ftm1 = f(xtm1,ytm1);
usymp = ut + (ut - utm1);
vsymp = vt + (vt - vtm1);
xsymp = f1(usymp,vsymp);
ysymp = f2(usymp,vsymp);
fsymp = ft + (ft - ftm1);
df = fsymp - f(xsymp,ysymp); % Is the surface changing? How much?
n = N(ut,vt); % Normal vector at point t
gamma = df * n(3); % Scalar x change f x z value of N
xtp1 = xsymp - gamma * n(1); % Gamma to modulate incre. x & y.
ytp1 = ysymp - gamma * n(2);
Uc(i + 1) = usymp - gamma * n(1);;
Vc(i + 1) = vsymp - gamma * n(2);;
end
x = f1(Uc,Vc);
y = f2(Uc,Vc);
P = [x; y; f(Uc,Vc)]; % Compiling results into a matrix.
units = 35; % Determines speed (smaller, faster)
packet = floor(size(P,2)/units);
P = P(:,1: packet * units);
for k = 1:packet:(packet * units)
hold on
plot3(P(1, k:(k+packet-1)), P(2,(k:(k+packet-1))), P(3,(k:(k+packet-1))),
'.', 'MarkerSize', 5,'color',C{s})
drawnow
end
end

Neural Network implementation in octave/matlab

I'm trying to make a simple neural network formed by three layers to resolve a binary classification problem. The first two layers have eight neurons (+ the bias units). I'm using fminunc. This is my cost function:
1 function [jVal, gradient] = cost2(thetaVec, X, y)
2 Theta1 = reshape(thetaVec(1:72),8, 9); % my weights used for
3 Theta2 = reshape(thetaVec(73:81),1, 9); %forward propagation
4 Delta1 = 0; %Delta is divided in Delta1 and Delta2 for simplicity but
5 Delta2 = 0; %they're combined to eventually calculate the gradient
6 jVal = 0; %the value of the costfunction
7 m = length(y);
8 for i = 1:m
9 a1 = X(i, :); %X size: 3x9, a1 size: 1x9
10 z2 = Theta1 * a1';
11 a2 = 1 ./(1 + exp(-z2)); %a2 size: 8x1
12 a2 = [ones(columns(a2), 1) a2']; % bias unit added to a2: a2 size: 1x9
13 z3 = Theta2 * a2';
14 a3 = 1 ./(1 + exp(-z3)); %a3 = h(x(i)) size: 1x1
15 jVal += (-1/m) * (y(i) * log(a3) + (1 - y(i)) * log(1 - a3));
16 delta3 = a3 - y(i); %delta3 size: 1x1
17 delta2 = Theta2' * delta3 .* a2 .* (1 - a2); %delta2 size: 9x9
18 Delta2 += delta3 * a2'; %I use Delta1 and Delta2 as accumulators
19 Delta1 += delta2 * a1'; %size Delta2: 9x1, size Delta1: 9x1
20 endfor
21 jVal = jVal/m; %avarage of jVal
22 Delta = [Delta1;Delta2]; %Deltas are combined. Size Delta: 18x1
23 gradient = (1/m) * Delta;% size gradient: 18x1
24 endfunction
My main:
%the values of the vector from which I derive my weights are chosen randomly
INIT_EPSILON = 0.1; %between thi interval
Theta1 = rand(8, 9) * (2*INIT_EPSILON) - INIT_EPSILON;
Theta2 = rand(1, 9) * (2*INIT_EPSILON) - INIT_EPSILON;
thetaVec = [ Theta1(:); Theta2(:)];
options = optimset('GradObj', 'on', 'MaxIter', 10000);
[optTheta, functionVal, exitFlag] = fminunc(#(t) cost2(t, X, y), thetaVec, options)
gradient should be a matrix 9x9, instead it is 18x1, so I can't use fminunc. Actually, I tried to modify the backpropagation part in my cost function several times to obtain a gradient 9x9 (in particular I used to change delta2). However, it never worked, the output was:
optTheta = %a vector of various values
functionVal = 0.71681 %or a similar value
exitFlag = 1
So, even if the exitflag was 1 it didn't converged. Where am I doing wrong?
You currently have the following code:
delta3 = a3 - y(i); % (1×1)
delta2 = Theta2' * delta3 .* a2 .* (1 - a2); % (9×1).*(1×9) = (9×9)
Delta2 += delta3 * a2'; % (9×9)* (9×1) = (9×1)
Delta1 += delta2 * a1'; % (9×9)* (n×1) = (9×1)
I think instead it should be something like:
delta3 = a3 - y(i); % (1×1)
delta2 = Theta2 * delta3 .* a2 .* (1 - a2); % (1×9).*(1×9) = (1×9)
Delta2 += delta3 * a2'; % (9×9)* (9×1) = (9×1)
Delta1 += delta2.' * a1; % (9×1)* (1×9) = (9×9)
And then you discard the gradients of the bias in Delta1 at each step, ending up with an (8×9) matrix as your ongoing Delta1 component. (you may need to transpose Delta1 first, I haven't followed your transpositions closely).
Finally, the whole point of the vertical concatenation step at the end is to "implode" your matrices back into a single-column long vector form, so that they follow they same "specification" as the input "thetaVec", therefore you'd take your (8×9) and (1×9) Delta1 and Delta2 objects, and 'implode' them, i.e.:
[Delta1(:) ; Delta2(:)]
UPDATE
Continuing here the discussion in the comments above.
Consider what Delta1 and Delta2 are. This is the total error (over all observations) corresponding to the elements in Theta1 and Theta2 respectively. In other words, Delta1 should have the same size as Theta1, and Delta2 should have the same size as Theta2. Now that you have included the matrix sizes in your code, you can see instantly that this is not the case.
Furthermore, since each iteration adds to these matrices, the result of each iteration should be a Delta1 and Delta2 of the right size, such that you add the errors contributed at each iteration, to get the total error (per parameter) over all iterations.
Also, consider what delta2 and delta3 are. These are also errors, but they do not refer to errors in the parameters, they refer to errors in nodes. In other words, they show the contribution / responsibility of each node in a layer to the final error. Therefore their size needs to be a vector with the same number of elements as there are nodes in the respective layer. You can already see that it makes no sense for delta2 to have a size of 9x9!
So the logic of the algorithm is this. Take the contribution of the nodes to the error, and backpropagate it, both to the previous nodes, and to the the parameters between them.
E.g. multiply each delta3 (in this case there's only one node) with each node in layer 2, to find how errors were distributed over each parameter element in Theta2.
Similarly, to obtain delta2, we considered the error in delta3, and distributed it to each node in layer 2, by following the parameter multiplication backwards. Then on top of that, we multiplied by the gradient (since this defines to what extent / rate that error would have been propagated forwards).
Now that we've dealt with layer 3 and its interaction with layer 2, we move on to 2 and its interaction with 1. So, similarly, multiply each delta2 (there are 9 nodes in layer 2, so delta2 should have 9 elements) with each node in layer 1. This gives a 9x9 matrix reflecting how each node in layer 1, through parameters Theta1, gave rise to the errors for each node of layer 2. However, because we do not care about the contributions to the 'bias' node of layer 2, we remove this part from Delta1, which leaves us with an 8x9 matrix, exactly the same size as Theta1.
In theory you could have also backpropagated delta2 to find a delta1, but since we have no use for it, we skip this step. After all, what we really care about is the error in the parameters, not in the nodes. (i.e. we only care about the errors in the nodes at each step because we need them to get the errors in the parameters from the layer before).

heterogeneous class recognition with ANN / MLP

I have put together a classifying 3 layer artificial neural network that appears to work on other datasets. Playing around some artificial datasets that I made, I was unable to correctly predict between two classes when one class was positive in one feature or another feature.
Clearly class1 is can be identified by asking if either feature 1 or feature 2 is equal to 1 but I can't get the algorithm to predict the dataset correctly (there are 20 examples following this pattern in the dataset).
Can ANN/MLPs recognize this type of pattern? If so, what am I missing? If not, are there other methods that can predict this type of pattern (maybe SVM)?
I used Octave as that was what was used in the online course offered from coursera. I have listed most of the code here although it is structured slightly differently when I run it. As you can see I do use bias units on the first and second layers and I have also varied the number of hidden units in the second layer from 1-5 with no improvement over random guessing.
% Load dataset
y = [1; 1; 2; 2]
X = [1, 0; 0, 1; 0, 0; 0, 0]
m = size(X, 1);
Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), hidden_layer_size, (input_layer_size + 1));
Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), num_labels, (hidden_layer_size + 1));
% Randomly initialize weight parameters
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size);
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels);
initial_nn_params = [initial_Theta1(:) ; initial_Theta2(:)];
% Add bias units to layers and feedforward
Xbias = [ones(m,1), X];
L2bias = [ones(m,1), sigmoid(Xbias*Theta1')];
L3 = sigmoid(L2bias * Theta2');
% Create class matrix Y
Y = zeros(m, num_labels);
for r = 1:m;
Y(r, y(r)) = 1;
end
% Set cost function
J = (sum(sum(Y.*log(L3) + (1-Y).*log(1-L3))))/-m + lambda*(sum(sum((Theta1(:,2:columns(Theta1))).^2)) + sum(sum((Theta2(:,2:columns(Theta2))).^2)))/2/m;
% Initialize weight gradient matrices
D2 = zeros(rows(Theta2),columns(Theta2));
D1 = zeros(rows(Theta1),columns(Theta1));
% Calculate gradient with backpropagation
for t = 1:m;
a1 = [1 X(t,:)]';
z2 = Theta1*a1;
a2 = [1; sigmoid(z2)];
z3 = Theta2*a2;
a3 = sigmoid(z3);
d3 = a3 - Y(t,:)';
d2 = (Theta2'*d3)(2:end).*sigmoidGradient(z2);
D2 = D2 + d3*a2';
D1 = D1 + d2*a1';
end
Theta2_grad = D2/m;
Theta1_grad = D1/m;
Theta2_grad(:,2:end) = Theta2_grad(:,2:end) + lambda*Theta2(:,2:end)/m;
Theta1_grad(:,2:end) = Theta1_grad(:,2:end) + lambda*Theta1(:,2:end)/m;
% Unroll gradients
grad = [Theta1_grad(:) ; Theta2_grad(:)];
% Compute cost (Feed forward)
[J,grad] = nnCostFunction(initial_nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda);
% Create "short hand" for the cost function to be minimized using fmincg
costFunction = #(p) nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda);
% Train the neural network using fmincg
options = optimset('MaxIter', 1000);
[nn_params, cost] = fmincg(costFunction, initial_nn_params, options);
% Obtain Theta1 and Theta2 back from nn_params
Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), hidden_layer_size, (input_layer_size + 1));
Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), num_labels, (hidden_layer_size + 1));
NN can recognize any pattern. Universal Approximation Theorem proves that (as well as many others).
The most obvious reason I can think of is lack of bias neuron. Althouh for more valuable answers you have to include your code.

Neural Networks: Sigmoid Activation Function for continuous output variable

Okay, so I am in the middle of Andrew Ng's machine learning course on coursera and would like to adapt the neural network which was completed as part of assignment 4.
In particular, the neural network which I had completed correctly as part of the assignment was as follows:
Sigmoid activation function: g(z) = 1/(1+e^(-z))
10 output units, each which could take 0 or 1
1 hidden layer
Back-propagation method used to minimize cost function
Cost function:
where L=number of layers, s_l = number of units in layer l, m = number of training examples, K = number of output units
Now I want to adjust the exercise so that there is one continuous output unit that takes any value between [0,1] and I am trying to work out what needs to change, so far I have
Replaced the data with my own, i.e.,such that the output is continuous variable between 0 and 1
Updated references to the number of output units
Updated the cost function in the back-propagation algorithm to:
where a_3 is the value of the output unit determined from forward propagation.
I am certain that something else must change as the gradient checking method shows the gradient determined by back-propagation and that by the numerical approximation no longer match up. I did not change the sigmoid gradient; it is left at f(z)*(1-f(z)) where f(z) is the sigmoid function 1/(1+e^(-z))) nor did I update the numerical approximation of the derivative formula; simply (J(theta+e) - J(theta-e))/(2e).
Can anyone advise of what other steps would be required?
Coded in Matlab as follows:
% FORWARD PROPAGATION
% input layer
a1 = [ones(m,1),X];
% hidden layer
z2 = a1*Theta1';
a2 = sigmoid(z2);
a2 = [ones(m,1),a2];
% output layer
z3 = a2*Theta2';
a3 = sigmoid(z3);
% BACKWARD PROPAGATION
delta3 = a3 - y;
delta2 = delta3*Theta2(:,2:end).*sigmoidGradient(z2);
Theta1_grad = (delta2'*a1)/m;
Theta2_grad = (delta3'*a2)/m;
% COST FUNCTION
J = 1/(2 * m) * sum( (a3-y).^2 );
% Implement regularization with the cost function and gradients.
Theta1_grad(:,2:end) = Theta1_grad(:,2:end) + Theta1(:,2:end)*lambda/m;
Theta2_grad(:,2:end) = Theta2_grad(:,2:end) + Theta2(:,2:end)*lambda/m;
J = J + lambda/(2*m)*( sum(sum(Theta1(:,2:end).^2)) + sum(sum(Theta2(:,2:end).^2)));
I have since realised that this question is similar to that asked by #Mikhail Erofeev on StackOverflow, however in this case I wish the continuous variable to be between 0 and 1 and therefore use a sigmoid function.
First, your cost function should be:
J = 1/m * sum( (a3-y).^2 );
I think your Theta2_grad = (delta3'*a2)/m;is expected to match the numerical approximation after changed to delta3 = 1/2 * (a3 - y);).
Check this slide for more details.
EDIT:
In case there is some minor discrepancy between our codes, I pasted my code below for your reference. The code has already been compared with numerical approximation function checkNNGradients(lambda);, the Relative Difference is less than 1e-4 (not meets the 1e-11 requirement by Dr.Andrew Ng though)
function [J grad] = nnCostFunctionRegression(nn_params, ...
input_layer_size, ...
hidden_layer_size, ...
num_labels, ...
X, y, lambda)
Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ...
hidden_layer_size, (input_layer_size + 1));
Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ...
num_labels, (hidden_layer_size + 1));
m = size(X, 1);
J = 0;
Theta1_grad = zeros(size(Theta1));
Theta2_grad = zeros(size(Theta2));
X = [ones(m, 1) X];
z1 = sigmoid(X * Theta1');
zs = z1;
z1 = [ones(m, 1) z1];
z2 = z1 * Theta2';
ht = sigmoid(z2);
y_recode = zeros(length(y),num_labels);
for i=1:length(y)
y_recode(i,y(i))=1;
end
y = y_recode;
regularization=lambda/2/m*(sum(sum(Theta1(:,2:end).^2))+sum(sum(Theta2(:,2:end).^2)));
J=1/(m)*sum(sum((ht - y).^2))+regularization;
delta_3 = 1/2*(ht - y);
delta_2 = delta_3 * Theta2(:,2:end) .* sigmoidGradient(X * Theta1');
delta_cap2 = delta_3' * z1;
delta_cap1 = delta_2' * X;
Theta1_grad = ((1/m) * delta_cap1)+ ((lambda/m) * (Theta1));
Theta2_grad = ((1/m) * delta_cap2)+ ((lambda/m) * (Theta2));
Theta1_grad(:,1) = Theta1_grad(:,1)-((lambda/m) * (Theta1(:,1)));
Theta2_grad(:,1) = Theta2_grad(:,1)-((lambda/m) * (Theta2(:,1)));
grad = [Theta1_grad(:) ; Theta2_grad(:)];
end
If you want to have continuous output try not to use sigmoid activation when computing target value.
a1 = [ones(m, 1) X];
a2 = sigmoid(X * Theta1');
a2 = [ones(m, 1) z1];
a3 = z1 * Theta2';
ht = a3;
Normalize input before using it in nnCostFunction. Everything else remains same.

Octave backpropagation implementation issues

I wrote a code to implement steepest descent backpropagation with which I am having issues. I am using the Machine CPU dataset and have scaled the inputs and outputs into range [0 1]
The codes in matlab/octave is as follows:
steepest descent backpropagation
%SGD = Steepest Gradient Decent
function weights = nnSGDTrain (X, y, nhid_units, gamma, max_epoch, X_test, y_test)
iput_units = columns (X);
oput_units = columns (y);
n = rows (X);
W2 = rand (nhid_units + 1, oput_units);
W1 = rand (iput_units + 1, nhid_units);
train_rmse = zeros (1, max_epoch);
test_rmse = zeros (1, max_epoch);
for (epoch = 1:max_epoch)
delW2 = zeros (nhid_units + 1, oput_units)';
delW1 = zeros (iput_units + 1, nhid_units)';
for (i = 1:rows(X))
o1 = sigmoid ([X(i,:), 1] * W1); %1xn+1 * n+1xk = 1xk
o2 = sigmoid ([o1, 1] * W2); %1xk+1 * k+1xm = 1xm
D2 = o2 .* (1 - o2);
D1 = o1 .* (1 - o1);
e = (y_test(i,:) - o2)';
delta2 = diag (D2) * e; %mxm * mx1 = mx1
delta1 = diag (D1) * W2(1:(end-1),:) * delta2; %kxm * mx1 = kx1
delW2 = delW2 + (delta2 * [o1 1]); %mx1 * 1xk+1 = mxk+1 %already transposed
delW1 = delW1 + (delta1 * [X(i, :) 1]); %kx1 * 1xn+1 = k*n+1 %already transposed
end
delW2 = gamma .* delW2 ./ n;
delW1 = gamma .* delW1 ./ n;
W2 = W2 + delW2';
W1 = W1 + delW1';
[dummy train_rmse(epoch)] = nnPredict (X, y, nhid_units, [W1(:);W2(:)]);
[dummy test_rmse(epoch)] = nnPredict (X_test, y_test, nhid_units, [W1(:);W2(:)]);
printf ('Epoch: %d\tTrain Error: %f\tTest Error: %f\n', epoch, train_rmse(epoch), test_rmse(epoch));
fflush (stdout);
end
weights = [W1(:);W2(:)];
% plot (1:max_epoch, test_rmse, 1);
% hold on;
plot (1:max_epoch, train_rmse(1:end), 2);
% hold off;
end
predict
%Now SFNN Only
function [o1 rmse] = nnPredict (X, y, nhid_units, weights)
iput_units = columns (X);
oput_units = columns (y);
n = rows (X);
W1 = reshape (weights(1:((iput_units + 1) * nhid_units),1), iput_units + 1, nhid_units);
W2 = reshape (weights((((iput_units + 1) * nhid_units) + 1):end,1), nhid_units + 1, oput_units);
o1 = sigmoid ([X ones(n,1)] * W1); %nxiput_units+1 * iput_units+1xnhid_units = nxnhid_units
o2 = sigmoid ([o1 ones(n,1)] * W2); %nxnhid_units+1 * nhid_units+1xoput_units = nxoput_units
rmse = RMSE (y, o2);
end
RMSE function
function rmse = RMSE (a1, a2)
rmse = sqrt (sum (sum ((a1 - a2).^2))/rows(a1));
end
I have also trained the same dataset using the R RSNNS package mlp and the RMSE for train set (first 100 examples) are around 0.03 . But in my implementation I cannot achieve lower RMSE than 0.14 . And sometimes the errors grow for some higher learning rates, and no learning rate gets me lower RMSE than 0.14. Also a paper i referred report the RMSE in for the train set is around 0.03
I wanted to know where is the problem i the code. I have followed Raul Rojas book and confirmed that things are okay.
In backprobagation code the line
e = (y_test(i,:) - o2)';
is not correct, because the o2 is the output from the train set and i am finding the difference from one example from the test set y_test. The line should have been as below:
e = (y(i,:) - o2)';
which correctly finds the difference between the predicted output by the current model and the target output of the corresponding example.
This took me 3 days to find this one, I am fortunate enough to find this freaking bug which stopped me from going into further modifications.